From 4cdd2af600f9b2934f105a8512498a186197a8cd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 29 Aug 2024 09:27:00 -0700 Subject: [PATCH 001/625] Upgrade to run under Linux 6.10.6 * Renamed install script to install_homa to avoid conflicts with the Linux install program * Reworked the timetracing mechanism used by the main Linux kernel to avoid header file conflicts --- README.md | 10 +-- cloudlab/bin/install | 41 ----------- homa_grant.c | 4 +- homa_impl.h | 85 +++++++++++++--------- homa_incoming.c | 5 +- homa_offload.c | 7 +- homa_plumbing.c | 31 ++------ homa_pool.c | 2 +- homa_skb.c | 45 +++++++----- test/mock.c | 145 ++++++++++++++++++++++---------------- test/mock.h | 10 ++- test/unit_homa_grant.c | 22 +++--- test/unit_homa_incoming.c | 49 +++++++------ test/unit_homa_offload.c | 82 ++++++++++----------- test/unit_homa_outgoing.c | 30 ++++---- test/unit_homa_peertab.c | 16 ++--- test/unit_homa_plumbing.c | 16 ++--- test/unit_homa_pool.c | 53 +++++++------- test/unit_homa_skb.c | 52 +++++++------- test/unit_homa_socktab.c | 8 +-- test/unit_homa_timer.c | 6 +- test/unit_homa_utils.c | 16 ++--- test/utils.h | 4 ++ timetrace.c | 15 ++-- 24 files changed, 378 insertions(+), 376 deletions(-) delete mode 100755 cloudlab/bin/install diff --git a/README.md b/README.md index 0e1a6f95..913fd2bc 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,10 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - Please contact me if you have any problems using this repo; I'm happy to provide advice and support. -- The head is known to work under Linux 6.1.38. In the past, Homa has - run under several earlier versions of Linux, including 5.17.7, - 5.4.80, and 4.15.18. There is a separate branch for each of these - older versions, with a names such as linux_4.15.18. Older branches are +- The head is known to work under Linux 6.10.6. In the past, Homa has + run under several earlier versions of Linux. There is a separate branch + for each of these + older versions, with names such as linux_4.15.18. Older branches are out of date feature-wise: recent commits have not been back-ported to them. Other versions of Linux have not been tested and may require code changes (these upgrades rarely take more than a couple @@ -47,7 +47,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k invoke it with no parameters to install and configure Homa on the current machine. -- The script `cloudlab/bin/install` will copy relevant Homa files +- The script `cloudlab/bin/install_homa` will copy relevant Homa files across a cluster of machines and configure Homa on each node. It assumes that nodes have names `nodeN` where N is a small integer, and it also assumes that you have already run `make` both in the top-level directory and diff --git a/cloudlab/bin/install b/cloudlab/bin/install deleted file mode 100755 index 120b6948..00000000 --- a/cloudlab/bin/install +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2020-2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause - -# This script installs all of the files needed to run Homa tests on one -# or more target machines; it also loads the Homa kernel module. -# -# Usage: -# install num_nodes [first] -# -# The "num_nodes" arguments indicates how many servers should be updated. -# The "first" argument is optional; it is an integer identifying the -# first node on which installation will occur (e.g. "install 4 2" means -# node2 through node5 will be updated. "first" defaults to 0. -# This script assumes that Homa has been built in ~/homaModule on the -# current machine (this includes both homa.ko and all of the binaries in util). - -root=~/homaModule - -set -e -if [ $# -eq 2 ]; then - first=$2 -elif [ $# -eq 1 ]; then - first=0 -else - echo "Usage: install num_nodes [first]" - exit 1 -fi -last=`expr $first + $1 - 1` || true - -for ((i = $first ; i <= $last; i++)); do - node=node$i - echo - echo '*** Installing on' $node '***' - rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv ~/.bashrc ~/.bash_profile ~/.gdbinit $node: - rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv --exclude __pycache__ ~/bin/ $node:bin/ - rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv $root/homa.ko $root/util/cp_node $root/util/homa_prio $root/util/*.py $node:bin/ - ssh -4 $node 'echo $PATH' - ssh -4 $node 'config default' -done \ No newline at end of file diff --git a/homa_grant.c b/homa_grant.c index 27df0b31..bf542e2c 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -15,7 +15,7 @@ * @rpc1: First RPC to consider. * @rpc2: Second RPC to consider. */ -int inline homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) +inline int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) { /* Fewest bytes remaining is the primary criterion; if those are * equal, then favor the older RPC. @@ -39,7 +39,7 @@ int inline homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) * may be possible to send out additional grants to some RPCs (doing * this is left to the caller). */ -int inline homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) { +inline int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) { int incoming = rpc->msgin.granted - (rpc->msgin.length - rpc->msgin.bytes_remaining); if (incoming < 0) diff --git a/homa_impl.h b/homa_impl.h index 9fd46862..a0e68e0a 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -38,13 +38,15 @@ #include #include #include -#include #include +#include +#include #include #include #include #include #include +#include #pragma GCC diagnostic warning "-Wpointer-sign" #pragma GCC diagnostic warning "-Wunused-variable" @@ -53,53 +55,68 @@ typedef unsigned int __poll_t; #endif #ifdef __UNIT_TEST__ -#define spin_unlock mock_spin_unlock -extern void mock_spin_unlock(spinlock_t *lock); - -#undef get_cycles -#define get_cycles mock_get_cycles -extern cycles_t mock_get_cycles(void); +#undef alloc_pages +#define alloc_pages mock_alloc_pages +extern struct page *mock_alloc_pages(gfp_t gfp, unsigned order); -#define signal_pending(xxx) mock_signal_pending -extern int mock_signal_pending; - -#define rcu_read_lock mock_rcu_read_lock -extern void mock_rcu_read_lock(void); +#define compound_order mock_compound_order +extern unsigned int mock_compound_order(struct page *page); -#define rcu_read_unlock mock_rcu_read_unlock -extern void mock_rcu_read_unlock(void); +#define cpu_to_node mock_cpu_to_node +extern int mock_cpu_to_node(int cpu); #undef current #define current current_task +extern struct task_struct *current_task; -#define kthread_complete_and_exit(comp, code) +#undef get_cycles +#define get_cycles mock_get_cycles +extern cycles_t mock_get_cycles(void); +#define get_page mock_get_page + extern void mock_get_page(struct page *page); + +#undef kmalloc #define kmalloc mock_kmalloc extern void *mock_kmalloc(size_t size, gfp_t flags); -#define get_page mock_get_page -extern void mock_get_page(struct page *page); +#define kthread_complete_and_exit(comp, code) -#define put_page mock_put_page -extern void mock_put_page(struct page *page); +#ifdef page_address +#undef page_address +#endif +#define page_address(page) ((void *) page) -#define compound_order mock_compound_order -extern unsigned int mock_compound_order(struct page *page); +#define page_ref_count mock_page_refs +extern int mock_page_refs(struct page *page); #define page_to_nid mock_page_to_nid extern int mock_page_to_nid(struct page *page); -#define page_ref_count mock_page_refs -extern int mock_page_refs(struct page *page); +#define put_page mock_put_page +extern void mock_put_page(struct page *page); -#define cpu_to_node mock_cpu_to_node -extern int mock_cpu_to_node(int cpu); +#define rcu_read_lock mock_rcu_read_lock +extern void mock_rcu_read_lock(void); -#ifdef page_address -#undef page_address -#endif -#define page_address(page) ((void *) page) -#endif +#define rcu_read_unlock mock_rcu_read_unlock +extern void mock_rcu_read_unlock(void); + +#undef register_net_sysctl +#define register_net_sysctl mock_register_net_sysctl +extern struct ctl_table_header *mock_register_net_sysctl(struct net *net, + const char *path, struct ctl_table *table); + +#define signal_pending(xxx) mock_signal_pending +extern int mock_signal_pending; + +#define spin_unlock mock_spin_unlock +extern void mock_spin_unlock(spinlock_t *lock); + +#undef vmalloc +#define vmalloc mock_vmalloc +extern void *mock_vmalloc(size_t size); +#endif /* __UNIT_TEST__ */ /* Null out things that confuse VSCode Intellisense */ #ifdef __VSCODE__ @@ -879,7 +896,7 @@ struct homa_interest { * of a struct homa_interest. * @interest: Struct to initialize. */ -static void inline homa_interest_init(struct homa_interest *interest) +inline static void homa_interest_init(struct homa_interest *interest) { interest->thread = current; atomic_long_set(&interest->ready_rpc, 0); @@ -3682,8 +3699,8 @@ extern enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); extern int homa_init(struct homa *homa); extern void homa_incoming_sysctl_changed(struct homa *homa); -extern int homa_ioc_abort(struct sock *sk, unsigned long arg); -extern int homa_ioctl(struct sock *sk, int cmd, unsigned long arg); +extern int homa_ioc_abort(struct sock *sk, int *karg); +extern int homa_ioctl(struct sock *sk, int cmd, int *karg); extern void homa_log_throttled(struct homa *homa); extern int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched); @@ -3777,8 +3794,6 @@ extern struct homa_rpc extern int homa_rpc_reap(struct homa_sock *hsk, int count); extern void homa_send_ipis(void); extern int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -extern int homa_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags); extern int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t __user optval, unsigned int optlen); extern int homa_shutdown(struct socket *sock, int how); diff --git a/homa_incoming.c b/homa_incoming.c index d5dbb3e7..81c9ddec 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -256,7 +256,6 @@ int homa_copy_to_user(struct homa_rpc *rpc) int pkt_length = homa_data_len(skbs[i]); int copied = 0; char *dst; - struct iovec iov; struct iov_iter iter; int buf_bytes, chunk_size; @@ -276,8 +275,8 @@ int homa_copy_to_user(struct homa_rpc *rpc) } chunk_size = buf_bytes; } - error = import_single_range(READ, dst, - chunk_size, &iov, &iter); + error = import_ubuf(READ, dst, chunk_size, + &iter); if (error) goto free_skbs; error = skb_copy_datagram_iter(skbs[i], diff --git a/homa_offload.c b/homa_offload.c index 502acde9..a6e34053 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -140,13 +140,14 @@ static inline void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) struct rps_sock_flow_table *sock_flow_table; int hash; - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (sock_flow_table == NULL) return; - hash = cpu + rps_cpu_mask + 1; + hash = cpu + net_hotdata.rps_cpu_mask + 1; if (sock_flow_table->ents[hash] != hash) { rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference( + net_hotdata.rps_sock_flow_table); sock_flow_table->ents[hash] = hash; rcu_read_unlock(); } diff --git a/homa_plumbing.c b/homa_plumbing.c index 533d2d48..ba820ffa 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -63,7 +63,6 @@ const struct proto_ops homa_proto_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, }; @@ -85,7 +84,6 @@ const struct proto_ops homav6_proto_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, }; @@ -108,7 +106,6 @@ struct proto homa_prot = { .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, - .sendpage = homa_sendpage, .backlog_rcv = homa_backlog_rcv, .release_cb = ip4_datagram_release_cb, .hash = homa_hash, @@ -136,7 +133,6 @@ struct proto homav6_prot = { .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, - .sendpage = homa_sendpage, .backlog_rcv = homa_backlog_rcv, .release_cb = ip6_datagram_release_cb, .hash = homa_hash, @@ -744,17 +740,17 @@ int homa_disconnect(struct sock *sk, int flags) { * homa_ioc_abort() - The top-level function for the ioctl that implements * the homa_abort user-level API. * @sk: Socket for this request. - * @arg: Used to pass information from user space. + * @karg: Used to pass information from user space. * * Return: 0 on success, otherwise a negative errno. */ -int homa_ioc_abort(struct sock *sk, unsigned long arg) { +int homa_ioc_abort(struct sock *sk, int *karg) { int ret = 0; struct homa_sock *hsk = homa_sk(sk); struct homa_abort_args args; struct homa_rpc *rpc; - if (unlikely(copy_from_user(&args, (void *) arg, sizeof(args)))) + if (unlikely(copy_from_user(&args, (void *) karg, sizeof(args)))) return -EFAULT; if (args._pad1 || args._pad2[0] || args._pad2[1]) { @@ -781,18 +777,18 @@ int homa_ioc_abort(struct sock *sk, unsigned long arg) { * homa_ioctl() - Implements the ioctl system call for Homa sockets. * @sk: Socket on which the system call was invoked. * @cmd: Identifier for a particular ioctl operation. - * @arg: Operation-specific argument; typically the address of a block + * @karg: Operation-specific argument; typically the address of a block * of data in user address space. * * Return: 0 on success, otherwise a negative errno. */ -int homa_ioctl(struct sock *sk, int cmd, unsigned long arg) { +int homa_ioctl(struct sock *sk, int cmd, int *karg) { int result; __u64 start = get_cycles(); switch (cmd) { case HOMAIOCABORT: - result = homa_ioc_abort(sk, arg); + result = homa_ioc_abort(sk, karg); INC_METRIC(abort_calls, 1); INC_METRIC(abort_cycles, get_cycles() - start); break; @@ -1157,21 +1153,6 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, return result; } -/** - * homa_sendpage() - ??. - * @sk: Socket for the operation - * @page: ?? - * @offset: ?? - * @size: ?? - * @flags: ?? - * Return: 0 on success, otherwise a negative errno. - */ -int homa_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) { - printk(KERN_WARNING "unimplemented sendpage invoked on Homa socket\n"); - return -ENOSYS; -} - /** * homa_hash() - ??. * @sk: Socket for the operation diff --git a/homa_pool.c b/homa_pool.c index 9c7ce739..a8dec585 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -29,7 +29,7 @@ * The caller must own the lock for @pool->hsk. * @pool: Pool to update. */ -static void inline set_bpages_needed(struct homa_pool *pool) { +inline static void set_bpages_needed(struct homa_pool *pool) { struct homa_rpc *rpc = list_first_entry(&pool->hsk->waiting_for_bufs, struct homa_rpc, buf_links); pool->bpages_needed = (rpc->msgin.length + HOMA_BPAGE_SIZE - 1) diff --git a/homa_skb.c b/homa_skb.c index 455b2939..e49455f1 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -13,6 +13,11 @@ extern int mock_max_skb_frags; #define HOMA_MAX_SKB_FRAGS MAX_SKB_FRAGS #endif +static inline void frag_page_set(skb_frag_t *frag, struct page *page) +{ + frag->netmem = page_to_netmem(page); +} + /** * homa_skb_page_pool_init() - Invoked when a struct homa is created to * initialize a page pool. @@ -144,15 +149,15 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) int actual_size = *length; /* Can we just extend the skb's last fragment? */ - if ((shinfo->nr_frags > 0) && (frag->bv_page == core->skb_page) + if ((shinfo->nr_frags > 0) && (skb_frag_page(frag) == core->skb_page) && (core->page_inuse < core->page_size) - && ((frag->bv_offset + frag->bv_len) + && ((frag->offset + skb_frag_size(frag)) == core->page_inuse)) { if ((core->page_size - core->page_inuse) < actual_size) actual_size = core->page_size - core->page_inuse; *length = actual_size; - frag->bv_len += actual_size; - result = page_address(frag->bv_page) + core->page_inuse; + skb_frag_size_add(frag, actual_size); + result = page_address(skb_frag_page(frag)) + core->page_inuse; core->page_inuse += actual_size; skb_len_add(skb, actual_size); return result; @@ -168,12 +173,12 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) actual_size = core->page_size - core->page_inuse; frag = &shinfo->frags[shinfo->nr_frags]; shinfo->nr_frags++; - frag->bv_page = core->skb_page; + frag_page_set(frag, core->skb_page); get_page(core->skb_page); - frag->bv_offset = core->page_inuse; + frag->offset = core->page_inuse; *length = actual_size; - frag->bv_len = actual_size; - result = page_address(frag->bv_page) + core->page_inuse; + skb_frag_size_set(frag, actual_size); + result = page_address(skb_frag_page(frag)) + core->page_inuse; core->page_inuse += actual_size; skb_len_add(skb, actual_size); return result; @@ -350,22 +355,23 @@ int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, src_frag_offset = head_len; for (src_frags_left = src_shinfo->nr_frags, src_frag = &src_shinfo->frags[0]; (src_frags_left > 0) && (length > 0); - src_frags_left--, src_frag_offset += src_frag->bv_len, src_frag++) + src_frags_left--, src_frag_offset += skb_frag_size(src_frag), + src_frag++) { - if (offset >= (src_frag_offset + src_frag->bv_len)) + if (offset >= (src_frag_offset + skb_frag_size(src_frag))) continue; - chunk_size = src_frag->bv_len - (offset - src_frag_offset); + chunk_size = skb_frag_size(src_frag) - (offset - src_frag_offset); if (chunk_size > length) chunk_size = length; if (dst_shinfo->nr_frags == HOMA_MAX_SKB_FRAGS) return -EINVAL; dst_frag = &dst_shinfo->frags[dst_shinfo->nr_frags]; dst_shinfo->nr_frags++; - dst_frag->bv_page = src_frag->bv_page; - get_page(src_frag->bv_page); - dst_frag->bv_offset = src_frag->bv_offset + frag_page_set(dst_frag, skb_frag_page(src_frag)); + get_page(skb_frag_page(src_frag)); + dst_frag->offset = src_frag->offset + (offset - src_frag_offset); - dst_frag->bv_len = chunk_size; + skb_frag_size_set(dst_frag, chunk_size); offset += chunk_size; length -= chunk_size; skb_len_add(dst_skb, chunk_size); @@ -498,13 +504,14 @@ void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length) frag_offset = head_len; for (frags_left = shinfo->nr_frags, frag = &shinfo->frags[0]; (frags_left > 0) && (length > 0); - frags_left--, frag_offset += frag->bv_len, frag++) { - if (offset >= (frag_offset + frag->bv_len)) + frags_left--, + frag_offset += skb_frag_size(frag), frag++) { + if (offset >= (frag_offset + skb_frag_size(frag))) continue; - chunk_size = frag->bv_len - (offset - frag_offset); + chunk_size = skb_frag_size(frag) - (offset - frag_offset); if (chunk_size > length) chunk_size = length; - memcpy(dst, page_address(frag->bv_page) + frag->bv_offset + memcpy(dst, page_address(skb_frag_page(frag)) + frag->offset + (offset - frag_offset), chunk_size); offset += chunk_size; diff --git a/test/mock.c b/test/mock.c index abe99df8..6e2b3e01 100644 --- a/test/mock.c +++ b/test/mock.c @@ -38,7 +38,7 @@ int mock_copy_data_errors = 0; int mock_copy_to_iter_errors = 0; int mock_copy_to_user_errors = 0; int mock_cpu_idle = 0; -int mock_import_single_range_errors = 0; +int mock_import_ubuf_errors = 0; int mock_import_iovec_errors = 0; int mock_ip6_xmit_errors = 0; int mock_ip_queue_xmit_errors = 0; @@ -136,9 +136,6 @@ bool mock_ipv6 = true; /* The value to use for mock_ipv6 in each test unless overridden. */ bool mock_ipv6_default; -/* Linux's idea of the current CPU number. */ -int cpu_number = 1; - /* List of priorities for all outbound packets. */ char mock_xmit_prios[1000]; int mock_xmit_prios_offset = 0; @@ -184,26 +181,16 @@ unsigned long page_offset_base = 0; unsigned long phys_base = 0; unsigned long vmemmap_base = 0; int __preempt_count = 0; +struct pcpu_hot pcpu_hot = {.cpu_number = 1}; char sock_flow_table[RPS_SOCK_FLOW_TABLE_SIZE(1024)]; -struct rps_sock_flow_table *rps_sock_flow_table - = (struct rps_sock_flow_table *) sock_flow_table; -__u32 rps_cpu_mask = 0x1f; +struct net_hotdata net_hotdata = { + .rps_cpu_mask = 0x1f, + .rps_sock_flow_table = (struct rps_sock_flow_table *) sock_flow_table +}; extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} -struct page *alloc_pages(gfp_t gfp, unsigned order) -{ - struct page *page; - if (mock_check_error(&mock_alloc_page_errors)) - return NULL; - page = (struct page *) malloc(PAGE_SIZE << order); - if (!pages_in_use) - pages_in_use = unit_hash_new(); - unit_hash_set(pages_in_use, page, (char *) 1); - return page; -} - struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node) { @@ -257,7 +244,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) return 0; } while (bytes_left > 0) { - struct iovec *iov = (struct iovec *) iter->iov; + struct iovec *iov = (struct iovec *) iter_iov(iter); __u64 int_base = (__u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; if (chunk_bytes > bytes_left) @@ -269,7 +256,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) iov->iov_base = (void *) (int_base + chunk_bytes); iov->iov_len -= chunk_bytes; if (iov->iov_len == 0) - iter->iov++; + iter->__iov++; } return bytes; } @@ -336,8 +323,8 @@ void dst_release(struct dst_entry *dst) { if (!dst) return; - dst->__refcnt.counter--; - if (dst->__refcnt.counter > 0) + atomic_dec(&dst->__rcuref.refcnt); + if (atomic_read(&dst->__rcuref.refcnt) > 0) return; if (!routes_in_use || unit_hash_get(routes_in_use, dst) == NULL) { FAIL("dst_release on unknown route"); @@ -422,14 +409,11 @@ ssize_t import_iovec(int type, const struct iovec __user * uvector, return size; } -int import_single_range(int type, void __user *buf, size_t len, - struct iovec *iov, struct iov_iter *i) +int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { - if (mock_check_error(&mock_import_single_range_errors)) + if (mock_check_error(&mock_import_ubuf_errors)) return -EACCES; - iov->iov_base = buf; - iov->iov_len = len; - iov_iter_init(i, type, iov, 1, len); + iov_iter_ubuf(i, rw, buf, len); return 0; } @@ -540,7 +524,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, { direction &= READ | WRITE; i->iter_type = ITER_IOVEC | direction; - i->iov = iov; + i->__iov = iov; i->nr_segs = nr_segs; i->iov_offset = 0; i->count = count; @@ -572,7 +556,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, FAIL("malloc failed"); return ERR_PTR(-ENOMEM); } - route->dst.__refcnt.counter = 1; + atomic_set(&route->dst.__rcuref.refcnt, 1); route->dst.ops = &mock_dst_ops; route->dst.dev = &mock_net_device; route->dst.obsolete = 0; @@ -667,7 +651,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, FAIL("malloc failed"); return ERR_PTR(-ENOMEM); } - route->dst.__refcnt.counter = 1; + atomic_set(&route->dst.__rcuref.refcnt, 1); route->dst.ops = &mock_dst_ops; route->dst.dev = &mock_net_device; route->dst.obsolete = 0; @@ -738,7 +722,7 @@ void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) shinfo->frag_list = next; } for (i = 0; i < shinfo->nr_frags; i++) { - put_page(shinfo->frags[i].bv_page); + put_page(skb_frag_page(&shinfo->frags[i])); } free(skb->head); free(skb); @@ -917,14 +901,13 @@ int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) return 1; } -void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) {} - -struct ctl_table_header *register_net_sysctl(struct net *net, - const char *path, struct ctl_table *table) +bool rcuref_get_slowpath(rcuref_t *ref) { - return NULL; + return true; } +void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) {} + void release_sock(struct sock *sk) { mock_active_locks--; @@ -939,7 +922,8 @@ void schedule(void) UNIT_HOOK("schedule"); } -void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic) {} +void security_sk_classify_flow(const struct sock *sk, + struct flowi_common *flic) {} void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) {} @@ -965,7 +949,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, return 0; } while (bytes_left > 0) { - struct iovec *iov = (struct iovec *) iter->iov; + struct iovec *iov = (struct iovec *) iter_iov(iter); __u64 int_base = (__u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; if (chunk_bytes > bytes_left) @@ -980,7 +964,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, iov->iov_base = (void *) (int_base + chunk_bytes); iov->iov_len -= chunk_bytes; if (iov->iov_len == 0) - iter->iov++; + iter->__iov++; } return 0; } @@ -1049,8 +1033,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, return 0; } -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +int sock_no_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { return 0; } @@ -1108,21 +1092,6 @@ int vfs_fsync(struct file *file, int datasync) return 0; } -void *vmalloc(size_t size) -{ - if (mock_check_error(&mock_vmalloc_errors)) - return NULL; - void *block = malloc(size); - if (!block) { - FAIL("malloc failed"); - return NULL; - } - if (!vmallocs_in_use) - vmallocs_in_use = unit_hash_new(); - unit_hash_set(vmallocs_in_use, block, "used"); - return block; -} - void wait_for_completion(struct completion *x) {} long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, @@ -1151,6 +1120,22 @@ int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, return 0; } +/** + * mock_alloc_pages() - Called instead of alloc_pages when Homa is compiled + * for unit testing. + */ +struct page *mock_alloc_pages(gfp_t gfp, unsigned order) +{ + struct page *page; + if (mock_check_error(&mock_alloc_page_errors)) + return NULL; + page = (struct page *)malloc(PAGE_SIZE << order); + if (!pages_in_use) + pages_in_use = unit_hash_new(); + unit_hash_set(pages_in_use, page, (char *)1); + return page; +} + /** * mock_check_error() - Determines whether a method should simulate an error * return. @@ -1304,6 +1289,25 @@ void mock_rcu_read_unlock(void) mock_active_rcu_locks--; } +/** + * mock_register_net_sysctl() - Called instead of register_net_sysctl + * when Homa is compiled for unit testing. + */ +struct ctl_table_header *mock_register_net_sysctl(struct net *net, + const char *path, struct ctl_table *table) +{ + return NULL; +} + +/** + * mock_set_core() - Set internal state that indicates the "current core". + * @num: Integer identifier for a core. + */ +void mock_set_core(int num) +{ + pcpu_hot.cpu_number = num; +} + /** * mock_skb_new() - Allocate and return a packet buffer. The buffer is * initialized as if it just arrived from the network. @@ -1451,7 +1455,7 @@ void mock_spin_unlock(spinlock_t *lock) */ void mock_teardown(void) { - cpu_number = 1; + pcpu_hot.cpu_number = 1; cpu_khz = 1000000; mock_alloc_page_errors = 0; mock_alloc_skb_errors = 0; @@ -1461,7 +1465,7 @@ void mock_teardown(void) mock_cpu_idle = 0; mock_cycles = 0; mock_ipv6 = mock_ipv6_default; - mock_import_single_range_errors = 0; + mock_import_ubuf_errors = 0; mock_import_iovec_errors = 0; mock_ip6_xmit_errors = 0; mock_ip_queue_xmit_errors = 0; @@ -1535,3 +1539,24 @@ void mock_teardown(void) unit_hook_clear(); } + +/** + * mock_vmalloc() - Called instead of vmalloc when Homa is compiled + * for unit testing. + * @size: Number of bytes to allocate. + */ +void *mock_vmalloc(size_t size) +{ + if (mock_check_error(&mock_vmalloc_errors)) + return NULL; + void *block = malloc(size); + if (!block) + { + FAIL("malloc failed"); + return NULL; + } + if (!vmallocs_in_use) + vmallocs_in_use = unit_hash_new(); + unit_hash_set(vmallocs_in_use, block, "used"); + return block; +} diff --git a/test/mock.h b/test/mock.h index c2e19a7d..20dcb2bf 100644 --- a/test/mock.h +++ b/test/mock.h @@ -4,7 +4,6 @@ /* Functions for mocking that are exported to test code. */ -extern int cpu_number; extern int mock_alloc_page_errors; extern int mock_alloc_skb_errors; extern int mock_bpage_size; @@ -16,7 +15,7 @@ extern int mock_copy_to_user_errors; extern int mock_cpu_idle; extern cycles_t mock_cycles; extern int mock_import_iovec_errors; -extern int mock_import_single_range_errors; +extern int mock_import_ubuf_errors; extern int mock_ip6_xmit_errors; extern int mock_ip_queue_xmit_errors; extern bool mock_ipv6; @@ -40,6 +39,8 @@ extern int mock_vmalloc_errors; extern int mock_xmit_log_verbose; extern int mock_xmit_log_homa_info; +extern struct page * + mock_alloc_pages(gfp_t gfp, unsigned order); extern int mock_check_error(int *errorMask); extern void mock_clear_xmit_prios(void); extern void mock_data_ready(struct sock *sk); @@ -52,6 +53,10 @@ extern int mock_page_refs(struct page *page); extern void mock_put_page(struct page *page); extern void mock_rcu_read_lock(void); extern void mock_rcu_read_unlock(void); +extern struct ctl_table_header * + mock_register_net_sysctl(struct net *net, + const char *path, struct ctl_table *table); +extern void mock_set_core(int num); extern void mock_spin_lock(spinlock_t *lock); extern void mock_spin_unlock(spinlock_t *lock); extern int mock_skb_count(void); @@ -63,3 +68,4 @@ extern void mock_sock_destroy(struct homa_sock *hsk, extern void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port); extern void mock_teardown(void); +extern void *mock_vmalloc(size_t size); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 5b1bb526..fcfc529a 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -192,7 +192,7 @@ TEST_F(homa_grant, homa_grant_add_rpc__update_metrics) mock_cycles = 200; test_rpc(self, 100, self->server_ip, 100000); EXPECT_EQ(4, self->homa.num_grantable_rpcs); - EXPECT_EQ(300, homa_cores[cpu_number]->metrics.grantable_rpcs_integral); + EXPECT_EQ(300, core_metrics.grantable_rpcs_integral); EXPECT_EQ(200, self->homa.last_grantable_change); } TEST_F(homa_grant, homa_grant_add_rpc__insert_in_peer_list) @@ -335,7 +335,7 @@ TEST_F(homa_grant, homa_grant_remove_rpc__update_metrics) homa_grant_remove_rpc(rpc); EXPECT_EQ(2, self->homa.num_grantable_rpcs); - EXPECT_EQ(300, homa_cores[cpu_number]->metrics.grantable_rpcs_integral); + EXPECT_EQ(300, core_metrics.grantable_rpcs_integral); EXPECT_EQ(200, self->homa.last_grantable_change); } TEST_F(homa_grant, homa_grant_remove_rpc__not_first_in_peer_list) @@ -739,7 +739,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.grant_recalc_cycles); + EXPECT_NE(0, core_metrics.grant_recalc_cycles); } TEST_F(homa_grant, homa_grant_recalc__already_locked) { @@ -763,7 +763,7 @@ TEST_F(homa_grant, homa_grant_recalc__skip_recalc) EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, rpc->msgin.granted); EXPECT_EQ(2, atomic_read(&self->homa.grant_recalc_count)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.grant_recalc_skips); + EXPECT_EQ(1, core_metrics.grant_recalc_skips); } TEST_F(homa_grant, homa_grant_recalc__clear_existing_active_rpcs) { @@ -871,14 +871,14 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) unit_hook_register(grantable_spinlock_hook); hook_homa = &self->homa; mock_trylock_errors = 0xfe0; - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.grant_recalc_skips); + EXPECT_EQ(0, core_metrics.grant_recalc_skips); homa_grant_recalc(&self->homa, 0); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(0, rpc3->msgin.granted); EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.grant_recalc_skips); + EXPECT_EQ(1, core_metrics.grant_recalc_skips); } TEST_F(homa_grant, homa_grant_pick_rpcs__basics) @@ -1073,8 +1073,8 @@ TEST_F(homa_grant, homa_grantable_lock_slow__basics) EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); homa_grantable_unlock(&self->homa); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.grantable_lock_misses); - EXPECT_EQ(500, homa_cores[cpu_number]->metrics.grantable_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.grantable_lock_misses); + EXPECT_EQ(500, core_metrics.grantable_lock_miss_cycles); } TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) { @@ -1086,12 +1086,12 @@ TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) EXPECT_EQ(0, homa_grantable_lock_slow(&self->homa, 1)); hook_homa = NULL; - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.grantable_lock_misses); - EXPECT_EQ(500, homa_cores[cpu_number]->metrics.grantable_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.grantable_lock_misses); + EXPECT_EQ(500, core_metrics.grantable_lock_miss_cycles); /* Make sure the check only occurs if the recalc argument is set. */ mock_trylock_errors = 0xff; EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); - EXPECT_EQ(2, homa_cores[cpu_number]->metrics.grantable_lock_misses); + EXPECT_EQ(2, core_metrics.grantable_lock_misses); homa_grantable_unlock(&self->homa); } \ No newline at end of file diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 003b04ee..86e88663 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -222,11 +222,11 @@ TEST_F(homa_incoming, homa_message_in_init__update_metrics) EXPECT_EQ(0, homa_message_in_init(crpc, 0x3000, 0)); EXPECT_EQ(0, homa_message_in_init(crpc, 1000000, 0)); EXPECT_EQ(0, homa_message_in_init(crpc, 900000, 0)); - EXPECT_EQ(270, homa_cores[cpu_number]->metrics.small_msg_bytes[2]); - EXPECT_EQ(0xfff, homa_cores[cpu_number]->metrics.small_msg_bytes[63]); - EXPECT_EQ(0x3000, homa_cores[cpu_number]->metrics.medium_msg_bytes[11]); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.medium_msg_bytes[15]); - EXPECT_EQ(1900000, homa_cores[cpu_number]->metrics.large_msg_bytes); + EXPECT_EQ(270, core_metrics.small_msg_bytes[2]); + EXPECT_EQ(0xfff, core_metrics.small_msg_bytes[63]); + EXPECT_EQ(0x3000, core_metrics.medium_msg_bytes[11]); + EXPECT_EQ(0, core_metrics.medium_msg_bytes[15]); + EXPECT_EQ(1900000, core_metrics.large_msg_bytes); } TEST_F(homa_incoming, homa_gap_retry) @@ -567,21 +567,21 @@ TEST_F(homa_incoming, homa_add_packet__metrics) homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 0)); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.resent_discards); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packet_discards); + EXPECT_EQ(0, core_metrics.resent_discards); + EXPECT_EQ(1, core_metrics.packet_discards); self->data.retransmit = 1; homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 0)); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.resent_discards); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packet_discards); + EXPECT_EQ(1, core_metrics.resent_discards); + EXPECT_EQ(1, core_metrics.packet_discards); self->data.seg.offset = htonl(4200); homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.resent_packets_used); + EXPECT_EQ(1, core_metrics.resent_packets_used); } TEST_F(homa_incoming, homa_copy_to_user__basics) @@ -718,7 +718,7 @@ TEST_F(homa_incoming, homa_copy_to_user__error_in_import_single_range) ASSERT_NE(NULL, crpc); unit_log_clear(); - mock_import_single_range_errors = 1; + mock_import_ubuf_errors = 1; EXPECT_EQ(13, -homa_copy_to_user(crpc)); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); @@ -859,7 +859,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cant_create_server_rpc) 1400, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, mock_skb_count()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.server_cant_create_rpcs); + EXPECT_EQ(1, core_metrics.server_cant_create_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__existing_server_rpc) { @@ -922,7 +922,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.unknown_rpcs); + EXPECT_EQ(1, core_metrics.unknown_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) { @@ -933,7 +933,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.unknown_rpcs); + EXPECT_EQ(0, core_metrics.unknown_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) { @@ -1009,7 +1009,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = 99}; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h, 0, 0), &self->homa); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.unknown_packet_types); + EXPECT_EQ(1, core_metrics.unknown_packet_types); } TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) { @@ -1077,7 +1077,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(31, self->hsk.dead_skbs); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.data_pkt_reap_cycles); + EXPECT_EQ(0, core_metrics.data_pkt_reap_cycles); /* Second packet: must reap. */ self->homa.dead_buffs_limit = 15; @@ -1085,7 +1085,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(21, self->hsk.dead_skbs); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.data_pkt_reap_cycles); + EXPECT_NE(0, core_metrics.data_pkt_reap_cycles); } TEST_F(homa_incoming, homa_data_pkt__basics) @@ -1104,7 +1104,7 @@ TEST_F(homa_incoming, homa_data_pkt__basics) EXPECT_EQ(200, crpc->msgin.bytes_remaining); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); EXPECT_EQ(1600, crpc->msgin.granted); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.responses_received); + EXPECT_EQ(1, core_metrics.responses_received); } TEST_F(homa_incoming, homa_data_pkt__wrong_client_rpc_state) { @@ -1169,7 +1169,7 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffers) atomic_set(&self->hsk.buffer_pool.free_bpages, 0); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); - EXPECT_EQ(1400, homa_cores[cpu_number]->metrics.dropped_data_no_bufs); + EXPECT_EQ(1400, core_metrics.dropped_data_no_bufs); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } TEST_F(homa_incoming, homa_data_pkt__update_delta) @@ -1598,7 +1598,7 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) struct sk_buff *skb = mock_skb_new(self->server_ip, &h.common, 0, 0); mock_kmalloc_errors = 1; homa_cutoffs_pkt(skb, &self->hsk); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_kmalloc_errors); + EXPECT_EQ(1, core_metrics.peer_kmalloc_errors); peer = homa_peer_find(&self->homa.peers, self->server_ip, &self->hsk.inet); ASSERT_FALSE(IS_ERR(peer)); @@ -1622,7 +1622,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) &self->homa); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ + EXPECT_EQ(1, core_metrics.packets_received[ NEED_ACK - DATA]); } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) @@ -1641,7 +1641,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ + EXPECT_EQ(1, core_metrics.packets_received[ NEED_ACK - DATA]); } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) @@ -1660,7 +1660,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ + EXPECT_EQ(1, core_metrics.packets_received[ NEED_ACK - DATA]); } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) @@ -1701,8 +1701,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ - ACK - DATA]); + EXPECT_EQ(1, core_metrics.packets_received[ACK - DATA]); } TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) { diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 70de2ca8..ae409a55 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -87,7 +87,7 @@ FIXTURE_SETUP(homa_offload) /* Configure so core isn't considered too busy for bypasses. */ mock_cycles = 1000; self->homa.gro_busy_cycles = 500; - homa_cores[cpu_number]->last_gro = 400; + cur_core->last_gro = 400; } FIXTURE_TEARDOWN(homa_offload) { @@ -159,10 +159,10 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = NULL; - homa_cores[cpu_number]->held_bucket = 99; + cur_core->held_skb = NULL; + cur_core->held_bucket = 99; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); + EXPECT_EQ(skb, cur_core->held_skb); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(IPPROTO_HOMA, ipv6_hdr(skb)->nexthdr); kfree_skb(skb); @@ -181,10 +181,10 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = NULL; - homa_cores[cpu_number]->held_bucket = 99; + cur_core->held_skb = NULL; + cur_core->held_bucket = 99; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); + EXPECT_EQ(skb, cur_core->held_skb); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(IPPROTO_HOMA, ip_hdr(skb)->protocol); EXPECT_EQ(2303, ip_hdr(skb)->check); @@ -220,8 +220,8 @@ TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) self->header.seg.offset = -1; skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = NULL; - homa_cores[cpu_number]->held_bucket = 99; + cur_core->held_skb = NULL; + cur_core->held_bucket = 99; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); h = (struct data_header *) skb_transport_header(skb); EXPECT_EQ(6000, htonl(h->seg.offset)); @@ -267,33 +267,33 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) skb = mock_skb_new(&self->ip, &h.common, 1400, 2000); struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_data_bypasses); + EXPECT_EQ(0, core_metrics.gro_data_bypasses); /* Second attempt: HOMA_GRO_SHORT_BYPASS enabled but message longer * than one packet. */ self->homa.gro_policy |= HOMA_GRO_SHORT_BYPASS; - homa_cores[cpu_number]->last_gro = 400; + cur_core->last_gro = 400; skb2 = mock_skb_new(&self->ip, &h.common, 1400, 2000); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_data_bypasses); + EXPECT_EQ(0, core_metrics.gro_data_bypasses); /* Third attempt: bypass should happen. */ h.message_length = htonl(1400); h.incoming = htonl(1400); - homa_cores[cpu_number]->last_gro = 400; + cur_core->last_gro = 400; skb3 = mock_skb_new(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_data_bypasses); + EXPECT_EQ(1, core_metrics.gro_data_bypasses); /* Third attempt: no bypass because core busy. */ - homa_cores[cpu_number]->last_gro = 600; + cur_core->last_gro = 600; skb4 = mock_skb_new(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_data_bypasses); + EXPECT_EQ(1, core_metrics.gro_data_bypasses); kfree_skb(skb); kfree_skb(skb2); @@ -326,24 +326,24 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) struct sk_buff *skb = mock_skb_new(&client_ip, &h.common, 0, 0); struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_grant_bypasses); + EXPECT_EQ(0, core_metrics.gro_grant_bypasses); EXPECT_STREQ("", unit_log_get()); /* Second attempt: HOMA_FAST_GRANTS is enabled. */ self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; - homa_cores[cpu_number]->last_gro = 400; + cur_core->last_gro = 400; struct sk_buff *skb2 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_grant_bypasses); + EXPECT_EQ(1, core_metrics.gro_grant_bypasses); EXPECT_SUBSTR("xmit DATA 1400@10000", unit_log_get()); /* Third attempt: core is too busy for fast grants. */ - homa_cores[cpu_number]->last_gro = 600; + cur_core->last_gro = 600; struct sk_buff *skb3 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_grant_bypasses); + EXPECT_EQ(1, core_metrics.gro_grant_bypasses); kfree_skb(skb); kfree_skb(skb3); } @@ -354,13 +354,13 @@ TEST_F(homa_offload, homa_gro_receive__no_held_skb) self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = NULL; - homa_cores[cpu_number]->held_bucket = 99; + cur_core->held_skb = NULL; + cur_core->held_bucket = 99; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); - EXPECT_EQ(3, homa_cores[cpu_number]->held_bucket); + EXPECT_EQ(skb, cur_core->held_skb); + EXPECT_EQ(3, cur_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__empty_merge_list) @@ -370,21 +370,21 @@ TEST_F(homa_offload, homa_gro_receive__empty_merge_list) self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = skb; - homa_cores[cpu_number]->held_bucket = 3; + cur_core->held_skb = skb; + cur_core->held_bucket = 3; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); - EXPECT_EQ(3, homa_cores[cpu_number]->held_bucket); + EXPECT_EQ(skb, cur_core->held_skb); + EXPECT_EQ(3, cur_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__merge) { struct sk_buff *skb, *skb2; int same_flow; - homa_cores[cpu_number]->held_skb = self->skb2; - homa_cores[cpu_number]->held_bucket = 2; + cur_core->held_skb = self->skb2; + cur_core->held_bucket = 2; self->header.seg.offset = htonl(6000); self->header.common.sender_id = cpu_to_be64(1002); @@ -419,8 +419,8 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) // First packet: fits below the limit. homa->max_gro_skbs = 3; - homa_cores[cpu_number]->held_skb = self->skb2; - homa_cores[cpu_number]->held_bucket = 2; + cur_core->held_skb = self->skb2; + cur_core->held_bucket = 2; self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); homa_gro_receive(&self->napi.gro_hash[3].list, skb); @@ -444,7 +444,7 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) // Third packet also hits the limit for skb, causing the bucket // to become empty. homa->max_gro_skbs = 2; - homa_cores[cpu_number]->held_skb = self->skb; + cur_core->held_skb = self->skb; skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( @@ -462,7 +462,7 @@ TEST_F(homa_offload, homa_gro_gen2) homa->gro_policy = HOMA_GRO_GEN2; mock_cycles = 1000; homa->busy_cycles = 100; - cpu_number = 5; + mock_set_core(5); atomic_set(&homa_cores[6]->softirq_backlog, 1); homa_cores[6]->last_gro = 0; atomic_set(&homa_cores[7]->softirq_backlog, 0); @@ -497,7 +497,7 @@ TEST_F(homa_offload, homa_gro_gen2) TEST_F(homa_offload, homa_gro_gen3__basics) { homa->gro_policy = HOMA_GRO_GEN3; - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; core->gen3_softirq_cores[0] = 3; core->gen3_softirq_cores[1] = 7; core->gen3_softirq_cores[2] = 5; @@ -515,7 +515,7 @@ TEST_F(homa_offload, homa_gro_gen3__basics) TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) { homa->gro_policy = HOMA_GRO_GEN3; - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; core->gen3_softirq_cores[0] = 3; core->gen3_softirq_cores[1] = -1; core->gen3_softirq_cores[2] = 5; @@ -531,7 +531,7 @@ TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) { homa->gro_policy = HOMA_GRO_GEN3; - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; core->gen3_softirq_cores[0] = 3; core->gen3_softirq_cores[1] = 7; core->gen3_softirq_cores[2] = 5; @@ -555,16 +555,16 @@ TEST_F(homa_offload, homa_gro_complete__GRO_IDLE) homa_cores[1]->last_active = 15; homa_cores[2]->last_active = 10; - cpu_number = 5; + mock_set_core(5); homa_gro_complete(self->skb, 0); EXPECT_EQ(1, self->skb->hash - 32); homa_cores[6]->last_active = 5; - cpu_number = 5; + mock_set_core(5); homa_gro_complete(self->skb, 0); EXPECT_EQ(6, self->skb->hash - 32); - cpu_number = 6; + mock_set_core(6); homa_gro_complete(self->skb, 0); EXPECT_EQ(2, self->skb->hash - 32); } diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 7b04e882..4ecd3053 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -559,7 +559,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) mock_ip_queue_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.control_xmit_errors); + EXPECT_EQ(1, core_metrics.control_xmit_errors); } TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) { @@ -583,7 +583,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) mock_ip6_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.control_xmit_errors); + EXPECT_EQ(1, core_metrics.control_xmit_errors); } TEST_F(homa_outgoing, homa_xmit_unknown) @@ -741,13 +741,13 @@ TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) self->server_port, self->client_id, 1000, 1000); unit_log_clear(); dst = crpc->peer->dst; - old_refcount = dst->__refcnt.counter; + old_refcount = atomic_read(&dst->__rcuref.refcnt); skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 6); EXPECT_STREQ("xmit DATA 1000@0", unit_log_get()); EXPECT_EQ(dst, skb_dst(crpc->msgout.packets)); - EXPECT_EQ(old_refcount+1, dst->__refcnt.counter); + EXPECT_EQ(old_refcount+1, atomic_read(&dst->__rcuref.refcnt)); } TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) { @@ -763,7 +763,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) mock_ip_queue_xmit_errors = 1; skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 5); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.data_xmit_errors); + EXPECT_EQ(1, core_metrics.data_xmit_errors); } TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) { @@ -779,7 +779,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) mock_ip6_xmit_errors = 1; skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 5); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.data_xmit_errors); + EXPECT_EQ(1, core_metrics.data_xmit_errors); } TEST_F(homa_outgoing, homa_resend_data__basics) @@ -984,8 +984,8 @@ TEST_F(homa_outgoing, homa_check_nic_queue__pacer_metrics) EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, true)); EXPECT_EQ(10500, atomic64_read(&self->homa.link_idle_time)); - EXPECT_EQ(500, homa_cores[cpu_number]->metrics.pacer_bytes); - EXPECT_EQ(200, homa_cores[cpu_number]->metrics.pacer_lost_cycles); + EXPECT_EQ(500, core_metrics.pacer_bytes); + EXPECT_EQ(200, core_metrics.pacer_lost_cycles); } TEST_F(homa_outgoing, homa_check_nic_queue__queue_empty) { @@ -1141,7 +1141,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) mock_trylock_errors = ~1; homa_pacer_xmit(&self->homa); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.pacer_skipped_rpcs); + EXPECT_EQ(1, core_metrics.pacer_skipped_rpcs); unit_log_clear(); mock_trylock_errors = 0; homa_pacer_xmit(&self->homa); @@ -1234,16 +1234,16 @@ TEST_F(homa_outgoing, homa_add_to_throttled__inc_metrics) self->server_port, self->client_id+4, 15000, 1000); homa_add_to_throttled(crpc1); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.throttle_list_adds); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.throttle_list_checks); + EXPECT_EQ(1, core_metrics.throttle_list_adds); + EXPECT_EQ(0, core_metrics.throttle_list_checks); homa_add_to_throttled(crpc2); - EXPECT_EQ(2, homa_cores[cpu_number]->metrics.throttle_list_adds); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.throttle_list_checks); + EXPECT_EQ(2, core_metrics.throttle_list_adds); + EXPECT_EQ(1, core_metrics.throttle_list_checks); homa_add_to_throttled(crpc3); - EXPECT_EQ(3, homa_cores[cpu_number]->metrics.throttle_list_adds); - EXPECT_EQ(3, homa_cores[cpu_number]->metrics.throttle_list_checks); + EXPECT_EQ(3, core_metrics.throttle_list_adds); + EXPECT_EQ(3, core_metrics.throttle_list_checks); } TEST_F(homa_outgoing, homa_remove_from_throttled) diff --git a/test/unit_homa_peertab.c b/test/unit_homa_peertab.c index 006312a9..d1645cab 100644 --- a/test/unit_homa_peertab.c +++ b/test/unit_homa_peertab.c @@ -72,7 +72,7 @@ TEST_F(homa_peertab, homa_peer_find__basics) peer2 = homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); EXPECT_NE(peer, peer2); - EXPECT_EQ(2, homa_cores[cpu_number]->metrics.peer_new_entries); + EXPECT_EQ(2, core_metrics.peer_new_entries); } static struct _test_data_homa_peertab *test_data; @@ -191,7 +191,7 @@ TEST_F(homa_peertab, homa_peer_find__kmalloc_error) peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_EQ(ENOMEM, -PTR_ERR(peer)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_kmalloc_errors); + EXPECT_EQ(1, core_metrics.peer_kmalloc_errors); } TEST_F(homa_peertab, homa_peer_find__route_error) { @@ -201,7 +201,7 @@ TEST_F(homa_peertab, homa_peer_find__route_error) peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_route_errors); + EXPECT_EQ(1, core_metrics.peer_route_errors); } TEST_F(homa_peertab, homa_dst_refresh__basics) @@ -229,7 +229,7 @@ TEST_F(homa_peertab, homa_dst_refresh__routing_error) mock_route_errors = 1; homa_dst_refresh(&self->homa.peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_route_errors); + EXPECT_EQ(1, core_metrics.peer_route_errors); EXPECT_EQ(0, dead_count(&self->homa.peers)); } TEST_F(homa_peertab, homa_dst_refresh__malloc_error) @@ -324,15 +324,15 @@ TEST_F(homa_peertab, homa_peer_lock_slow) ASSERT_NE(NULL, peer); homa_peer_lock(peer); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.peer_ack_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.peer_ack_lock_miss_cycles); + EXPECT_EQ(0, core_metrics.peer_ack_lock_misses); + EXPECT_EQ(0, core_metrics.peer_ack_lock_miss_cycles); homa_peer_unlock(peer); mock_trylock_errors = 1; unit_hook_register(peer_spinlock_hook); homa_peer_lock(peer); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_ack_lock_misses); - EXPECT_EQ(1000, homa_cores[cpu_number]->metrics.peer_ack_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.peer_ack_lock_misses); + EXPECT_EQ(1000, core_metrics.peer_ack_lock_miss_cycles); homa_peer_unlock(peer); } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index a671de47..556752f6 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -189,7 +189,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__basics) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 200); ASSERT_NE(NULL, crpc); - EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (unsigned long) &args)); + EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); EXPECT_EQ(RPC_DEAD, crpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } @@ -197,8 +197,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__cant_read_user_args) { struct homa_abort_args args = {self->client_id, 0}; mock_copy_data_errors = 1; - EXPECT_EQ(EFAULT, -homa_ioc_abort(&self->hsk.inet.sk, - (unsigned long) &args)); + EXPECT_EQ(EFAULT, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); } TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) { @@ -211,7 +210,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) self->server_port, self->client_id, 10000, 200); ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); - EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (unsigned long) &args)); + EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); EXPECT_EQ(-ECANCELED, crpc1->error); EXPECT_EQ(-ECANCELED, crpc2->error); EXPECT_EQ(2, unit_list_length(&self->hsk.active_rpcs)); @@ -219,8 +218,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) TEST_F(homa_plumbing, homa_ioc_abort__nonexistent_rpc) { struct homa_abort_args args = {99, 0}; - EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, - (unsigned long) &args)); + EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); } TEST_F(homa_plumbing, homa_set_sock_opt__bad_level) @@ -270,7 +268,7 @@ TEST_F(homa_plumbing, homa_set_sock_opt__success) sizeof(struct homa_set_buf_args))); EXPECT_EQ(args.start, self->hsk.buffer_pool.region); EXPECT_EQ(64, self->hsk.buffer_pool.num_bpages); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.so_set_buf_calls); + EXPECT_EQ(1, core_metrics.so_set_buf_calls); } TEST_F(homa_plumbing, homa_sendmsg__args_not_in_user_space) @@ -688,7 +686,7 @@ TEST_F(homa_plumbing, homa_softirq__packet_too_short) skb->len -= 1; homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.short_packets); + EXPECT_EQ(1, core_metrics.short_packets); } TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) { @@ -697,7 +695,7 @@ TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.short_packets); + EXPECT_EQ(1, core_metrics.short_packets); } TEST_F(homa_plumbing, homa_softirq__process_short_messages_first) { diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 628b1318..3970d5bd 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -61,7 +61,8 @@ static void change_owner_hook(char *id) return; if (!cur_pool) return; - cur_pool->descriptors[cur_pool->cores[cpu_number].page_hint].owner = -1; + cur_pool->descriptors[cur_pool->cores[raw_smp_processor_id()] + .page_hint].owner = -1; } TEST_F(homa_pool, homa_pool_set_bpages_needed) @@ -126,7 +127,7 @@ TEST_F(homa_pool, homa_pool_get_pages__basics) EXPECT_EQ(1, pages[1]); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); EXPECT_EQ(-1, pool->descriptors[1].owner); - EXPECT_EQ(2, pool->cores[cpu_number].next_candidate); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].next_candidate); EXPECT_EQ(98, atomic_read(&pool->free_bpages)); } TEST_F(homa_pool, homa_pool_get_pages__not_enough_space) @@ -143,7 +144,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit) struct homa_pool *pool = &self->hsk.buffer_pool; __u32 pages[10]; atomic_set(&pool->free_bpages, 62); - pool->cores[cpu_number].next_candidate = 49; + pool->cores[raw_smp_processor_id()].next_candidate = 49; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(49, pages[0]); EXPECT_EQ(0, pages[1]); @@ -153,7 +154,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit_with_MIN_EXTRA) struct homa_pool *pool = &self->hsk.buffer_pool; __u32 pages[10]; atomic_set(&pool->free_bpages, 92); - pool->cores[cpu_number].next_candidate = 13; + pool->cores[raw_smp_processor_id()].next_candidate = 13; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(13, pages[0]); EXPECT_EQ(0, pages[1]); @@ -234,9 +235,9 @@ TEST_F(homa_pool, homa_pool_allocate__basics) EXPECT_EQ(0, crpc->msgin.bpage_offsets[0]); EXPECT_EQ(-1, pool->descriptors[0].owner); EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[2]); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); EXPECT_EQ(150000 - 2*HOMA_BPAGE_SIZE, - pool->cores[cpu_number].allocated); + pool->cores[raw_smp_processor_id()].allocated); } TEST_F(homa_pool, homa_pool_no_buffer_pool) { @@ -278,7 +279,7 @@ TEST_F(homa_pool, homa_pool_allocate__no_partial_page) TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) { struct homa_pool *pool = &self->hsk.buffer_pool; - pool->cores[cpu_number].next_candidate = 2; + pool->cores[raw_smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 40); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, @@ -286,7 +287,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) ASSERT_NE(NULL, crpc); // First allocation just sets up a partially-allocated bpage. - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); // Try a second allocation; the lock hook steals the partial bpage, // so a new one has to be allocated. @@ -296,8 +297,8 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) EXPECT_EQ(0, homa_pool_allocate(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(3, pool->cores[cpu_number].page_hint); - EXPECT_EQ(2000, pool->cores[cpu_number].allocated); + EXPECT_EQ(3, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); EXPECT_EQ(1, -pool->descriptors[2].owner); EXPECT_EQ(1, pool->descriptors[3].owner); EXPECT_EQ(38, atomic_read(&pool->free_bpages)); @@ -305,40 +306,40 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) { struct homa_pool *pool = &self->hsk.buffer_pool; - pool->cores[cpu_number].page_hint = 2; - pool->cores[cpu_number].allocated = HOMA_BPAGE_SIZE-1900; + pool->cores[raw_smp_processor_id()].page_hint = 2; + pool->cores[raw_smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; atomic_set(&pool->descriptors[2].refs, 1); - pool->descriptors[2].owner = cpu_number; + pool->descriptors[2].owner = raw_smp_processor_id(); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(2000, pool->cores[cpu_number].allocated); - EXPECT_EQ(cpu_number, pool->descriptors[2].owner); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.bpage_reuses); + EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); + EXPECT_EQ(raw_smp_processor_id(), pool->descriptors[2].owner); + EXPECT_EQ(1, core_metrics.bpage_reuses); } TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) { struct homa_pool *pool = &self->hsk.buffer_pool; - pool->cores[cpu_number].next_candidate = 2; + pool->cores[raw_smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 50); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); crpc->msgin.num_bpages = 0; - pool->cores[cpu_number].allocated = HOMA_BPAGE_SIZE-1900; + pool->cores[raw_smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; EXPECT_EQ(0, homa_pool_allocate(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(3, pool->cores[cpu_number].page_hint); - EXPECT_EQ(2000, pool->cores[cpu_number].allocated); + EXPECT_EQ(3, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); EXPECT_EQ(-1, pool->descriptors[2].owner); EXPECT_EQ(1, atomic_read(&pool->descriptors[2].refs)); EXPECT_EQ(1, pool->descriptors[3].owner); @@ -347,7 +348,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) { struct homa_pool *pool = &self->hsk.buffer_pool; - pool->cores[cpu_number].next_candidate = 2; + pool->cores[raw_smp_processor_id()].next_candidate = 2; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); @@ -362,8 +363,8 @@ TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) EXPECT_EQ(1, crpc2->msgin.num_bpages); EXPECT_EQ(2*HOMA_BPAGE_SIZE + 2000, crpc2->msgin.bpage_offsets[0]); EXPECT_EQ(3, atomic_read(&pool->descriptors[2].refs)); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); - EXPECT_EQ(5000, pool->cores[cpu_number].allocated); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(5000, pool->cores[raw_smp_processor_id()].allocated); } TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) { @@ -404,7 +405,7 @@ TEST_F(homa_pool, homa_pool_allocate__out_of_space) rpc = list_next_entry(rpc, buf_links); EXPECT_EQ(100, rpc->id); EXPECT_TRUE(list_is_last(&rpc->buf_links, &self->hsk.waiting_for_bufs)); - EXPECT_EQ(3, homa_cores[cpu_number]->metrics.buffer_alloc_failures); + EXPECT_EQ(3, core_metrics.buffer_alloc_failures); EXPECT_EQ(1, pool->bpages_needed); } diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 34259f00..6de0ef12 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -95,7 +95,7 @@ TEST_F(homa_skb, homa_skb_cleanup) core->skb_page = alloc_pages(GFP_KERNEL, 2); add_to_pool(&self->homa, 5, 2); add_to_pool(&self->homa, 4, 3); - cpu_number = 3; + mock_set_core(3); homa_skb_stash_pages(&self->homa, 2 * HOMA_SKB_PAGE_SIZE - 100); EXPECT_EQ(5, homa_cores[2]->numa->page_pool.avail); EXPECT_EQ(2, homa_cores[3]->numa->page_pool.avail); @@ -195,7 +195,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) EXPECT_NE(NULL, p3); EXPECT_EQ(1000, length); EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); - EXPECT_EQ(0, skb_shinfo(self->skb)->frags[1].bv_offset); + EXPECT_EQ(0, skb_shinfo(self->skb)->frags[1].offset); EXPECT_EQ(2000, self->skb->len); EXPECT_EQ(1000, core->page_inuse); @@ -223,7 +223,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) EXPECT_EQ(p2 + 512, p3); EXPECT_EQ(512, length); EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); - EXPECT_EQ(1536, skb_shinfo(self->skb)->frags[1].bv_offset); + EXPECT_EQ(1536, skb_shinfo(self->skb)->frags[1].offset); EXPECT_EQ(2048, core->page_inuse); kfree_skb(skb2); @@ -246,7 +246,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__free_previous_page) } TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) { - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; struct sk_buff *skb = homa_skb_new_tx(100); struct page *page; int length = 100; @@ -262,8 +262,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) } TEST_F(homa_skb, homa_skb_page_alloc__from_stash) { - struct homa_core *core = homa_cores[cpu_number]; - add_to_pool(&self->homa, 5, cpu_number); + struct homa_core *core = cur_core; + add_to_pool(&self->homa, 5, raw_smp_processor_id()); homa_skb_stash_pages(&self->homa, 3*HOMA_SKB_PAGE_SIZE - 100); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); EXPECT_NE(NULL, core->skb_page); @@ -273,8 +273,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__from_stash) } TEST_F(homa_skb, homa_skb_page_alloc__from_pool) { - struct homa_core *core = homa_cores[cpu_number]; - add_to_pool(&self->homa, 5, cpu_number); + struct homa_core *core = cur_core; + add_to_pool(&self->homa, 5, raw_smp_processor_id()); EXPECT_EQ(5, core->numa->page_pool.avail); EXPECT_EQ(0, core->num_stashed_pages); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); @@ -283,8 +283,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__from_pool) } TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) { - struct homa_core *core = homa_cores[cpu_number]; - add_to_pool(&self->homa, 1, cpu_number); + struct homa_core *core = cur_core; + add_to_pool(&self->homa, 1, raw_smp_processor_id()); EXPECT_EQ(1, core->numa->page_pool.avail); EXPECT_EQ(0, core->num_stashed_pages); hook_pool = &core->numa->page_pool; @@ -297,15 +297,15 @@ TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) } TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) { - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; mock_cycles = ~0; EXPECT_EQ(0, core->numa->page_pool.avail); EXPECT_EQ(0, core->num_stashed_pages); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); EXPECT_NE(NULL, core->skb_page); EXPECT_EQ(HOMA_SKB_PAGE_SIZE, core->page_size); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.skb_page_allocs); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.skb_page_alloc_cycles); + EXPECT_EQ(1, core_metrics.skb_page_allocs); + EXPECT_NE(0, core_metrics.skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) { @@ -317,8 +317,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) EXPECT_NE(NULL, core->skb_page); EXPECT_EQ(PAGE_SIZE, core->page_size); EXPECT_EQ(0, core->page_inuse); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.skb_page_allocs); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.skb_page_alloc_cycles); + EXPECT_EQ(1, core_metrics.skb_page_allocs); + EXPECT_NE(0, core_metrics.skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__no_pages_available) { @@ -342,15 +342,15 @@ TEST_F(homa_skb, homa_skb_append_to_frag__basics) "0123456789ABCDEFGHIJ", 21)); EXPECT_EQ(2, shinfo->nr_frags); - EXPECT_EQ(10, shinfo->frags[0].bv_len); - char *p = ((char *) page_address(shinfo->frags[0].bv_page)) - + shinfo->frags[0].bv_offset; - p[shinfo->frags[0].bv_len] = 0; + EXPECT_EQ(10, skb_frag_size(&shinfo->frags[0])); + char *p = ((char *) page_address(skb_frag_page(&shinfo->frags[0]))) + + shinfo->frags[0].offset; + p[skb_frag_size(&shinfo->frags[0])] = 0; EXPECT_STREQ("abcd012345", p); - EXPECT_EQ(15, shinfo->frags[1].bv_len); - p = ((char *) page_address(shinfo->frags[1].bv_page)) - + shinfo->frags[1].bv_offset; + EXPECT_EQ(15, skb_frag_size(&shinfo->frags[1])); + p = ((char *) page_address(skb_frag_page(&shinfo->frags[1]))) + + shinfo->frags[1].offset; EXPECT_STREQ("6789ABCDEFGHIJ", p); } TEST_F(homa_skb, homa_skb_append_to_frag__no_memory) @@ -383,8 +383,8 @@ TEST_F(homa_skb, homa_skb_append_from_iter__basics) unit_log_get()); EXPECT_EQ(2, shinfo->nr_frags); - EXPECT_EQ(4096, shinfo->frags[0].bv_len); - EXPECT_EQ(904, shinfo->frags[1].bv_len); + EXPECT_EQ(4096, skb_frag_size(&shinfo->frags[0])); + EXPECT_EQ(904, skb_frag_size(&shinfo->frags[1])); } TEST_F(homa_skb, homa_skb_append_from_iter__no_memory) { @@ -513,7 +513,7 @@ TEST_F(homa_skb, homa_skb_free_many_tx__skb_ref_count_not_one) length = HOMA_SKB_PAGE_SIZE; homa_skb_extend_frags(&self->homa, skb, &length); EXPECT_EQ(HOMA_SKB_PAGE_SIZE, length); - page = skb_shinfo(skb)->frags[0].bv_page; + page = skb_frag_page(&skb_shinfo(skb)->frags[0]); EXPECT_EQ(2, page_ref_count(page)); skb_get(skb); EXPECT_EQ(2, refcount_read(&skb->users)); @@ -535,7 +535,7 @@ TEST_F(homa_skb, homa_skb_free_many_tx__check_page_order) homa_skb_extend_frags(&self->homa, skb, &length); } EXPECT_EQ(HOMA_SKB_PAGE_SIZE, length); - struct page *page = skb_shinfo(skb)->frags[2].bv_page; + struct page *page = skb_frag_page(&skb_shinfo(skb)->frags[2]); mock_compound_order_mask = 3; homa_skb_free_many_tx(&self->homa, &skb, 1); diff --git a/test/unit_homa_socktab.c b/test/unit_homa_socktab.c index ef7f018f..646182a7 100644 --- a/test/unit_homa_socktab.c +++ b/test/unit_homa_socktab.c @@ -294,13 +294,13 @@ TEST_F(homa_socktab, homa_sock_lock_slow) mock_cycles = ~0; homa_sock_lock(&self->hsk, "unit test"); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.socket_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.socket_lock_miss_cycles); + EXPECT_EQ(0, core_metrics.socket_lock_misses); + EXPECT_EQ(0, core_metrics.socket_lock_miss_cycles); homa_sock_unlock(&self->hsk); mock_trylock_errors = 1; homa_sock_lock(&self->hsk, "unit test"); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.socket_lock_misses); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.socket_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.socket_lock_misses); + EXPECT_NE(0, core_metrics.socket_lock_miss_cycles); homa_sock_unlock(&self->hsk); } \ No newline at end of file diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 28c00155..cd5193e9 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -136,11 +136,11 @@ TEST_F(homa_timer, homa_check_rpc__timeout) unit_log_clear(); crpc->silent_ticks = self->homa.timeout_ticks-1; homa_check_rpc(crpc); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.rpc_timeouts); + EXPECT_EQ(0, core_metrics.rpc_timeouts); EXPECT_EQ(0, crpc->error); crpc->silent_ticks = self->homa.timeout_ticks; homa_check_rpc(crpc); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.rpc_timeouts); + EXPECT_EQ(1, core_metrics.rpc_timeouts); EXPECT_EQ(ETIMEDOUT, -crpc->error); } TEST_F(homa_timer, homa_check_rpc__issue_resend) @@ -250,7 +250,7 @@ TEST_F(homa_timer, homa_timer__basics) unit_log_clear(); crpc->peer->outstanding_resends = self->homa.timeout_resends; homa_timer(&self->homa); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.rpc_timeouts); + EXPECT_EQ(1, core_metrics.rpc_timeouts); EXPECT_EQ(ETIMEDOUT, -crpc->error); } TEST_F(homa_timer, homa_timer__reap_dead_rpcs) diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index b5297ee9..fcc82212 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -280,18 +280,18 @@ TEST_F(homa_utils, homa_bucket_lock_slow) ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.client_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.client_lock_miss_cycles); + EXPECT_EQ(0, core_metrics.client_lock_misses); + EXPECT_EQ(0, core_metrics.client_lock_miss_cycles); homa_bucket_lock_slow(crpc->bucket, crpc->id); homa_rpc_unlock(crpc); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.client_lock_misses); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.client_lock_miss_cycles); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.server_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.server_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.client_lock_misses); + EXPECT_NE(0, core_metrics.client_lock_miss_cycles); + EXPECT_EQ(0, core_metrics.server_lock_misses); + EXPECT_EQ(0, core_metrics.server_lock_miss_cycles); homa_bucket_lock_slow(srpc->bucket, srpc->id); homa_rpc_unlock(srpc); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.server_lock_misses); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.server_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.server_lock_misses); + EXPECT_NE(0, core_metrics.server_lock_miss_cycles); } TEST_F(homa_utils, homa_rpc_acked__basics) diff --git a/test/utils.h b/test/utils.h index 8e23de70..f782266f 100644 --- a/test/utils.h +++ b/test/utils.h @@ -30,6 +30,10 @@ enum unit_rpc_state { UNIT_IN_SERVICE = 24, }; +#define core_metrics homa_cores[raw_smp_processor_id()]->metrics + +#define cur_core homa_cores[raw_smp_processor_id()] + extern char *unit_ack_string(struct homa_ack *ack); extern struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, diff --git a/timetrace.c b/timetrace.c index 737fd0c7..e429786c 100644 --- a/timetrace.c +++ b/timetrace.c @@ -9,18 +9,19 @@ * timetrace stubs; we will then connect the timetrace mechanism here with * those stubs to allow the rest of the kernel to log in our buffers. */ -#define TT_KERNEL 1 +//#define TT_KERNEL 1 #endif #ifdef TT_KERNEL -extern int tt_linux_buffer_mask; extern struct tt_buffer *tt_linux_buffers[]; extern void (*tt_linux_freeze)(void); extern atomic_t *tt_linux_freeze_count; extern atomic_t tt_linux_freeze_no_homa; extern int *tt_linux_homa_temp; extern int tt_linux_homa_temp_default[]; -extern void tt_inc_metric(int metric, __u64 count); extern void (*tt_linux_inc_metrics)(int metric, __u64 count); +extern void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, + const char* format, __u32 arg0, __u32 arg1, __u32 arg2, + __u32 arg3); extern void tt_linux_skip_metrics(int metric, __u64 count); extern void (*tt_linux_printk)(void); extern void (*tt_linux_dbg1)(char *msg, ...); @@ -28,7 +29,12 @@ extern void (*tt_linux_dbg2)(char *msg, ...); extern void (*tt_linux_dbg3)(char *msg, ...); extern void tt_linux_nop(void); extern void homa_trace(__u64 u0, __u64 u1, int i0, int i1); + +extern void ltt_record_nop(struct tt_buffer *buffer, __u64 timestamp, + const char *format, __u32 arg0, __u32 arg1, + __u32 arg2, __u32 arg3); #endif +extern void tt_inc_metric(int metric, __u64 count); /* Separate buffers for each core: this eliminates the need for * synchronization in tt_record, which improves performance significantly. @@ -135,7 +141,7 @@ int tt_init(char *proc_file, int *temp) for (i = 0; i < nr_cpu_ids; i++) { tt_linux_buffers[i] = tt_buffers[i]; } - tt_linux_buffer_mask = TT_BUF_SIZE-1; + tt_linux_record = tt_record_buf; tt_linux_freeze = tt_freeze; tt_linux_freeze_count = &tt_freeze_count; tt_linux_inc_metrics = tt_inc_metric; @@ -177,6 +183,7 @@ void tt_destroy(void) tt_freeze_count.counter = 1; #ifdef TT_KERNEL + tt_linux_record = ltt_record_nop; tt_linux_freeze = tt_linux_nop; tt_linux_freeze_count = &tt_linux_freeze_no_homa; for (i = 0; i < nr_cpu_ids; i++) { From 90fa29c9ae25faf750454b77ea2c9ac39e0f1f1f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 29 Aug 2024 09:40:57 -0700 Subject: [PATCH 002/625] Updates to README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 913fd2bc..a417d495 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k sysctl mechanism. For details, see the man page `homa.7`. ## Significant recent improvements +- August 2024: upgraded to Linux 6.10.6. - July 2024: introduced "TCP hijacking", where Homa packets are sent as legitimate TCP segments (using TCP as the IP protocol) and then reclaimed from TCP on the destination. This allows Homa to make better use of @@ -132,6 +133,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - June 2024: refactored sk_buff management to use frags; improves efficiency significantly. - April 2024: replaced `master` branch with `main` +- July 2023: upgraded to Linux 6.1.38. - December 2022: Version 2.0. This includes a new mechanism for managing buffer space for incoming messages, which improves throughput by 50-100% in many situations. In addition, Homa now uses the sendmsg From df0daa5a9b4f26fd6dfd8e3630906cac01fc4324 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 29 Aug 2024 09:27:00 -0700 Subject: [PATCH 003/625] Upgrade to run under Linux 6.10.6 * Renamed install script to install_homa to avoid conflicts with the Linux install program * Reworked the timetracing mechanism used by the main Linux kernel to avoid header file conflicts --- README.md | 10 +- cloudlab/bin/{install => install_homa} | 6 +- homa_grant.c | 4 +- homa_impl.h | 85 +++++++++------ homa_incoming.c | 5 +- homa_offload.c | 7 +- homa_plumbing.c | 31 +----- homa_pool.c | 2 +- homa_skb.c | 45 ++++---- test/mock.c | 145 +++++++++++++++---------- test/mock.h | 10 +- test/unit_homa_grant.c | 22 ++-- test/unit_homa_incoming.c | 49 ++++----- test/unit_homa_offload.c | 82 +++++++------- test/unit_homa_outgoing.c | 30 ++--- test/unit_homa_peertab.c | 16 +-- test/unit_homa_plumbing.c | 16 ++- test/unit_homa_pool.c | 53 ++++----- test/unit_homa_skb.c | 52 ++++----- test/unit_homa_socktab.c | 8 +- test/unit_homa_timer.c | 6 +- test/unit_homa_utils.c | 16 +-- test/utils.h | 4 + timetrace.c | 15 ++- 24 files changed, 381 insertions(+), 338 deletions(-) rename cloudlab/bin/{install => install_homa} (91%) diff --git a/README.md b/README.md index 0e1a6f95..913fd2bc 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,10 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - Please contact me if you have any problems using this repo; I'm happy to provide advice and support. -- The head is known to work under Linux 6.1.38. In the past, Homa has - run under several earlier versions of Linux, including 5.17.7, - 5.4.80, and 4.15.18. There is a separate branch for each of these - older versions, with a names such as linux_4.15.18. Older branches are +- The head is known to work under Linux 6.10.6. In the past, Homa has + run under several earlier versions of Linux. There is a separate branch + for each of these + older versions, with names such as linux_4.15.18. Older branches are out of date feature-wise: recent commits have not been back-ported to them. Other versions of Linux have not been tested and may require code changes (these upgrades rarely take more than a couple @@ -47,7 +47,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k invoke it with no parameters to install and configure Homa on the current machine. -- The script `cloudlab/bin/install` will copy relevant Homa files +- The script `cloudlab/bin/install_homa` will copy relevant Homa files across a cluster of machines and configure Homa on each node. It assumes that nodes have names `nodeN` where N is a small integer, and it also assumes that you have already run `make` both in the top-level directory and diff --git a/cloudlab/bin/install b/cloudlab/bin/install_homa similarity index 91% rename from cloudlab/bin/install rename to cloudlab/bin/install_homa index 120b6948..eb89935e 100755 --- a/cloudlab/bin/install +++ b/cloudlab/bin/install_homa @@ -7,7 +7,7 @@ # or more target machines; it also loads the Homa kernel module. # # Usage: -# install num_nodes [first] +# install_homa num_nodes [first] # # The "num_nodes" arguments indicates how many servers should be updated. # The "first" argument is optional; it is an integer identifying the @@ -24,7 +24,7 @@ if [ $# -eq 2 ]; then elif [ $# -eq 1 ]; then first=0 else - echo "Usage: install num_nodes [first]" + echo "Usage: install_homa num_nodes [first]" exit 1 fi last=`expr $first + $1 - 1` || true @@ -32,7 +32,7 @@ last=`expr $first + $1 - 1` || true for ((i = $first ; i <= $last; i++)); do node=node$i echo - echo '*** Installing on' $node '***' + echo '*** Installing Homa on' $node '***' rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv ~/.bashrc ~/.bash_profile ~/.gdbinit $node: rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv --exclude __pycache__ ~/bin/ $node:bin/ rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv $root/homa.ko $root/util/cp_node $root/util/homa_prio $root/util/*.py $node:bin/ diff --git a/homa_grant.c b/homa_grant.c index 27df0b31..bf542e2c 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -15,7 +15,7 @@ * @rpc1: First RPC to consider. * @rpc2: Second RPC to consider. */ -int inline homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) +inline int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) { /* Fewest bytes remaining is the primary criterion; if those are * equal, then favor the older RPC. @@ -39,7 +39,7 @@ int inline homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) * may be possible to send out additional grants to some RPCs (doing * this is left to the caller). */ -int inline homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) { +inline int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) { int incoming = rpc->msgin.granted - (rpc->msgin.length - rpc->msgin.bytes_remaining); if (incoming < 0) diff --git a/homa_impl.h b/homa_impl.h index 9fd46862..a0e68e0a 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -38,13 +38,15 @@ #include #include #include -#include #include +#include +#include #include #include #include #include #include +#include #pragma GCC diagnostic warning "-Wpointer-sign" #pragma GCC diagnostic warning "-Wunused-variable" @@ -53,53 +55,68 @@ typedef unsigned int __poll_t; #endif #ifdef __UNIT_TEST__ -#define spin_unlock mock_spin_unlock -extern void mock_spin_unlock(spinlock_t *lock); - -#undef get_cycles -#define get_cycles mock_get_cycles -extern cycles_t mock_get_cycles(void); +#undef alloc_pages +#define alloc_pages mock_alloc_pages +extern struct page *mock_alloc_pages(gfp_t gfp, unsigned order); -#define signal_pending(xxx) mock_signal_pending -extern int mock_signal_pending; - -#define rcu_read_lock mock_rcu_read_lock -extern void mock_rcu_read_lock(void); +#define compound_order mock_compound_order +extern unsigned int mock_compound_order(struct page *page); -#define rcu_read_unlock mock_rcu_read_unlock -extern void mock_rcu_read_unlock(void); +#define cpu_to_node mock_cpu_to_node +extern int mock_cpu_to_node(int cpu); #undef current #define current current_task +extern struct task_struct *current_task; -#define kthread_complete_and_exit(comp, code) +#undef get_cycles +#define get_cycles mock_get_cycles +extern cycles_t mock_get_cycles(void); +#define get_page mock_get_page + extern void mock_get_page(struct page *page); + +#undef kmalloc #define kmalloc mock_kmalloc extern void *mock_kmalloc(size_t size, gfp_t flags); -#define get_page mock_get_page -extern void mock_get_page(struct page *page); +#define kthread_complete_and_exit(comp, code) -#define put_page mock_put_page -extern void mock_put_page(struct page *page); +#ifdef page_address +#undef page_address +#endif +#define page_address(page) ((void *) page) -#define compound_order mock_compound_order -extern unsigned int mock_compound_order(struct page *page); +#define page_ref_count mock_page_refs +extern int mock_page_refs(struct page *page); #define page_to_nid mock_page_to_nid extern int mock_page_to_nid(struct page *page); -#define page_ref_count mock_page_refs -extern int mock_page_refs(struct page *page); +#define put_page mock_put_page +extern void mock_put_page(struct page *page); -#define cpu_to_node mock_cpu_to_node -extern int mock_cpu_to_node(int cpu); +#define rcu_read_lock mock_rcu_read_lock +extern void mock_rcu_read_lock(void); -#ifdef page_address -#undef page_address -#endif -#define page_address(page) ((void *) page) -#endif +#define rcu_read_unlock mock_rcu_read_unlock +extern void mock_rcu_read_unlock(void); + +#undef register_net_sysctl +#define register_net_sysctl mock_register_net_sysctl +extern struct ctl_table_header *mock_register_net_sysctl(struct net *net, + const char *path, struct ctl_table *table); + +#define signal_pending(xxx) mock_signal_pending +extern int mock_signal_pending; + +#define spin_unlock mock_spin_unlock +extern void mock_spin_unlock(spinlock_t *lock); + +#undef vmalloc +#define vmalloc mock_vmalloc +extern void *mock_vmalloc(size_t size); +#endif /* __UNIT_TEST__ */ /* Null out things that confuse VSCode Intellisense */ #ifdef __VSCODE__ @@ -879,7 +896,7 @@ struct homa_interest { * of a struct homa_interest. * @interest: Struct to initialize. */ -static void inline homa_interest_init(struct homa_interest *interest) +inline static void homa_interest_init(struct homa_interest *interest) { interest->thread = current; atomic_long_set(&interest->ready_rpc, 0); @@ -3682,8 +3699,8 @@ extern enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); extern int homa_init(struct homa *homa); extern void homa_incoming_sysctl_changed(struct homa *homa); -extern int homa_ioc_abort(struct sock *sk, unsigned long arg); -extern int homa_ioctl(struct sock *sk, int cmd, unsigned long arg); +extern int homa_ioc_abort(struct sock *sk, int *karg); +extern int homa_ioctl(struct sock *sk, int cmd, int *karg); extern void homa_log_throttled(struct homa *homa); extern int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched); @@ -3777,8 +3794,6 @@ extern struct homa_rpc extern int homa_rpc_reap(struct homa_sock *hsk, int count); extern void homa_send_ipis(void); extern int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -extern int homa_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags); extern int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t __user optval, unsigned int optlen); extern int homa_shutdown(struct socket *sock, int how); diff --git a/homa_incoming.c b/homa_incoming.c index d5dbb3e7..81c9ddec 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -256,7 +256,6 @@ int homa_copy_to_user(struct homa_rpc *rpc) int pkt_length = homa_data_len(skbs[i]); int copied = 0; char *dst; - struct iovec iov; struct iov_iter iter; int buf_bytes, chunk_size; @@ -276,8 +275,8 @@ int homa_copy_to_user(struct homa_rpc *rpc) } chunk_size = buf_bytes; } - error = import_single_range(READ, dst, - chunk_size, &iov, &iter); + error = import_ubuf(READ, dst, chunk_size, + &iter); if (error) goto free_skbs; error = skb_copy_datagram_iter(skbs[i], diff --git a/homa_offload.c b/homa_offload.c index 502acde9..a6e34053 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -140,13 +140,14 @@ static inline void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) struct rps_sock_flow_table *sock_flow_table; int hash; - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (sock_flow_table == NULL) return; - hash = cpu + rps_cpu_mask + 1; + hash = cpu + net_hotdata.rps_cpu_mask + 1; if (sock_flow_table->ents[hash] != hash) { rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference( + net_hotdata.rps_sock_flow_table); sock_flow_table->ents[hash] = hash; rcu_read_unlock(); } diff --git a/homa_plumbing.c b/homa_plumbing.c index 533d2d48..ba820ffa 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -63,7 +63,6 @@ const struct proto_ops homa_proto_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, }; @@ -85,7 +84,6 @@ const struct proto_ops homav6_proto_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, }; @@ -108,7 +106,6 @@ struct proto homa_prot = { .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, - .sendpage = homa_sendpage, .backlog_rcv = homa_backlog_rcv, .release_cb = ip4_datagram_release_cb, .hash = homa_hash, @@ -136,7 +133,6 @@ struct proto homav6_prot = { .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, - .sendpage = homa_sendpage, .backlog_rcv = homa_backlog_rcv, .release_cb = ip6_datagram_release_cb, .hash = homa_hash, @@ -744,17 +740,17 @@ int homa_disconnect(struct sock *sk, int flags) { * homa_ioc_abort() - The top-level function for the ioctl that implements * the homa_abort user-level API. * @sk: Socket for this request. - * @arg: Used to pass information from user space. + * @karg: Used to pass information from user space. * * Return: 0 on success, otherwise a negative errno. */ -int homa_ioc_abort(struct sock *sk, unsigned long arg) { +int homa_ioc_abort(struct sock *sk, int *karg) { int ret = 0; struct homa_sock *hsk = homa_sk(sk); struct homa_abort_args args; struct homa_rpc *rpc; - if (unlikely(copy_from_user(&args, (void *) arg, sizeof(args)))) + if (unlikely(copy_from_user(&args, (void *) karg, sizeof(args)))) return -EFAULT; if (args._pad1 || args._pad2[0] || args._pad2[1]) { @@ -781,18 +777,18 @@ int homa_ioc_abort(struct sock *sk, unsigned long arg) { * homa_ioctl() - Implements the ioctl system call for Homa sockets. * @sk: Socket on which the system call was invoked. * @cmd: Identifier for a particular ioctl operation. - * @arg: Operation-specific argument; typically the address of a block + * @karg: Operation-specific argument; typically the address of a block * of data in user address space. * * Return: 0 on success, otherwise a negative errno. */ -int homa_ioctl(struct sock *sk, int cmd, unsigned long arg) { +int homa_ioctl(struct sock *sk, int cmd, int *karg) { int result; __u64 start = get_cycles(); switch (cmd) { case HOMAIOCABORT: - result = homa_ioc_abort(sk, arg); + result = homa_ioc_abort(sk, karg); INC_METRIC(abort_calls, 1); INC_METRIC(abort_cycles, get_cycles() - start); break; @@ -1157,21 +1153,6 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, return result; } -/** - * homa_sendpage() - ??. - * @sk: Socket for the operation - * @page: ?? - * @offset: ?? - * @size: ?? - * @flags: ?? - * Return: 0 on success, otherwise a negative errno. - */ -int homa_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) { - printk(KERN_WARNING "unimplemented sendpage invoked on Homa socket\n"); - return -ENOSYS; -} - /** * homa_hash() - ??. * @sk: Socket for the operation diff --git a/homa_pool.c b/homa_pool.c index 9c7ce739..a8dec585 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -29,7 +29,7 @@ * The caller must own the lock for @pool->hsk. * @pool: Pool to update. */ -static void inline set_bpages_needed(struct homa_pool *pool) { +inline static void set_bpages_needed(struct homa_pool *pool) { struct homa_rpc *rpc = list_first_entry(&pool->hsk->waiting_for_bufs, struct homa_rpc, buf_links); pool->bpages_needed = (rpc->msgin.length + HOMA_BPAGE_SIZE - 1) diff --git a/homa_skb.c b/homa_skb.c index 455b2939..e49455f1 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -13,6 +13,11 @@ extern int mock_max_skb_frags; #define HOMA_MAX_SKB_FRAGS MAX_SKB_FRAGS #endif +static inline void frag_page_set(skb_frag_t *frag, struct page *page) +{ + frag->netmem = page_to_netmem(page); +} + /** * homa_skb_page_pool_init() - Invoked when a struct homa is created to * initialize a page pool. @@ -144,15 +149,15 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) int actual_size = *length; /* Can we just extend the skb's last fragment? */ - if ((shinfo->nr_frags > 0) && (frag->bv_page == core->skb_page) + if ((shinfo->nr_frags > 0) && (skb_frag_page(frag) == core->skb_page) && (core->page_inuse < core->page_size) - && ((frag->bv_offset + frag->bv_len) + && ((frag->offset + skb_frag_size(frag)) == core->page_inuse)) { if ((core->page_size - core->page_inuse) < actual_size) actual_size = core->page_size - core->page_inuse; *length = actual_size; - frag->bv_len += actual_size; - result = page_address(frag->bv_page) + core->page_inuse; + skb_frag_size_add(frag, actual_size); + result = page_address(skb_frag_page(frag)) + core->page_inuse; core->page_inuse += actual_size; skb_len_add(skb, actual_size); return result; @@ -168,12 +173,12 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) actual_size = core->page_size - core->page_inuse; frag = &shinfo->frags[shinfo->nr_frags]; shinfo->nr_frags++; - frag->bv_page = core->skb_page; + frag_page_set(frag, core->skb_page); get_page(core->skb_page); - frag->bv_offset = core->page_inuse; + frag->offset = core->page_inuse; *length = actual_size; - frag->bv_len = actual_size; - result = page_address(frag->bv_page) + core->page_inuse; + skb_frag_size_set(frag, actual_size); + result = page_address(skb_frag_page(frag)) + core->page_inuse; core->page_inuse += actual_size; skb_len_add(skb, actual_size); return result; @@ -350,22 +355,23 @@ int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, src_frag_offset = head_len; for (src_frags_left = src_shinfo->nr_frags, src_frag = &src_shinfo->frags[0]; (src_frags_left > 0) && (length > 0); - src_frags_left--, src_frag_offset += src_frag->bv_len, src_frag++) + src_frags_left--, src_frag_offset += skb_frag_size(src_frag), + src_frag++) { - if (offset >= (src_frag_offset + src_frag->bv_len)) + if (offset >= (src_frag_offset + skb_frag_size(src_frag))) continue; - chunk_size = src_frag->bv_len - (offset - src_frag_offset); + chunk_size = skb_frag_size(src_frag) - (offset - src_frag_offset); if (chunk_size > length) chunk_size = length; if (dst_shinfo->nr_frags == HOMA_MAX_SKB_FRAGS) return -EINVAL; dst_frag = &dst_shinfo->frags[dst_shinfo->nr_frags]; dst_shinfo->nr_frags++; - dst_frag->bv_page = src_frag->bv_page; - get_page(src_frag->bv_page); - dst_frag->bv_offset = src_frag->bv_offset + frag_page_set(dst_frag, skb_frag_page(src_frag)); + get_page(skb_frag_page(src_frag)); + dst_frag->offset = src_frag->offset + (offset - src_frag_offset); - dst_frag->bv_len = chunk_size; + skb_frag_size_set(dst_frag, chunk_size); offset += chunk_size; length -= chunk_size; skb_len_add(dst_skb, chunk_size); @@ -498,13 +504,14 @@ void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length) frag_offset = head_len; for (frags_left = shinfo->nr_frags, frag = &shinfo->frags[0]; (frags_left > 0) && (length > 0); - frags_left--, frag_offset += frag->bv_len, frag++) { - if (offset >= (frag_offset + frag->bv_len)) + frags_left--, + frag_offset += skb_frag_size(frag), frag++) { + if (offset >= (frag_offset + skb_frag_size(frag))) continue; - chunk_size = frag->bv_len - (offset - frag_offset); + chunk_size = skb_frag_size(frag) - (offset - frag_offset); if (chunk_size > length) chunk_size = length; - memcpy(dst, page_address(frag->bv_page) + frag->bv_offset + memcpy(dst, page_address(skb_frag_page(frag)) + frag->offset + (offset - frag_offset), chunk_size); offset += chunk_size; diff --git a/test/mock.c b/test/mock.c index abe99df8..6e2b3e01 100644 --- a/test/mock.c +++ b/test/mock.c @@ -38,7 +38,7 @@ int mock_copy_data_errors = 0; int mock_copy_to_iter_errors = 0; int mock_copy_to_user_errors = 0; int mock_cpu_idle = 0; -int mock_import_single_range_errors = 0; +int mock_import_ubuf_errors = 0; int mock_import_iovec_errors = 0; int mock_ip6_xmit_errors = 0; int mock_ip_queue_xmit_errors = 0; @@ -136,9 +136,6 @@ bool mock_ipv6 = true; /* The value to use for mock_ipv6 in each test unless overridden. */ bool mock_ipv6_default; -/* Linux's idea of the current CPU number. */ -int cpu_number = 1; - /* List of priorities for all outbound packets. */ char mock_xmit_prios[1000]; int mock_xmit_prios_offset = 0; @@ -184,26 +181,16 @@ unsigned long page_offset_base = 0; unsigned long phys_base = 0; unsigned long vmemmap_base = 0; int __preempt_count = 0; +struct pcpu_hot pcpu_hot = {.cpu_number = 1}; char sock_flow_table[RPS_SOCK_FLOW_TABLE_SIZE(1024)]; -struct rps_sock_flow_table *rps_sock_flow_table - = (struct rps_sock_flow_table *) sock_flow_table; -__u32 rps_cpu_mask = 0x1f; +struct net_hotdata net_hotdata = { + .rps_cpu_mask = 0x1f, + .rps_sock_flow_table = (struct rps_sock_flow_table *) sock_flow_table +}; extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} -struct page *alloc_pages(gfp_t gfp, unsigned order) -{ - struct page *page; - if (mock_check_error(&mock_alloc_page_errors)) - return NULL; - page = (struct page *) malloc(PAGE_SIZE << order); - if (!pages_in_use) - pages_in_use = unit_hash_new(); - unit_hash_set(pages_in_use, page, (char *) 1); - return page; -} - struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node) { @@ -257,7 +244,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) return 0; } while (bytes_left > 0) { - struct iovec *iov = (struct iovec *) iter->iov; + struct iovec *iov = (struct iovec *) iter_iov(iter); __u64 int_base = (__u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; if (chunk_bytes > bytes_left) @@ -269,7 +256,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) iov->iov_base = (void *) (int_base + chunk_bytes); iov->iov_len -= chunk_bytes; if (iov->iov_len == 0) - iter->iov++; + iter->__iov++; } return bytes; } @@ -336,8 +323,8 @@ void dst_release(struct dst_entry *dst) { if (!dst) return; - dst->__refcnt.counter--; - if (dst->__refcnt.counter > 0) + atomic_dec(&dst->__rcuref.refcnt); + if (atomic_read(&dst->__rcuref.refcnt) > 0) return; if (!routes_in_use || unit_hash_get(routes_in_use, dst) == NULL) { FAIL("dst_release on unknown route"); @@ -422,14 +409,11 @@ ssize_t import_iovec(int type, const struct iovec __user * uvector, return size; } -int import_single_range(int type, void __user *buf, size_t len, - struct iovec *iov, struct iov_iter *i) +int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { - if (mock_check_error(&mock_import_single_range_errors)) + if (mock_check_error(&mock_import_ubuf_errors)) return -EACCES; - iov->iov_base = buf; - iov->iov_len = len; - iov_iter_init(i, type, iov, 1, len); + iov_iter_ubuf(i, rw, buf, len); return 0; } @@ -540,7 +524,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, { direction &= READ | WRITE; i->iter_type = ITER_IOVEC | direction; - i->iov = iov; + i->__iov = iov; i->nr_segs = nr_segs; i->iov_offset = 0; i->count = count; @@ -572,7 +556,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, FAIL("malloc failed"); return ERR_PTR(-ENOMEM); } - route->dst.__refcnt.counter = 1; + atomic_set(&route->dst.__rcuref.refcnt, 1); route->dst.ops = &mock_dst_ops; route->dst.dev = &mock_net_device; route->dst.obsolete = 0; @@ -667,7 +651,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, FAIL("malloc failed"); return ERR_PTR(-ENOMEM); } - route->dst.__refcnt.counter = 1; + atomic_set(&route->dst.__rcuref.refcnt, 1); route->dst.ops = &mock_dst_ops; route->dst.dev = &mock_net_device; route->dst.obsolete = 0; @@ -738,7 +722,7 @@ void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) shinfo->frag_list = next; } for (i = 0; i < shinfo->nr_frags; i++) { - put_page(shinfo->frags[i].bv_page); + put_page(skb_frag_page(&shinfo->frags[i])); } free(skb->head); free(skb); @@ -917,14 +901,13 @@ int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) return 1; } -void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) {} - -struct ctl_table_header *register_net_sysctl(struct net *net, - const char *path, struct ctl_table *table) +bool rcuref_get_slowpath(rcuref_t *ref) { - return NULL; + return true; } +void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) {} + void release_sock(struct sock *sk) { mock_active_locks--; @@ -939,7 +922,8 @@ void schedule(void) UNIT_HOOK("schedule"); } -void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic) {} +void security_sk_classify_flow(const struct sock *sk, + struct flowi_common *flic) {} void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) {} @@ -965,7 +949,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, return 0; } while (bytes_left > 0) { - struct iovec *iov = (struct iovec *) iter->iov; + struct iovec *iov = (struct iovec *) iter_iov(iter); __u64 int_base = (__u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; if (chunk_bytes > bytes_left) @@ -980,7 +964,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, iov->iov_base = (void *) (int_base + chunk_bytes); iov->iov_len -= chunk_bytes; if (iov->iov_len == 0) - iter->iov++; + iter->__iov++; } return 0; } @@ -1049,8 +1033,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, return 0; } -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +int sock_no_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { return 0; } @@ -1108,21 +1092,6 @@ int vfs_fsync(struct file *file, int datasync) return 0; } -void *vmalloc(size_t size) -{ - if (mock_check_error(&mock_vmalloc_errors)) - return NULL; - void *block = malloc(size); - if (!block) { - FAIL("malloc failed"); - return NULL; - } - if (!vmallocs_in_use) - vmallocs_in_use = unit_hash_new(); - unit_hash_set(vmallocs_in_use, block, "used"); - return block; -} - void wait_for_completion(struct completion *x) {} long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, @@ -1151,6 +1120,22 @@ int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, return 0; } +/** + * mock_alloc_pages() - Called instead of alloc_pages when Homa is compiled + * for unit testing. + */ +struct page *mock_alloc_pages(gfp_t gfp, unsigned order) +{ + struct page *page; + if (mock_check_error(&mock_alloc_page_errors)) + return NULL; + page = (struct page *)malloc(PAGE_SIZE << order); + if (!pages_in_use) + pages_in_use = unit_hash_new(); + unit_hash_set(pages_in_use, page, (char *)1); + return page; +} + /** * mock_check_error() - Determines whether a method should simulate an error * return. @@ -1304,6 +1289,25 @@ void mock_rcu_read_unlock(void) mock_active_rcu_locks--; } +/** + * mock_register_net_sysctl() - Called instead of register_net_sysctl + * when Homa is compiled for unit testing. + */ +struct ctl_table_header *mock_register_net_sysctl(struct net *net, + const char *path, struct ctl_table *table) +{ + return NULL; +} + +/** + * mock_set_core() - Set internal state that indicates the "current core". + * @num: Integer identifier for a core. + */ +void mock_set_core(int num) +{ + pcpu_hot.cpu_number = num; +} + /** * mock_skb_new() - Allocate and return a packet buffer. The buffer is * initialized as if it just arrived from the network. @@ -1451,7 +1455,7 @@ void mock_spin_unlock(spinlock_t *lock) */ void mock_teardown(void) { - cpu_number = 1; + pcpu_hot.cpu_number = 1; cpu_khz = 1000000; mock_alloc_page_errors = 0; mock_alloc_skb_errors = 0; @@ -1461,7 +1465,7 @@ void mock_teardown(void) mock_cpu_idle = 0; mock_cycles = 0; mock_ipv6 = mock_ipv6_default; - mock_import_single_range_errors = 0; + mock_import_ubuf_errors = 0; mock_import_iovec_errors = 0; mock_ip6_xmit_errors = 0; mock_ip_queue_xmit_errors = 0; @@ -1535,3 +1539,24 @@ void mock_teardown(void) unit_hook_clear(); } + +/** + * mock_vmalloc() - Called instead of vmalloc when Homa is compiled + * for unit testing. + * @size: Number of bytes to allocate. + */ +void *mock_vmalloc(size_t size) +{ + if (mock_check_error(&mock_vmalloc_errors)) + return NULL; + void *block = malloc(size); + if (!block) + { + FAIL("malloc failed"); + return NULL; + } + if (!vmallocs_in_use) + vmallocs_in_use = unit_hash_new(); + unit_hash_set(vmallocs_in_use, block, "used"); + return block; +} diff --git a/test/mock.h b/test/mock.h index c2e19a7d..20dcb2bf 100644 --- a/test/mock.h +++ b/test/mock.h @@ -4,7 +4,6 @@ /* Functions for mocking that are exported to test code. */ -extern int cpu_number; extern int mock_alloc_page_errors; extern int mock_alloc_skb_errors; extern int mock_bpage_size; @@ -16,7 +15,7 @@ extern int mock_copy_to_user_errors; extern int mock_cpu_idle; extern cycles_t mock_cycles; extern int mock_import_iovec_errors; -extern int mock_import_single_range_errors; +extern int mock_import_ubuf_errors; extern int mock_ip6_xmit_errors; extern int mock_ip_queue_xmit_errors; extern bool mock_ipv6; @@ -40,6 +39,8 @@ extern int mock_vmalloc_errors; extern int mock_xmit_log_verbose; extern int mock_xmit_log_homa_info; +extern struct page * + mock_alloc_pages(gfp_t gfp, unsigned order); extern int mock_check_error(int *errorMask); extern void mock_clear_xmit_prios(void); extern void mock_data_ready(struct sock *sk); @@ -52,6 +53,10 @@ extern int mock_page_refs(struct page *page); extern void mock_put_page(struct page *page); extern void mock_rcu_read_lock(void); extern void mock_rcu_read_unlock(void); +extern struct ctl_table_header * + mock_register_net_sysctl(struct net *net, + const char *path, struct ctl_table *table); +extern void mock_set_core(int num); extern void mock_spin_lock(spinlock_t *lock); extern void mock_spin_unlock(spinlock_t *lock); extern int mock_skb_count(void); @@ -63,3 +68,4 @@ extern void mock_sock_destroy(struct homa_sock *hsk, extern void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port); extern void mock_teardown(void); +extern void *mock_vmalloc(size_t size); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 5b1bb526..fcfc529a 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -192,7 +192,7 @@ TEST_F(homa_grant, homa_grant_add_rpc__update_metrics) mock_cycles = 200; test_rpc(self, 100, self->server_ip, 100000); EXPECT_EQ(4, self->homa.num_grantable_rpcs); - EXPECT_EQ(300, homa_cores[cpu_number]->metrics.grantable_rpcs_integral); + EXPECT_EQ(300, core_metrics.grantable_rpcs_integral); EXPECT_EQ(200, self->homa.last_grantable_change); } TEST_F(homa_grant, homa_grant_add_rpc__insert_in_peer_list) @@ -335,7 +335,7 @@ TEST_F(homa_grant, homa_grant_remove_rpc__update_metrics) homa_grant_remove_rpc(rpc); EXPECT_EQ(2, self->homa.num_grantable_rpcs); - EXPECT_EQ(300, homa_cores[cpu_number]->metrics.grantable_rpcs_integral); + EXPECT_EQ(300, core_metrics.grantable_rpcs_integral); EXPECT_EQ(200, self->homa.last_grantable_change); } TEST_F(homa_grant, homa_grant_remove_rpc__not_first_in_peer_list) @@ -739,7 +739,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.grant_recalc_cycles); + EXPECT_NE(0, core_metrics.grant_recalc_cycles); } TEST_F(homa_grant, homa_grant_recalc__already_locked) { @@ -763,7 +763,7 @@ TEST_F(homa_grant, homa_grant_recalc__skip_recalc) EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, rpc->msgin.granted); EXPECT_EQ(2, atomic_read(&self->homa.grant_recalc_count)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.grant_recalc_skips); + EXPECT_EQ(1, core_metrics.grant_recalc_skips); } TEST_F(homa_grant, homa_grant_recalc__clear_existing_active_rpcs) { @@ -871,14 +871,14 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) unit_hook_register(grantable_spinlock_hook); hook_homa = &self->homa; mock_trylock_errors = 0xfe0; - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.grant_recalc_skips); + EXPECT_EQ(0, core_metrics.grant_recalc_skips); homa_grant_recalc(&self->homa, 0); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(0, rpc3->msgin.granted); EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.grant_recalc_skips); + EXPECT_EQ(1, core_metrics.grant_recalc_skips); } TEST_F(homa_grant, homa_grant_pick_rpcs__basics) @@ -1073,8 +1073,8 @@ TEST_F(homa_grant, homa_grantable_lock_slow__basics) EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); homa_grantable_unlock(&self->homa); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.grantable_lock_misses); - EXPECT_EQ(500, homa_cores[cpu_number]->metrics.grantable_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.grantable_lock_misses); + EXPECT_EQ(500, core_metrics.grantable_lock_miss_cycles); } TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) { @@ -1086,12 +1086,12 @@ TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) EXPECT_EQ(0, homa_grantable_lock_slow(&self->homa, 1)); hook_homa = NULL; - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.grantable_lock_misses); - EXPECT_EQ(500, homa_cores[cpu_number]->metrics.grantable_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.grantable_lock_misses); + EXPECT_EQ(500, core_metrics.grantable_lock_miss_cycles); /* Make sure the check only occurs if the recalc argument is set. */ mock_trylock_errors = 0xff; EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); - EXPECT_EQ(2, homa_cores[cpu_number]->metrics.grantable_lock_misses); + EXPECT_EQ(2, core_metrics.grantable_lock_misses); homa_grantable_unlock(&self->homa); } \ No newline at end of file diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 003b04ee..86e88663 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -222,11 +222,11 @@ TEST_F(homa_incoming, homa_message_in_init__update_metrics) EXPECT_EQ(0, homa_message_in_init(crpc, 0x3000, 0)); EXPECT_EQ(0, homa_message_in_init(crpc, 1000000, 0)); EXPECT_EQ(0, homa_message_in_init(crpc, 900000, 0)); - EXPECT_EQ(270, homa_cores[cpu_number]->metrics.small_msg_bytes[2]); - EXPECT_EQ(0xfff, homa_cores[cpu_number]->metrics.small_msg_bytes[63]); - EXPECT_EQ(0x3000, homa_cores[cpu_number]->metrics.medium_msg_bytes[11]); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.medium_msg_bytes[15]); - EXPECT_EQ(1900000, homa_cores[cpu_number]->metrics.large_msg_bytes); + EXPECT_EQ(270, core_metrics.small_msg_bytes[2]); + EXPECT_EQ(0xfff, core_metrics.small_msg_bytes[63]); + EXPECT_EQ(0x3000, core_metrics.medium_msg_bytes[11]); + EXPECT_EQ(0, core_metrics.medium_msg_bytes[15]); + EXPECT_EQ(1900000, core_metrics.large_msg_bytes); } TEST_F(homa_incoming, homa_gap_retry) @@ -567,21 +567,21 @@ TEST_F(homa_incoming, homa_add_packet__metrics) homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 0)); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.resent_discards); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packet_discards); + EXPECT_EQ(0, core_metrics.resent_discards); + EXPECT_EQ(1, core_metrics.packet_discards); self->data.retransmit = 1; homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 0)); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.resent_discards); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packet_discards); + EXPECT_EQ(1, core_metrics.resent_discards); + EXPECT_EQ(1, core_metrics.packet_discards); self->data.seg.offset = htonl(4200); homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.resent_packets_used); + EXPECT_EQ(1, core_metrics.resent_packets_used); } TEST_F(homa_incoming, homa_copy_to_user__basics) @@ -718,7 +718,7 @@ TEST_F(homa_incoming, homa_copy_to_user__error_in_import_single_range) ASSERT_NE(NULL, crpc); unit_log_clear(); - mock_import_single_range_errors = 1; + mock_import_ubuf_errors = 1; EXPECT_EQ(13, -homa_copy_to_user(crpc)); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); @@ -859,7 +859,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cant_create_server_rpc) 1400, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, mock_skb_count()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.server_cant_create_rpcs); + EXPECT_EQ(1, core_metrics.server_cant_create_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__existing_server_rpc) { @@ -922,7 +922,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.unknown_rpcs); + EXPECT_EQ(1, core_metrics.unknown_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) { @@ -933,7 +933,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.unknown_rpcs); + EXPECT_EQ(0, core_metrics.unknown_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) { @@ -1009,7 +1009,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = 99}; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h, 0, 0), &self->homa); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.unknown_packet_types); + EXPECT_EQ(1, core_metrics.unknown_packet_types); } TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) { @@ -1077,7 +1077,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(31, self->hsk.dead_skbs); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.data_pkt_reap_cycles); + EXPECT_EQ(0, core_metrics.data_pkt_reap_cycles); /* Second packet: must reap. */ self->homa.dead_buffs_limit = 15; @@ -1085,7 +1085,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(21, self->hsk.dead_skbs); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.data_pkt_reap_cycles); + EXPECT_NE(0, core_metrics.data_pkt_reap_cycles); } TEST_F(homa_incoming, homa_data_pkt__basics) @@ -1104,7 +1104,7 @@ TEST_F(homa_incoming, homa_data_pkt__basics) EXPECT_EQ(200, crpc->msgin.bytes_remaining); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); EXPECT_EQ(1600, crpc->msgin.granted); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.responses_received); + EXPECT_EQ(1, core_metrics.responses_received); } TEST_F(homa_incoming, homa_data_pkt__wrong_client_rpc_state) { @@ -1169,7 +1169,7 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffers) atomic_set(&self->hsk.buffer_pool.free_bpages, 0); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); - EXPECT_EQ(1400, homa_cores[cpu_number]->metrics.dropped_data_no_bufs); + EXPECT_EQ(1400, core_metrics.dropped_data_no_bufs); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } TEST_F(homa_incoming, homa_data_pkt__update_delta) @@ -1598,7 +1598,7 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) struct sk_buff *skb = mock_skb_new(self->server_ip, &h.common, 0, 0); mock_kmalloc_errors = 1; homa_cutoffs_pkt(skb, &self->hsk); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_kmalloc_errors); + EXPECT_EQ(1, core_metrics.peer_kmalloc_errors); peer = homa_peer_find(&self->homa.peers, self->server_ip, &self->hsk.inet); ASSERT_FALSE(IS_ERR(peer)); @@ -1622,7 +1622,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) &self->homa); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ + EXPECT_EQ(1, core_metrics.packets_received[ NEED_ACK - DATA]); } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) @@ -1641,7 +1641,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ + EXPECT_EQ(1, core_metrics.packets_received[ NEED_ACK - DATA]); } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) @@ -1660,7 +1660,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ + EXPECT_EQ(1, core_metrics.packets_received[ NEED_ACK - DATA]); } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) @@ -1701,8 +1701,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.packets_received[ - ACK - DATA]); + EXPECT_EQ(1, core_metrics.packets_received[ACK - DATA]); } TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) { diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 70de2ca8..ae409a55 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -87,7 +87,7 @@ FIXTURE_SETUP(homa_offload) /* Configure so core isn't considered too busy for bypasses. */ mock_cycles = 1000; self->homa.gro_busy_cycles = 500; - homa_cores[cpu_number]->last_gro = 400; + cur_core->last_gro = 400; } FIXTURE_TEARDOWN(homa_offload) { @@ -159,10 +159,10 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = NULL; - homa_cores[cpu_number]->held_bucket = 99; + cur_core->held_skb = NULL; + cur_core->held_bucket = 99; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); + EXPECT_EQ(skb, cur_core->held_skb); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(IPPROTO_HOMA, ipv6_hdr(skb)->nexthdr); kfree_skb(skb); @@ -181,10 +181,10 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = NULL; - homa_cores[cpu_number]->held_bucket = 99; + cur_core->held_skb = NULL; + cur_core->held_bucket = 99; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); + EXPECT_EQ(skb, cur_core->held_skb); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(IPPROTO_HOMA, ip_hdr(skb)->protocol); EXPECT_EQ(2303, ip_hdr(skb)->check); @@ -220,8 +220,8 @@ TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) self->header.seg.offset = -1; skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = NULL; - homa_cores[cpu_number]->held_bucket = 99; + cur_core->held_skb = NULL; + cur_core->held_bucket = 99; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); h = (struct data_header *) skb_transport_header(skb); EXPECT_EQ(6000, htonl(h->seg.offset)); @@ -267,33 +267,33 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) skb = mock_skb_new(&self->ip, &h.common, 1400, 2000); struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_data_bypasses); + EXPECT_EQ(0, core_metrics.gro_data_bypasses); /* Second attempt: HOMA_GRO_SHORT_BYPASS enabled but message longer * than one packet. */ self->homa.gro_policy |= HOMA_GRO_SHORT_BYPASS; - homa_cores[cpu_number]->last_gro = 400; + cur_core->last_gro = 400; skb2 = mock_skb_new(&self->ip, &h.common, 1400, 2000); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_data_bypasses); + EXPECT_EQ(0, core_metrics.gro_data_bypasses); /* Third attempt: bypass should happen. */ h.message_length = htonl(1400); h.incoming = htonl(1400); - homa_cores[cpu_number]->last_gro = 400; + cur_core->last_gro = 400; skb3 = mock_skb_new(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_data_bypasses); + EXPECT_EQ(1, core_metrics.gro_data_bypasses); /* Third attempt: no bypass because core busy. */ - homa_cores[cpu_number]->last_gro = 600; + cur_core->last_gro = 600; skb4 = mock_skb_new(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_data_bypasses); + EXPECT_EQ(1, core_metrics.gro_data_bypasses); kfree_skb(skb); kfree_skb(skb2); @@ -326,24 +326,24 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) struct sk_buff *skb = mock_skb_new(&client_ip, &h.common, 0, 0); struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_grant_bypasses); + EXPECT_EQ(0, core_metrics.gro_grant_bypasses); EXPECT_STREQ("", unit_log_get()); /* Second attempt: HOMA_FAST_GRANTS is enabled. */ self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; - homa_cores[cpu_number]->last_gro = 400; + cur_core->last_gro = 400; struct sk_buff *skb2 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_grant_bypasses); + EXPECT_EQ(1, core_metrics.gro_grant_bypasses); EXPECT_SUBSTR("xmit DATA 1400@10000", unit_log_get()); /* Third attempt: core is too busy for fast grants. */ - homa_cores[cpu_number]->last_gro = 600; + cur_core->last_gro = 600; struct sk_buff *skb3 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_grant_bypasses); + EXPECT_EQ(1, core_metrics.gro_grant_bypasses); kfree_skb(skb); kfree_skb(skb3); } @@ -354,13 +354,13 @@ TEST_F(homa_offload, homa_gro_receive__no_held_skb) self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = NULL; - homa_cores[cpu_number]->held_bucket = 99; + cur_core->held_skb = NULL; + cur_core->held_bucket = 99; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); - EXPECT_EQ(3, homa_cores[cpu_number]->held_bucket); + EXPECT_EQ(skb, cur_core->held_skb); + EXPECT_EQ(3, cur_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__empty_merge_list) @@ -370,21 +370,21 @@ TEST_F(homa_offload, homa_gro_receive__empty_merge_list) self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - homa_cores[cpu_number]->held_skb = skb; - homa_cores[cpu_number]->held_bucket = 3; + cur_core->held_skb = skb; + cur_core->held_bucket = 3; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, homa_cores[cpu_number]->held_skb); - EXPECT_EQ(3, homa_cores[cpu_number]->held_bucket); + EXPECT_EQ(skb, cur_core->held_skb); + EXPECT_EQ(3, cur_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__merge) { struct sk_buff *skb, *skb2; int same_flow; - homa_cores[cpu_number]->held_skb = self->skb2; - homa_cores[cpu_number]->held_bucket = 2; + cur_core->held_skb = self->skb2; + cur_core->held_bucket = 2; self->header.seg.offset = htonl(6000); self->header.common.sender_id = cpu_to_be64(1002); @@ -419,8 +419,8 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) // First packet: fits below the limit. homa->max_gro_skbs = 3; - homa_cores[cpu_number]->held_skb = self->skb2; - homa_cores[cpu_number]->held_bucket = 2; + cur_core->held_skb = self->skb2; + cur_core->held_bucket = 2; self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); homa_gro_receive(&self->napi.gro_hash[3].list, skb); @@ -444,7 +444,7 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) // Third packet also hits the limit for skb, causing the bucket // to become empty. homa->max_gro_skbs = 2; - homa_cores[cpu_number]->held_skb = self->skb; + cur_core->held_skb = self->skb; skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( @@ -462,7 +462,7 @@ TEST_F(homa_offload, homa_gro_gen2) homa->gro_policy = HOMA_GRO_GEN2; mock_cycles = 1000; homa->busy_cycles = 100; - cpu_number = 5; + mock_set_core(5); atomic_set(&homa_cores[6]->softirq_backlog, 1); homa_cores[6]->last_gro = 0; atomic_set(&homa_cores[7]->softirq_backlog, 0); @@ -497,7 +497,7 @@ TEST_F(homa_offload, homa_gro_gen2) TEST_F(homa_offload, homa_gro_gen3__basics) { homa->gro_policy = HOMA_GRO_GEN3; - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; core->gen3_softirq_cores[0] = 3; core->gen3_softirq_cores[1] = 7; core->gen3_softirq_cores[2] = 5; @@ -515,7 +515,7 @@ TEST_F(homa_offload, homa_gro_gen3__basics) TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) { homa->gro_policy = HOMA_GRO_GEN3; - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; core->gen3_softirq_cores[0] = 3; core->gen3_softirq_cores[1] = -1; core->gen3_softirq_cores[2] = 5; @@ -531,7 +531,7 @@ TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) { homa->gro_policy = HOMA_GRO_GEN3; - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; core->gen3_softirq_cores[0] = 3; core->gen3_softirq_cores[1] = 7; core->gen3_softirq_cores[2] = 5; @@ -555,16 +555,16 @@ TEST_F(homa_offload, homa_gro_complete__GRO_IDLE) homa_cores[1]->last_active = 15; homa_cores[2]->last_active = 10; - cpu_number = 5; + mock_set_core(5); homa_gro_complete(self->skb, 0); EXPECT_EQ(1, self->skb->hash - 32); homa_cores[6]->last_active = 5; - cpu_number = 5; + mock_set_core(5); homa_gro_complete(self->skb, 0); EXPECT_EQ(6, self->skb->hash - 32); - cpu_number = 6; + mock_set_core(6); homa_gro_complete(self->skb, 0); EXPECT_EQ(2, self->skb->hash - 32); } diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 7b04e882..4ecd3053 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -559,7 +559,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) mock_ip_queue_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.control_xmit_errors); + EXPECT_EQ(1, core_metrics.control_xmit_errors); } TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) { @@ -583,7 +583,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) mock_ip6_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.control_xmit_errors); + EXPECT_EQ(1, core_metrics.control_xmit_errors); } TEST_F(homa_outgoing, homa_xmit_unknown) @@ -741,13 +741,13 @@ TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) self->server_port, self->client_id, 1000, 1000); unit_log_clear(); dst = crpc->peer->dst; - old_refcount = dst->__refcnt.counter; + old_refcount = atomic_read(&dst->__rcuref.refcnt); skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 6); EXPECT_STREQ("xmit DATA 1000@0", unit_log_get()); EXPECT_EQ(dst, skb_dst(crpc->msgout.packets)); - EXPECT_EQ(old_refcount+1, dst->__refcnt.counter); + EXPECT_EQ(old_refcount+1, atomic_read(&dst->__rcuref.refcnt)); } TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) { @@ -763,7 +763,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) mock_ip_queue_xmit_errors = 1; skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 5); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.data_xmit_errors); + EXPECT_EQ(1, core_metrics.data_xmit_errors); } TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) { @@ -779,7 +779,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) mock_ip6_xmit_errors = 1; skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 5); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.data_xmit_errors); + EXPECT_EQ(1, core_metrics.data_xmit_errors); } TEST_F(homa_outgoing, homa_resend_data__basics) @@ -984,8 +984,8 @@ TEST_F(homa_outgoing, homa_check_nic_queue__pacer_metrics) EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, true)); EXPECT_EQ(10500, atomic64_read(&self->homa.link_idle_time)); - EXPECT_EQ(500, homa_cores[cpu_number]->metrics.pacer_bytes); - EXPECT_EQ(200, homa_cores[cpu_number]->metrics.pacer_lost_cycles); + EXPECT_EQ(500, core_metrics.pacer_bytes); + EXPECT_EQ(200, core_metrics.pacer_lost_cycles); } TEST_F(homa_outgoing, homa_check_nic_queue__queue_empty) { @@ -1141,7 +1141,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) mock_trylock_errors = ~1; homa_pacer_xmit(&self->homa); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.pacer_skipped_rpcs); + EXPECT_EQ(1, core_metrics.pacer_skipped_rpcs); unit_log_clear(); mock_trylock_errors = 0; homa_pacer_xmit(&self->homa); @@ -1234,16 +1234,16 @@ TEST_F(homa_outgoing, homa_add_to_throttled__inc_metrics) self->server_port, self->client_id+4, 15000, 1000); homa_add_to_throttled(crpc1); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.throttle_list_adds); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.throttle_list_checks); + EXPECT_EQ(1, core_metrics.throttle_list_adds); + EXPECT_EQ(0, core_metrics.throttle_list_checks); homa_add_to_throttled(crpc2); - EXPECT_EQ(2, homa_cores[cpu_number]->metrics.throttle_list_adds); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.throttle_list_checks); + EXPECT_EQ(2, core_metrics.throttle_list_adds); + EXPECT_EQ(1, core_metrics.throttle_list_checks); homa_add_to_throttled(crpc3); - EXPECT_EQ(3, homa_cores[cpu_number]->metrics.throttle_list_adds); - EXPECT_EQ(3, homa_cores[cpu_number]->metrics.throttle_list_checks); + EXPECT_EQ(3, core_metrics.throttle_list_adds); + EXPECT_EQ(3, core_metrics.throttle_list_checks); } TEST_F(homa_outgoing, homa_remove_from_throttled) diff --git a/test/unit_homa_peertab.c b/test/unit_homa_peertab.c index 006312a9..d1645cab 100644 --- a/test/unit_homa_peertab.c +++ b/test/unit_homa_peertab.c @@ -72,7 +72,7 @@ TEST_F(homa_peertab, homa_peer_find__basics) peer2 = homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); EXPECT_NE(peer, peer2); - EXPECT_EQ(2, homa_cores[cpu_number]->metrics.peer_new_entries); + EXPECT_EQ(2, core_metrics.peer_new_entries); } static struct _test_data_homa_peertab *test_data; @@ -191,7 +191,7 @@ TEST_F(homa_peertab, homa_peer_find__kmalloc_error) peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_EQ(ENOMEM, -PTR_ERR(peer)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_kmalloc_errors); + EXPECT_EQ(1, core_metrics.peer_kmalloc_errors); } TEST_F(homa_peertab, homa_peer_find__route_error) { @@ -201,7 +201,7 @@ TEST_F(homa_peertab, homa_peer_find__route_error) peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_route_errors); + EXPECT_EQ(1, core_metrics.peer_route_errors); } TEST_F(homa_peertab, homa_dst_refresh__basics) @@ -229,7 +229,7 @@ TEST_F(homa_peertab, homa_dst_refresh__routing_error) mock_route_errors = 1; homa_dst_refresh(&self->homa.peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_route_errors); + EXPECT_EQ(1, core_metrics.peer_route_errors); EXPECT_EQ(0, dead_count(&self->homa.peers)); } TEST_F(homa_peertab, homa_dst_refresh__malloc_error) @@ -324,15 +324,15 @@ TEST_F(homa_peertab, homa_peer_lock_slow) ASSERT_NE(NULL, peer); homa_peer_lock(peer); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.peer_ack_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.peer_ack_lock_miss_cycles); + EXPECT_EQ(0, core_metrics.peer_ack_lock_misses); + EXPECT_EQ(0, core_metrics.peer_ack_lock_miss_cycles); homa_peer_unlock(peer); mock_trylock_errors = 1; unit_hook_register(peer_spinlock_hook); homa_peer_lock(peer); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.peer_ack_lock_misses); - EXPECT_EQ(1000, homa_cores[cpu_number]->metrics.peer_ack_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.peer_ack_lock_misses); + EXPECT_EQ(1000, core_metrics.peer_ack_lock_miss_cycles); homa_peer_unlock(peer); } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index a671de47..556752f6 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -189,7 +189,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__basics) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 200); ASSERT_NE(NULL, crpc); - EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (unsigned long) &args)); + EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); EXPECT_EQ(RPC_DEAD, crpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } @@ -197,8 +197,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__cant_read_user_args) { struct homa_abort_args args = {self->client_id, 0}; mock_copy_data_errors = 1; - EXPECT_EQ(EFAULT, -homa_ioc_abort(&self->hsk.inet.sk, - (unsigned long) &args)); + EXPECT_EQ(EFAULT, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); } TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) { @@ -211,7 +210,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) self->server_port, self->client_id, 10000, 200); ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); - EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (unsigned long) &args)); + EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); EXPECT_EQ(-ECANCELED, crpc1->error); EXPECT_EQ(-ECANCELED, crpc2->error); EXPECT_EQ(2, unit_list_length(&self->hsk.active_rpcs)); @@ -219,8 +218,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) TEST_F(homa_plumbing, homa_ioc_abort__nonexistent_rpc) { struct homa_abort_args args = {99, 0}; - EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, - (unsigned long) &args)); + EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); } TEST_F(homa_plumbing, homa_set_sock_opt__bad_level) @@ -270,7 +268,7 @@ TEST_F(homa_plumbing, homa_set_sock_opt__success) sizeof(struct homa_set_buf_args))); EXPECT_EQ(args.start, self->hsk.buffer_pool.region); EXPECT_EQ(64, self->hsk.buffer_pool.num_bpages); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.so_set_buf_calls); + EXPECT_EQ(1, core_metrics.so_set_buf_calls); } TEST_F(homa_plumbing, homa_sendmsg__args_not_in_user_space) @@ -688,7 +686,7 @@ TEST_F(homa_plumbing, homa_softirq__packet_too_short) skb->len -= 1; homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.short_packets); + EXPECT_EQ(1, core_metrics.short_packets); } TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) { @@ -697,7 +695,7 @@ TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.short_packets); + EXPECT_EQ(1, core_metrics.short_packets); } TEST_F(homa_plumbing, homa_softirq__process_short_messages_first) { diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 628b1318..3970d5bd 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -61,7 +61,8 @@ static void change_owner_hook(char *id) return; if (!cur_pool) return; - cur_pool->descriptors[cur_pool->cores[cpu_number].page_hint].owner = -1; + cur_pool->descriptors[cur_pool->cores[raw_smp_processor_id()] + .page_hint].owner = -1; } TEST_F(homa_pool, homa_pool_set_bpages_needed) @@ -126,7 +127,7 @@ TEST_F(homa_pool, homa_pool_get_pages__basics) EXPECT_EQ(1, pages[1]); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); EXPECT_EQ(-1, pool->descriptors[1].owner); - EXPECT_EQ(2, pool->cores[cpu_number].next_candidate); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].next_candidate); EXPECT_EQ(98, atomic_read(&pool->free_bpages)); } TEST_F(homa_pool, homa_pool_get_pages__not_enough_space) @@ -143,7 +144,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit) struct homa_pool *pool = &self->hsk.buffer_pool; __u32 pages[10]; atomic_set(&pool->free_bpages, 62); - pool->cores[cpu_number].next_candidate = 49; + pool->cores[raw_smp_processor_id()].next_candidate = 49; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(49, pages[0]); EXPECT_EQ(0, pages[1]); @@ -153,7 +154,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit_with_MIN_EXTRA) struct homa_pool *pool = &self->hsk.buffer_pool; __u32 pages[10]; atomic_set(&pool->free_bpages, 92); - pool->cores[cpu_number].next_candidate = 13; + pool->cores[raw_smp_processor_id()].next_candidate = 13; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(13, pages[0]); EXPECT_EQ(0, pages[1]); @@ -234,9 +235,9 @@ TEST_F(homa_pool, homa_pool_allocate__basics) EXPECT_EQ(0, crpc->msgin.bpage_offsets[0]); EXPECT_EQ(-1, pool->descriptors[0].owner); EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[2]); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); EXPECT_EQ(150000 - 2*HOMA_BPAGE_SIZE, - pool->cores[cpu_number].allocated); + pool->cores[raw_smp_processor_id()].allocated); } TEST_F(homa_pool, homa_pool_no_buffer_pool) { @@ -278,7 +279,7 @@ TEST_F(homa_pool, homa_pool_allocate__no_partial_page) TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) { struct homa_pool *pool = &self->hsk.buffer_pool; - pool->cores[cpu_number].next_candidate = 2; + pool->cores[raw_smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 40); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, @@ -286,7 +287,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) ASSERT_NE(NULL, crpc); // First allocation just sets up a partially-allocated bpage. - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); // Try a second allocation; the lock hook steals the partial bpage, // so a new one has to be allocated. @@ -296,8 +297,8 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) EXPECT_EQ(0, homa_pool_allocate(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(3, pool->cores[cpu_number].page_hint); - EXPECT_EQ(2000, pool->cores[cpu_number].allocated); + EXPECT_EQ(3, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); EXPECT_EQ(1, -pool->descriptors[2].owner); EXPECT_EQ(1, pool->descriptors[3].owner); EXPECT_EQ(38, atomic_read(&pool->free_bpages)); @@ -305,40 +306,40 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) { struct homa_pool *pool = &self->hsk.buffer_pool; - pool->cores[cpu_number].page_hint = 2; - pool->cores[cpu_number].allocated = HOMA_BPAGE_SIZE-1900; + pool->cores[raw_smp_processor_id()].page_hint = 2; + pool->cores[raw_smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; atomic_set(&pool->descriptors[2].refs, 1); - pool->descriptors[2].owner = cpu_number; + pool->descriptors[2].owner = raw_smp_processor_id(); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(2000, pool->cores[cpu_number].allocated); - EXPECT_EQ(cpu_number, pool->descriptors[2].owner); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.bpage_reuses); + EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); + EXPECT_EQ(raw_smp_processor_id(), pool->descriptors[2].owner); + EXPECT_EQ(1, core_metrics.bpage_reuses); } TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) { struct homa_pool *pool = &self->hsk.buffer_pool; - pool->cores[cpu_number].next_candidate = 2; + pool->cores[raw_smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 50); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); crpc->msgin.num_bpages = 0; - pool->cores[cpu_number].allocated = HOMA_BPAGE_SIZE-1900; + pool->cores[raw_smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; EXPECT_EQ(0, homa_pool_allocate(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(3, pool->cores[cpu_number].page_hint); - EXPECT_EQ(2000, pool->cores[cpu_number].allocated); + EXPECT_EQ(3, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); EXPECT_EQ(-1, pool->descriptors[2].owner); EXPECT_EQ(1, atomic_read(&pool->descriptors[2].refs)); EXPECT_EQ(1, pool->descriptors[3].owner); @@ -347,7 +348,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) { struct homa_pool *pool = &self->hsk.buffer_pool; - pool->cores[cpu_number].next_candidate = 2; + pool->cores[raw_smp_processor_id()].next_candidate = 2; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); @@ -362,8 +363,8 @@ TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) EXPECT_EQ(1, crpc2->msgin.num_bpages); EXPECT_EQ(2*HOMA_BPAGE_SIZE + 2000, crpc2->msgin.bpage_offsets[0]); EXPECT_EQ(3, atomic_read(&pool->descriptors[2].refs)); - EXPECT_EQ(2, pool->cores[cpu_number].page_hint); - EXPECT_EQ(5000, pool->cores[cpu_number].allocated); + EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(5000, pool->cores[raw_smp_processor_id()].allocated); } TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) { @@ -404,7 +405,7 @@ TEST_F(homa_pool, homa_pool_allocate__out_of_space) rpc = list_next_entry(rpc, buf_links); EXPECT_EQ(100, rpc->id); EXPECT_TRUE(list_is_last(&rpc->buf_links, &self->hsk.waiting_for_bufs)); - EXPECT_EQ(3, homa_cores[cpu_number]->metrics.buffer_alloc_failures); + EXPECT_EQ(3, core_metrics.buffer_alloc_failures); EXPECT_EQ(1, pool->bpages_needed); } diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 34259f00..6de0ef12 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -95,7 +95,7 @@ TEST_F(homa_skb, homa_skb_cleanup) core->skb_page = alloc_pages(GFP_KERNEL, 2); add_to_pool(&self->homa, 5, 2); add_to_pool(&self->homa, 4, 3); - cpu_number = 3; + mock_set_core(3); homa_skb_stash_pages(&self->homa, 2 * HOMA_SKB_PAGE_SIZE - 100); EXPECT_EQ(5, homa_cores[2]->numa->page_pool.avail); EXPECT_EQ(2, homa_cores[3]->numa->page_pool.avail); @@ -195,7 +195,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) EXPECT_NE(NULL, p3); EXPECT_EQ(1000, length); EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); - EXPECT_EQ(0, skb_shinfo(self->skb)->frags[1].bv_offset); + EXPECT_EQ(0, skb_shinfo(self->skb)->frags[1].offset); EXPECT_EQ(2000, self->skb->len); EXPECT_EQ(1000, core->page_inuse); @@ -223,7 +223,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) EXPECT_EQ(p2 + 512, p3); EXPECT_EQ(512, length); EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); - EXPECT_EQ(1536, skb_shinfo(self->skb)->frags[1].bv_offset); + EXPECT_EQ(1536, skb_shinfo(self->skb)->frags[1].offset); EXPECT_EQ(2048, core->page_inuse); kfree_skb(skb2); @@ -246,7 +246,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__free_previous_page) } TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) { - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; struct sk_buff *skb = homa_skb_new_tx(100); struct page *page; int length = 100; @@ -262,8 +262,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) } TEST_F(homa_skb, homa_skb_page_alloc__from_stash) { - struct homa_core *core = homa_cores[cpu_number]; - add_to_pool(&self->homa, 5, cpu_number); + struct homa_core *core = cur_core; + add_to_pool(&self->homa, 5, raw_smp_processor_id()); homa_skb_stash_pages(&self->homa, 3*HOMA_SKB_PAGE_SIZE - 100); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); EXPECT_NE(NULL, core->skb_page); @@ -273,8 +273,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__from_stash) } TEST_F(homa_skb, homa_skb_page_alloc__from_pool) { - struct homa_core *core = homa_cores[cpu_number]; - add_to_pool(&self->homa, 5, cpu_number); + struct homa_core *core = cur_core; + add_to_pool(&self->homa, 5, raw_smp_processor_id()); EXPECT_EQ(5, core->numa->page_pool.avail); EXPECT_EQ(0, core->num_stashed_pages); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); @@ -283,8 +283,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__from_pool) } TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) { - struct homa_core *core = homa_cores[cpu_number]; - add_to_pool(&self->homa, 1, cpu_number); + struct homa_core *core = cur_core; + add_to_pool(&self->homa, 1, raw_smp_processor_id()); EXPECT_EQ(1, core->numa->page_pool.avail); EXPECT_EQ(0, core->num_stashed_pages); hook_pool = &core->numa->page_pool; @@ -297,15 +297,15 @@ TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) } TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) { - struct homa_core *core = homa_cores[cpu_number]; + struct homa_core *core = cur_core; mock_cycles = ~0; EXPECT_EQ(0, core->numa->page_pool.avail); EXPECT_EQ(0, core->num_stashed_pages); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); EXPECT_NE(NULL, core->skb_page); EXPECT_EQ(HOMA_SKB_PAGE_SIZE, core->page_size); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.skb_page_allocs); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.skb_page_alloc_cycles); + EXPECT_EQ(1, core_metrics.skb_page_allocs); + EXPECT_NE(0, core_metrics.skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) { @@ -317,8 +317,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) EXPECT_NE(NULL, core->skb_page); EXPECT_EQ(PAGE_SIZE, core->page_size); EXPECT_EQ(0, core->page_inuse); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.skb_page_allocs); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.skb_page_alloc_cycles); + EXPECT_EQ(1, core_metrics.skb_page_allocs); + EXPECT_NE(0, core_metrics.skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__no_pages_available) { @@ -342,15 +342,15 @@ TEST_F(homa_skb, homa_skb_append_to_frag__basics) "0123456789ABCDEFGHIJ", 21)); EXPECT_EQ(2, shinfo->nr_frags); - EXPECT_EQ(10, shinfo->frags[0].bv_len); - char *p = ((char *) page_address(shinfo->frags[0].bv_page)) - + shinfo->frags[0].bv_offset; - p[shinfo->frags[0].bv_len] = 0; + EXPECT_EQ(10, skb_frag_size(&shinfo->frags[0])); + char *p = ((char *) page_address(skb_frag_page(&shinfo->frags[0]))) + + shinfo->frags[0].offset; + p[skb_frag_size(&shinfo->frags[0])] = 0; EXPECT_STREQ("abcd012345", p); - EXPECT_EQ(15, shinfo->frags[1].bv_len); - p = ((char *) page_address(shinfo->frags[1].bv_page)) - + shinfo->frags[1].bv_offset; + EXPECT_EQ(15, skb_frag_size(&shinfo->frags[1])); + p = ((char *) page_address(skb_frag_page(&shinfo->frags[1]))) + + shinfo->frags[1].offset; EXPECT_STREQ("6789ABCDEFGHIJ", p); } TEST_F(homa_skb, homa_skb_append_to_frag__no_memory) @@ -383,8 +383,8 @@ TEST_F(homa_skb, homa_skb_append_from_iter__basics) unit_log_get()); EXPECT_EQ(2, shinfo->nr_frags); - EXPECT_EQ(4096, shinfo->frags[0].bv_len); - EXPECT_EQ(904, shinfo->frags[1].bv_len); + EXPECT_EQ(4096, skb_frag_size(&shinfo->frags[0])); + EXPECT_EQ(904, skb_frag_size(&shinfo->frags[1])); } TEST_F(homa_skb, homa_skb_append_from_iter__no_memory) { @@ -513,7 +513,7 @@ TEST_F(homa_skb, homa_skb_free_many_tx__skb_ref_count_not_one) length = HOMA_SKB_PAGE_SIZE; homa_skb_extend_frags(&self->homa, skb, &length); EXPECT_EQ(HOMA_SKB_PAGE_SIZE, length); - page = skb_shinfo(skb)->frags[0].bv_page; + page = skb_frag_page(&skb_shinfo(skb)->frags[0]); EXPECT_EQ(2, page_ref_count(page)); skb_get(skb); EXPECT_EQ(2, refcount_read(&skb->users)); @@ -535,7 +535,7 @@ TEST_F(homa_skb, homa_skb_free_many_tx__check_page_order) homa_skb_extend_frags(&self->homa, skb, &length); } EXPECT_EQ(HOMA_SKB_PAGE_SIZE, length); - struct page *page = skb_shinfo(skb)->frags[2].bv_page; + struct page *page = skb_frag_page(&skb_shinfo(skb)->frags[2]); mock_compound_order_mask = 3; homa_skb_free_many_tx(&self->homa, &skb, 1); diff --git a/test/unit_homa_socktab.c b/test/unit_homa_socktab.c index ef7f018f..646182a7 100644 --- a/test/unit_homa_socktab.c +++ b/test/unit_homa_socktab.c @@ -294,13 +294,13 @@ TEST_F(homa_socktab, homa_sock_lock_slow) mock_cycles = ~0; homa_sock_lock(&self->hsk, "unit test"); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.socket_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.socket_lock_miss_cycles); + EXPECT_EQ(0, core_metrics.socket_lock_misses); + EXPECT_EQ(0, core_metrics.socket_lock_miss_cycles); homa_sock_unlock(&self->hsk); mock_trylock_errors = 1; homa_sock_lock(&self->hsk, "unit test"); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.socket_lock_misses); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.socket_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.socket_lock_misses); + EXPECT_NE(0, core_metrics.socket_lock_miss_cycles); homa_sock_unlock(&self->hsk); } \ No newline at end of file diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 28c00155..cd5193e9 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -136,11 +136,11 @@ TEST_F(homa_timer, homa_check_rpc__timeout) unit_log_clear(); crpc->silent_ticks = self->homa.timeout_ticks-1; homa_check_rpc(crpc); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.rpc_timeouts); + EXPECT_EQ(0, core_metrics.rpc_timeouts); EXPECT_EQ(0, crpc->error); crpc->silent_ticks = self->homa.timeout_ticks; homa_check_rpc(crpc); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.rpc_timeouts); + EXPECT_EQ(1, core_metrics.rpc_timeouts); EXPECT_EQ(ETIMEDOUT, -crpc->error); } TEST_F(homa_timer, homa_check_rpc__issue_resend) @@ -250,7 +250,7 @@ TEST_F(homa_timer, homa_timer__basics) unit_log_clear(); crpc->peer->outstanding_resends = self->homa.timeout_resends; homa_timer(&self->homa); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.rpc_timeouts); + EXPECT_EQ(1, core_metrics.rpc_timeouts); EXPECT_EQ(ETIMEDOUT, -crpc->error); } TEST_F(homa_timer, homa_timer__reap_dead_rpcs) diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index b5297ee9..fcc82212 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -280,18 +280,18 @@ TEST_F(homa_utils, homa_bucket_lock_slow) ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.client_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.client_lock_miss_cycles); + EXPECT_EQ(0, core_metrics.client_lock_misses); + EXPECT_EQ(0, core_metrics.client_lock_miss_cycles); homa_bucket_lock_slow(crpc->bucket, crpc->id); homa_rpc_unlock(crpc); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.client_lock_misses); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.client_lock_miss_cycles); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.server_lock_misses); - EXPECT_EQ(0, homa_cores[cpu_number]->metrics.server_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.client_lock_misses); + EXPECT_NE(0, core_metrics.client_lock_miss_cycles); + EXPECT_EQ(0, core_metrics.server_lock_misses); + EXPECT_EQ(0, core_metrics.server_lock_miss_cycles); homa_bucket_lock_slow(srpc->bucket, srpc->id); homa_rpc_unlock(srpc); - EXPECT_EQ(1, homa_cores[cpu_number]->metrics.server_lock_misses); - EXPECT_NE(0, homa_cores[cpu_number]->metrics.server_lock_miss_cycles); + EXPECT_EQ(1, core_metrics.server_lock_misses); + EXPECT_NE(0, core_metrics.server_lock_miss_cycles); } TEST_F(homa_utils, homa_rpc_acked__basics) diff --git a/test/utils.h b/test/utils.h index 8e23de70..f782266f 100644 --- a/test/utils.h +++ b/test/utils.h @@ -30,6 +30,10 @@ enum unit_rpc_state { UNIT_IN_SERVICE = 24, }; +#define core_metrics homa_cores[raw_smp_processor_id()]->metrics + +#define cur_core homa_cores[raw_smp_processor_id()] + extern char *unit_ack_string(struct homa_ack *ack); extern struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, diff --git a/timetrace.c b/timetrace.c index 737fd0c7..e429786c 100644 --- a/timetrace.c +++ b/timetrace.c @@ -9,18 +9,19 @@ * timetrace stubs; we will then connect the timetrace mechanism here with * those stubs to allow the rest of the kernel to log in our buffers. */ -#define TT_KERNEL 1 +//#define TT_KERNEL 1 #endif #ifdef TT_KERNEL -extern int tt_linux_buffer_mask; extern struct tt_buffer *tt_linux_buffers[]; extern void (*tt_linux_freeze)(void); extern atomic_t *tt_linux_freeze_count; extern atomic_t tt_linux_freeze_no_homa; extern int *tt_linux_homa_temp; extern int tt_linux_homa_temp_default[]; -extern void tt_inc_metric(int metric, __u64 count); extern void (*tt_linux_inc_metrics)(int metric, __u64 count); +extern void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, + const char* format, __u32 arg0, __u32 arg1, __u32 arg2, + __u32 arg3); extern void tt_linux_skip_metrics(int metric, __u64 count); extern void (*tt_linux_printk)(void); extern void (*tt_linux_dbg1)(char *msg, ...); @@ -28,7 +29,12 @@ extern void (*tt_linux_dbg2)(char *msg, ...); extern void (*tt_linux_dbg3)(char *msg, ...); extern void tt_linux_nop(void); extern void homa_trace(__u64 u0, __u64 u1, int i0, int i1); + +extern void ltt_record_nop(struct tt_buffer *buffer, __u64 timestamp, + const char *format, __u32 arg0, __u32 arg1, + __u32 arg2, __u32 arg3); #endif +extern void tt_inc_metric(int metric, __u64 count); /* Separate buffers for each core: this eliminates the need for * synchronization in tt_record, which improves performance significantly. @@ -135,7 +141,7 @@ int tt_init(char *proc_file, int *temp) for (i = 0; i < nr_cpu_ids; i++) { tt_linux_buffers[i] = tt_buffers[i]; } - tt_linux_buffer_mask = TT_BUF_SIZE-1; + tt_linux_record = tt_record_buf; tt_linux_freeze = tt_freeze; tt_linux_freeze_count = &tt_freeze_count; tt_linux_inc_metrics = tt_inc_metric; @@ -177,6 +183,7 @@ void tt_destroy(void) tt_freeze_count.counter = 1; #ifdef TT_KERNEL + tt_linux_record = ltt_record_nop; tt_linux_freeze = tt_linux_nop; tt_linux_freeze_count = &tt_linux_freeze_no_homa; for (i = 0; i < nr_cpu_ids; i++) { From c004f6d8458137b4a0274675992ac4e5a689f8ac Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Sep 2024 16:09:01 -0700 Subject: [PATCH 004/625] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 913fd2bc..a417d495 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k sysctl mechanism. For details, see the man page `homa.7`. ## Significant recent improvements +- August 2024: upgraded to Linux 6.10.6. - July 2024: introduced "TCP hijacking", where Homa packets are sent as legitimate TCP segments (using TCP as the IP protocol) and then reclaimed from TCP on the destination. This allows Homa to make better use of @@ -132,6 +133,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - June 2024: refactored sk_buff management to use frags; improves efficiency significantly. - April 2024: replaced `master` branch with `main` +- July 2023: upgraded to Linux 6.1.38. - December 2022: Version 2.0. This includes a new mechanism for managing buffer space for incoming messages, which improves throughput by 50-100% in many situations. In addition, Homa now uses the sendmsg From 6fbaff0357c5d1d4bb5fae7e71f613d00c4de05b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Sep 2024 16:10:53 -0700 Subject: [PATCH 005/625] Extend cp_node to handle concurrent Homa and TCP traffic --- util/cp_node.cc | 577 +++++++++++++++++++++++++++--------------------- util/cperf.py | 12 +- 2 files changed, 327 insertions(+), 262 deletions(-) diff --git a/util/cp_node.cc b/util/cp_node.cc index d92da5a6..6c562665 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -54,9 +54,9 @@ using std::string; uint32_t client_max = 1; uint32_t client_port_max = 1; int client_ports = 0; -int first_port = 4000; +int first_port = -1; bool is_server = false; -int id = -1; +int node_id = -1; double net_gbps = 0.0; bool tcp_trunc = true; bool one_way = false; @@ -75,7 +75,7 @@ int inet_family = AF_INET; int server_core = -1; int buf_bpages = 1000; -/* Node ids for clients to send requests to. */ +/* Node ids for client to send requests to. */ std::vector server_ids; /** @rand_gen: random number generator. */ @@ -123,95 +123,76 @@ struct conn_id { }; /** - * @server_addrs: Internet addresses for each of the server threads available - * to receive a Homa RPC. - */ -std::vector server_addrs; - -/** - * @server_conns: for each entry in @server_addrs, a connection identifier - * with all fields filled in except client_port, which will be 0. - */ -std::vector server_conns; - -/** - * @freeze: one entry for each node index; 1 means messages to that - * node should contain a flag telling the node to freeze its time trace. - */ -std::vector freeze; - -/** - * @first_id: entry i contains the index in server_addrs of the first - * entry for the server ports on node i. Used to map from node+port to - * server id. - */ -std::vector first_id; + * enum protocol - Used to distinguish things using Homa vs. TCP. +*/ +enum protocol {PROT_HOMA, PROT_TCP}; +#define NUM_PROTOCOLS 2 /** @message_id: used to generate unique identifiers for outgoing messages.*/ std::atomic message_id; /** * @last_stats_time: time (in rdtsc cycles) when we last printed - * staticsics. Zero means that none of the statistics below are valid. + * statistics. Zero means that none of the statistics below are valid. */ uint64_t last_stats_time = 0; /** * @last_client_rpcs: total number of client RPCS completed by this - * application as of the last time we printed statistics. + * application for each protocol as of the last time we printed statistics. */ -uint64_t last_client_rpcs = 0; +uint64_t last_client_rpcs[NUM_PROTOCOLS]; /** * @last_client_bytes_out: total amount of data in request messages for - * client RPCS completed by this application as of the last time we printed - * statistics. + * client RPCS completed by this application for each protocol as of the + * last time we printed statistics. */ -uint64_t last_client_bytes_out = 0; +uint64_t last_client_bytes_out[NUM_PROTOCOLS]; /** * @last_client_bytes_in: total amount of data in response messages for - * client RPCS completed by this application as of the last time we printed - * statistics. + * client RPCS completed by this application for each protocol as of the + * last time we printed statistics. */ -uint64_t last_client_bytes_in = 0; +uint64_t last_client_bytes_in[NUM_PROTOCOLS]; /** * @last_total_elapsed: total amount of elapsed time for all client RPCs - * issued by this application (in units of rdtsc cycles), as of the last - * time we printed statistics. + * issued by this application (in units of rdtsc cycles) for each protocol + * as of the last time we printed statistics. */ -uint64_t last_total_rtt = 0; +uint64_t last_total_rtt[NUM_PROTOCOLS]; /** * @last_lag: total lag across all clients (measured in rdtsc cycles) - * as of the last time we printed statistics. + * for each protocol as of the last time we printed statistics. */ -uint64_t last_lag = 0; +uint64_t last_lag[NUM_PROTOCOLS]; /** - * @last_backups: total # of backed-up sends as of the last time we - * printed statistics. + * @last_backups: total # of backed-up sends for each protocol as of the + * last time we printed statistics. */ -uint64_t last_backups = 0; +uint64_t last_backups[NUM_PROTOCOLS]; /** * @last_server_rpcs: total number of server RPCS handled by this - * application as of the last time we printed statistics. + * application for each protocol as of the last time we printed statistics. */ -uint64_t last_server_rpcs = 0; +uint64_t last_server_rpcs[NUM_PROTOCOLS]; /** * @last_server_bytes_in: total amount of data in incoming requests handled by - * this application as of the last time we printed statistics. + * this application for each protocol as of the last time we printed statistics. */ -uint64_t last_server_bytes_in = 0; +uint64_t last_server_bytes_in[NUM_PROTOCOLS]; /** * @last_server_bytes_out: total amount of data in responses sent by - * this application as of the last time we printed statistics. + * this application for each protcool as of the last time we printed statistics. */ -uint64_t last_server_bytes_out = 0; +uint64_t last_server_bytes_out[NUM_PROTOCOLS]; /** * @last_per_server_rpcs: server->requests for each individual server, @@ -277,8 +258,8 @@ void print_help(const char *name) printf(" --client-max Maximum number of outstanding requests from a single\n" " client machine (divided equally among client ports)\n" " (default: %d)\n", client_max); - printf(" --first-port Lowest port number to use for each server (default: %d)\n", - first_port); + printf(" --first-port Lowest port number to use for each server (default: \n"); + printf(" 4000 for Homa, 5000 for TCP)\n"); printf(" --first-server Id of first server node (default: 1, meaning node1)\n"); printf(" --gbps Target network utilization, including only message data,\n" " Gbps; 0 means send continuously (default: %.1f)\n", @@ -313,7 +294,8 @@ void print_help(const char *name) workload); printf("debug value value ... Set one or more int64_t values that may be used for\n" " various debugging purposes\n\n"); - printf("dump_times file Log RTT times (and lengths) to file\n\n"); + printf("dump_times prot file Log RTT times (and lengths) for clients running the\n" + " protocol given by prot (homa or tcp) to file\n\n"); printf("exit Exit the application\n\n"); printf("log [options] [msg] Configure logging as determined by the options. If\n" " there is an \"option\" that doesn't start with \"--\",\n" @@ -326,8 +308,8 @@ void print_help(const char *name) printf(" --buf-bpages Number of bpages to allocate in the buffer poool for\n" " incoming messages (default: %d)\n", buf_bpages); - printf(" --first-port Lowest port number to use (default: %d)\n", - first_port); + printf(" --first-port Lowest port number to use (default: 4000 for Homa,\n"); + printf(" 5000 for TCP)\n"); printf(" --iovec Use homa_replyv instead of homa_reply\n"); printf(" --ipv6 Use IPv6 instead of IPv4\n"); printf(" --pin All server threads will be restricted to run only\n" @@ -498,55 +480,6 @@ struct message_header { uint32_t msg_id; }; -/** - * init_server_addrs() - Set up the server_addrs table (addresses of the - * server/port combinations that clients will communicate with), based on - * current configuration parameters. Any previous contents of the table - * are discarded. This also initializes related arrays @server_ids and - * @freeze. - */ -void init_server_addrs(void) -{ - server_addrs.clear(); - server_conns.clear(); - freeze.clear(); - first_id.clear(); - for (int node: server_ids) { - char host[100]; - struct addrinfo hints; - struct addrinfo *matching_addresses; - sockaddr_in_union *dest; - - if (node == id) - continue; - snprintf(host, sizeof(host), "node%d", node); - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = inet_family; - hints.ai_socktype = SOCK_DGRAM; - int status = getaddrinfo(host, NULL, &hints, - &matching_addresses); - if (status != 0) { - log(NORMAL, "FATAL: couldn't look up address " - "for %s: %s\n", - host, gai_strerror(status)); - exit(1); - } - dest = reinterpret_cast - (matching_addresses->ai_addr); - while (((int) first_id.size()) < node) - first_id.push_back(-1); - first_id.push_back((int) server_addrs.size()); - for (int thread = 0; thread < server_ports; thread++) { - dest->in4.sin_port = htons(first_port + thread); - server_addrs.push_back(*dest); - server_conns.emplace_back(node, thread, id, 0); - } - while (((int) freeze.size()) <= node) - freeze.push_back(0); - freeaddrinfo(matching_addresses); - } -} - /** * class spin_lock - Implements simple spin lock guards: lock is acquired by * constructor, released by destructor. @@ -938,6 +871,9 @@ bool tcp_connection::xmit() */ class server_metrics { public: + /** @protocol: Was Homa used by this server, or TCP? */ + enum protocol protocol; + /** @requests: Total number of requests handled so far. */ uint64_t requests; @@ -953,7 +889,8 @@ class server_metrics { */ uint64_t bytes_out; - server_metrics() :requests(0), bytes_in(0), bytes_out(0) {} + server_metrics(enum protocol protocol) : protocol(protocol), + requests(0), bytes_in(0), bytes_out(0) {} }; /** @@ -1063,7 +1000,7 @@ homa_server::homa_server(int port, int id, int inet_family, } for (int i = 0; i < num_threads; i++) { - server_metrics *thread_metrics = new server_metrics; + server_metrics *thread_metrics = new server_metrics(PROT_HOMA); metrics.push_back(thread_metrics); threads.emplace_back([this, i, thread_metrics] () { server(i, thread_metrics); @@ -1300,7 +1237,7 @@ tcp_server::tcp_server(int port, int id, int num_threads) exit(1); } - metrics = new server_metrics; + metrics = new server_metrics(PROT_TCP); ::metrics.push_back(metrics); for (int i = 0; i < num_threads; i++) @@ -1555,21 +1492,46 @@ class client { rinfo() : start_time(0), request_length(0), active(false) {} }; - client(int id); + client(enum protocol protocol, int id); virtual ~client(); void check_completion(const char *protocol); int get_rinfo(); void record(uint64_t end_time, message_header *header); virtual void stop_sender(void) {} + /** @protocol: indicates whether Homa or TCP is used by this client. */ + enum protocol protocol; + /** * @id: unique identifier for this client (index starting at * 0 for the first client. */ int id; - /** @num_servers: Number of servers this client will send requests to. */ - size_t num_servers; + /** + * @server_addrs: Internet addresses for each of the server ports + * where this client will send RPCs. + */ + std::vector server_addrs; + + /** + * @server_conns: for each entry in @server_addrs, a connection + * identifier with all fields filled in except client_port, which + * will be 0. + */ + std::vector server_conns; + + /** + * @freeze: one entry for each node index; 1 means messages to that + * node should contain a flag telling the node to freeze its time trace. + */ + std::vector freeze; + + /** + * @first_id: entry i contains the index in server_addrs of the first + * entry for the server ports on node i. + */ + std::vector first_id; /** * @rinfos: storage for more than enough rinfos to handle all of the @@ -1672,17 +1634,22 @@ class client { std::vector clients; /** - * client::client() - Constructor for client objects. + * client::client() - Constructor for client objects. Uses configuration + * information from global variables to initialize. * * @id: Unique identifier for this client (index starting at 0?) */ -client::client(int id) - : id(id) - , num_servers(server_addrs.size()) +client::client(enum protocol protocol, int id) + : protocol(protocol) + , id(id) + , server_addrs() + , server_conns() + , freeze() + , first_id() , last_rinfo(0) , receivers_running(0) , cycles_per_second(get_cycles_per_sec()) - , server_dist(0, static_cast(num_servers - 1)) + , server_dist() , length_dist(workload, HOMA_MAX_MESSAGE_LENGTH) , actual_lengths(NUM_CLIENT_STATS, 0) , actual_rtts(NUM_CLIENT_STATS, 0) @@ -1693,13 +1660,55 @@ client::client(int id) , total_rtt(0) , lag(0) { + server_addrs.clear(); + server_conns.clear(); + freeze.clear(); + first_id.clear(); + for (int node: server_ids) { + char host[100]; + struct addrinfo hints; + struct addrinfo *matching_addresses; + sockaddr_in_union *dest; + + if (node == node_id) + continue; + snprintf(host, sizeof(host), "node%d", node); + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = inet_family; + hints.ai_socktype = SOCK_DGRAM; + int status = getaddrinfo(host, NULL, &hints, + &matching_addresses); + if (status != 0) { + log(NORMAL, "FATAL: couldn't look up address " + "for %s: %s\n", + host, gai_strerror(status)); + exit(1); + } + dest = reinterpret_cast + (matching_addresses->ai_addr); + while (((int) first_id.size()) < node) + first_id.push_back(-1); + first_id.push_back((int) server_addrs.size()); + for (int thread = 0; thread < server_ports; thread++) { + dest->in4.sin_port = htons(first_port + thread); + server_addrs.push_back(*dest); + server_conns.emplace_back(node, thread, node_id, 0); + } + while (((int) freeze.size()) <= node) + freeze.push_back(0); + freeaddrinfo(matching_addresses); + } + + server_dist.param(std::uniform_int_distribution<>::param_type(0, + static_cast(server_addrs.size() - 1))); + rinfos.resize(2*client_port_max + 5); double avg_length = length_dist.get_mean(); double rate = 1e09*(net_gbps/8.0)/(avg_length*client_ports); interval_dist = std::exponential_distribution(rate); requests.resize(server_addrs.size()); - responses = new std::atomic[num_servers]; - for (size_t i = 0; i < num_servers; i++) + responses = new std::atomic[server_addrs.size()]; + for (size_t i = 0; i < server_addrs.size(); i++) responses[i] = 0; log(NORMAL, "Average message length %.1f KB, rate %.2f K/sec, " "expected BW %.1f Gbps\n", @@ -1889,7 +1898,7 @@ class homa_client : public client { * @id: Unique identifier for this client (index starting at 0?) */ homa_client::homa_client(int id) - : client(id) + : client(PROT_HOMA, id) , fd(-1) , buf_region(nullptr) , buf_size(buf_bpages*HOMA_BPAGE_SIZE) @@ -2087,7 +2096,7 @@ void homa_client::sender() &server_addrs[server], &rpc_id, 0); } else status = homa_send(fd, sender_buffer, header->length, - &server_addrs[server], &rpc_id, 0); + &server_addrs[server], &rpc_id, 0); if (status < 0) { log(NORMAL, "FATAL: error in homa_send: %s (request " "length %d)\n", strerror(errno), @@ -2114,7 +2123,7 @@ void homa_client::sender() void homa_client::receiver(int receiver_id) { char thread_name[50]; - snprintf(thread_name, sizeof(thread_name), "R%d.%d", id, receiver_id); + snprintf(thread_name, sizeof(thread_name), "R%d.%d", node_id, receiver_id); time_trace::thread_buffer thread_buffer(thread_name); homa::receiver receiver(fd, buf_region); @@ -2291,7 +2300,7 @@ class tcp_client : public client { * @id: Unique identifier for this client (index starting at 0?) */ tcp_client::tcp_client(int id) - : client(id) + : client(PROT_TCP, id) , connections() , blocked() , bytes_sent() @@ -2303,8 +2312,8 @@ tcp_client::tcp_client(int id) , receiving_threads() , sending_thread() { - bytes_rcvd = new std::atomic[num_servers]; - for (size_t i = 0; i < num_servers; i++) { + bytes_rcvd = new std::atomic[server_addrs.size()]; + for (size_t i = 0; i < server_addrs.size(); i++) { bytes_sent.push_back(0); bytes_rcvd[i] = 0; } @@ -2582,46 +2591,60 @@ void tcp_client::read(tcp_connection *connection, int pid) */ void server_stats(uint64_t now) { - char details[10000]; - int offset = 0; - int length; - uint64_t server_rpcs = 0; - uint64_t server_bytes_in = 0; - uint64_t server_bytes_out = 0; - details[0] = 0; - for (uint32_t i = 0; i < metrics.size(); i++) { - server_metrics *server = metrics[i]; - server_rpcs += server->requests; - server_bytes_in += server->bytes_in; - server_bytes_out += server->bytes_out; - length = snprintf(details + offset, sizeof(details) - offset, - "%s%lu", (offset != 0) ? " " : "", - server->requests - last_per_server_rpcs[i]); - offset += length; - if (i > last_per_server_rpcs.size()) - printf("last_per_server_rpcs has %lu entries, needs %lu\n", - last_per_server_rpcs.size(), - metrics.size()); - last_per_server_rpcs[i] = server->requests; - } - if ((last_stats_time != 0) && (server_bytes_in != last_server_bytes_in)) { - double elapsed = to_seconds(now - last_stats_time); - double rpcs = (double) (server_rpcs - last_server_rpcs); - double in_delta = (double) (server_bytes_in - - last_server_bytes_in); - double out_delta = (double) (server_bytes_out - - last_server_bytes_out); - log(NORMAL, "Servers: %.2f Kops/sec, %.2f Gbps in, " - "%.2f Gbps out, avg. req. length %.1f bytes\n", - rpcs/(1000.0*elapsed), - 8.0*in_delta/(1e09*elapsed), - 8.0*out_delta/(1e09*elapsed), - in_delta/rpcs); - log(NORMAL, "RPCs per server: %s\n", details); - } - last_server_rpcs = server_rpcs; - last_server_bytes_in = server_bytes_in; - last_server_bytes_out = server_bytes_out; + int prot; + for (prot = PROT_HOMA; prot <= PROT_TCP; prot++) { + char details[10000]; + int offset = 0; + int length; + uint64_t server_rpcs = 0; + uint64_t server_bytes_in = 0; + uint64_t server_bytes_out = 0; + const char *prot_string = (prot == PROT_HOMA) ? "Homa" : "TCP"; + + details[0] = 0; + for (uint32_t i = 0; i < metrics.size(); i++) { + server_metrics *server = metrics[i]; + if (server->protocol != prot) + continue; + server_rpcs += server->requests; + server_bytes_in += server->bytes_in; + server_bytes_out += server->bytes_out; + length = snprintf(details + offset, + sizeof(details) - offset, + "%s%lu", (offset != 0) ? " " : "", + server->requests - last_per_server_rpcs[i]); + offset += length; + if (i > last_per_server_rpcs.size()) + printf("last_per_server_rpcs has %lu " + "entries, needs %lu\n", + last_per_server_rpcs.size(), + metrics.size()); + last_per_server_rpcs[i] = server->requests; + } + if ((last_stats_time != 0) && (server_bytes_in + != last_server_bytes_in[prot])) { + double elapsed = to_seconds(now - last_stats_time); + double rpcs = (double) (server_rpcs + - last_server_rpcs[prot]); + double in_delta = (double) (server_bytes_in + - last_server_bytes_in[prot]); + double out_delta = (double) (server_bytes_out + - last_server_bytes_out[prot]); + log(NORMAL, "%s servers: %.2f Kops/sec, %.2f Gbps in, " + "%.2f Gbps out, avg. req. length " + "%.1f bytes\n", + prot_string, + rpcs/(1000.0*elapsed), + 8.0*in_delta/(1e09*elapsed), + 8.0*out_delta/(1e09*elapsed), + in_delta/rpcs); + log(NORMAL, "RPCs per %s server: %s\n", prot_string, + details); + } + last_server_rpcs[prot] = server_rpcs; + last_server_bytes_in[prot] = server_bytes_in; + last_server_bytes_out[prot] = server_bytes_out; + } } /** @@ -2633,95 +2656,117 @@ void server_stats(uint64_t now) void client_stats(uint64_t now) { #define CDF_VALUES 100000 - uint64_t client_rpcs = 0; - uint64_t request_bytes = 0; - uint64_t response_bytes = 0; - uint64_t total_rtt = 0; - uint64_t lag = 0; - uint64_t outstanding_rpcs = 0; - uint64_t cdf_times[CDF_VALUES]; - uint64_t backups = 0; - int times_per_client; - int cdf_index = 0; - - if (clients.size() == 0) - return; + int prot; + for (prot = PROT_HOMA; prot <= PROT_TCP; prot++) { + uint64_t client_rpcs = 0; + uint64_t request_bytes = 0; + uint64_t response_bytes = 0; + uint64_t total_rtt = 0; + uint64_t lag = 0; + uint64_t outstanding_rpcs = 0; + uint64_t cdf_times[CDF_VALUES]; + uint64_t backups = 0; + int num_clients = 0; + int times_per_client; + int cdf_index = 0; + const char *prot_string = (prot == PROT_HOMA) ? "Homa" : "TCP"; + + for (client *client: clients) { + if (client->protocol == prot) + num_clients++; + } + if (num_clients == 0) + continue; - times_per_client = CDF_VALUES/clients.size(); - if (times_per_client > NUM_CLIENT_STATS) - times_per_client = NUM_CLIENT_STATS; - for (client *client: clients) { - for (size_t i = 0; i < client->num_servers; i++) - client_rpcs += client->responses[i]; - request_bytes += client->request_bytes; - response_bytes += client->response_bytes; - total_rtt += client->total_rtt; - lag += client->lag; - outstanding_rpcs += client->total_requests - - client->total_responses; - for (int i = 1; i <= times_per_client; i++) { - /* Collect the most recent RTTs from the client for - * computing a CDF. - */ - int src = (client->total_responses - i) - % NUM_CLIENT_STATS; - if (client->actual_rtts[src] == 0) { - /* Client hasn't accumulated times_per_client - * entries yet; just use what it has. */ - break; + times_per_client = CDF_VALUES/num_clients; + if (times_per_client > NUM_CLIENT_STATS) + times_per_client = NUM_CLIENT_STATS; + for (client *client: clients) { + if (client->protocol != prot) + continue; + for (size_t i = 0; i < client->server_addrs.size(); i++) + client_rpcs += client->responses[i]; + request_bytes += client->request_bytes; + response_bytes += client->response_bytes; + total_rtt += client->total_rtt; + lag += client->lag; + outstanding_rpcs += client->total_requests + - client->total_responses; + for (int i = 1; i <= times_per_client; i++) { + /* Collect the most recent RTTs from the client + * for computing a CDF. + */ + int src = (client->total_responses - i) + % NUM_CLIENT_STATS; + if (client->actual_rtts[src] == 0) { + /* Client hasn't accumulated + * times_per_client entries yet; just + * use what it has. + */ + break; + } + cdf_times[cdf_index] = client->actual_rtts[src]; + cdf_index++; } - cdf_times[cdf_index] = client->actual_rtts[src]; - cdf_index++; + tcp_client *tclient = dynamic_cast(client); + if (tclient) + backups += tclient->backups; } - tcp_client *tclient = dynamic_cast(client); - if (tclient) - backups += tclient->backups; - } - std::sort(cdf_times, cdf_times + cdf_index); - if ((last_stats_time != 0) && ((request_bytes != last_client_bytes_out) - || (outstanding_rpcs != 0))){ - double elapsed = to_seconds(now - last_stats_time); - double rpcs = (double) (client_rpcs - last_client_rpcs); - double delta_out = (double) (request_bytes - - last_client_bytes_out); - double delta_in = (double) (response_bytes - - last_client_bytes_in); - log(NORMAL, "Clients: %.2f Kops/sec, %.2f Gbps out, " - "%.2f Gbps in, RTT (us) P50 %.2f P99 %.2f " - "P99.9 %.2f, avg. req. length %.1f bytes\n", - rpcs/(1000.0*elapsed), - 8.0*delta_out/(1e09*elapsed), - 8.0*delta_in/(1e09*elapsed), - to_seconds(cdf_times[cdf_index/2])*1e06, - to_seconds(cdf_times[99*cdf_index/100])*1e06, - to_seconds(cdf_times[999*cdf_index/1000])*1e06, - delta_out/rpcs); - double lag_fraction; - if (lag > last_lag) - lag_fraction = (to_seconds(lag - last_lag)/elapsed) - / clients.size(); - else - lag_fraction = -(to_seconds(last_lag - lag)/elapsed) - / clients.size(); - if (lag_fraction >= .01) - log(NORMAL, "Lag due to overload: %.1f%%\n", - lag_fraction*100.0); - if (backups != 0) { - log(NORMAL, "Backed-up sends: %lu/%lu (%.1f%%)\n", - backups - last_backups, - client_rpcs - last_client_rpcs, - 100.0*(backups - last_backups) - /(client_rpcs - last_client_rpcs)); + if (num_clients == 0) + continue; + std::sort(cdf_times, cdf_times + cdf_index); + if ((last_stats_time != 0) && ((request_bytes + != last_client_bytes_out[prot]) + || (outstanding_rpcs != 0))){ + double elapsed = to_seconds(now - last_stats_time); + double rpcs = (double) (client_rpcs - last_client_rpcs[prot]); + double delta_out = (double) (request_bytes + - last_client_bytes_out[prot]); + double delta_in = (double) (response_bytes + - last_client_bytes_in[prot]); + log(NORMAL, "%s clients: %.2f Kops/sec, %.2f Gbps out, " + "%.2f Gbps in, RTT (us) P50 %.2f " + "P99 %.2f P99.9 %.2f, avg. req. length " + "%.1f bytes\n", + prot_string, + rpcs/(1000.0*elapsed), + 8.0*delta_out/(1e09*elapsed), + 8.0*delta_in/(1e09*elapsed), + to_seconds(cdf_times[cdf_index/2])*1e06, + to_seconds(cdf_times[99*cdf_index/100])*1e06, + to_seconds(cdf_times[999*cdf_index/1000])*1e06, + delta_out/rpcs); + double lag_fraction; + if (lag > last_lag[prot]) + lag_fraction = (to_seconds(lag + - last_lag[prot])/elapsed) + / num_clients; + else + lag_fraction = -(to_seconds(last_lag[prot] + - lag)/elapsed) / num_clients; + if (lag_fraction >= .01) + log(NORMAL, "%s lag due to overload: %.1f%%\n", + prot_string, + lag_fraction*100.0); + if (backups != 0) { + log(NORMAL, "Backed-up %s sends: %lu/%lu (%.1f%%)\n", + prot_string, + backups - last_backups[prot], + client_rpcs - last_client_rpcs[prot], + 100.0*(backups - last_backups[prot]) + /(client_rpcs - last_client_rpcs[prot])); + } } + if (outstanding_rpcs != 0) + log(NORMAL, "Outstanding %s client RPCs: %lu\n", + prot_string, outstanding_rpcs); + last_client_rpcs[prot] = client_rpcs; + last_client_bytes_out[prot] = request_bytes; + last_client_bytes_in[prot] = response_bytes; + last_total_rtt[prot] = total_rtt; + last_lag[prot] = lag; + last_backups[prot] = backups; } - if (outstanding_rpcs != 0) - log(NORMAL, "Outstanding client RPCs: %lu\n", outstanding_rpcs); - last_client_rpcs = client_rpcs; - last_client_bytes_out = request_bytes; - last_client_bytes_in = response_bytes; - last_total_rtt = total_rtt; - last_lag = lag; - last_backups = backups; } /** @@ -2757,7 +2802,7 @@ int client_cmd(std::vector &words) client_iovec = false; client_max = 1; client_ports = 1; - first_port = 4000; + first_port = -1; inet_family = AF_INET; net_gbps = 0.0; port_receivers = 1; @@ -2791,7 +2836,7 @@ int client_cmd(std::vector &words) return 0; i++; } else if (strcmp(option, "--id") == 0) { - if (!parse(words, i+1, &id, option, "integer")) + if (!parse(words, i+1, &node_id, option, "integer")) return 0; i++; } else if (strcmp(option, "--iovec") == 0) { @@ -2855,7 +2900,7 @@ int client_cmd(std::vector &words) } /* Figure out which nodes to use for servers (--servers, - * --server-ports, --first-server). + * --server-nodes, --first-server). */ server_ids.clear(); if (!servers.empty()) { @@ -2879,17 +2924,21 @@ int client_cmd(std::vector &words) server_ids.push_back(first_server + i); } - init_server_addrs(); client_port_max = client_max/client_ports; if (client_port_max < 1) client_port_max = 1; /* Create clients. */ for (int i = 0; i < client_ports; i++) { - if (strcmp(protocol, "homa") == 0) + if (strcmp(protocol, "homa") == 0) { + if (first_port == -1) + first_port = 4000; clients.push_back(new homa_client(i)); - else + } else { + if (first_port == -1) + first_port = 5000; clients.push_back(new tcp_client(i)); + } } last_stats_time = 0; time_trace::cleanup(); @@ -2931,12 +2980,22 @@ int dump_times_cmd(std::vector &words) FILE *f; time_t now; char time_buffer[100]; + int prot; - if (words.size() != 2) { - printf("Wrong # args; must be 'dump_times file'\n"); + if (words.size() != 3) { + printf("Wrong # args; must be 'dump_times protocol file'\n"); return 0; } - f = fopen(words[1].c_str(), "w"); + if (words[1] == "homa") + prot = PROT_HOMA; + else if (words[1] == "tcp") + prot = PROT_TCP; + else { + printf("Unknown protocol %s: must be homa or tcp\n", + words[1].c_str()); + return 0; + } + f = fopen(words[2].c_str(), "w"); if (f == NULL) { printf("Couldn't open file %s: %s\n", words[1].c_str(), strerror(errno)); @@ -2954,6 +3013,8 @@ int dump_times_cmd(std::vector &words) server_ids.size(), server_ports, client_max); fprintf(f, "# Length RTT (usec)\n"); for (client *client: clients) { + if (client->protocol != prot) + continue; __u32 start = client->total_responses % NUM_CLIENT_STATS; __u32 i = start; while (1) { @@ -3087,7 +3148,7 @@ int log_cmd(std::vector &words) int server_cmd(std::vector &words) { buf_bpages = 1000; - first_port = 4000; + first_port = -1; inet_family = AF_INET; protocol = "homa"; port_threads = 1; @@ -3138,12 +3199,16 @@ int server_cmd(std::vector &words) } if (strcmp(protocol, "homa") == 0) { + if (first_port == -1) + first_port = 4000; for (int i = 0; i < server_ports; i++) { homa_server *server = new homa_server(first_port + i, i, inet_family, port_threads); homa_servers.push_back(server); } } else { + if (first_port == -1) + first_port = 5000; for (int i = 0; i < server_ports; i++) { tcp_server *server = new tcp_server(first_port + i, i, port_threads); diff --git a/util/cperf.py b/util/cperf.py index 9969a4c6..508abbb4 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -704,7 +704,7 @@ def run_experiment(name, clients, options): for id in exp_nodes: do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) if not "no_rtt_files" in options: - do_cmd("dump_times /dev/null", clients) + do_cmd("dump_times %s /dev/null" % (options.protocol), clients) do_cmd("log Starting %s experiment" % (name), server_nodes, clients) debug_delay = 0 if debug_delay > 0: @@ -720,7 +720,7 @@ def run_experiment(name, clients, options): do_cmd("log Ending %s experiment" % (name), server_nodes, clients) log("Retrieving data for %s experiment" % (name)) if not "no_rtt_files" in options: - do_cmd("dump_times rtts", clients) + do_cmd("dump_times %s rtts" % (options.protocol), clients) if (options.protocol == "homa") and not "unloaded" in options: vlog("Recording final metrics from nodes %s" % (exp_nodes)) for id in exp_nodes: @@ -777,12 +777,12 @@ def scan_log(file, node, experiments): experiment = "" if experiment != "": gbps = -1.0 - match = re.match('.*Clients: ([0-9.]+) Kops/sec, ' + match = re.match('.*clients: ([0-9.]+) Kops/sec, ' '([0-9.]+) Gbps.*P50 ([0-9.]+)', line) if match: gbps = float(match.group(2)) else: - match = re.match('.*Clients: ([0-9.]+) Kops/sec, ' + match = re.match('.*clients: ([0-9.]+) Kops/sec, ' '([0-9.]+) MB/sec.*P50 ([0-9.]+)', line) if match: gbps = 8.0*float(match.group(2)) @@ -799,12 +799,12 @@ def scan_log(file, node, experiments): continue gbps = -1.0 - match = re.match('.*Servers: ([0-9.]+) Kops/sec, ' + match = re.match('.*servers: ([0-9.]+) Kops/sec, ' '([0-9.]+) Gbps', line) if match: gbps = float(match.group(2)) else: - match = re.match('.*Servers: ([0-9.]+) Kops/sec, ' + match = re.match('.*servers: ([0-9.]+) Kops/sec, ' '([0-9.]+) MB/sec', line) if match: gbps = 8.0*float(match.group(2)) From f028e5398b3740f763d156ac776bacd75703dc40 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Sep 2024 17:01:43 -0700 Subject: [PATCH 006/625] Add --skip-unloaded option in cp_vs_tcp (true by default) --- util/cp_vs_tcp | 30 +++++++++++++++++++----------- util/cperf.py | 4 ++-- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index 6258a457..32cab54b 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -25,6 +25,9 @@ parser.add_argument('--servers', dest='num_servers', type=int, metavar='count', "and this value indicates the number of server nodes; all other " "nodes will be clients. If 0, each node runs both a client and a " "server (default: 0)") +parser.add_argument('--skip-unloaded', dest='skip_unloaded', type=boolean, + default=True, help="Boolean value:: true means don't measure" + "Homa latency under low (default: true)") options = parser.parse_args() init(options) @@ -59,15 +62,16 @@ if not options.plot_only: dctcp_exp = "dctcp_" + workload try: options.protocol = "homa" - start_servers(options.nodes[1:2], options) - o = copy.deepcopy(options) - o.gbps = 0.0 - o.client_ports = 1 - o.client_max = 1 - o.server_ports = 1 - o.unloaded = 500 - run_experiment(unloaded_exp, options.nodes[0:1], o) + if not options.skip_unloaded: + start_servers(options.nodes[1:2], options) + o = copy.deepcopy(options) + o.gbps = 0.0 + o.client_ports = 1 + o.client_max = 1 + o.server_ports = 1 + o.unloaded = 500 + run_experiment(unloaded_exp, options.nodes[0:1], o) start_servers(options.servers, options) run_experiment(homa_exp, options.clients, options) @@ -104,7 +108,8 @@ for workload, bw, seconds in load_info: dctcp_exp = "dctcp_" + workload scan_metrics(homa_exp) - set_unloaded(unloaded_exp) + if not options.skip_unloaded: + set_unloaded(unloaded_exp) # Generate slowdown plot. log("Generating slowdown plot for %s" % (workload)) @@ -125,7 +130,8 @@ for workload, bw, seconds in load_info: # Generate CDF of small-message RTTs. log("Generating short message CDF for %s" % (workload)) - unloaded_x, unloaded_y = get_short_cdf(unloaded_exp) + if not options.skip_unloaded: + unloaded_x, unloaded_y = get_short_cdf(unloaded_exp) homa_x, homa_y = get_short_cdf(homa_exp) if options.tcp: tcp_x, tcp_y = get_short_cdf(tcp_exp) @@ -138,6 +144,8 @@ for workload, bw, seconds in load_info: if options.dctcp: plt.plot(dctcp_x, dctcp_y, label="DCTCP", color=dctcp_color) plt.plot(homa_x, homa_y, label="Homa", color=homa_color) - plt.plot(unloaded_x, unloaded_y, label="Homa best case", color=unloaded_color) + if not options.skip_unloaded: + plt.plot(unloaded_x, unloaded_y, label="Homa best case", + color=unloaded_color) plt.legend(loc="upper right", prop={'size': 9}) plt.savefig("%s/reports/short_cdf_%s.pdf" % (options.log_dir, workload)) diff --git a/util/cperf.py b/util/cperf.py index 508abbb4..06cefb70 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -1142,7 +1142,7 @@ def get_digest(experiment): log("Outlier alt-slowdown in %s: %.1f vs. %.1f overall average" % (info[0], info[1], overall_avg)) - if len(unloaded_p50) == 0: + if old_slowdown and (len(unloaded_p50) == 0): raise Exception("No unloaded data: must invoke set_unloaded") rtts = digest["rtts"] @@ -1184,7 +1184,7 @@ def get_digest(experiment): next_bucket += 1 if old_slowdown: optimal = unloaded_p50[length] - elif length in unloaded_p50: + else: optimal = 15 + length*8/link_mbps bucket_count += len(rtts[length]) for rtt in rtts[length]: From cd4e50730315c28714b295327b9a9e973bcc8f96 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 11 Sep 2024 15:31:43 -0700 Subject: [PATCH 007/625] Created util/cp_both benchmark * Also introduced experiment names into cp_node * Added run_experiments function to cperf.py --- util/cp_both | 84 +++++++++++ util/cp_node.cc | 372 +++++++++++++++++++++++++++------------------- util/cp_vs_tcp | 12 +- util/cperf.py | 382 ++++++++++++++++++++++++++++++++++-------------- 4 files changed, 586 insertions(+), 264 deletions(-) create mode 100755 util/cp_both diff --git a/util/cp_both b/util/cp_both new file mode 100755 index 00000000..ad945b92 --- /dev/null +++ b/util/cp_both @@ -0,0 +1,84 @@ +#!/usr/bin/python3 + +# Copyright (c) 2024 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause + +# This cperf benchmark runs both TCP and Homa on each client and server +# node in order to measure interference between the protocols. +# Type "cp_both --help" for documentation. + +from cperf import * + +for option in ['client_max', 'client_ports', 'port_threads', 'server_ports', + 'tcp_client_ports', 'tcp_server_ports']: + default_defaults[option] = (default_defaults[option]+1)/2 +parser = get_parser(description= + 'Measures slowdown when TCP and Homa are competing for resources ' + 'on the same nodes.', + usage='%(prog)s [options]', defaults={'homa_gbps': 0}) +parser.add_argument('--homa-gbps', type=float, dest='homa_gbps', + metavar='B', default=0, + help='Configure Homa to generate B Gbps of total outgoing bandwidth ' + 'on each node (clients and servers combined); the remainder of ' + '--gbps will be generated by TCP. 0 means split --gbps between ' + 'Homa and TCP (default: 0)') +default_defaults['client_max'] +options = parser.parse_args() +init(options) + +# First, run the experiment +if not options.plot_only: + homa_options = copy.deepcopy(options) + homa_options.name = "homa_" + options.workload + homa_options.protocol = "homa" + + tcp_options = copy.deepcopy(options) + tcp_options.name = "tcp_" + options.workload + tcp_options.protocol = "tcp" + + if not options.homa_gbps: + homa_options.gbps = options.gbps/4.0 + tcp_options.gbps = homa_options.gbps + else: + tcp_options.gbps = (options.gbps - options.homa_gbps)/2 + if tcp_options.gbps < 0: + tcp_options.gbps = 0 + homa_options.gbps = options.gbps/2 - tcp_options.gbps + try: + run_experiments(homa_options, tcp_options) + except Exception as e: + log(traceback.format_exc()) + log("Stopping nodes") + stop_nodes() + scan_logs() + +# Generate plots and reports + homa_exp = "homa_" + options.workload + scan_metrics(homa_exp) + tcp_exp = "tcp_" + options.workload + scan_metrics(tcp_exp) + + # Generate slowdown plot. + log("Generating slowdown plot for %s" % (options.workload)) + title = "TCP and Homa together %s %d nodes, %.1f Gbps" % ( + options.workload.capitalize(), options.num_nodes, options.gbps) + ax = start_slowdown_plot(title, 1000, homa_exp) + plot_slowdown(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) + plot_slowdown(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) + plot_slowdown(ax, homa_exp, "p99", "Homa P99", color=homa_color) + plot_slowdown(ax, homa_exp, "p50", "Homa P50", color=homa_color2) + ax.legend(loc="upper right", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/reports/both_%s.pdf" % (options.log_dir, options.workload)) + + # Generate CDF of small-message RTTs. + log("Generating short message CDF for %s" % (options.workload)) + homa_x, homa_y = get_short_cdf(homa_exp) + tcp_x, tcp_y = get_short_cdf(tcp_exp) + start_cdf_plot(title, 10, 0.99e05, 1e-05, "RTT (usecs)", + "Cumulative Fraction Short Messages") + plt.plot(tcp_x, tcp_y, label="TCP", color=tcp_color) + plt.plot(homa_x, homa_y, label="Homa", color=homa_color) + plt.legend(loc="upper right", prop={'size': 9}) + plt.savefig("%s/reports/short_cdf_%s.pdf" % (options.log_dir, + options.workload)) diff --git a/util/cp_node.cc b/util/cp_node.cc index 6c562665..da4fd364 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -122,15 +122,15 @@ struct conn_id { } }; -/** - * enum protocol - Used to distinguish things using Homa vs. TCP. -*/ -enum protocol {PROT_HOMA, PROT_TCP}; -#define NUM_PROTOCOLS 2 - /** @message_id: used to generate unique identifiers for outgoing messages.*/ std::atomic message_id; +/** + * @experiments: names of all known experiments (may include some that are + * no longer in use) + */ +std::vector experiments; + /** * @last_stats_time: time (in rdtsc cycles) when we last printed * statistics. Zero means that none of the statistics below are valid. @@ -138,61 +138,66 @@ std::atomic message_id; uint64_t last_stats_time = 0; /** - * @last_client_rpcs: total number of client RPCS completed by this - * application for each protocol as of the last time we printed statistics. + * @last_client_rpcs: entries correspond to @experiments; total number of + * client RPCS completed by that experiment as of the last time we printed + * statistics. */ -uint64_t last_client_rpcs[NUM_PROTOCOLS]; +std::vector last_client_rpcs; /** - * @last_client_bytes_out: total amount of data in request messages for - * client RPCS completed by this application for each protocol as of the - * last time we printed statistics. + * @last_client_bytes_out: entries correspond to @experiments; total amount + * of data sent in request messages by client RPCS in that experiment as + * of the last time we printed statistics. */ -uint64_t last_client_bytes_out[NUM_PROTOCOLS]; +std::vector last_client_bytes_out; /** - * @last_client_bytes_in: total amount of data in response messages for - * client RPCS completed by this application for each protocol as of the - * last time we printed statistics. + * @last_client_bytes_in: entries correspond to @experiments; total + * amount of data received in response messages for client RPCS in that + * experiment as of the last time we printed statistics. */ -uint64_t last_client_bytes_in[NUM_PROTOCOLS]; +std::vector last_client_bytes_in; /** - * @last_total_elapsed: total amount of elapsed time for all client RPCs - * issued by this application (in units of rdtsc cycles) for each protocol + * @last_total_rtt: entries correspond to @experiments; total amount of + * elapsed time for all client RPCs in that experiment (units of rdtsc cycles) * as of the last time we printed statistics. */ -uint64_t last_total_rtt[NUM_PROTOCOLS]; +std::vector last_total_rtt; /** - * @last_lag: total lag across all clients (measured in rdtsc cycles) - * for each protocol as of the last time we printed statistics. + * @last_lag: entries correspond to @experiments; total lag (measured in rdtsc + * cycles) for all clients in that experiment, as of the last time we printed + * statistics. */ -uint64_t last_lag[NUM_PROTOCOLS]; +std::vector last_lag; /** - * @last_backups: total # of backed-up sends for each protocol as of the - * last time we printed statistics. + * @last_backups: entries correspond to @experiments; total # of backed-up + * sends for client RPCs issued by that experiment as of the last time we + * printed statistics. */ -uint64_t last_backups[NUM_PROTOCOLS]; +std::vector last_backups; /** - * @last_server_rpcs: total number of server RPCS handled by this - * application for each protocol as of the last time we printed statistics. + * @last_server_rpcs: entries correspond to @experiments; total # of server + * RPCs handled by that experiment as of the last time we printed statistics. */ -uint64_t last_server_rpcs[NUM_PROTOCOLS]; +std::vector last_server_rpcs; /** - * @last_server_bytes_in: total amount of data in incoming requests handled by - * this application for each protocol as of the last time we printed statistics. + * @last_server_bytes_in: entries correspond to @experiments; total amount + * of data in incoming requests handled by that experiment as of the last + * time we printed statistics. */ -uint64_t last_server_bytes_in[NUM_PROTOCOLS]; +std::vector last_server_bytes_in; /** - * @last_server_bytes_out: total amount of data in responses sent by - * this application for each protcool as of the last time we printed statistics. + * @last_server_bytes_out: entries correspond to @experiments; total amount + * of data in responses sent by that experiment as of the last time we printed + * statistics. */ -uint64_t last_server_bytes_out[NUM_PROTOCOLS]; +std::vector last_server_bytes_out; /** * @last_per_server_rpcs: server->requests for each individual server, @@ -258,6 +263,9 @@ void print_help(const char *name) printf(" --client-max Maximum number of outstanding requests from a single\n" " client machine (divided equally among client ports)\n" " (default: %d)\n", client_max); + printf(" --exp Name of the experiment in which these client threads\n"); + printf(" will be participating; used to label measurement data\n"); + printf(" (defaults to _)\n"); printf(" --first-port Lowest port number to use for each server (default: \n"); printf(" 4000 for Homa, 5000 for TCP)\n"); printf(" --first-server Id of first server node (default: 1, meaning node1)\n"); @@ -294,8 +302,9 @@ void print_help(const char *name) workload); printf("debug value value ... Set one or more int64_t values that may be used for\n" " various debugging purposes\n\n"); - printf("dump_times prot file Log RTT times (and lengths) for clients running the\n" - " protocol given by prot (homa or tcp) to file\n\n"); + printf("dump_times file [exp] Log RTT times (and lengths) for clients running\n"); + printf(" experiment exp to file; if exp is omitted, dump\n"); + printf(" all RTTs\n\n"); printf("exit Exit the application\n\n"); printf("log [options] [msg] Configure logging as determined by the options. If\n" " there is an \"option\" that doesn't start with \"--\",\n" @@ -308,6 +317,9 @@ void print_help(const char *name) printf(" --buf-bpages Number of bpages to allocate in the buffer poool for\n" " incoming messages (default: %d)\n", buf_bpages); + printf(" --exp Name of the experiment in which these server ports\n"); + printf(" will be participating; used to label measurement data\n"); + printf(" (defaults to _)\n"); printf(" --first-port Lowest port number to use (default: 4000 for Homa,\n"); printf(" 5000 for TCP)\n"); printf(" --iovec Use homa_replyv instead of homa_reply\n"); @@ -871,8 +883,8 @@ bool tcp_connection::xmit() */ class server_metrics { public: - /** @protocol: Was Homa used by this server, or TCP? */ - enum protocol protocol; + /** @experiment: Name of experiment for this server thread */ + std::string experiment; /** @requests: Total number of requests handled so far. */ uint64_t requests; @@ -889,7 +901,7 @@ class server_metrics { */ uint64_t bytes_out; - server_metrics(enum protocol protocol) : protocol(protocol), + server_metrics(std::string& experiment) : experiment(experiment), requests(0), bytes_in(0), bytes_out(0) {} }; @@ -906,7 +918,8 @@ std::vector metrics; */ class homa_server { public: - homa_server(int port, int id, int inet_family, int num_threads); + homa_server(int port, int id, int inet_family, int num_threads, + std::string& experiment); ~homa_server(); void server(int thread_id, server_metrics *metrics); @@ -919,6 +932,9 @@ class homa_server { /** @port: Homa port number managed by this object. */ int port; + /** @experiment: name of the experiment this server is running. */ + string experiment; + /** * @buf_region: mmapped region of memory in which receive buffers * are alloocated. @@ -944,12 +960,14 @@ std::vector homa_servers; * @inet_family: AF_INET or AF_INET6: determines whether we use IPv4 or IPv6. * @num_threads: How many threads should collctively service requests on * @port. + * @experiment: Name of the experiment in which this server is participating. */ -homa_server::homa_server(int port, int id, int inet_family, - int num_threads) +homa_server::homa_server(int port, int id, int inet_family, int num_threads, + std::string& experiment) : id(id) , fd(-1) , port(port) + , experiment(experiment) , buf_region(NULL) , buf_size(0) , threads() @@ -957,6 +975,10 @@ homa_server::homa_server(int port, int id, int inet_family, sockaddr_in_union addr; struct homa_set_buf_args arg; + if (std::find(experiments.begin(), experiments.end(), experiment) + == experiments.end()) + experiments.emplace_back(experiment); + fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); if (fd < 0) { log(NORMAL, "FATAL: homa_server couldn't open Homa " @@ -1000,7 +1022,7 @@ homa_server::homa_server(int port, int id, int inet_family, } for (int i = 0; i < num_threads; i++) { - server_metrics *thread_metrics = new server_metrics(PROT_HOMA); + server_metrics *thread_metrics = new server_metrics(experiment); metrics.push_back(thread_metrics); threads.emplace_back([this, i, thread_metrics] () { server(i, thread_metrics); @@ -1105,7 +1127,7 @@ void homa_server::server(int thread_id, server_metrics *metrics) */ class tcp_server { public: - tcp_server(int port, int id, int num_threads); + tcp_server(int port, int id, int num_threads, std::string& experiment); ~tcp_server(); void accept(int epoll_fd); void read(int fd, int pid); @@ -1123,6 +1145,9 @@ class tcp_server { /** @id: Unique identifier for this server. */ int id; + /** @experiment: name of the experiment this server is running. */ + string experiment; + /** @listen_fd: File descriptor for the listen socket. */ int listen_fd; @@ -1166,8 +1191,10 @@ std::vector tcp_servers; * @id: Unique identifier for this server. * @num_threads: Number of threads to service this listening socket and * all of the other sockets excepted from it. + * @experiment: Name of the experiment in which this server is participating. */ -tcp_server::tcp_server(int port, int id, int num_threads) +tcp_server::tcp_server(int port, int id, int num_threads, + std::string& experiment) : mutex(0) , port(port) , id(id) @@ -1179,6 +1206,10 @@ tcp_server::tcp_server(int port, int id, int num_threads) , threads() , stop(false) { + if (std::find(experiments.begin(), experiments.end(), experiment) + == experiments.end()) + experiments.emplace_back(experiment); + memset(connections, 0, sizeof(connections)); listen_fd = socket(inet_family, SOCK_STREAM, 0); if (listen_fd == -1) { @@ -1237,7 +1268,7 @@ tcp_server::tcp_server(int port, int id, int num_threads) exit(1); } - metrics = new server_metrics(PROT_TCP); + metrics = new server_metrics(experiment); ::metrics.push_back(metrics); for (int i = 0; i < num_threads; i++) @@ -1492,22 +1523,22 @@ class client { rinfo() : start_time(0), request_length(0), active(false) {} }; - client(enum protocol protocol, int id); + client(int id, std::string& experiment); virtual ~client(); void check_completion(const char *protocol); int get_rinfo(); void record(uint64_t end_time, message_header *header); virtual void stop_sender(void) {} - /** @protocol: indicates whether Homa or TCP is used by this client. */ - enum protocol protocol; - /** * @id: unique identifier for this client (index starting at * 0 for the first client. */ int id; + /** @experiment: name of the experiment this client is running. */ + string experiment; + /** * @server_addrs: Internet addresses for each of the server ports * where this client will send RPCs. @@ -1637,11 +1668,12 @@ std::vector clients; * client::client() - Constructor for client objects. Uses configuration * information from global variables to initialize. * - * @id: Unique identifier for this client (index starting at 0?) + * @id: Unique identifier for this client (index starting at 0?) + * @experiment: Name of experiment in which this client will participate. */ -client::client(enum protocol protocol, int id) - : protocol(protocol) - , id(id) +client::client(int id, std::string& experiment) + : id(id) + , experiment(experiment) , server_addrs() , server_conns() , freeze() @@ -1660,6 +1692,10 @@ client::client(enum protocol protocol, int id) , total_rtt(0) , lag(0) { + if (std::find(experiments.begin(), experiments.end(), experiment) + == experiments.end()) + experiments.emplace_back(experiment); + server_addrs.clear(); server_conns.clear(); freeze.clear(); @@ -1845,7 +1881,7 @@ void client::record(uint64_t end_time, message_header *header) */ class homa_client : public client { public: - homa_client(int id); + homa_client(int id, std::string& experiment); virtual ~homa_client(); void measure_unloaded(int count); uint64_t measure_rtt(int server, int length, char *buffer, @@ -1895,10 +1931,11 @@ class homa_client : public client { /** * homa_client::homa_client() - Constructor for homa_client objects. * - * @id: Unique identifier for this client (index starting at 0?) + * @id: Unique identifier for this client (index starting at 0?). + * @experiment: Name of experiment in which this client will participate. */ -homa_client::homa_client(int id) - : client(PROT_HOMA, id) +homa_client::homa_client(int id, std::string& experiment) + : client(id, experiment) , fd(-1) , buf_region(nullptr) , buf_size(buf_bpages*HOMA_BPAGE_SIZE) @@ -2233,7 +2270,7 @@ void homa_client::measure_unloaded(int count) */ class tcp_client : public client { public: - tcp_client(int id); + tcp_client(int id, std::string& experiment); virtual ~tcp_client(); void read(tcp_connection *connection, int pid); void receiver(int id); @@ -2297,10 +2334,11 @@ class tcp_client : public client { /** * tcp_client::tcp_client() - Constructor for tcp_client objects. * - * @id: Unique identifier for this client (index starting at 0?) + * @id: Unique identifier for this client (index starting at 0?) + * @experiment: Name of experiment in which this client will participate. */ -tcp_client::tcp_client(int id) - : client(PROT_TCP, id) +tcp_client::tcp_client(int id, std::string& experiment) + : client(id, experiment) , connections() , blocked() , bytes_sent() @@ -2591,59 +2629,58 @@ void tcp_client::read(tcp_connection *connection, int pid) */ void server_stats(uint64_t now) { - int prot; - for (prot = PROT_HOMA; prot <= PROT_TCP; prot++) { + last_per_server_rpcs.resize(metrics.size(), 0); + last_server_rpcs.resize(experiments.size(), 0); + last_server_bytes_in.resize(experiments.size(), 0); + last_server_bytes_out.resize(experiments.size(), 0); + + for (size_t i = 0; i < experiments.size(); i++) { + std::string& exp = experiments[i]; char details[10000]; int offset = 0; int length; uint64_t server_rpcs = 0; uint64_t server_bytes_in = 0; uint64_t server_bytes_out = 0; - const char *prot_string = (prot == PROT_HOMA) ? "Homa" : "TCP"; details[0] = 0; for (uint32_t i = 0; i < metrics.size(); i++) { - server_metrics *server = metrics[i]; - if (server->protocol != prot) + server_metrics *smetrics = metrics[i]; + if (smetrics->experiment != exp) continue; - server_rpcs += server->requests; - server_bytes_in += server->bytes_in; - server_bytes_out += server->bytes_out; + server_rpcs += smetrics->requests; + server_bytes_in += smetrics->bytes_in; + server_bytes_out += smetrics->bytes_out; length = snprintf(details + offset, sizeof(details) - offset, "%s%lu", (offset != 0) ? " " : "", - server->requests - last_per_server_rpcs[i]); + smetrics->requests - last_per_server_rpcs[i]); offset += length; - if (i > last_per_server_rpcs.size()) - printf("last_per_server_rpcs has %lu " - "entries, needs %lu\n", - last_per_server_rpcs.size(), - metrics.size()); - last_per_server_rpcs[i] = server->requests; + last_per_server_rpcs[i] = smetrics->requests; } if ((last_stats_time != 0) && (server_bytes_in - != last_server_bytes_in[prot])) { + != last_server_bytes_in[i])) { double elapsed = to_seconds(now - last_stats_time); double rpcs = (double) (server_rpcs - - last_server_rpcs[prot]); + - last_server_rpcs[i]); double in_delta = (double) (server_bytes_in - - last_server_bytes_in[prot]); + - last_server_bytes_in[i]); double out_delta = (double) (server_bytes_out - - last_server_bytes_out[prot]); + - last_server_bytes_out[i]); log(NORMAL, "%s servers: %.2f Kops/sec, %.2f Gbps in, " "%.2f Gbps out, avg. req. length " "%.1f bytes\n", - prot_string, + exp.c_str(), rpcs/(1000.0*elapsed), 8.0*in_delta/(1e09*elapsed), 8.0*out_delta/(1e09*elapsed), in_delta/rpcs); - log(NORMAL, "RPCs per %s server: %s\n", prot_string, - details); + log(NORMAL, "RPCs per %s server thread: %s\n", + exp.c_str(), details); } - last_server_rpcs[prot] = server_rpcs; - last_server_bytes_in[prot] = server_bytes_in; - last_server_bytes_out[prot] = server_bytes_out; + last_server_rpcs[i] = server_rpcs; + last_server_bytes_in[i] = server_bytes_in; + last_server_bytes_out[i] = server_bytes_out; } } @@ -2656,8 +2693,28 @@ void server_stats(uint64_t now) void client_stats(uint64_t now) { #define CDF_VALUES 100000 - int prot; - for (prot = PROT_HOMA; prot <= PROT_TCP; prot++) { + std::vector num_clients(sizeof(experiments), 0); + size_t i; + + for (client *client: clients) { + for (i = 0; i < experiments.size(); i++) { + if (experiments[i] == client->experiment) + break; + } + if (i == experiments.size()) + experiments.emplace_back(client->experiment); + num_clients[i]++; + } + + last_client_rpcs.resize(experiments.size(), 0); + last_client_bytes_out.resize(experiments.size(), 0); + last_client_bytes_in.resize(experiments.size(), 0); + last_total_rtt.resize(experiments.size(), 0); + last_lag.resize(experiments.size(), 0); + last_backups.resize(experiments.size(), 0); + + for (i = 0; i < experiments.size(); i++) { + std::string& exp = experiments[i]; uint64_t client_rpcs = 0; uint64_t request_bytes = 0; uint64_t response_bytes = 0; @@ -2666,26 +2723,20 @@ void client_stats(uint64_t now) uint64_t outstanding_rpcs = 0; uint64_t cdf_times[CDF_VALUES]; uint64_t backups = 0; - int num_clients = 0; int times_per_client; int cdf_index = 0; - const char *prot_string = (prot == PROT_HOMA) ? "Homa" : "TCP"; - for (client *client: clients) { - if (client->protocol == prot) - num_clients++; - } - if (num_clients == 0) + if (num_clients[i] == 0) continue; - times_per_client = CDF_VALUES/num_clients; + times_per_client = CDF_VALUES/num_clients[i]; if (times_per_client > NUM_CLIENT_STATS) times_per_client = NUM_CLIENT_STATS; for (client *client: clients) { - if (client->protocol != prot) + if (client->experiment != exp) continue; - for (size_t i = 0; i < client->server_addrs.size(); i++) - client_rpcs += client->responses[i]; + for (size_t j = 0; j < client->server_addrs.size(); j++) + client_rpcs += client->responses[j]; request_bytes += client->request_bytes; response_bytes += client->response_bytes; total_rtt += client->total_rtt; @@ -2712,23 +2763,21 @@ void client_stats(uint64_t now) if (tclient) backups += tclient->backups; } - if (num_clients == 0) - continue; std::sort(cdf_times, cdf_times + cdf_index); if ((last_stats_time != 0) && ((request_bytes - != last_client_bytes_out[prot]) + != last_client_bytes_out[i]) || (outstanding_rpcs != 0))){ double elapsed = to_seconds(now - last_stats_time); - double rpcs = (double) (client_rpcs - last_client_rpcs[prot]); + double rpcs = (double) (client_rpcs - last_client_rpcs[i]); double delta_out = (double) (request_bytes - - last_client_bytes_out[prot]); + - last_client_bytes_out[i]); double delta_in = (double) (response_bytes - - last_client_bytes_in[prot]); + - last_client_bytes_in[i]); log(NORMAL, "%s clients: %.2f Kops/sec, %.2f Gbps out, " "%.2f Gbps in, RTT (us) P50 %.2f " "P99 %.2f P99.9 %.2f, avg. req. length " "%.1f bytes\n", - prot_string, + exp.c_str(), rpcs/(1000.0*elapsed), 8.0*delta_out/(1e09*elapsed), 8.0*delta_in/(1e09*elapsed), @@ -2737,35 +2786,36 @@ void client_stats(uint64_t now) to_seconds(cdf_times[999*cdf_index/1000])*1e06, delta_out/rpcs); double lag_fraction; - if (lag > last_lag[prot]) + if (lag > last_lag[i]) lag_fraction = (to_seconds(lag - - last_lag[prot])/elapsed) - / num_clients; + - last_lag[i])/elapsed) + / num_clients[i]; else - lag_fraction = -(to_seconds(last_lag[prot] - - lag)/elapsed) / num_clients; + lag_fraction = -(to_seconds(last_lag[i] + - lag)/elapsed) / num_clients[i]; if (lag_fraction >= .01) - log(NORMAL, "%s lag due to overload: %.1f%%\n", - prot_string, - lag_fraction*100.0); + log(NORMAL, "Lag due to overload for %s " + "experiment: %.1f%%\n", + exp.c_str(), lag_fraction*100.0); if (backups != 0) { log(NORMAL, "Backed-up %s sends: %lu/%lu (%.1f%%)\n", - prot_string, - backups - last_backups[prot], - client_rpcs - last_client_rpcs[prot], - 100.0*(backups - last_backups[prot]) - /(client_rpcs - last_client_rpcs[prot])); + exp.c_str(), + backups - last_backups[i], + client_rpcs - last_client_rpcs[i], + 100.0*(backups - last_backups[i]) + /(client_rpcs - last_client_rpcs[i])); } } if (outstanding_rpcs != 0) - log(NORMAL, "Outstanding %s client RPCs: %lu\n", - prot_string, outstanding_rpcs); - last_client_rpcs[prot] = client_rpcs; - last_client_bytes_out[prot] = request_bytes; - last_client_bytes_in[prot] = response_bytes; - last_total_rtt[prot] = total_rtt; - last_lag[prot] = lag; - last_backups[prot] = backups; + log(NORMAL, "Outstanding client RPCS for %s " + "experiment: %lu\n", + exp.c_str(), outstanding_rpcs); + last_client_rpcs[i] = client_rpcs; + last_client_bytes_out[i] = request_bytes; + last_client_bytes_in[i] = response_bytes; + last_total_rtt[i] = total_rtt; + last_lag[i] = lag; + last_backups[i] = backups; } } @@ -2797,6 +2847,7 @@ int client_cmd(std::vector &words) int first_server = 1; int server_nodes = 1; std::string servers; + std::string experiment; buf_bpages = 1000; client_iovec = false; @@ -2823,6 +2874,14 @@ int client_cmd(std::vector &words) option, "integer")) return 0; i++; + } else if (strcmp(option, "--exp") == 0) { + if ((i + 1) >= words.size()) { + printf("No value provided for %s\n", + option); + return 0; + } + experiment = words[i+1]; + i++; } else if (strcmp(option, "--first-port") == 0) { if (!parse(words, i+1, &first_port, option, "integer")) return 0; @@ -2898,6 +2957,11 @@ int client_cmd(std::vector &words) return 0; } } + if (experiment.empty()) { + experiment = protocol; + experiment += "_"; + experiment += workload; + } /* Figure out which nodes to use for servers (--servers, * --server-nodes, --first-server). @@ -2933,11 +2997,11 @@ int client_cmd(std::vector &words) if (strcmp(protocol, "homa") == 0) { if (first_port == -1) first_port = 4000; - clients.push_back(new homa_client(i)); + clients.push_back(new homa_client(i, experiment)); } else { if (first_port == -1) first_port = 5000; - clients.push_back(new tcp_client(i)); + clients.push_back(new tcp_client(i, experiment)); } } last_stats_time = 0; @@ -2980,22 +3044,15 @@ int dump_times_cmd(std::vector &words) FILE *f; time_t now; char time_buffer[100]; - int prot; + std::string exp; - if (words.size() != 3) { - printf("Wrong # args; must be 'dump_times protocol file'\n"); + if (words.size() == 3) + exp = words[2]; + else if (words.size() != 2) { + printf("Wrong # args; must be 'dump_times file [experiment]'\n"); return 0; } - if (words[1] == "homa") - prot = PROT_HOMA; - else if (words[1] == "tcp") - prot = PROT_TCP; - else { - printf("Unknown protocol %s: must be homa or tcp\n", - words[1].c_str()); - return 0; - } - f = fopen(words[2].c_str(), "w"); + f = fopen(words[1].c_str(), "w"); if (f == NULL) { printf("Couldn't open file %s: %s\n", words[1].c_str(), strerror(errno)); @@ -3005,15 +3062,16 @@ int dump_times_cmd(std::vector &words) time(&now); strftime(time_buffer, sizeof(time_buffer), "%Y-%m-%d %H:%M:%S", localtime(&now)); - fprintf(f, "# Round-trip times measured by cp_node at %s\n", - time_buffer); + fprintf(f, "# Round-trip times measured by cp_node at %s for " + "experiment %s\n", + time_buffer, exp.empty() ? "" : exp.c_str()); fprintf(f, "# --protocol %s, --workload %s, --gpbs %.1f --threads %d,\n", protocol, workload, net_gbps, client_ports); fprintf(f, "# --server-nodes %lu --server-ports %d, --client-max %d\n", server_ids.size(), server_ports, client_max); fprintf(f, "# Length RTT (usec)\n"); for (client *client: clients) { - if (client->protocol != prot) + if (!exp.empty() && (client->experiment != exp)) continue; __u32 start = client->total_responses % NUM_CLIENT_STATS; __u32 i = start; @@ -3147,6 +3205,7 @@ int log_cmd(std::vector &words) */ int server_cmd(std::vector &words) { + std::string experiment; buf_bpages = 1000; first_port = -1; inet_family = AF_INET; @@ -3163,6 +3222,14 @@ int server_cmd(std::vector &words) if (!parse(words, i+1, &buf_bpages, option, "integer")) return 0; i++; + } else if (strcmp(option, "--exp") == 0) { + if ((i + 1) >= words.size()) { + printf("No value provided for %s\n", + option); + return 0; + } + experiment = words[i+1]; + i++; } else if (strcmp(option, "--first-port") == 0) { if (!parse(words, i+1, &first_port, option, "integer")) return 0; @@ -3197,13 +3264,19 @@ int server_cmd(std::vector &words) return 0; } } + if (experiment.empty()) { + experiment = protocol; + experiment += "_"; + experiment += workload; + } if (strcmp(protocol, "homa") == 0) { if (first_port == -1) first_port = 4000; for (int i = 0; i < server_ports; i++) { homa_server *server = new homa_server(first_port + i, - i, inet_family, port_threads); + i, inet_family, port_threads, + experiment); homa_servers.push_back(server); } } else { @@ -3211,11 +3284,10 @@ int server_cmd(std::vector &words) first_port = 5000; for (int i = 0; i < server_ports; i++) { tcp_server *server = new tcp_server(first_port + i, - i, port_threads); + i, port_threads, experiment); tcp_servers.push_back(server); } } - last_per_server_rpcs.resize(server_ports*port_threads, 0); last_stats_time = 0; return 1; } diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index 32cab54b..18effb9a 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -27,7 +27,7 @@ parser.add_argument('--servers', dest='num_servers', type=int, metavar='count', "server (default: 0)") parser.add_argument('--skip-unloaded', dest='skip_unloaded', type=boolean, default=True, help="Boolean value:: true means don't measure" - "Homa latency under low (default: true)") + "Homa latency under low load (default: true)") options = parser.parse_args() init(options) @@ -55,7 +55,7 @@ if not options.plot_only: for workload, bw, seconds in load_info: options.workload = workload options.gbps = bw * bw_multiplier - options.seconds = seconds; + options.seconds = seconds unloaded_exp = "unloaded_" + workload homa_exp = "homa_" + workload tcp_exp = "tcp_" + workload @@ -64,7 +64,7 @@ if not options.plot_only: options.protocol = "homa" if not options.skip_unloaded: - start_servers(options.nodes[1:2], options) + start_servers(unloaded_exp, options.nodes[1:2], options) o = copy.deepcopy(options) o.gbps = 0.0 o.client_ports = 1 @@ -73,21 +73,21 @@ if not options.plot_only: o.unloaded = 500 run_experiment(unloaded_exp, options.nodes[0:1], o) - start_servers(options.servers, options) + start_servers(homa_exp, options.servers, options) run_experiment(homa_exp, options.clients, options) if options.tcp: options.protocol = "tcp" set_sysctl_parameter("net.ipv4.tcp_congestion_control", "cubic", range(0, options.num_nodes)) - start_servers(options.servers, options) + start_servers(tcp_exp, options.servers, options) run_experiment(tcp_exp, options.clients, options) if options.dctcp: options.protocol = "tcp" set_sysctl_parameter("net.ipv4.tcp_congestion_control", "dctcp", range(0, options.num_nodes)) - start_servers(options.servers, options) + start_servers(tcp_exp, options.servers, options) run_experiment(dctcp_exp, options.clients, options) except Exception as e: log(traceback.format_exc()) diff --git a/util/cperf.py b/util/cperf.py index 06cefb70..f371f4ba 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -7,6 +7,7 @@ # tests for the Linux kernel implementation of Homa. import argparse +from collections import defaultdict import copy import datetime import glob @@ -38,7 +39,7 @@ # If a server's id appears as a key in this dictionary, it means we # have started homa_prio running on that node. The value of each entry is # a Popen object for the homa_prio instance; if this is terminated, then -# the homa_prio process will end +# the homa_prio process will end. homa_prios = {} # The range of nodes currently running cp_node servers. @@ -208,8 +209,8 @@ def get_parser(description, usage, defaults = {}): '(default: %.2f)' % (defaults['gbps'])) parser.add_argument('--client-max', type=int, dest='client_max', metavar='count', default=defaults['client_max'], - help='Maximum number of requests each client machine can have ' - 'outstanding at a time (divided evenly among its ports) ' + help='Maximum number of Homa requests each client machine can have ' + 'outstanding at a time (divided evenly among the Homa ports) ' '(default: %d)' % (defaults['client_max'])) parser.add_argument('--client-ports', type=int, dest='client_ports', metavar='count', default=defaults['client_ports'], @@ -275,6 +276,12 @@ def get_parser(description, usage, defaults = {}): metavar='nodes', help='List of node numbers not to use in the experiment; can ' ' contain ranges, such as "3,5-8,12"') + parser.add_argument('--tcp-client-max', dest='tcp_client_max', type=int, + metavar='count', default=0, help="Maximum number of TCP requests " + "that can be outstanding from a client node at once (divided evenly " + "among the TCP ports); if zero, the " + "--client-max option is used for TCP as well (i.e. each protocol " + "can have that many outstanding requests) (default: 0)") parser.add_argument('--tcp-client-ports', type=int, dest='tcp_client_ports', metavar='count', default=defaults['tcp_client_ports'], help='Number of ports on which each TCP client should issue requests ' @@ -412,7 +419,7 @@ def wait_output(string, nodes, cmd, time_limit=10.0): if print_data.endswith(string): print_data = print_data[:(len(data) - len(string))] if print_data != "": - log("output from node%d: '%s'" % (id, print_data)) + log("extra output from node%d: '%s'" % (id, print_data)) outputs[id] += data bad_node = -1 for id in nodes: @@ -433,44 +440,43 @@ def wait_output(string, nodes, cmd, time_limit=10.0): def start_nodes(ids, options): """ - Start up cp_node on a group of nodes. + Start up cp_node on a group of nodes. Also starts homa_prio on the + nodes, if protocol is "homa". ids: List of node ids on which to start cp_node, if it isn't already running - options: Command-line options that may affect experiment + options: Command-line options that may affect node configuration """ - global active_nodes + global active_nodes, homa_prios, verbose started = [] for id in ids: - if id in active_nodes: - continue - vlog("Starting cp_node on node%d" % (id)) - node = subprocess.Popen(["ssh", "-o", "StrictHostKeyChecking=no", - "node%d" % (id), "cp_node"], encoding="utf-8", - stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - fl = fcntl.fcntl(node.stdin, fcntl.F_GETFL) - fcntl.fcntl(node.stdin, fcntl.F_SETFL, fl | os.O_NONBLOCK) - fl = fcntl.fcntl(node.stdout, fcntl.F_GETFL) - fcntl.fcntl(node.stdout, fcntl.F_SETFL, fl | os.O_NONBLOCK) - active_nodes[id] = node - if not options.no_homa_prio: - f = open("%s/homa_prio-%d.log" % (log_dir,id), "w") - homa_prios[id] = subprocess.Popen(["ssh", "-o", - "StrictHostKeyChecking=no", "node%d" % (id), "sudo", - "bin/homa_prio", "--interval", "500", "--unsched", - str(options.unsched), "--unsched-boost", - str(options.unsched_boost)], encoding="utf-8", - stdout=f, stderr=subprocess.STDOUT) - f.close - if options.set_ids: - set_sysctl_parameter(".net.homa.next_id", str(100000000*(id+1)), - [id]) - started.append(id) + if not id in active_nodes: + vlog("Starting cp_node on node%d" % (id)) + node = subprocess.Popen(["ssh", "-o", "StrictHostKeyChecking=no", + "node%d" % (id), "cp_node"], encoding="utf-8", + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + fl = fcntl.fcntl(node.stdin, fcntl.F_GETFL) + fcntl.fcntl(node.stdin, fcntl.F_SETFL, fl | os.O_NONBLOCK) + fl = fcntl.fcntl(node.stdout, fcntl.F_GETFL) + fcntl.fcntl(node.stdout, fcntl.F_SETFL, fl | os.O_NONBLOCK) + active_nodes[id] = node + started.append(id) + if options.protocol == "homa": + if options.set_ids: + set_sysctl_parameter(".net.homa.next_id", + str(100000000*(id+1)), [id]) + if not options.no_homa_prio: + f = open("%s/homa_prio-%d.log" % (log_dir,id), "w") + homa_prios[id] = subprocess.Popen(["ssh", "-o", + "StrictHostKeyChecking=no", "node%d" % (id), "sudo", + "bin/homa_prio", "--interval", "500", "--unsched", + str(options.unsched), "--unsched-boost", + str(options.unsched_boost)], encoding="utf-8", + stdout=f, stderr=subprocess.STDOUT) + f.close wait_output("% ", started, "ssh") - log_level = "normal" - if verbose: - log_level = "verbose" + log_level = "verbose" if verbose else "normal" command = "log --file node.log --level %s" % (log_level) for id in started: active_nodes[id].stdin.write(command + "\n") @@ -597,10 +603,11 @@ def do_subprocess(words): log("Error output from %s: %s" % (words, result.stderr.rstrip())) return result.stdout.rstrip() -def start_servers(ids, options): +def start_servers(exp, ids, options): """ Starts cp_node servers running on a group of nodes + exp: Name of experiment these servers will be part of ids: A list of node ids on which to start cp_node servers options: A namespace that must contain at least the following keys, which will be used to configure the servers: @@ -609,19 +616,19 @@ def start_servers(ids, options): protocol """ global server_nodes - log("Starting %s servers on nodes %s" % (options.protocol, ids)) + log("Starting servers for %s experiment on nodes %s" % (exp, ids)) if len(server_nodes) > 0: do_cmd("stop servers", server_nodes) server_nodes = [] start_nodes(ids, options) if options.protocol == "homa": - do_cmd("server --ports %d --port-threads %d --protocol %s %s" % ( - options.server_ports, options.port_threads, - options.protocol, options.ipv6), ids) + do_cmd("server --ports %d --port-threads %d --protocol %s --exp %s %s" + % (options.server_ports, options.port_threads, + options.protocol, exp, options.ipv6), ids) else: - do_cmd("server --ports %d --port-threads %d --protocol %s %s" % ( - options.tcp_server_ports, options.tcp_port_threads, - options.protocol, options.ipv6), ids) + do_cmd("server --ports %d --port-threads %d --protocol %s --exp %s %s" + % (options.tcp_server_ports, options.tcp_port_threads, + options.protocol, exp, options.ipv6), ids) server_nodes = ids def run_experiment(name, clients, options): @@ -651,12 +658,12 @@ def run_experiment(name, clients, options): exp_nodes = list(set(options.servers + list(clients))) start_nodes(clients, options) nodes = [] - log("Starting %s experiment with clients %s" % (name, clients)) + log("Starting clients for %s experiment on nodes %s" % (name, clients)) for id in clients: if options.protocol == "homa": command = "client --ports %d --port-receivers %d --server-ports %d " \ "--workload %s --servers %s --gbps %.3f --client-max %d " \ - "--protocol %s --id %d %s" % ( + "--protocol %s --id %d --exp %s %s" % ( options.client_ports, options.port_receivers, options.server_ports, @@ -666,6 +673,7 @@ def run_experiment(name, clients, options): options.client_max, options.protocol, id, + name, options.ipv6) if "unloaded" in options: command += " --unloaded %d" % (options.unloaded) @@ -674,9 +682,12 @@ def run_experiment(name, clients, options): trunc = '--no-trunc' else: trunc = '' + client_max = options.tcp_client_max + if not client_max: + client_max = options.client_max command = "client --ports %d --port-receivers %d --server-ports %d " \ "--workload %s --servers %s --gbps %.3f %s --client-max %d " \ - "--protocol %s --id %d %s" % ( + "--protocol %s --id %d --exp %s %s" % ( options.tcp_client_ports, options.tcp_port_receivers, options.tcp_server_ports, @@ -684,9 +695,10 @@ def run_experiment(name, clients, options): ",".join([str(x) for x in options.servers]), options.gbps, trunc, - options.client_max, + client_max, options.protocol, id, + name, options.ipv6) active_nodes[id].stdin.write(command + "\n") try: @@ -704,8 +716,10 @@ def run_experiment(name, clients, options): for id in exp_nodes: do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) if not "no_rtt_files" in options: - do_cmd("dump_times %s /dev/null" % (options.protocol), clients) - do_cmd("log Starting %s experiment" % (name), server_nodes, clients) + do_cmd("dump_times /dev/null %s" % (name), clients) + do_cmd("log Starting measurements for %s experiment" % (name), + server_nodes, clients) + log("Starting measurements") debug_delay = 0 if debug_delay > 0: time.sleep(debug_delay) @@ -717,10 +731,11 @@ def run_experiment(name, clients, options): if options.protocol == "homa" and options.tt_freeze: log("Freezing timetraces via node%d" % nodes[0]) set_sysctl_parameter(".net.homa.action", "7", nodes[0:1]) - do_cmd("log Ending %s experiment" % (name), server_nodes, clients) + do_cmd("log Ending measurements for %s experiment" % (name), + server_nodes, clients) log("Retrieving data for %s experiment" % (name)) if not "no_rtt_files" in options: - do_cmd("dump_times %s rtts" % (options.protocol), clients) + do_cmd("dump_times rtts %s" % (name), clients) if (options.protocol == "homa") and not "unloaded" in options: vlog("Recording final metrics from nodes %s" % (exp_nodes)) for id in exp_nodes: @@ -743,6 +758,161 @@ def run_experiment(name, clients, options): do_subprocess(["rsync", "-rtvq", "node%d:rtts" % (id), "%s/%s-%d.rtts" % (options.log_dir, name, id)]) +def run_experiments(*args): + """ + Run multiple experiments simultaneously and collect statistics. + + args: Each argument is a namespace describing an experiment to + run. The namespace must contain the following values: + name: The name of the experiment; used to create files + with the experiment's results. + clients: List of node numbers on which to run clients for the + experiment. + servers: List of node numbers on which to run servers for the + experiment (if the same server is in multiple + experiments, the parameters from the first experiment + are used to start the server). + protocol: tcp or homa + gbps + seconds + workload + + For Homa experiments the following values must be present: + client_max + client_ports + port_receivers + server_ports + port_threads + + For TCP experiments the following values must be present: + tcp_client_max (or client_max) + tcp_client_ports + tcp_server_ports + + There may be additional optional values that used if present. + """ + + global active_nodes + + homa_nodes = [] + homa_clients = [] + homa_servers= [] + tcp_nodes = [] + for exp in args: + if exp.protocol == "homa": + homa_clients.extend(exp.clients) + homa_nodes.extend(exp.clients) + homa_servers.extend(exp.servers) + homa_nodes.extend(exp.servers) + elif exp.protocol == "tcp": + tcp_nodes.extend(exp.clients) + tcp_nodes.extend(exp.servers) + homa_clients = sorted(list(set(homa_clients))) + homa_servers = sorted(list(set(homa_servers))) + homa_nodes = sorted(list(set(homa_nodes))) + tcp_nodes = sorted(list(set(tcp_nodes))) + all_nodes = sorted(list(set(homa_nodes + tcp_nodes))) + + # Start servers for all experiments + stop_nodes() + for exp in args: + if exp.servers: + log("Starting servers for %s experiment on nodes %s" % (exp.name, + exp.servers)) + start_nodes(exp.servers, exp) + if exp.protocol == "homa": + do_cmd("server --ports %d --port-threads %d --protocol homa " + "--exp %s %s" + % (exp.server_ports, exp.port_threads, + exp.name, exp.ipv6), exp.servers) + else: + do_cmd("server --ports %d --port-threads %d --protocol tcp " + "--exp %s %s" + % (exp.tcp_server_ports, exp.tcp_port_threads, + exp.name, exp.ipv6), exp.servers) + + # Start clients for all experiments + for exp in args: + log("Starting clients for %s experiment on nodes %s" % (exp.name, + exp.clients)) + start_nodes(exp.clients, exp) + for id in exp.clients: + if exp.protocol == "homa": + command = "client --ports %d --port-receivers %d --server-ports %d " \ + "--workload %s --servers %s --gbps %.3f --client-max %d " \ + "--protocol homa --id %d --exp %s %s" % ( + exp.client_ports, + exp.port_receivers, + exp.server_ports, + exp.workload, + ",".join([str(x) for x in exp.servers]), + exp.gbps, + exp.client_max, + id, + exp.name, + exp.ipv6) + else: + client_max = exp.tcp_client_max + if not client_max: + client_max = exp.client_max + command = "client --ports %d --port-receivers %d --server-ports %d " \ + "--workload %s --servers %s --gbps %.3f --client-max %d " \ + "--protocol tcp --id %d --exp %s %s" % ( + exp.tcp_client_ports, + exp.tcp_port_receivers, + exp.tcp_server_ports, + exp.workload, + ",".join([str(x) for x in exp.servers]), + exp.gbps, + client_max, + id, + exp.name, + exp.ipv6) + active_nodes[id].stdin.write(command + "\n") + try: + active_nodes[id].stdin.flush() + except BrokenPipeError: + log("Broken pipe to node%d while starting %s client" % (id, + exp.protocol)) + vlog("Command for node%d: %s" % (id, command)) + wait_output("% ", exp.clients, command, 40.0) + if homa_clients: + # Wait a bit so that homa_prio can set priorities appropriately + time.sleep(2) + if homa_nodes: + vlog("Initializing metrics and timetracing") + do_ssh(["metrics.py; ttprint.py > /dev/null"], homa_nodes) + do_cmd("dump_times /dev/null", all_nodes) + do_cmd("log Starting measurements", all_nodes) + log("Starting measurements") + + time.sleep(exp.seconds) + + # Collect results + if homa_nodes and exp.tt_freeze: + log("Freezing timetraces via node%d" % all_nodes[0]) + set_sysctl_parameter(".net.homa.action", "7", all_nodes[0:1]) + do_cmd("log Ending measurements", all_nodes) + log("Retrieving data") + for exp in args: + do_cmd("dump_times %s.rtts %s" % (exp.name, exp.name), exp.clients) + if homa_nodes: + vlog("Recording final metrics from nodes %s" % (homa_nodes)) + for id in homa_nodes: + f = open("%s/%d.metrics" % (exp.log_dir, id), 'w') + subprocess.run(["ssh", "node%d" % (id), "metrics.py"], stdout=f) + f.close() + shutil.copyfile("%s/%d.metrics" % (exp.log_dir, homa_clients[0]), + "%s/reports/%d.metrics" % (exp.log_dir, homa_clients[0])) + shutil.copyfile("%s/%d.metrics" % (exp.log_dir, homa_servers[0]), + "%s/reports/%d.metrics" % (exp.log_dir, homa_servers[0])) + do_cmd("stop senders", all_nodes) + do_cmd("stop clients", all_nodes) + for exp in args: + for id in exp.clients: + do_subprocess(["rsync", "-rtvq", "node%d:%s.rtts" % (id, exp.name), + "%s/%s-%d.rtts" % (exp.log_dir, exp.name, id)]) + def scan_log(file, node, experiments): """ Read a log file and extract various useful information, such as fatal @@ -762,83 +932,78 @@ def scan_log(file, node, experiments): exited = False experiment = "" node_data = None + active = False for line in open(file): - match = re.match('.*Starting (.*) experiment', line) + if "FATAL:" in line: + log("%s: %s" % (file, line[:-1])) + exited = True + if "ERROR:" in line: + log("%s: %s" % (file, line[:-1])) + if "cp_node exiting" in line: + exited = True + + match = re.match('.*Starting measurements', line) + if match: + active = True + continue + + match = re.match('.*Ending measurements', line) if match: - experiment = match.group(1) - if not experiment in experiments: - experiments[experiment] = {} - if not node in experiments[experiment]: - experiments[experiment][node] = {} - node_data = experiments[experiment][node] + active = False continue - if re.match('.*Ending .* experiment', line): - experiment = "" - if experiment != "": - gbps = -1.0 - match = re.match('.*clients: ([0-9.]+) Kops/sec, ' + + if active: + match = re.match('[0-9.]+ (.*) clients: ([0-9.]+) Kops/sec, ' '([0-9.]+) Gbps.*P50 ([0-9.]+)', line) if match: - gbps = float(match.group(2)) - else: - match = re.match('.*clients: ([0-9.]+) Kops/sec, ' - '([0-9.]+) MB/sec.*P50 ([0-9.]+)', line) - if match: - gbps = 8.0*float(match.group(2)) - if gbps >= 0.0: - if not "client_kops" in node_data: - node_data["client_kops"] = [] - node_data["client_kops"].append(float(match.group(1))) - if not "client_gbps" in node_data: - node_data["client_gbps"] = [] - node_data["client_gbps"].append(gbps) - if not "client_latency" in node_data: - node_data["client_latency"] = [] - node_data["client_latency"].append(float(match.group(3))) + node_data = experiments[match.group(1)][node] + gbps = float(match.group(3)) + if gbps >= 0.0: + if not "client_kops" in node_data: + node_data["client_kops"] = [] + node_data["client_kops"].append(float(match.group(2))) + if not "client_gbps" in node_data: + node_data["client_gbps"] = [] + node_data["client_gbps"].append(gbps) + if not "client_latency" in node_data: + node_data["client_latency"] = [] + node_data["client_latency"].append(float(match.group(4))) continue - gbps = -1.0 - match = re.match('.*servers: ([0-9.]+) Kops/sec, ' + match = re.match('[0-9.]+ (.*) servers: ([0-9.]+) Kops/sec, ' '([0-9.]+) Gbps', line) if match: - gbps = float(match.group(2)) - else: - match = re.match('.*servers: ([0-9.]+) Kops/sec, ' - '([0-9.]+) MB/sec', line) - if match: - gbps = 8.0*float(match.group(2)) - if gbps >= 0.0: - if not "server_kops" in node_data: - node_data["server_kops"] = [] - node_data["server_kops"].append(float(match.group(1))) - if not "server_gbps" in node_data: - node_data["server_gbps"] = [] - node_data["server_gbps"].append(gbps) + node_data = experiments[match.group(1)][node] + gbps = float(match.group(3)) + if gbps >= 0.0: + if not "server_kops" in node_data: + node_data["server_kops"] = [] + node_data["server_kops"].append(float(match.group(2))) + if not "server_gbps" in node_data: + node_data["server_gbps"] = [] + node_data["server_gbps"].append(gbps) continue - match = re.match('.*Outstanding client RPCs: ([0-9.]+)', line) + match = re.match('.*Outstanding client RPCs for (.*) ' + 'experiment: ([0-9.]+)', line) if match: + node_data = experiments[match.group(1)][node] if not "outstanding_rpcs" in node_data: node_data["outstanding_rpcs"] = [] - node_data["outstanding_rpcs"].append(int(match.group(1))) + node_data["outstanding_rpcs"].append(int(match.group(2))) continue - match = re.match('.*Backed-up sends: ([0-9.]+)/([0-9.]+)', line) + match = re.match('.*Backed-up (.*) sends: ([0-9.]+)/([0-9.]+)', + line) if match: + node_data = experiments[match.group(1)][node] if not "backups" in node_data: node_data["backups"] = [] - total = float(match.group(2)) + total = float(match.group(3)) if total > 0: - node_data["backups"].append(float(match.group(1))/total) + node_data["backups"].append(float(match.group(2))/total) continue - if "FATAL:" in line: - log("%s: %s" % (file, line[:-1])) - exited = True - if "ERROR:" in line: - log("%s: %s" % (file, line[:-1])) - if "cp_node exiting" in line: - exited = True if not exited: log("%s appears to have crashed (didn't exit)" % (node)) @@ -849,8 +1014,9 @@ def scan_logs(): """ global log_dir, verbose - # This value is described in the header doc for scan_log. - experiments = {} + # Data collected so far for all experiments. See scan_log header + # comment for more info. + experiments = defaultdict(lambda : defaultdict(dict)) for file in sorted(glob.glob(log_dir + "/node*.log")): node = re.match('.*/(node[0-9]+)\.log', file).group(1) From e7ba01cde9142daf7402ead067634859b9ba17e1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 11 Sep 2024 15:33:00 -0700 Subject: [PATCH 008/625] Updates to notes.txt --- notes.txt | 52 ++++++++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/notes.txt b/notes.txt index 21ccae15..10c3d069 100755 --- a/notes.txt +++ b/notes.txt @@ -1,6 +1,26 @@ Notes for Homa implementation in Linux: --------------------------------------- +* Ideas for making TCP and Homa play well together: + * Goals: + * Balance queue lengths for the protocols? + * If one protocol is using a lot less bandwidth, give it preference + for transmission? + * Keep track of recent bandwidth consumed by each protocol; when there + is overload, restrict each protocol to its fraction of recent bandwidth. + * "Consumed" has to be measured in terms of bytes offered, not bytes actually + transmitted (otherwise a protocol could get "stuck" at a low transmittion + rate?). + * Maybe use a mechanism like fair-share scheduling? Keep track of + recent usage by each protocol, give priority to the protocol that + used least bandwidth recently? + * Could this be implemented with a mechanism like a token bucket? + * Use a token bucket for each protocol with 50% of available bandwidth + (or maybe less?). Split any extra available bandwidth among the + protocols. Maybe adjust rates for the token buckets based on recent + traffic? + * Also consider the amount of data that has been "stuck" in the NIC? + * Remedies to consider for the performance problems at 100 Gbps, where one tx channel gets very backed up: * Implement zero-copy on output in order to reduce memory bandwidth @@ -35,38 +55,6 @@ Notes for Homa implementation in Linux: (perhaps because the slots for its host are all taken?) * Could it abort early if there is no incoming headroom? -* Loose ends from experiment ending 4/29/2024: - * traces/100g and traces/100g2 both show significant qdisc queueing, - and the system appears to have difficulty clearing the queues (it - takes a very long time). - * Looked into packet drops: - * The switch is not reporting any packet drops. - * ifconfig is not showing any packet drops, either tx or rx. - * Tried to configure switch for priority queues, but it didn't seem - to help; see traces/100g4. - -* Experiments to try with c6525-100g cluster: - * For packets that seem to be experiencing long times in the NIC, - see if perhaps the NIC doorbell isn't getting rung. - * Check whether Homa may be exceeding max_incoming - * Analyze homa_grant_recalc: it seems to be using a lot of time, and - it sometimes runs simultaneously in several threads, resulting in - long waits for RPC locks. - * Figure out how to program the switch: are buffer sizes too small? - * Better data on actual packet loss rates - -* Notes on poor 100 Gbps throughput as of Feb. 2024: - * One node (node3) has 1400 incoming RPCs blocked waiting for buffers - * Tried increasing max_overcommit to 16, but throughput cratered: only - a few Gbps. - * On node3, one core (12) has a huge backup of data packets waiting for - SoftIRQ processing (1-2 MB, delays of 1 ms or more): - * Need to analyze why there are large delays in sending control packets - * Also, homa_grant_check_rpc seems to take a long time. - * Found times (e.g. in 100g2/node5 around 13368.354) when homa_grant_recalc - runs simultaenously in several threads, resulting in long blocks for - RPC locks. - * Notes on refactoring of grant mechanism: * Need to reimplement FIFO grants * Replace fifo_grant_increment with fifo_grant_interval From cbd4dc08d2ee14cce06b11597149aec2ce3bd6cb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 12 Sep 2024 21:33:19 -0700 Subject: [PATCH 009/625] Improve robustness of txpkts analyzer in tthoma.py (handle packets with missing fields) --- util/tthoma.py | 71 +++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index fa6ce273..6f799118 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1086,9 +1086,6 @@ def __send_data(self, trace, time, core, match, interests): id = int(match.group(1)) offset = int(match.group(2)) length = int(match.group(3)) - if length == 0: - # Temporary fix to compensate for Homa bug; delete this code soon. - return for interest in interests: interest.tt_send_data(trace, time, core, id, offset, length) @@ -5311,8 +5308,8 @@ def init_trace(self, trace): def tt_ip_xmit(self, trace, t, core, id, offset): global packets, rpcs p = packets[pkt_id(id, offset)] - # If packet retransmitted, only record first transmission - if not 'retransmits' in p: + # Only record first transmission (packet might be retransmitted) + if not 'xmit' in p: p['xmit'] = t p['tx_node'] = trace['node'] p['tx_core'] = core @@ -7248,7 +7245,7 @@ def __init__(self, dispatcher): dispatcher.interest('AnalyzePackets') def output(self): - global packets, options + global packets, options, traces # node -> list of packets transmitted by that node node_pkts = defaultdict(list) @@ -7257,8 +7254,6 @@ def output(self): for pkt in packets.values(): if (not 'xmit' in pkt) or not ('tso_length' in pkt): continue - if (not 'nic' in pkt) or (not 'gro' in pkt) or (not 'tx_qid' in pkt): - continue node_pkts[pkt['tx_node']].append(pkt) sort_keys = {'Xmit': 'xmit', 'Nic': 'nic', 'MaxGro': 'gro', @@ -7317,11 +7312,13 @@ def output(self): if (options.node != None) and (node != options.node): continue - # Create a data file for this node with packets in time order. + # Create a data file for this node with packets in time order + # (or whatever order was requested on the command line). + pkts = sorted(node_pkts[node], key = lambda pkt : pkt['xmit']) if sort_key == 'gro': - pkts = sorted(node_pkts[node], key = lambda pkt : get_max_gro(pkt)) - else: - pkts = sorted(node_pkts[node], key = lambda pkt : pkt[sort_key]) + pkts = sorted(pkts, key = lambda pkt : get_max_gro(pkt)) + elif sort_key != 'xmit': + pkts = sorted(pkts, key = lambda pkt : pkt[sort_key]) if len(pkts) == 0: continue @@ -7381,14 +7378,14 @@ def output(self): f.write('# Rx: Number of times segments in the packet were ' 'retransmitted\n\n') - f.write('# Xmit RpcId Offset Length Qid') - f.write(' Nic NDelay MaxGro GDelay') - f.write(' Free FDelay Rx\n') + f.write('# Xmit RpcId Offset Length Qid') + f.write(' Nic NDelay MaxGro GDelay') + f.write(' Free FDelay Rx\n') for pkt in pkts: xmit = pkt['xmit'] - nic = pkt['nic'] + nic = pkt['nic'] if 'nic' in pkt else None max_gro = get_max_gro(pkt) - free = pkt['free_tx_skb'] + free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None length = pkt['tso_length'] if 'tx_qid' in pkt: @@ -7419,23 +7416,37 @@ def output(self): rx += len(seg['retransmits']) rx_msg = str(rx) if rx > 0 else "" - if rx == 0 and qid != None: + gro_string = "" + if rx == 0 and qid != None and nic != None: delays[qid]['nic'].append(nic - xmit) - delays[qid]['gro'].append(max_gro - nic) - delays[qid]['free'].append(free - nic) - - excess = (free - nic) - options.threshold - if excess > 0: - qid_backlog[qid] += excess * length - qid_slow_bytes[qid] += length + if max_gro != None: + delays[qid]['gro'].append(max_gro - nic) + gro_string = '%.1f' % (max_gro - nic) + if free != None: + delays[qid]['free'].append(free - nic) + + if nic != None: + t = free if free != None else traces[node]['last_time'] + excess = (t - nic) - options.threshold + if excess > 0: + qid_backlog[qid] += excess * length + qid_slow_bytes[qid] += length qid_total_bytes[qid] += length - f.write('%9.3f %10d %6d %6d %3s' % (xmit, pkt['id'], + + f.write('%10.3f %10d %6d %6d %3s' % (xmit, pkt['id'], pkt['offset'], pkt['tso_length'], qid_string)) - f.write(' %9.3f %7.1f %9.3f %7.1f' % (nic, nic - xmit, - max_gro, max_gro - nic)) - f.write(' %9.3f %7.1f %2s\n' % (pkt['free_tx_skb'], - pkt['free_tx_skb'] - nic, rx_msg)) + nic_delay_string = '' + if (nic != None) and (xmit != None): + nic_delay_string = '%.1f' % (nic - xmit) + f.write(' %10s %7s %10s %7s' % (print_if(nic, '%.3f'), + nic_delay_string, print_if(max_gro, '%.3f'), + gro_string)) + free_delay_string = '' + if (nic != None) and (free != None): + free_delay_string = '%.1f' % (free - nic) + f.write(' %10s %7s %2s\n' % (print_if(free, '%.3f'), + free_delay_string, rx_msg)) f.close() def print_type(delays): From 8e97e0b6732dd0ea9bcd753dc69c1702878fc8ca Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 18 Sep 2024 11:41:10 -0700 Subject: [PATCH 010/625] homa_gro_receive was accidentally manipulating TCP packets --- homa_offload.c | 24 ++++++++++++++-- test/unit_homa_offload.c | 59 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 76 insertions(+), 7 deletions(-) diff --git a/homa_offload.c b/homa_offload.c index a6e34053..e94102ee 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -68,6 +68,7 @@ void homa_gro_hook_tcp(void) if (tcp_net_offload != NULL) return; + printk(KERN_NOTICE "Homa setting up TCP hijacking\n"); tcp_net_offload = inet_offloads[IPPROTO_TCP]; hook_tcp_net_offload = *tcp_net_offload; hook_tcp_net_offload.callbacks.gro_receive = homa_tcp_gro_receive; @@ -88,6 +89,7 @@ void homa_gro_unhook_tcp(void) { if (tcp_net_offload == NULL) return; + printk(KERN_NOTICE "Homa cancelling TCP hijacking\n"); inet_offloads[IPPROTO_TCP] = tcp_net_offload; tcp_net_offload = NULL; inet6_offloads[IPPROTO_TCP] = tcp6_net_offload; @@ -251,7 +253,6 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * gro_list by the caller, so it will be considered for merges * in the future. */ -// int hdr_offset, hdr_end; struct sk_buff *held_skb; struct sk_buff *result = NULL; struct homa_core *core = homa_cores[raw_smp_processor_id()]; @@ -332,11 +333,29 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, struct napi_struct *napi = container_of(gro_list, struct napi_struct, gro_hash[hash]); - /* Make sure that core->held_skb is on the list. */ + /* Must verify that core->held_skb points to a packet on + * the list, and that the packet is a Homa packet. + * homa_gro_complete isn't always invoked before removing + * packets from the list, so core->held_skb could be a + * dangling pointer (or the skb could have been reused for + * some other protocol). + */ list_for_each_entry(held_skb, &napi->gro_hash[core->held_bucket].list, list) { + int protocol; + if (held_skb != core->held_skb) continue; + if (skb_is_ipv6(held_skb)) + protocol = ipv6_hdr(held_skb)->nexthdr; + else + protocol = ip_hdr(held_skb)->protocol; + if (protocol != IPPROTO_HOMA) { + tt_record3("homa_gro_receive held_skb 0x%0x%0x " + "isn't Homa: protocol %d", + SPLIT_64(held_skb), protocol); + continue; + } /* Aggregate skb into held_skb. We don't update the * length of held_skb because we'll eventually split @@ -528,6 +547,7 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) // ntohl(h->seg.offset), // NAPI_GRO_CB(skb)->count); + homa_cores[raw_smp_processor_id()]->held_skb = NULL; if (homa->gro_policy & HOMA_GRO_GEN3) { homa_gro_gen3(skb); } else if (homa->gro_policy & HOMA_GRO_GEN2) { diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index ae409a55..59deb723 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -353,14 +353,15 @@ TEST_F(homa_offload, homa_gro_receive__no_held_skb) int same_flow; self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; cur_core->held_skb = NULL; - cur_core->held_bucket = 99; - EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); + cur_core->held_bucket = 2; + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[2].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); EXPECT_EQ(skb, cur_core->held_skb); - EXPECT_EQ(3, cur_core->held_bucket); + EXPECT_EQ(2, cur_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__empty_merge_list) @@ -369,10 +370,49 @@ TEST_F(homa_offload, homa_gro_receive__empty_merge_list) int same_flow; self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; - cur_core->held_skb = skb; + cur_core->held_skb = self->skb; cur_core->held_bucket = 3; - EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[2].list, skb)); + same_flow = NAPI_GRO_CB(skb)->same_flow; + EXPECT_EQ(0, same_flow); + EXPECT_EQ(skb, cur_core->held_skb); + EXPECT_EQ(2, cur_core->held_bucket); + kfree_skb(skb); +} +TEST_F(homa_offload, homa_gro_receive__held_skb_not_in_merge_list) +{ + struct sk_buff *skb; + int same_flow; + self->header.seg.offset = htonl(6000); + skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb->hash = 3; + NAPI_GRO_CB(skb)->same_flow = 0; + cur_core->held_skb = skb; + cur_core->held_bucket = 2; + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb)); + same_flow = NAPI_GRO_CB(skb)->same_flow; + EXPECT_EQ(0, same_flow); + EXPECT_EQ(skb, cur_core->held_skb); + EXPECT_EQ(3, cur_core->held_bucket); + kfree_skb(skb); +} +TEST_F(homa_offload, homa_gro_receive__held_skb__in_merge_list_but_wrong_proto) +{ + struct sk_buff *skb; + int same_flow; + self->header.seg.offset = htonl(6000); + skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb->hash = 3; + NAPI_GRO_CB(skb)->same_flow = 0; + cur_core->held_skb = self->skb; + if (skb_is_ipv6(self->skb)) + ipv6_hdr(self->skb)->nexthdr = IPPROTO_TCP; + else + ip_hdr(self->skb)->protocol = IPPROTO_TCP; + cur_core->held_bucket = 2; + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); EXPECT_EQ(skb, cur_core->held_skb); @@ -546,6 +586,15 @@ TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) EXPECT_EQ(5000, homa_cores[3]->last_active); } + +TEST_F(homa_offload, homa_gro_complete__clear_held_skb) +{ + struct homa_core *core = homa_cores[raw_smp_processor_id()]; + + core->held_skb = self->skb2; + homa_gro_complete(self->skb, 0); + EXPECT_EQ(NULL, core->held_skb); +} TEST_F(homa_offload, homa_gro_complete__GRO_IDLE) { homa->gro_policy = HOMA_GRO_IDLE; From f7ed8c2a1a2a1e622ce7921b846c45a385afc91a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 18 Sep 2024 11:42:50 -0700 Subject: [PATCH 011/625] Small improvements in cp_node error messages --- util/cp_node.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/util/cp_node.cc b/util/cp_node.cc index da4fd364..36808d8a 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -2053,14 +2053,15 @@ bool homa_client::wait_response(homa::receiver *receiver, uint64_t rpc_id) if (length < 0) { if (exit_receivers) return false; - log(NORMAL, "FATAL: error in recvmsg: %s (id %lu, server %s)\n", + log(NORMAL, "FATAL: error in Homa recvmsg: %s (id %lu, " + "server %s)\n", strerror(errno), rpc_id, print_address(receiver->src_addr())); exit(1); } header = receiver->get(0); if (header == nullptr) { - log(NORMAL, "FATAL: response message contained %lu bytes; " + log(NORMAL, "FATAL: Homa response message contained %lu bytes; " "need at least %lu", length, sizeof(*header)); exit(1); } From d41e540028dc4ecdbaad77710bcc8fb925389ae6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 18 Sep 2024 11:54:09 -0700 Subject: [PATCH 012/625] Don't clear timetraces during cperf.py experiment startup --- util/cperf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index f371f4ba..4eb50db8 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -880,8 +880,8 @@ def run_experiments(*args): # Wait a bit so that homa_prio can set priorities appropriately time.sleep(2) if homa_nodes: - vlog("Initializing metrics and timetracing") - do_ssh(["metrics.py; ttprint.py > /dev/null"], homa_nodes) + vlog("Initializing metrics") + do_ssh(["metrics.py > /dev/null"], homa_nodes) do_cmd("dump_times /dev/null", all_nodes) do_cmd("log Starting measurements", all_nodes) log("Starting measurements") From 4abb6be308548eff8a14b546950e25c59a910356 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 20 Sep 2024 16:49:26 -0700 Subject: [PATCH 013/625] tt_record when packets back up because tx queues have been stopped --- homa_outgoing.c | 24 +++++++++++++++++------- test/mock.c | 7 ++++++- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index 97836ea0..a6f7c481 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -373,10 +373,8 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, int result, priority; struct dst_entry *dst; struct sk_buff *skb; + struct netdev_queue *txq; - /* Allocate the same size sk_buffs as for the smallest data - * packets (better reuse of sk_buffs?). - */ dst = homa_get_dst(peer, hsk); skb = homa_skb_new_tx(HOMA_MAX_HEADER); if (unlikely(!skb)) @@ -432,6 +430,13 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, } } } + txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); + if (!netif_tx_queue_stopped(txq)) + tt_record4("__homa_xmit_control found stopped txq for id %d, " + "qid %d, num_queued %d, limit %d", + be64_to_cpu(h->sender_id), + skb->queue_mapping, txq->dql.num_queued, + txq->dql.adj_limit); INC_METRIC(packets_sent[h->type - DATA], 1); INC_METRIC(priority_bytes[priority], skb->len); INC_METRIC(priority_packets[priority], 1); @@ -489,8 +494,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) void homa_xmit_data(struct homa_rpc *rpc, bool force) { struct homa *homa = rpc->hsk->homa; - - tt_record("homa_xmit_data starting"); + struct netdev_queue *txq; atomic_inc(&rpc->msgout.active_xmits); while (*rpc->msgout.next_xmit) { @@ -528,6 +532,12 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) homa_rpc_unlock(rpc); skb_get(skb); __homa_xmit_data(skb, rpc, priority); + txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); + if (netif_tx_queue_stopped(txq)) + tt_record4("homa_xmit_data found stopped txq for id %d, " + "qid %d, num_queued %d, limit %d", + rpc->id, skb->queue_mapping, + txq->dql.num_queued, txq->dql.adj_limit); force = false; homa_rpc_lock(rpc, "homa_xmit_data"); if (rpc->state == RPC_DEAD) @@ -584,10 +594,10 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) err = ip_queue_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow); } tt_record4("Finished queueing packet: rpc id %llu, offset %d, len %d, " - "granted %d", + "qid %d", rpc->id, homa_info->offset, homa_get_skb_info(skb)->data_bytes, - rpc->msgout.granted); + skb->queue_mapping); if (err) { INC_METRIC(data_xmit_errors, 1); } diff --git a/test/mock.c b/test/mock.c index 6e2b3e01..686e9024 100644 --- a/test/mock.c +++ b/test/mock.c @@ -164,9 +164,11 @@ int mock_compound_order_mask = 0; int mock_page_nid_mask = 0; struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; +struct netdev_queue mock_net_queue = {.state = 0}; struct net_device mock_net_device = { .gso_max_segs = 1000, - .gso_max_size = 0}; + .gso_max_size = 0, + ._tx = &mock_net_queue}; const struct net_offload *inet_offloads[MAX_INET_PROTOS]; const struct net_offload *inet6_offloads[MAX_INET_PROTOS]; @@ -221,6 +223,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, skb->_skb_refdst = 0; ip_hdr(skb)->saddr = 0; skb->truesize = size; + skb->dev = &mock_net_device; return skb; } @@ -1395,6 +1398,7 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct common_header *h, skb->_skb_refdst = 0; skb->hash = 3; skb->next = NULL; + skb->dev = &mock_net_device; return skb; } @@ -1489,6 +1493,7 @@ void mock_teardown(void) mock_compound_order_mask = 0; mock_page_nid_mask = 0; mock_net_device.gso_max_size = 0; + mock_net_device.gso_max_segs = 1000; memset(inet_offloads, 0, sizeof(inet_offloads)); memset(inet6_offloads, 0, sizeof(inet6_offloads)); From f7cfa2bca917bbb158ebcb9dda7fba4d53e9ca7d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 20 Sep 2024 16:50:13 -0700 Subject: [PATCH 014/625] Enable better console logging in install_homa --- cloudlab/bin/install_homa | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudlab/bin/install_homa b/cloudlab/bin/install_homa index eb89935e..71582537 100755 --- a/cloudlab/bin/install_homa +++ b/cloudlab/bin/install_homa @@ -36,6 +36,7 @@ for ((i = $first ; i <= $last; i++)); do rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv ~/.bashrc ~/.bash_profile ~/.gdbinit $node: rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv --exclude __pycache__ ~/bin/ $node:bin/ rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv $root/homa.ko $root/util/cp_node $root/util/homa_prio $root/util/*.py $node:bin/ + ssh -4 $node 'sudo sysctl .kernel.printk="5 4 1 7"' ssh -4 $node 'echo $PATH' ssh -4 $node 'config default' done \ No newline at end of file From 2023939f16675b88c1e66e9661416d0e64bc81d4 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 24 Sep 2024 16:00:15 -0700 Subject: [PATCH 015/625] Improves txqueues and txintervals analyzers for tthoma.py (e.g., include TCP packets when estimating NIC queue lengths) --- util/tthoma.py | 136 +++++++++++++++++++++++++++++++------------------ 1 file changed, 86 insertions(+), 50 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 6f799118..36aa161e 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -17,6 +17,7 @@ import os from pathlib import Path import re +from socket import NI_NUMERICHOST import string import sys import textwrap @@ -241,15 +242,15 @@ def __missing__(self, key): # tx_gro_bytes Bytes of data from this node received by GRO on other nodes # during the interval # tx_free_bytes: Bytes of data freed after NIC notified tx completion -# tx_max_free: Largest value of pkt['free_tx_skb'] - pkt['nic'] for -# a packet freed in this interval (0 if no packets freed) -# tx_min_free: Smallest value of pkt['free_tx_skb'] - pkt['nic'] for -# a packet freed in this interval (0 if no packets freed) -# tx_max_gro_free:Largest value of pkt['gro'] - pkt['free_tx_skb'] for -# any segment of a packet freed in this interval (None if +# tx_max_free: Largest value of pkt['free_tx_skb'] - pkt['nic'] for a +# packet passed to NIC in this interval (0 if no packets freed) +# tx_min_free: Smallest value of pkt['free_tx_skb'] - pkt['nic'] for a +# packet passed to NIC in this interval (0 if no packets freed) +# tx_max_gro_free:Largest value of pkt['gro'] - pkt['free_tx_skb'] for any +# segment of a packet passed to NIC in this interval (None if # no packets freed) -# tx_min_gro_free:Smallest value of pkt['gro'] - pkt['free_tx_skb'] for -# any segment of a packet freed in this interval (None if +# tx_min_gro_free:Smallest value of pkt['gro'] - pkt['free_tx_skb'] for any +# segment of a packet passed to NIC in this interval (None if # no packets freed) # tx_grant_xmit: Bytes of grant that have been passsed to ip*xmit but not yet # received by GRO, as of the end of the interval @@ -1523,6 +1524,16 @@ def __pacer_xmit(self, trace, time, core, match, interests): '([0-9]+), offset ([0-9]+), bytes_left ([0-9]+)' }) + def __tcp_xmit(self, trace, time, core, match, interests): + length = int(match.group(1)) + for interest in interests: + interest.tt_tcp_xmit(trace, time, core, length) + + patterns.append({ + 'name': 'tcp_xmit', + 'regexp': '__tcp_transmit_skb sent packet with ([0-9]+) bytes' + }) + #------------------------------------------------ # Analyzer: activity #------------------------------------------------ @@ -3876,8 +3887,16 @@ def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') dispatcher.interest('AnalyzePackets') self.tx_qid = None + + # Node name -> list of pairs, where time gives the + # time when a packet was handed off to the NIC and length gives + # the total length of the packet in bytes. + self.tcp_xmits = defaultdict(list) return + def tt_tcp_xmit(self, trace, t, core, length): + self.tcp_xmits[trace['node']].append([t, length]) + def restrict_qid(self, qid): """ Ignore all packets except thost that use the given transmit queue. @@ -4020,9 +4039,11 @@ def analyze(self): global rpcs, packets, grants, max_unsched, traces, options, intervals # Node name -> list of pairs, where time gives the - # time when a packet was handed off to the NIC and length gives - # the total length of the packet in bytes. + # time when a packet was handed off to the NIC (or passed to ip*xmit) + # and length gives the total length of the packet in bytes. node_xmits = defaultdict(list) + for node, xmits in self.tcp_xmits.items(): + node_xmits[node].extend(xmits) # Total number of bytes a grant packet occupies on the wire, including # headers, inter-packet gap, etc. @@ -4054,6 +4075,11 @@ def analyze(self): continue length = pkt['length'] txmit = pkt['xmit'] if 'xmit' in pkt else None + if 'nic' in pkt: + tnic = pkt['nic'] + nic_interval = get_interval(tx_node, tnic) + else: + tnic = None tnic = pkt['nic'] if 'nic' in pkt else None tfree = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None tgro = pkt['gro'] if 'gro' in pkt else None @@ -4075,9 +4101,8 @@ def analyze(self): tnic = pkt['nic'] node_xmits[tx_node].append([pkt['nic'], tso_length + data_overhead_bytes]) - interval = get_interval(tx_node, tnic) - interval['tx_nic_pkts'] += 1 - interval['tx_nic_bytes'] += tso_length + nic_interval['tx_nic_pkts'] += 1 + nic_interval['tx_nic_bytes'] += tso_length elif txmit != None: node_xmits[tx_node].append([txmit, tso_length + data_overhead_bytes]) @@ -4089,11 +4114,11 @@ def analyze(self): add_to_intervals(tx_node, tnic, tfree, 'tx_in_nic', tso_length) delay = tfree - tnic - if delay > interval['tx_max_free']: - interval['tx_max_free'] = delay - if (interval['tx_min_free'] == 0) or (delay < - interval['tx_min_free']): - interval['tx_min_free'] = delay + if delay > nic_interval['tx_max_free']: + nic_interval['tx_max_free'] = delay + if (nic_interval['tx_min_free'] == 0) or (delay < + nic_interval['tx_min_free']): + nic_interval['tx_min_free'] = delay else: start = traces[tx_node]['first_time'] add_to_intervals(tx_node, start, tfree, 'tx_in_nic', @@ -4130,16 +4155,14 @@ def analyze(self): add_to_intervals(rx_node, tnic+late_usecs, tgro, 'rx_overdue', length) - if (tgro != None) and (tfree != None): - interval = get_interval(tx_node, tfree) - if interval != None: - delay = tgro - tfree - if (interval['tx_max_gro_free'] == None) or (delay > - interval['tx_max_gro_free']): - interval['tx_max_gro_free'] = delay - if (interval['tx_min_gro_free'] == None) or (delay < - interval['tx_min_gro_free']): - interval['tx_min_gro_free'] = delay + if (tgro != None) and (tfree != None) and (tnic != None): + delay = tgro - tfree + if (nic_interval['tx_max_gro_free'] == None) or (delay > + nic_interval['tx_max_gro_free']): + nic_interval['tx_max_gro_free'] = delay + if (nic_interval['tx_min_gro_free'] == None) or (delay < + nic_interval['tx_min_gro_free']): + nic_interval['tx_min_gro_free'] = delay if 'softirq' in pkt: tsoftirq = pkt['softirq'] @@ -7161,17 +7184,17 @@ def output(self): f.write('# FreeKB: KB of skb data freed after NIC notified ' 'transmission complete\n') f.write('# MinFr: Smallest p[\'free_tx_skb\'] - p[\'nic\'] for a ' - 'packet freed in\n') - f.write('# this interval\n') + 'packet passed to\n') + f.write('# NIC in this interval\n') f.write('# MaxFr: Largest p[\'free_tx_skb\'] - p[\'nic\'] for a ' - 'packet freed in\n') - f.write('# this interval\n') + 'packet passed to\n') + f.write('# NIC in this interval\n') f.write('# MinGF: Smallest p[\'gro\'] - p[\'free_tx_skb\'] ' - 'for any segment of\n') - f.write('# a packet freed in this interval\n') + 'for any segment of a\n') + f.write('# packet passed to NIC in this interval\n') f.write('# MaxGF: Largest p[\'gro\'] - p[\'free_tx_skb\'] ' - 'for any segment of\n') - f.write('# a packet freed in this interval\n') + 'for any segment of a\n') + f.write('# packet passed to NIC in this interval\n') f.write('# GXmit: KB of grants that have been sent by peer ' 'but not yet\n') f.write(' received by GRO\n') @@ -7509,18 +7532,22 @@ class AnalyzeTxqueues: """ def __init__(self, dispatcher): - # Node name -> list of tuples for all - # transmitted packets. Length is the packet length including Homa - # header but not IP or Ethernet overheads. Queue_length is the - # # bytes in the NIC queue as of time (includes this packet). - # Queue_length starts off zero and is updated later. + # Node name -> list of tuples for + # all transmitted packets. Length is the packet length including + # Homa/TCP header but not IP or Ethernet overheads. Queue_length is + # the # bytes in the NIC queue as of time (includes this packet). + # Queue_length starts off zero and is updated later. Type indicates + # the kind of packet: "homa_data", "homa_grant", or "tcp" self.nodes = defaultdict(list) - def tt_send_data(self, trace, time, core, id, offset, length): - self.nodes[trace['node']].append([time, length + 60, 0]) + def tt_send_data(self, trace, t, core, id, offset, length): + self.nodes[trace['node']].append([t, length + 60, 0, "homa_data"]) - def tt_send_grant(self, trace, time, core, id, offset, priority, increment): - self.nodes[trace['node']].append([time, 34, 0]) + def tt_send_grant(self, trace, t, core, id, offset, priority, increment): + self.nodes[trace['node']].append([t, 34, 0, "homa_grant"]) + + def tt_tcp_xmit(self, trace, t, core, length): + self.nodes[trace['node']].append([t, length, 0, "tcp"]) def output(self): global options, traces @@ -7537,8 +7564,11 @@ def output(self): print('Time: Time when worst-case queue length occurred') print('Delay: Delay (usec until fully transmitted) experienced by packet ') print(' transmitted at Time') + print('P50: Median delay experienced by Homa data packets') + print('P90: 90th percentile delay experienced by Homa data packets') + print('P99: 99th percentile delay experienced by Homa data packets') print('') - print('Node MaxLength Time Delay') + print('Node MaxLength Time Delay P50 P90 P99') for node in get_sorted_nodes(): pkts = self.nodes[node] @@ -7550,7 +7580,7 @@ def output(self): cur_queue = 0 prev_time = traces[node]['first_time'] for i in range(len(pkts)): - time, length, ignore = pkts[i] + time, length, ignore, ignore2 = pkts[i] # 20 bytes for IPv4 header, 42 bytes for Ethernet overhead (CRC, # preamble, interpacket gap) @@ -7575,8 +7605,14 @@ def output(self): max_time = time prev_time = time pkts[i][2] = cur_queue - print('%-10s %9d %9.3f %7.1f ' % (node, max_queue, max_time, - (max_queue*8)/(options.gbps*1000))) + data_pkts = sorted(filter(lambda t: t[3] == 'homa_data', pkts), + key=lambda t: t[2]) + print('%-10s %9d %9.3f %7.1f %7.1f %7.1f %7.1f' % ( + node, max_queue, max_time, + (max_queue*8)/(options.gbps*1000), + data_pkts[50*len(data_pkts)//100][2]*8/(options.gbps*1000), + data_pkts[90*len(data_pkts)//100][2]*8/(options.gbps*1000), + data_pkts[99*len(data_pkts)//100][2]*8/(options.gbps*1000))) if options.data: # Print stats for each node at regular intervals @@ -7602,7 +7638,7 @@ def output(self): i = cur[node] xmits = self.nodes[node] while i < len(xmits): - time, ignore, queue_length = xmits[i] + time, ignore, queue_length, type = xmits[i] if time > interval_end: break if queue_length > max: From 0620c504f71f673be69a53fb7c1b149c9197a780 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 24 Sep 2024 16:16:03 -0700 Subject: [PATCH 016/625] Minor fix to timetracing in homa_outgoing.c --- homa_outgoing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index a6f7c481..b8d9eaf9 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -431,7 +431,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, } } txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); - if (!netif_tx_queue_stopped(txq)) + if (netif_tx_queue_stopped(txq)) tt_record4("__homa_xmit_control found stopped txq for id %d, " "qid %d, num_queued %d, limit %d", be64_to_cpu(h->sender_id), From c904850eda8de09f239132c451a6e218aa99b6ba Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 24 Sep 2024 16:16:28 -0700 Subject: [PATCH 017/625] Updates to perf.txt (cp_both measurements) --- perf.txt | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/perf.txt b/perf.txt index c5d7f748..4b10fb0b 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,50 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +58. (September 2024): Interference between Homa and TCP when both run + concurrently on the same nodes (no special kernel code to mitigate + interference) + Experiment on xl170 cluster: + cp_both -n 9 --skip 0 -w w4 -b 20 -s 30 + + HomaGbps: Gbps generated by Homa (20 - HomaGbps generated by TCP) + HAvg: Average slowdown for Homa + HP50: Median RTT for Homa short messages + HP99: P99 RTT for Homa short messages + TAvg: Average slowdown for TCP + TP50: Median RTT for TCP short messages + TP99: P99 RTT for TCP short messagesAvailable + + HomaGbps HAvg HP50 HP99 TAvg TP50 TP99 + 0 63.4 797 6089 + 2 8.1 66 335 80.5 1012 10131 + 4 8.6 65 507 80.0 1021 9315 + 6 9.9 66 765 80.8 1022 9328 + 8 12.1 68 1065 79.8 1042 8309 + 10 14.3 70 1324 76.7 993 6881 + 12 15.1 72 1394 73.4 971 5866 + 14 14.8 75 1305 73.1 927 6076 + 16 12.9 75 1077 70.2 816 6564 + 18 10.0 70 755 69.7 748 7387 + 20 4.4 44 119 + + Overall observations: + * Short messages: + * Homa: 10x increase for P99, not much changes for P50 + * TCP: 25-60% increases for both P50 and P99 + * Long messages: + * TCP latency improves up to 2x as Homa traffic share increases (perhaps + because Homa throttles itself to link speed?) + * Homa latency not much affected + * Other workloads: + * W5 similar to W4 + * W3 and W2 show less Homa degradation, more TCP degradation + * Estimated NIC queue lengths have gotten much longer (e.g P99 queueing + delay of 235-750 us now, vs. < 10 us when Homa runs alone) + * Homa packets are experiencing even longer delays than this because + packets aren't distributed evenly across tx queues, while the NIC serves + queues evenly. + 57. (August 2024): Best known parameters for c6525-100g cluster: Homa: hijack_tcp=1 .unsched_bytes=20000 window=0 max_incoming=1000000 From 54a4c93d8c20819a4246091874c678cd4a3778d0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 24 Sep 2024 16:17:09 -0700 Subject: [PATCH 018/625] Bugs in cp_both --- util/cp_both | 75 ++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/util/cp_both b/util/cp_both index ad945b92..dfe8e5fc 100755 --- a/util/cp_both +++ b/util/cp_both @@ -17,11 +17,11 @@ parser = get_parser(description= 'on the same nodes.', usage='%(prog)s [options]', defaults={'homa_gbps': 0}) parser.add_argument('--homa-gbps', type=float, dest='homa_gbps', - metavar='B', default=0, + metavar='B', default=None, help='Configure Homa to generate B Gbps of total outgoing bandwidth ' 'on each node (clients and servers combined); the remainder of ' - '--gbps will be generated by TCP. 0 means split --gbps between ' - 'Homa and TCP (default: 0)') + '--gbps will be generated by TCP (default: split --gbps between ' + 'Homa and TCP)') default_defaults['client_max'] options = parser.parse_args() init(options) @@ -36,14 +36,12 @@ if not options.plot_only: tcp_options.name = "tcp_" + options.workload tcp_options.protocol = "tcp" - if not options.homa_gbps: - homa_options.gbps = options.gbps/4.0 - tcp_options.gbps = homa_options.gbps - else: - tcp_options.gbps = (options.gbps - options.homa_gbps)/2 - if tcp_options.gbps < 0: - tcp_options.gbps = 0 - homa_options.gbps = options.gbps/2 - tcp_options.gbps + if options.homa_gbps == None: + options.homa_gbps = options.gbps/2.0 + tcp_options.gbps = (options.gbps - options.homa_gbps)/2 + if tcp_options.gbps < 0: + tcp_options.gbps = 0 + homa_options.gbps = options.gbps/2 - tcp_options.gbps try: run_experiments(homa_options, tcp_options) except Exception as e: @@ -53,32 +51,33 @@ if not options.plot_only: scan_logs() # Generate plots and reports - homa_exp = "homa_" + options.workload - scan_metrics(homa_exp) - tcp_exp = "tcp_" + options.workload - scan_metrics(tcp_exp) +homa_exp = "homa_" + options.workload +scan_metrics(homa_exp) +tcp_exp = "tcp_" + options.workload +scan_metrics(tcp_exp) - # Generate slowdown plot. - log("Generating slowdown plot for %s" % (options.workload)) - title = "TCP and Homa together %s %d nodes, %.1f Gbps" % ( - options.workload.capitalize(), options.num_nodes, options.gbps) - ax = start_slowdown_plot(title, 1000, homa_exp) - plot_slowdown(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) - plot_slowdown(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) - plot_slowdown(ax, homa_exp, "p99", "Homa P99", color=homa_color) - plot_slowdown(ax, homa_exp, "p50", "Homa P50", color=homa_color2) - ax.legend(loc="upper right", prop={'size': 9}) - plt.tight_layout() - plt.savefig("%s/reports/both_%s.pdf" % (options.log_dir, options.workload)) +# Generate slowdown plot. +log("Generating slowdown plot for %s" % (options.workload)) +title = "TCP (%.1f Gbps) and Homa (%.1f Gbps) together, %s %d nodes" % ( + options.gbps - options.homa_gbps, options.homa_gbps, + options.workload.capitalize(), options.num_nodes) +ax = start_slowdown_plot(title, 1000, homa_exp) +plot_slowdown(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) +plot_slowdown(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) +plot_slowdown(ax, homa_exp, "p99", "Homa P99", color=homa_color) +plot_slowdown(ax, homa_exp, "p50", "Homa P50", color=homa_color2) +ax.legend(loc="upper right", prop={'size': 9}) +plt.tight_layout() +plt.savefig("%s/reports/both_%s.pdf" % (options.log_dir, options.workload)) - # Generate CDF of small-message RTTs. - log("Generating short message CDF for %s" % (options.workload)) - homa_x, homa_y = get_short_cdf(homa_exp) - tcp_x, tcp_y = get_short_cdf(tcp_exp) - start_cdf_plot(title, 10, 0.99e05, 1e-05, "RTT (usecs)", - "Cumulative Fraction Short Messages") - plt.plot(tcp_x, tcp_y, label="TCP", color=tcp_color) - plt.plot(homa_x, homa_y, label="Homa", color=homa_color) - plt.legend(loc="upper right", prop={'size': 9}) - plt.savefig("%s/reports/short_cdf_%s.pdf" % (options.log_dir, - options.workload)) +# Generate CDF of small-message RTTs. +log("Generating short message CDF for %s" % (options.workload)) +homa_x, homa_y = get_short_cdf(homa_exp) +tcp_x, tcp_y = get_short_cdf(tcp_exp) +start_cdf_plot(title, 10, 0.99e05, 1e-05, "RTT (usecs)", + "Cumulative Fraction Short Messages") +plt.plot(tcp_x, tcp_y, label="TCP", color=tcp_color) +plt.plot(homa_x, homa_y, label="Homa", color=homa_color) +plt.legend(loc="upper right", prop={'size': 9}) +plt.savefig("%s/reports/short_cdf_%s.pdf" % (options.log_dir, + options.workload)) From 17c11030d7fda6537374b426b7e39f4fb5337716 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 25 Sep 2024 17:31:19 -0700 Subject: [PATCH 019/625] Fix style problems found by checkpatch --- homa.h | 18 +- homa_api.c | 5 +- homa_grant.c | 70 ++--- homa_impl.h | 304 +++++++++++----------- homa_incoming.c | 143 +++++------ homa_offload.c | 68 +++-- homa_outgoing.c | 93 +++---- homa_peertab.c | 68 ++--- homa_plumbing.c | 220 ++++++++-------- homa_pool.c | 61 +++-- homa_skb.c | 53 ++-- homa_socktab.c | 25 +- homa_timer.c | 43 ++-- homa_utils.c | 526 ++++++++++++++------------------------ test/unit_homa_grant.c | 2 +- test/unit_homa_incoming.c | 2 +- test/unit_homa_outgoing.c | 2 +- test/unit_homa_plumbing.c | 14 +- test/unit_homa_pool.c | 4 +- test/unit_homa_timer.c | 2 +- test/unit_homa_utils.c | 2 +- test/unit_timetrace.c | 2 +- test/utils.c | 2 +- timetrace.c | 154 ++++++----- timetrace.h | 27 +- 25 files changed, 860 insertions(+), 1050 deletions(-) diff --git a/homa.h b/homa.h index 4eae9544..c8ac00a2 100644 --- a/homa.h +++ b/homa.h @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +/* SPDX-License-Identifier: BSD-2-Clause */ /* This file defines the kernel call interface for the Homa * transport protocol. @@ -57,11 +55,11 @@ extern "C" * Holds either an IPv4 or IPv6 address (smaller and easier to use than * sockaddr_storage). */ -typedef union sockaddr_in_union { +union sockaddr_in_union { struct sockaddr sa; struct sockaddr_in in4; struct sockaddr_in6 in6; -} sockaddr_in_union; +}; /** * struct homa_sendmsg_args - Provides information needed by Homa's @@ -122,7 +120,7 @@ struct homa_recvmsg_args { * always be set when peer information is available, which includes * some error cases. */ - sockaddr_in_union peer_addr; + union sockaddr_in_union peer_addr; /** * @num_bpages: (in/out) Number of valid entries in @bpage_offsets. @@ -216,16 +214,16 @@ struct homa_set_buf_args { extern int homa_abortp(int fd, struct homa_abort_args *args); extern int homa_send(int sockfd, const void *message_buf, - size_t length, const sockaddr_in_union *dest_addr, + size_t length, const union sockaddr_in_union *dest_addr, uint64_t *id, uint64_t completion_cookie); extern int homa_sendv(int sockfd, const struct iovec *iov, - int iovcnt, const sockaddr_in_union *dest_addr, + int iovcnt, const union sockaddr_in_union *dest_addr, uint64_t *id, uint64_t completion_cookie); extern ssize_t homa_reply(int sockfd, const void *message_buf, - size_t length, const sockaddr_in_union *dest_addr, + size_t length, const union sockaddr_in_union *dest_addr, uint64_t id); extern ssize_t homa_replyv(int sockfd, const struct iovec *iov, - int iovcnt, const sockaddr_in_union *dest_addr, + int iovcnt, const union sockaddr_in_union *dest_addr, uint64_t id); extern int homa_abort(int sockfd, uint64_t id, int error); diff --git a/homa_api.c b/homa_api.c index cdb719ea..c2b31f43 100644 --- a/homa_api.c +++ b/homa_api.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file contains functions that implement the Homa API visible to * applications. It is intended to be part of the user-level run-time library. @@ -204,5 +202,6 @@ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, int homa_abort(int sockfd, uint64_t id, int error) { struct homa_abort_args args = {id, error}; + return ioctl(sockfd, HOMAIOCABORT, &args); } diff --git a/homa_grant.c b/homa_grant.c index bf542e2c..6bb84672 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2024 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file contains functions related to issuing grants for incoming * messages. @@ -39,14 +37,17 @@ inline int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) * may be possible to send out additional grants to some RPCs (doing * this is left to the caller). */ -inline int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) { +inline int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) +{ int incoming = rpc->msgin.granted - (rpc->msgin.length - rpc->msgin.bytes_remaining); + if (incoming < 0) incoming = 0; if (incoming != rpc->msgin.rec_incoming) { int delta = incoming - rpc->msgin.rec_incoming; int old = atomic_fetch_add(delta, &homa->total_incoming); + rpc->msgin.rec_incoming = incoming; return ((old >= homa->max_incoming) && ((old + delta) < homa->max_incoming)); @@ -76,6 +77,7 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) * the peer's list. */ __u64 time = get_cycles(); + INC_METRIC(grantable_rpcs_integral, homa->num_grantable_rpcs * (time - homa->last_grantable_change)); homa->last_grantable_change = time; @@ -94,20 +96,22 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) } } list_add_tail(&rpc->grantable_links, &peer->grantable_rpcs); - } else while (rpc != list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links)) { - /* Message is on the list, but its priority may have - * increased because of the recent packet arrival. If so, - * adjust its position in the list. - */ - candidate = list_prev_entry(rpc, grantable_links); - if (!homa_grant_outranks(rpc, candidate)) - goto position_peer; - __list_del_entry(&candidate->grantable_links); - list_add(&candidate->grantable_links, &rpc->grantable_links); + } else { + while (rpc != list_first_entry(&peer->grantable_rpcs, + struct homa_rpc, grantable_links)) { + /* Message is on the list, but its priority may have + * increased because of the recent packet arrival. If + * so, adjust its position in the list. + */ + candidate = list_prev_entry(rpc, grantable_links); + if (!homa_grant_outranks(rpc, candidate)) + goto position_peer; + __list_del_entry(&candidate->grantable_links); + list_add(&candidate->grantable_links, &rpc->grantable_links); + } } - position_peer: +position_peer: /* At this point rpc is positioned correctly on the list for its peer. * However, the peer may need to be added to, or moved upward on, * homa->grantable_peers. @@ -127,8 +131,8 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) list_add_tail(&peer->grantable_links, &homa->grantable_peers); goto done; } - /* The peer is on Homa's list, but it may need to move upward. */ - while (peer != list_first_entry(&homa->grantable_peers, + /* The peer is on Homa's list, but it may need to move upward. */ + while (peer != list_first_entry(&homa->grantable_peers, struct homa_peer, grantable_links)) { struct homa_peer *prev_peer = list_prev_entry( peer, grantable_links); @@ -139,7 +143,7 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) __list_del_entry(&prev_peer->grantable_links); list_add(&prev_peer->grantable_links, &peer->grantable_links); } - done: +done: } /** @@ -188,7 +192,7 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) */ head = list_first_entry(&peer->grantable_rpcs, struct homa_rpc, grantable_links); - while (peer != list_last_entry(&homa->grantable_peers, struct homa_peer, + while (peer != list_last_entry(&homa->grantable_peers, struct homa_peer, grantable_links)) { struct homa_peer *next_peer = list_next_entry( peer, grantable_links); @@ -245,10 +249,10 @@ int homa_grant_send(struct homa_rpc *rpc, struct homa *homa) grant.priority = rpc->msgin.priority; grant.resend_all = rpc->msgin.resend_all; rpc->msgin.resend_all = 0; - tt_record4("sending grant for id %llu, offset %d, priority %d, " - "increment %d", rpc->id, rpc->msgin.granted, - rpc->msgin.priority, increment); - homa_xmit_control(GRANT, &grant, sizeof(grant),rpc); + tt_record4("sending grant for id %llu, offset %d, priority %d, increment %d", + rpc->id, rpc->msgin.granted, rpc->msgin.priority, + increment); + homa_xmit_control(GRANT, &grant, sizeof(grant), rpc); return 1; } @@ -286,20 +290,20 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) } if (rpc->msgin.granted >= rpc->msgin.length) { - homa_grant_update_incoming(rpc,homa); + homa_grant_update_incoming(rpc, homa); homa_rpc_unlock(rpc); goto done; } - tt_record4("homa_grant_check_rpc starting for id %d, granted %d, " - "recv_end %d, length %d", rpc->id, rpc->msgin.granted, - rpc->msgin.recv_end, rpc->msgin.length); + tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", + rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, + rpc->msgin.length); /* This message requires grants; if it is a new message, set up * granting. */ if (list_empty(&rpc->grantable_links)) { - homa_grant_update_incoming(rpc,homa); + homa_grant_update_incoming(rpc, homa); homa_grantable_lock(homa, 0); homa_grant_add_rpc(rpc); recalc = ((homa->num_active_rpcs < homa->max_overcommit) @@ -355,7 +359,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) homa_rpc_unlock(rpc); if (recalc) homa_grant_recalc(homa, 0); - done: +done: tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); } @@ -405,9 +409,8 @@ void homa_grant_recalc(struct homa *homa, int locked) atomic_inc(&homa->grant_recalc_count); /* Clear the existing grant calculation. */ - for (i = 0; i < homa->num_active_rpcs; i++) { + for (i = 0; i < homa->num_active_rpcs; i++) atomic_set(&homa->active_rpcs[i]->msgin.rank, -1); - } /* Recompute which RPCs we'll grant to and initialize info * about them. @@ -653,8 +656,7 @@ int homa_grantable_lock_slow(struct homa *homa, int recalc) } if (recalc && atomic_read(&homa->grant_recalc_count) != starting_count) { - tt_record("skipping wait for grantable lock: recalc " - "elsewhere"); + tt_record("skipping wait for grantable lock: recalc elsewhere"); break; } } diff --git a/homa_impl.h b/homa_impl.h index a0e68e0a..b8ec2208 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +/* SPDX-License-Identifier: BSD-2-Clause */ /* This file contains definitions that are shared across the files * that implement Homa for Linux. @@ -50,14 +48,10 @@ #pragma GCC diagnostic warning "-Wpointer-sign" #pragma GCC diagnostic warning "-Wunused-variable" -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0) -typedef unsigned int __poll_t; -#endif - #ifdef __UNIT_TEST__ #undef alloc_pages #define alloc_pages mock_alloc_pages -extern struct page *mock_alloc_pages(gfp_t gfp, unsigned order); +extern struct page *mock_alloc_pages(gfp_t gfp, unsigned int order); #define compound_order mock_compound_order extern unsigned int mock_compound_order(struct page *page); @@ -74,12 +68,15 @@ extern struct task_struct *current_task; extern cycles_t mock_get_cycles(void); #define get_page mock_get_page - extern void mock_get_page(struct page *page); +extern void mock_get_page(struct page *page); #undef kmalloc #define kmalloc mock_kmalloc extern void *mock_kmalloc(size_t size, gfp_t flags); +#undef kmalloc_array +#define kmalloc_array(count, size, type) mock_kmalloc(count*size, type) + #define kthread_complete_and_exit(comp, code) #ifdef page_address @@ -317,7 +314,8 @@ struct common_header { /** * @checksum: not used by Homa, but must occupy the same bytes as - * the checksum in a TCP header (TSO may modify this?).*/ + * the checksum in a TCP header (TSO may modify this?). + */ __be16 checksum; /** @@ -335,7 +333,7 @@ struct common_header { * this RPC). */ __be64 sender_id; -} __attribute__((packed)); +} __packed; /** * struct homa_ack - Identifies an RPC that can be safely deleted by its @@ -359,7 +357,7 @@ struct homa_ack { /** @server_port: The server-side port for the RPC. */ __be16 server_port; -} __attribute__((packed)); +} __packed; /* struct data_header - Contains data for part or all of a Homa message. * An incoming packet consists of a data_header followed by message data. @@ -415,7 +413,7 @@ struct seg_header { * value will always be valid once the packet reaches homa_softirq. */ __be32 offset; -} __attribute__((packed)); +} __packed; struct data_header { struct common_header common; @@ -459,17 +457,14 @@ struct data_header { /** @seg: First of possibly many segments. */ struct seg_header seg; -} __attribute__((packed)); +} __packed; _Static_assert(sizeof(struct data_header) <= HOMA_MAX_HEADER, - "data_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + "data_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); _Static_assert(sizeof(struct data_header) >= HOMA_MIN_PKT_LENGTH, - "data_header too small: Homa doesn't currently have code" - "to pad data packets"); + "data_header too small: Homa doesn't currently have codeto pad data packets"); _Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) & 0x3) == 0, - " data_header length not a multiple of 4 bytes (required " - "for TCP/TSO compatibility"); + " data_header length not a multiple of 4 bytes (required for TCP/TSO compatibility"); /** * homa_data_len() - Returns the total number of bytes in a DATA packet @@ -511,10 +506,9 @@ struct grant_header { * that no packets have been successfully received). */ __u8 resend_all; -} __attribute__((packed)); +} __packed; _Static_assert(sizeof(struct grant_header) <= HOMA_MAX_HEADER, - "grant_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + "grant_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct resend_header - Wire format for RESEND packets. @@ -550,10 +544,9 @@ struct resend_header { * priority. */ __u8 priority; -} __attribute__((packed)); +} __packed; _Static_assert(sizeof(struct resend_header) <= HOMA_MAX_HEADER, - "resend_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + "resend_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct unknown_header - Wire format for UNKNOWN packets. @@ -567,10 +560,9 @@ _Static_assert(sizeof(struct resend_header) <= HOMA_MAX_HEADER, struct unknown_header { /** @common: Fields common to all packet types. */ struct common_header common; -} __attribute__((packed)); +} __packed; _Static_assert(sizeof(struct unknown_header) <= HOMA_MAX_HEADER, - "unknown_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + "unknown_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct busy_header - Wire format for BUSY packets. @@ -581,10 +573,9 @@ _Static_assert(sizeof(struct unknown_header) <= HOMA_MAX_HEADER, struct busy_header { /** @common: Fields common to all packet types. */ struct common_header common; -} __attribute__((packed)); +} __packed; _Static_assert(sizeof(struct busy_header) <= HOMA_MAX_HEADER, - "busy_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + "busy_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct cutoffs_header - Wire format for CUTOFFS packets. @@ -609,10 +600,9 @@ struct cutoffs_header { * this packet. */ __be16 cutoff_version; -} __attribute__((packed)); +} __packed; _Static_assert(sizeof(struct cutoffs_header) <= HOMA_MAX_HEADER, - "cutoffs_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + "cutoffs_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct freeze_header - Wire format for FREEZE packets. @@ -623,10 +613,9 @@ _Static_assert(sizeof(struct cutoffs_header) <= HOMA_MAX_HEADER, struct freeze_header { /** @common: Fields common to all packet types. */ struct common_header common; -} __attribute__((packed)); +} __packed; _Static_assert(sizeof(struct freeze_header) <= HOMA_MAX_HEADER, - "freeze_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + "freeze_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct need_ack_header - Wire format for NEED_ACK packets. @@ -637,10 +626,9 @@ _Static_assert(sizeof(struct freeze_header) <= HOMA_MAX_HEADER, struct need_ack_header { /** @common: Fields common to all packet types. */ struct common_header common; -} __attribute__((packed)); +} __packed; _Static_assert(sizeof(struct need_ack_header) <= HOMA_MAX_HEADER, - "need_ack_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + "need_ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct ack_header - Wire format for ACK packets. @@ -657,10 +645,9 @@ struct ack_header { __be16 num_acks; struct homa_ack acks[NUM_PEER_UNACKED_IDS]; -} __attribute__((packed)); +} __packed; _Static_assert(sizeof(struct ack_header) <= HOMA_MAX_HEADER, - "ack_header too large for HOMA_MAX_HEADER; must " - "adjust HOMA_MAX_HEADER"); + "ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct homa_message_out - Describes a message (either request or response) @@ -799,7 +786,7 @@ struct homa_message_in { * Never larger than @length. Note: once initialized, this * may not be modified without holding @homa->grantable_lock. */ - int granted; + int granted; /** * @rec_incoming: Number of bytes in homa->total_incoming currently @@ -896,7 +883,7 @@ struct homa_interest { * of a struct homa_interest. * @interest: Struct to initialize. */ -inline static void homa_interest_init(struct homa_interest *interest) +static inline void homa_interest_init(struct homa_interest *interest) { interest->thread = current; atomic_long_set(&interest->ready_rpc, 0); @@ -1132,10 +1119,11 @@ struct homa_rpc { * trace. * @rpc: RPC to validate. */ -inline static void homa_rpc_validate(struct homa_rpc *rpc) { +static inline void homa_rpc_validate(struct homa_rpc *rpc) +{ if (rpc->magic == HOMA_RPC_MAGIC) return; - printk(KERN_ERR "Accessing reaped Homa RPC!\n"); + pr_err("Accessing reaped Homa RPC!\n"); BUG(); } @@ -1158,7 +1146,7 @@ struct homa_socktab { * for socket lookups (RCU is used instead). Also used to * synchronize port allocation. */ - struct spinlock write_lock; + spinlock_t write_lock; /** * @buckets: Heads of chains for hash table buckets. Chains @@ -1221,7 +1209,7 @@ struct homa_rpc_bucket { * this bucket. This dual purpose permits clean and safe * deletion and garbage collection of RPCs. */ - struct spinlock lock; + spinlock_t lock; /** @rpcs: list of RPCs that hash to this bucket. */ struct hlist_head rpcs; @@ -1247,7 +1235,7 @@ struct homa_bpage { struct homa_cache_line cache_line; struct { /** @lock: to synchronize shared access. */ - struct spinlock lock; + spinlock_t lock; /** * @refs: Counts number of distinct uses of this @@ -1393,11 +1381,12 @@ struct homa_sock { * spin lock). See sync.txt for more on Homa's synchronization * strategy. */ - struct spinlock lock; + spinlock_t lock; /** * @last_locker: identifies the code that most recently acquired - * @lock successfully. Occasionally used for debugging. */ + * @lock successfully. Occasionally used for debugging. + */ char *last_locker; /** @@ -1552,7 +1541,7 @@ struct homa_peertab { * @write_lock: Synchronizes addition of new entries; not needed * for lookups (RCU is used instead). */ - struct spinlock write_lock; + spinlock_t write_lock; /** * @dead_dsts: List of dst_entries that are waiting to be deleted. @@ -1687,7 +1676,7 @@ struct homa_peer { /** * @ack_lock: used to synchronize access to @num_acks and @acks. */ - struct spinlock ack_lock; + spinlock_t ack_lock; }; /** @@ -1709,8 +1698,7 @@ enum homa_freeze_type { * use by a single NUMA node. Access to these objects is synchronized with * @homa->page_pool_mutex. */ -struct homa_page_pool -{ +struct homa_page_pool { /** @avail: Number of free pages currently in the pool. */ int avail; @@ -1751,13 +1739,13 @@ struct homa { * it could be a severe underestimate if there is competing traffic * from, say, TCP. Access only with atomic ops. */ - atomic64_t link_idle_time __attribute__((aligned(CACHE_LINE_SIZE))); + atomic64_t link_idle_time __aligned(CACHE_LINE_SIZE); /** * @grantable_lock: Used to synchronize access to grant-related * fields below, from @grantable_peers to @last_grantable_change. */ - struct spinlock grantable_lock __attribute__((aligned(CACHE_LINE_SIZE))); + spinlock_t grantable_lock __aligned(CACHE_LINE_SIZE); /** * @grantable_lock_time: get_cycles() time when grantable_lock @@ -1856,7 +1844,7 @@ struct homa { * @pacer_mutex: Ensures that only one instance of homa_pacer_xmit * runs at a time. Only used in "try" mode: never block on this. */ - struct spinlock pacer_mutex __attribute__((aligned(CACHE_LINE_SIZE))); + spinlock_t pacer_mutex __aligned(CACHE_LINE_SIZE); /** * @pacer_fifo_fraction: The fraction of time (in thousandths) when @@ -1882,7 +1870,7 @@ struct homa { * insert or remove an RPC from throttled_rpcs, must first acquire * the RPC's socket lock, then this lock. */ - struct spinlock throttle_lock; + spinlock_t throttle_lock; /** * @throttled_rpcs: Contains all homa_rpcs that have bytes ready @@ -1918,7 +1906,7 @@ struct homa { * a peer sends more bytes than granted (see synchronization note in * homa_send_grants for why we have to allow this possibility). */ - atomic_t total_incoming __attribute__((aligned(CACHE_LINE_SIZE))); + atomic_t total_incoming __aligned(CACHE_LINE_SIZE); /** * @next_client_port: A client port number to consider for the @@ -1926,23 +1914,23 @@ struct homa { * be in the range allocated for servers; must check before using. * This port may also be in use already; must check. */ - __u16 next_client_port __attribute__((aligned(CACHE_LINE_SIZE))); + __u16 next_client_port __aligned(CACHE_LINE_SIZE); /** * @port_map: Information about all open sockets. */ - struct homa_socktab port_map __attribute__((aligned(CACHE_LINE_SIZE))); + struct homa_socktab port_map __aligned(CACHE_LINE_SIZE); /** * @peertab: Info about all the other hosts we have communicated with. */ struct homa_peertab peers; - /** + /** * @page_pool_mutex: Synchronizes access to any/all of the page_pools * used for outgoing sk_buff data. */ - struct spinlock page_pool_mutex __attribute__((aligned(CACHE_LINE_SIZE))); + spinlock_t page_pool_mutex __aligned(CACHE_LINE_SIZE); /** * @skb_page_frees_per_sec: Rate at which to return pages from sk_buff @@ -1957,7 +1945,7 @@ struct homa { */ struct page **skb_pages_to_free; - /** + /** * @pages_to_free_slot: Maximum number of pages that can be * stored in skb_pages_to_free; */ @@ -2290,7 +2278,7 @@ struct homa { * @metrics_lock: Used to synchronize accesses to @metrics_active_opens * and updates to @metrics. */ - struct spinlock metrics_lock; + spinlock_t metrics_lock; /* * @metrics: a human-readable string containing recent values @@ -2298,7 +2286,7 @@ struct homa { * homa_append_metric. This string is kmalloc-ed; NULL means * homa_append_metric has never been called. */ - char* metrics; + char *metrics; /** @metrics_capacity: number of bytes available at metrics. */ size_t metrics_capacity; @@ -2547,7 +2535,8 @@ struct homa_metrics { __u64 send_cycles; /** @send_calls: total number of invocations of homa_semdmsg - * for requests. */ + * for requests. + */ __u64 send_calls; /** @@ -3068,17 +3057,17 @@ struct homa_core { */ __u64 last_app_active; - /** - * held_skb: last packet buffer known to be available for - * merging other packets into on this core (note: may not still - * be available), or NULL if none. - */ - struct sk_buff *held_skb; + /** + * held_skb: last packet buffer known to be available for + * merging other packets into on this core (note: may not still + * be available), or NULL if none. + */ + struct sk_buff *held_skb; /** * @held_bucket: the index, within napi->gro_hash, of the list - * containing @held_skb; undefined if @held_skb is NULL. Used to - * verify that @held_skb is still available. + * containing @held_skb; undefined if @held_skb is NULL. Used to + * verify that @held_skb is still available. */ int held_bucket; @@ -3101,7 +3090,7 @@ struct homa_core { */ int rpcs_locked; - /** + /** * @skb_page: a page of data available being used for skb frags. * This pointer is included in the page's reference count. */ @@ -3130,7 +3119,7 @@ struct homa_core { */ int num_stashed_pages; - /** + /** * @stashed_pages: use to prefetch from the cache all of the pages a * message will need with a single operation, to avoid having to * synchronize separately for each page. Note: these pages are all @@ -3168,7 +3157,7 @@ struct homa_skb_info { */ int data_bytes; - /** @seg_length: maximum number of data bytes in each GSO segment. */ + /** @seg_length: maximum number of data bytes in each GSO segment. */ int seg_length; /** @@ -3224,13 +3213,14 @@ static inline __u64 homa_local_id(__be64 sender_id) * @locker: Static string identifying the locking code. Normally ignored, * but used occasionally for diagnostics and debugging. */ -inline static void homa_bucket_lock(struct homa_rpc_bucket *bucket, - __u64 id, char *locker) +static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, + __u64 id, const char *locker) { int core = raw_smp_processor_id(); + if (!spin_trylock_bh(&bucket->lock)) homa_bucket_lock_slow(bucket, id); - homa_cores[core]->rpcs_locked ++; + homa_cores[core]->rpcs_locked++; BUG_ON(homa_cores[core]->rpcs_locked > 1); } @@ -3244,13 +3234,14 @@ inline static void homa_bucket_lock(struct homa_rpc_bucket *bucket, * Return: Nonzero if lock was successfully acquired, zero if it is * currently owned by someone else. */ -inline static int homa_bucket_try_lock(struct homa_rpc_bucket *bucket, - __u64 id, char *locker) +static inline int homa_bucket_try_lock(struct homa_rpc_bucket *bucket, + __u64 id, const char *locker) { int core = raw_smp_processor_id(); + if (!spin_trylock_bh(&bucket->lock)) return 0; - homa_cores[core]->rpcs_locked ++; + homa_cores[core]->rpcs_locked++; BUG_ON(homa_cores[core]->rpcs_locked > 1); return 1; } @@ -3260,7 +3251,7 @@ inline static int homa_bucket_try_lock(struct homa_rpc_bucket *bucket, * @bucket: Bucket to unlock. * @id: ID of the RPC that was using the lock. */ -inline static void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id) +static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id) { homa_cores[raw_smp_processor_id()]->rpcs_locked--; spin_unlock_bh(&bucket->lock); @@ -3279,7 +3270,8 @@ inline static void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id) * @locker: Static string identifying the locking code. Normally ignored, * but used occasionally for diagnostics and debugging. */ -inline static void homa_rpc_lock(struct homa_rpc *rpc, char *locker) { +static inline void homa_rpc_lock(struct homa_rpc *rpc, const char *locker) +{ homa_bucket_lock(rpc->bucket, rpc->id, locker); } @@ -3287,7 +3279,8 @@ inline static void homa_rpc_lock(struct homa_rpc *rpc, char *locker) { * homa_rpc_unlock() - Release the lock for an RPC. * @rpc: RPC to unlock. */ -inline static void homa_rpc_unlock(struct homa_rpc *rpc) { +static inline void homa_rpc_unlock(struct homa_rpc *rpc) +{ homa_bucket_unlock(rpc->bucket, rpc->id); } @@ -3321,7 +3314,7 @@ static inline struct homa_rpc_bucket *homa_client_rpc_bucket( */ static inline struct sk_buff **homa_next_skb(struct sk_buff *skb) { - return (struct sk_buff **) (skb_end_pointer(skb) - sizeof(char*)); + return (struct sk_buff **) (skb_end_pointer(skb) - sizeof(char *)); } /** @@ -3384,7 +3377,8 @@ static inline struct homa_sock *homa_sk(const struct sock *sk) * @locker: Static string identifying where the socket was locked; * used to track down deadlocks. */ -static inline void homa_sock_lock(struct homa_sock *hsk, char *locker) { +static inline void homa_sock_lock(struct homa_sock *hsk, const char *locker) +{ if (!spin_trylock_bh(&hsk->lock)) { // printk(KERN_NOTICE "Slow path for socket %d, last locker %s", // hsk->client_port, hsk->last_locker); @@ -3397,7 +3391,8 @@ static inline void homa_sock_lock(struct homa_sock *hsk, char *locker) { * homa_sock_unlock() - Release the lock for a socket. * @hsk: Socket to lock. */ -static inline void homa_sock_unlock(struct homa_sock *hsk) { +static inline void homa_sock_unlock(struct homa_sock *hsk) +{ spin_unlock_bh(&hsk->lock); } @@ -3408,9 +3403,8 @@ static inline void homa_sock_unlock(struct homa_sock *hsk) { */ static inline void homa_peer_lock(struct homa_peer *peer) { - if (!spin_trylock_bh(&peer->ack_lock)) { + if (!spin_trylock_bh(&peer->ack_lock)) homa_peer_lock_slow(peer); - } } /** @@ -3424,10 +3418,9 @@ static inline void homa_peer_unlock(struct homa_peer *peer) /** * homa_protect_rpcs() - Ensures that no RPCs will be reaped for a given - * socket until until homa_sock_unprotect is called. Typically - * used by functions that want to scan the active RPCs for a socket - * without holding the socket lock. Multiple calls to this function may - * be in effect at once. + * socket until homa_sock_unprotect is called. Typically used by functions + * that want to scan the active RPCs for a socket without holding the socket + * lock. Multiple calls to this function may be in effect at once. * @hsk: Socket whose RPCs should be protected. Must not be locked * by the caller; will be locked here. * @@ -3437,7 +3430,8 @@ static inline void homa_peer_unlock(struct homa_peer *peer) static inline int homa_protect_rpcs(struct homa_sock *hsk) { int result; - homa_sock_lock(hsk, "homa_sock_protect"); + + homa_sock_lock(hsk, __func__); result = !hsk->shutdown; if (result) atomic_inc(&hsk->protect_count); @@ -3496,9 +3490,8 @@ static inline void homa_grantable_unlock(struct homa *homa) */ static inline void homa_throttle_lock(struct homa *homa) { - if (!spin_trylock_bh(&homa->throttle_lock)) { + if (!spin_trylock_bh(&homa->throttle_lock)) homa_throttle_lock_slow(homa); - } } /** @@ -3526,7 +3519,9 @@ static inline bool skb_is_ipv6(const struct sk_buff *skb) static inline struct in6_addr ipv4_to_ipv6(__be32 ip4) { struct in6_addr ret = {}; - if (ip4 == INADDR_ANY) return in6addr_any; + + if (ip4 == INADDR_ANY) + return in6addr_any; ret.in6_u.u6_addr32[2] = htonl(0xffff); ret.in6_u.u6_addr32[3] = ip4; return ret; @@ -3548,7 +3543,7 @@ static inline __be32 ipv6_to_ipv4(const struct in6_addr ip6) * was IPv4, convert it to an IPv4-mapped IPv6 address. * @addr: Address to canonicalize. */ -static inline struct in6_addr canonical_ipv6_addr(const sockaddr_in_union *addr) +static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union *addr) { if (addr) { return (addr->sa.sa_family == AF_INET6) @@ -3587,6 +3582,7 @@ static inline bool is_mapped_ipv4(const struct in6_addr x) static inline bool is_homa_pkt(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); + return ((iph->protocol == IPPROTO_HOMA) || ((iph->protocol == IPPROTO_TCP) && (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); @@ -3605,8 +3601,8 @@ static inline __be32 tt_addr(const struct in6_addr x) } #ifdef __UNIT_TEST__ -extern void unit_log_printf(const char *separator, const char* format, ...) - __attribute__((format(printf, 2, 3))); +extern void unit_log_printf(const char *separator, const char *format, ...) + __printf(2, 3); #define UNIT_LOG unit_log_printf extern void unit_hook(char *id); #define UNIT_HOOK(msg) unit_hook(msg) @@ -3622,42 +3618,43 @@ extern void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); extern void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb); extern void homa_add_to_throttled(struct homa_rpc *rpc); -extern void homa_append_metric(struct homa *homa, const char* format, ...); +extern void homa_append_metric(struct homa *homa, const char *format, ...); extern int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb); extern int homa_bind(struct socket *sk, struct sockaddr *addr, - int addr_len); + int addr_len); extern void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id); extern void homa_check_rpc(struct homa_rpc *rpc); extern int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, - bool force); + bool force); extern struct homa_rpc *homa_choose_fifo_grant(struct homa *homa); extern struct homa_interest - *homa_choose_interest(struct homa *homa, struct list_head *head, - int offset); + *homa_choose_interest(struct homa *homa, struct list_head *head, + int offset); extern void homa_close(struct sock *sock, long timeout); extern int homa_copy_to_user(struct homa_rpc *rpc); extern void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); extern void homa_data_from_server(struct sk_buff *skb, - struct homa_rpc *crpc); + struct homa_rpc *crpc); extern void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); extern void homa_destroy(struct homa *homa); extern int homa_diag_destroy(struct sock *sk, int err); extern int homa_disconnect(struct sock *sk, int flags); extern void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); extern int homa_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void __user *buffer, size_t *lenp, loff_t *ppos); extern void homa_dst_refresh(struct homa_peertab *peertab, - struct homa_peer *peer, struct homa_sock *hsk); + struct homa_peer *peer, struct homa_sock *hsk); extern int homa_err_handler_v4(struct sk_buff *skb, u32 info); -extern int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm * - , u8, u8, int, __be32); +extern int homa_err_handler_v6(struct sk_buff *skb, + struct inet6_skb_parm *opt, u8 type, u8 code, int offset, + __be32 info); extern int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter); extern struct homa_rpc - *homa_find_client_rpc(struct homa_sock *hsk, __u64 id); + *homa_find_client_rpc(struct homa_sock *hsk, __u64 id); extern struct homa_rpc - *homa_find_server_rpc(struct homa_sock *hsk, + *homa_find_server_rpc(struct homa_sock *hsk, const struct in6_addr *saddr, __u16 sport, __u64 id); extern void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format); @@ -3667,7 +3664,7 @@ extern struct homa_gap extern void homa_gap_retry(struct homa_rpc *rpc); extern int homa_get_port(struct sock *sk, unsigned short snum); extern int homa_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option); + char __user *optval, int __user *option); extern void homa_grant_add_rpc(struct homa_rpc *rpc); extern void homa_grant_check_rpc(struct homa_rpc *rpc); extern void homa_grant_find_oldest(struct homa *homa); @@ -3689,14 +3686,14 @@ extern void homa_gro_gen3(struct sk_buff *skb); extern void homa_gro_hook_tcp(void); extern void homa_gro_unhook_tcp(void); extern struct sk_buff - *homa_gro_receive(struct list_head *gro_list, - struct sk_buff *skb); + *homa_gro_receive(struct list_head *gro_list, + struct sk_buff *skb); extern struct sk_buff - *homa_gso_segment(struct sk_buff *skb, + *homa_gso_segment(struct sk_buff *skb, netdev_features_t features); extern int homa_hash(struct sock *sk); extern enum hrtimer_restart - homa_hrtimer(struct hrtimer *timer); + homa_hrtimer(struct hrtimer *timer); extern int homa_init(struct homa *homa); extern void homa_incoming_sysctl_changed(struct homa *homa); extern int homa_ioc_abort(struct sock *sk, int *karg); @@ -3711,12 +3708,12 @@ extern loff_t homa_metrics_lseek(struct file *file, loff_t offset, int whence); extern int homa_metrics_open(struct inode *inode, struct file *file); extern ssize_t homa_metrics_read(struct file *file, char __user *buffer, - size_t length, loff_t *offset); + size_t length, loff_t *offset); extern int homa_metrics_release(struct inode *inode, struct file *file); extern void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); extern struct sk_buff - *homa_new_data_packet(struct homa_rpc *rpc, + *homa_new_data_packet(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data); extern int homa_offload_end(void); @@ -3732,18 +3729,18 @@ extern struct homa_peer ** extern int homa_peertab_init(struct homa_peertab *peertab); extern void homa_peer_add_ack(struct homa_rpc *rpc); extern struct homa_peer - *homa_peer_find(struct homa_peertab *peertab, + *homa_peer_find(struct homa_peertab *peertab, const struct in6_addr *addr, struct inet_sock *inet); extern int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst); extern struct dst_entry - *homa_peer_get_dst(struct homa_peer *peer, + *homa_peer_get_dst(struct homa_peer *peer, struct inet_sock *inet); extern void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, - int c2, int c3, int c4, int c5, int c6, int c7); + int c2, int c3, int c4, int c5, int c6, int c7); extern void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now); extern __poll_t homa_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); + struct poll_table_struct *wait); extern int homa_pool_allocate(struct homa_rpc *rpc); extern void homa_pool_check_waiting(struct homa_pool *pool); extern void homa_pool_destroy(struct homa_pool *pool); @@ -3760,20 +3757,20 @@ extern char *homa_print_ipv6_addr(const struct in6_addr *addr); extern char *homa_print_metrics(struct homa *homa); extern char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); extern char *homa_print_packet_short(struct sk_buff *skb, char *buffer, - int buf_len); + int buf_len); extern void homa_prios_changed(struct homa *homa); extern int homa_proc_read_metrics(char *buffer, char **start, off_t offset, - int count, int *eof, void *data); + int count, int *eof, void *data); extern int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len); + int flags, int *addr_len); extern int homa_register_interests(struct homa_interest *interest, - struct homa_sock *hsk, int flags, __u64 id); + struct homa_sock *hsk, int flags, __u64 id); extern void homa_rehash(struct sock *sk); extern void homa_remove_from_throttled(struct homa_rpc *rpc); extern void homa_resend_data(struct homa_rpc *rpc, int start, int end, - int priority); + int priority); extern void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, - struct homa_sock *hsk); + struct homa_sock *hsk); extern void homa_rpc_abort(struct homa_rpc *crpc, int error); extern void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); @@ -3785,17 +3782,17 @@ extern void homa_rpc_log_tt(struct homa_rpc *rpc); extern void homa_rpc_log_active(struct homa *homa, uint64_t id); extern void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); extern struct homa_rpc - *homa_rpc_new_client(struct homa_sock *hsk, - const sockaddr_in_union *dest); + *homa_rpc_new_client(struct homa_sock *hsk, + const union sockaddr_in_union *dest); extern struct homa_rpc - *homa_rpc_new_server(struct homa_sock *hsk, + *homa_rpc_new_server(struct homa_sock *hsk, const struct in6_addr *source, struct data_header *h, int *created); extern int homa_rpc_reap(struct homa_sock *hsk, int count); extern void homa_send_ipis(void); extern int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); extern int homa_setsockopt(struct sock *sk, int level, int optname, - sockptr_t __user optval, unsigned int optlen); + sockptr_t __user optval, unsigned int optlen); extern int homa_shutdown(struct socket *sock, int how); extern int homa_skb_append_from_iter(struct homa *homa, struct sk_buff *skb, struct iov_iter *iter, int length); @@ -3821,52 +3818,51 @@ extern void homa_skb_page_pool_init(struct homa_page_pool *pool); extern void homa_skb_release_pages(struct homa *homa); extern void homa_skb_stash_pages(struct homa *homa, int length); extern int homa_snprintf(char *buffer, int size, int used, - const char *format, ...) - __attribute__((format(printf, 4, 5))); + const char *format, ...) __printf(4, 5); extern int homa_sock_bind(struct homa_socktab *socktab, - struct homa_sock *hsk, __u16 port); + struct homa_sock *hsk, __u16 port); extern void homa_sock_destroy(struct homa_sock *hsk); extern struct homa_sock * - homa_sock_find(struct homa_socktab *socktab, __u16 port); + homa_sock_find(struct homa_socktab *socktab, __u16 port); extern void homa_sock_init(struct homa_sock *hsk, struct homa *homa); extern void homa_sock_shutdown(struct homa_sock *hsk); extern int homa_socket(struct sock *sk); extern void homa_socktab_destroy(struct homa_socktab *socktab); extern void homa_socktab_init(struct homa_socktab *socktab); extern struct homa_sock - *homa_socktab_next(struct homa_socktab_scan *scan); + *homa_socktab_next(struct homa_socktab_scan *scan); extern struct homa_sock - *homa_socktab_start_scan(struct homa_socktab *socktab, - struct homa_socktab_scan *scan); + *homa_socktab_start_scan(struct homa_socktab *socktab, + struct homa_socktab_scan *scan); extern int homa_softirq(struct sk_buff *skb); extern void homa_spin(int ns); extern char *homa_symbol_for_state(struct homa_rpc *rpc); extern char *homa_symbol_for_type(uint8_t type); extern int homa_sysctl_softirq_cores(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void __user *buffer, size_t *lenp, loff_t *ppos); extern struct sk_buff - *homa_tcp_gro_receive(struct list_head *held_list, + *homa_tcp_gro_receive(struct list_head *held_list, struct sk_buff *skb); extern void homa_timer(struct homa *homa); extern int homa_timer_main(void *transportInfo); extern void homa_unhash(struct sock *sk); extern void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); extern int homa_unsched_priority(struct homa *homa, - struct homa_peer *peer, int length); + struct homa_peer *peer, int length); extern int homa_v4_early_demux(struct sk_buff *skb); extern int homa_v4_early_demux_handler(struct sk_buff *skb); extern int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); extern struct homa_rpc - *homa_wait_for_message(struct homa_sock *hsk, int flags, - __u64 id); + *homa_wait_for_message(struct homa_sock *hsk, int flags, + __u64 id); extern int homa_xmit_control(enum homa_packet_type type, void *contents, - size_t length, struct homa_rpc *rpc); + size_t length, struct homa_rpc *rpc); extern int __homa_xmit_control(void *contents, size_t length, - struct homa_peer *peer, struct homa_sock *hsk); + struct homa_peer *peer, struct homa_sock *hsk); extern void homa_xmit_data(struct homa_rpc *rpc, bool force); extern void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, - int priority); + int priority); extern void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); /** diff --git a/homa_incoming.c b/homa_incoming.c index 81c9ddec..cbf7f7b2 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1,9 +1,8 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file contains functions that handle incoming Homa messages, including - * both receiving information for those messages and sending grants. */ + * both receiving information for those messages and sending grants. + */ #include "homa_impl.h" @@ -58,10 +57,11 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) * @end: Offset of byte just after the last one covered by the gap. * Return: Pointer to the new gap. */ -struct homa_gap * homa_gap_new(struct list_head *next, int start, int end) +struct homa_gap *homa_gap_new(struct list_head *next, int start, int end) { struct homa_gap *gap; - gap = (struct homa_gap *) kmalloc(sizeof(struct homa_gap), GFP_KERNEL); + + gap = kmalloc(sizeof(struct homa_gap), GFP_KERNEL); gap->start = start; gap->end = end; gap->time = get_cycles(); @@ -79,13 +79,11 @@ void homa_gap_retry(struct homa_rpc *rpc) struct homa_gap *gap; struct resend_header resend; - list_for_each_entry(gap, &rpc->msgin.gaps, links) - { + list_for_each_entry(gap, &rpc->msgin.gaps, links) { resend.offset = htonl(gap->start); resend.length = htonl(gap->end - gap->start); resend.priority = rpc->hsk->homa->num_priorities - 1; - tt_record3("homa_gap_retry sending RESEND for id %d, start %d, " - "end %d", + tt_record3("homa_gap_retry sending RESEND for id %d, start %d, end %d", rpc->id, gap->start, gap->end); homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); } @@ -107,8 +105,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) struct homa_gap *gap, *dummy, *gap2; if ((start + length) > rpc->msgin.length) { - tt_record3("Packet extended past message end; id %d, " - "offset %d, length %d", + tt_record3("Packet extended past message end; id %d, offset %d, length %d", rpc->id, start, length); goto discard; } @@ -130,19 +127,17 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) * an existing gap. */ list_for_each_entry_safe(gap, dummy, &rpc->msgin.gaps, links) { - /* Is packet at the start of this gap? */ + /* Is packet at the start of this gap? */ if (start <= gap->start) { if (end <= gap->start) continue; if (start < gap->start) { - tt_record4("Packet overlaps gap start: id %d, " - "start %d, end %d, gap_start %d", + tt_record4("Packet overlaps gap start: id %d, start %d, end %d, gap_start %d", rpc->id, start, end, gap->start); goto discard; } if (end > gap->end) { - tt_record4("Packet overlaps gap end: id %d, " - "start %d, end %d, gap_end %d", + tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", rpc->id, start, end, gap->start); goto discard; } @@ -154,15 +149,14 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) goto keep; } - /* Is packet at the end of this gap? BTW, at this point we know + /* Is packet at the end of this gap? BTW, at this point we know * the packet can't cover the entire gap. */ if (end >= gap->end) { if (start >= gap->end) continue; if (end > gap->end) { - tt_record4("Packet overlaps gap end: id %d, " - "start %d, end %d, gap_end %d", + tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", rpc->id, start, end, gap->start); goto discard; } @@ -177,18 +171,17 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) goto keep; } - discard: +discard: if (h->retransmit) INC_METRIC(resent_discards, 1); else INC_METRIC(packet_discards, 1); - tt_record4("homa_add_packet discarding packet for id %d, " - "offset %d, length %d, retransmit %d", + tt_record4("homa_add_packet discarding packet for id %d, offset %d, length %d, retransmit %d", rpc->id, start, length, h->retransmit); kfree_skb(skb); return; - keep: +keep: if (h->retransmit) INC_METRIC(resent_packets_used, 1); __skb_queue_tail(&rpc->msgin.packets, skb); @@ -229,6 +222,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) */ while (true) { struct sk_buff *skb = __skb_dequeue(&rpc->msgin.packets); + if (skb != NULL) { skbs[n] = skb; n++; @@ -297,7 +291,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) end_offset = offset + pkt_length; } - free_skbs: +free_skbs: if (end_offset != 0) { tt_record3("copied out bytes %d-%d for id %d", start_offset, end_offset, rpc->id); @@ -365,10 +359,9 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) else icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - tt_record3("Discarding packet(s) for unknown port %u, " - "id %llu, type %d", dport, - homa_local_id(h->common.sender_id), - h->common.type); + tt_record3("Discarding packet(s) for unknown port %u, id %llu, type %d", + dport, homa_local_id(h->common.sender_id), + h->common.type); while (skb != NULL) { next = skb->next; kfree_skb(skb); @@ -377,9 +370,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) return; } - /* Each iteration through through the following loop processes one - * packet. - */ + /* Each iteration through the following loop processes one packet. */ for (; skb != NULL; skb = next) { h = (struct data_header *) skb->data; next = skb->next; @@ -389,10 +380,10 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) */ if (rpc != NULL) { int flags = atomic_read(&rpc->flags); + if (flags & APP_NEEDS_LOCK) { homa_rpc_unlock(rpc); - tt_record2("softirq released lock for id %d, " - "flags 0x%x", rpc->id, flags); + tt_record2("softirq released lock for id %d, flags 0x%x", rpc->id, flags); homa_spin(200); rpc = NULL; } @@ -406,12 +397,12 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) int created; /* Create a new RPC if one doesn't - * already exist. */ + * already exist. + */ rpc = homa_rpc_new_server(hsk, &saddr, h, &created); if (IS_ERR(rpc)) { - printk(KERN_WARNING "homa_pkt_dispatch couldn't " - "create server rpc: error %lu", + pr_warn("homa_pkt_dispatch couldn't create server rpc: error %lu", -PTR_ERR(rpc)); INC_METRIC(server_cant_create_rpcs, 1); rpc = NULL; @@ -430,8 +421,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) && (h->common.type != NEED_ACK) && (h->common.type != ACK) && (h->common.type != RESEND)) { - tt_record4("Discarding packet for unknown RPC, " - "id %u, type %d, peer 0x%x:%d", + tt_record4("Discarding packet for unknown RPC, id %u, type %d, peer 0x%x:%d", id, h->common.type, tt_addr(saddr), ntohs(h->common.sport)); @@ -447,7 +437,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) } switch (h->common.type) { - case DATA: + case DATA: if (h->ack.client_id != 0) { /* Save the ack for processing later, when we * have released the RPC lock. @@ -504,7 +494,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) } continue; - discard: +discard: kfree_skb(skb); } if (rpc != NULL) @@ -570,8 +560,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) * exceed available cache space, resulting in poor * performance. */ - tt_record4("Dropping packet because no buffer space available: " - "id %d, offset %d, length %d, old incoming %d", + tt_record4("Dropping packet because no buffer space available: id %d, offset %d, length %d, old incoming %d", rpc->id, ntohl(h->seg.offset), homa_data_len(skb), rpc->msgin.granted); @@ -613,7 +602,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) } return; - discard: +discard: kfree_skb(skb); UNIT_LOG("; ", "homa_data_pkt discarded packet"); } @@ -629,8 +618,7 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) struct grant_header *h = (struct grant_header *) skb->data; int new_offset = ntohl(h->offset); - tt_record4("processing grant for id %llu, offset %d, priority %d, " - "increment %d", + tt_record4("processing grant for id %llu, offset %d, priority %d, increment %d", homa_local_id(h->common.sender_id), ntohl(h->offset), h->priority, new_offset - rpc->msgout.granted); if (rpc->state == RPC_OUTGOING) { @@ -666,8 +654,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct busy_header busy; if (rpc == NULL) { - tt_record4("resend request for unknown id %d, peer 0x%x:%d, " - "offset %d; responding with UNKNOWN", + tt_record4("resend request for unknown id %d, peer 0x%x:%d, offset %d; responding with UNKNOWN", homa_local_id(h->common.sender_id), tt_addr(saddr), ntohs(h->common.sport), ntohl(h->offset)); @@ -691,9 +678,8 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, /* We have chosen not to transmit data from this message; * send BUSY instead. */ - tt_record3("sending BUSY from resend, id %d, offset %d, " - "granted %d", rpc->id, - rpc->msgout.next_xmit_offset, + tt_record3("sending BUSY from resend, id %d, offset %d, granted %d", + rpc->id, rpc->msgout.next_xmit_offset, rpc->msgout.granted); homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); } else { @@ -709,7 +695,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, h->priority); } - done: +done: kfree_skb(skb); } @@ -728,31 +714,26 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) /* It appears that everything we've already transmitted * has been lost; retransmit it. */ - tt_record4("Restarting id %d to server 0x%x:%d, " - "lost %d bytes", + tt_record4("Restarting id %d to server 0x%x:%d, lost %d bytes", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgout.next_xmit_offset); - homa_freeze(rpc, RESTART_RPC, "Freezing because of " - "RPC restart, id %d, peer 0x%x"); + homa_freeze(rpc, RESTART_RPC, "Freezing because of RPC restart, id %d, peer 0x%x"); homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset, homa_unsched_priority(rpc->hsk->homa, rpc->peer, rpc->msgout.length)); goto done; } - printk(KERN_ERR "Received unknown for RPC id %llu, peer %s:%d " - "in bogus state %d; discarding unknown\n", + pr_err("Received unknown for RPC id %llu, peer %s:%d in bogus state %d; discarding unknown\n", rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), rpc->dport, rpc->state); - tt_record4("Discarding unknown for RPC id %d, peer 0x%x:%d: " - "bad state %d", + tt_record4("Discarding unknown for RPC id %d, peer 0x%x:%d: bad state %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->state); } else { if (rpc->hsk->homa->verbose) - printk(KERN_NOTICE "Freeing rpc id %llu from client " - "%s:%d: unknown to client", + pr_notice("Freeing rpc id %llu from client %s:%d: unknown to client", rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), rpc->dport); @@ -779,7 +760,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) if (!IS_ERR(peer)) { peer->unsched_cutoffs[0] = INT_MAX; - for (i = 1; i unsched_cutoffs[i] = ntohl(h->unsched_cutoffs[i]); peer->cutoff_version = h->cutoff_version; } @@ -811,12 +792,10 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, */ if ((rpc != NULL) && ((rpc->state != RPC_INCOMING) || rpc->msgin.bytes_remaining)) { - tt_record3("NEED_ACK arrived for id %d before message " - "received, state %d, remaining %d", + tt_record3("NEED_ACK arrived for id %d before message received, state %d, remaining %d", rpc->id, rpc->state, rpc->msgin.bytes_remaining); homa_freeze(rpc, NEED_ACK_MISSING_DATA, - "Freezing because NEED_ACK received before " - "message complete, id %d, peer 0x%x"); + "Freezing because NEED_ACK received before message complete, id %d, peer 0x%x"); goto done; } else { peer = homa_peer_find(&hsk->homa->peers, &saddr, &hsk->inet); @@ -837,10 +816,10 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, ack.num_acks = htons(homa_peer_get_acks(peer, NUM_PEER_UNACKED_IDS, ack.acks)); __homa_xmit_control(&ack, sizeof(ack), peer, hsk); - tt_record3("Responded to NEED_ACK for id %d, peer %0x%x with %d " - "other acks", id, tt_addr(saddr), ntohs(ack.num_acks)); + tt_record3("Responded to NEED_ACK for id %d, peer %0x%x with %d other acks", + id, tt_addr(saddr), ntohs(ack.num_acks)); - done: +done: kfree_skb(skb); } @@ -1051,18 +1030,17 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) homa_rpc_unlock(rpc); continue; } - tt_record4("homa_abort_sock_rpcs aborting id %u on port %d, " - "peer 0x%x, error %d", + tt_record4("homa_abort_sock_rpcs aborting id %u on port %d, peer 0x%x, error %d", rpc->id, hsk->port, tt_addr(rpc->peer->addr), error); - if (error) { + if (error) homa_rpc_abort(rpc, error); - } else + else homa_rpc_free(rpc); homa_rpc_unlock(rpc); } homa_unprotect_rpcs(hsk); - done: +done: rcu_read_unlock(); } @@ -1154,7 +1132,7 @@ int homa_register_interests(struct homa_interest *interest, homa_sock_unlock(hsk); return 0; - claim_rpc: +claim_rpc: list_del_init(&rpc->ready_links); if (!list_empty(&hsk->ready_requests) || !list_empty(&hsk->ready_responses)) { @@ -1164,7 +1142,8 @@ int homa_register_interests(struct homa_interest *interest, /* This flag is needed to keep the RPC from being reaped during the * gap between when we release the socket lock and we acquire the - * RPC lock.*/ + * RPC lock. + */ atomic_or(RPC_HANDING_OFF, &rpc->flags); homa_sock_unlock(hsk); if (!interest->locked) { @@ -1209,9 +1188,8 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, while (1) { error = homa_register_interests(&interest, hsk, flags, id); rpc = (struct homa_rpc *) atomic_long_read(&interest.ready_rpc); - if (rpc) { + if (rpc) goto found_rpc; - } if (error < 0) { result = ERR_PTR(error); goto found_rpc; @@ -1220,11 +1198,12 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, // tt_record3("Preparing to poll, socket %d, flags 0x%x, pid %d", // hsk->client_port, flags, current->pid); - /* There is no ready RPC so far. Clean up dead RPCs before + /* There is no ready RPC so far. Clean up dead RPCs before * going to sleep (or returning, if in nonblocking mode). */ while (1) { int reaper_result; + rpc = (struct homa_rpc *) atomic_long_read( &interest.ready_rpc); if (rpc) { @@ -1251,6 +1230,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, poll_start = now = get_cycles(); while (1) { __u64 blocked; + rpc = (struct homa_rpc *) atomic_long_read( &interest.ready_rpc); if (rpc) { @@ -1286,6 +1266,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, if (!rpc && !hsk->shutdown) { __u64 end; __u64 start = get_cycles(); + tt_record1("homa_wait_for_message sleeping, pid %d", current->pid); schedule(); @@ -1358,7 +1339,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, if (signal_pending(current)) return ERR_PTR(-EINTR); - /* No message and no error; try again. */ + /* No message and no error; try again. */ } done: diff --git a/homa_offload.c b/homa_offload.c index e94102ee..55cb8a83 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file implements GSO (Generic Segmentation Offload) and GRO (Generic * Receive Offload) for Homa. @@ -23,8 +21,8 @@ extern struct homa *homa; /* Pointers to TCP's net_offload structures. NULL means homa_gro_hook_tcp * hasn't been called yet. */ -const struct net_offload *tcp_net_offload = NULL; -const struct net_offload *tcp6_net_offload = NULL; +const struct net_offload *tcp_net_offload; +const struct net_offload *tcp6_net_offload; /* * Identical to *tcp_net_offload except that the gro_receive function @@ -42,6 +40,7 @@ int homa_offload_init(void) { int res1 = inet_add_offload(&homa_offload, IPPROTO_HOMA); int res2 = inet6_add_offload(&homa_offload, IPPROTO_HOMA); + return res1 ? res1 : res2; } @@ -55,6 +54,7 @@ int homa_offload_end(void) { int res1 = inet_del_offload(&homa_offload, IPPROTO_HOMA); int res2 = inet6_del_offload(&homa_offload, IPPROTO_HOMA); + return res1 ? res1 : res2; } @@ -68,7 +68,7 @@ void homa_gro_hook_tcp(void) if (tcp_net_offload != NULL) return; - printk(KERN_NOTICE "Homa setting up TCP hijacking\n"); + pr_notice("Homa setting up TCP hijacking\n"); tcp_net_offload = inet_offloads[IPPROTO_TCP]; hook_tcp_net_offload = *tcp_net_offload; hook_tcp_net_offload.callbacks.gro_receive = homa_tcp_gro_receive; @@ -89,7 +89,7 @@ void homa_gro_unhook_tcp(void) { if (tcp_net_offload == NULL) return; - printk(KERN_NOTICE "Homa cancelling TCP hijacking\n"); + pr_notice("Homa cancelling TCP hijacking\n"); inet_offloads[IPPROTO_TCP] = tcp_net_offload; tcp_net_offload = NULL; inet6_offloads[IPPROTO_TCP] = tcp6_net_offload; @@ -110,8 +110,8 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, struct common_header *h = (struct common_header *) skb_transport_header(skb); // tt_record4("homa_tcp_gro_receive got type 0x%x, flags 0x%x, " - // "urgent 0x%x, id %d", h->type, h->flags, - // ntohs(h->urgent), homa_local_id(h->sender_id)); + // "urgent 0x%x, id %d", h->type, h->flags, + // ntohs(h->urgent), homa_local_id(h->sender_id)); if ((h->flags != HOMA_TCP_FLAGS) || (ntohs(h->urgent) != HOMA_TCP_URGENT)) return tcp_net_offload->callbacks.gro_receive(held_list, skb); @@ -199,6 +199,7 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs; + tt_record2("homa_gso_segment invoked, frags %d, headlen %d", skb_shinfo(skb)->nr_frags, skb_headlen(skb)); @@ -215,6 +216,7 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, if (ip_hdr(segs)->version == 4) { struct sk_buff *seg; int i = 0; + for (seg = segs; seg != NULL; seg = seg->next) { ip_hdr(seg)->id = htons(i); i++; @@ -285,8 +287,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, ntohl(h_new->common.sequence)); h_new->seg.offset = h_new->common.sequence; } - tt_record4("homa_gro_receive got packet from 0x%x " - "id %llu, offset %d, priority %d", + tt_record4("homa_gro_receive got packet from 0x%x id %llu, offset %d, priority %d", saddr, homa_local_id(h_new->common.sender_id), ntohl(h_new->seg.offset), priority); if ((homa_data_len(skb) == ntohl(h_new->message_length)) @@ -296,8 +297,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, goto bypass; } } else if (h_new->common.type == GRANT) { - tt_record4("homa_gro_receive got grant from 0x%x " - "id %llu, offset %d, priority %d", + tt_record4("homa_gro_receive got grant from 0x%x id %llu, offset %d, priority %d", saddr, homa_local_id(h_new->common.sender_id), ntohl(((struct grant_header *) h_new)->offset), priority); @@ -312,8 +312,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, goto bypass; } } else - tt_record4("homa_gro_receive got packet from 0x%x " - "id %llu, type 0x%x, priority %d", + tt_record4("homa_gro_receive got packet from 0x%x id %llu, type 0x%x, priority %d", saddr, homa_local_id(h_new->common.sender_id), h_new->common.type, priority); @@ -339,7 +338,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * packets from the list, so core->held_skb could be a * dangling pointer (or the skb could have been reused for * some other protocol). - */ + */ list_for_each_entry(held_skb, &napi->gro_hash[core->held_bucket].list, list) { int protocol; @@ -351,8 +350,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, else protocol = ip_hdr(held_skb)->protocol; if (protocol != IPPROTO_HOMA) { - tt_record3("homa_gro_receive held_skb 0x%0x%0x " - "isn't Homa: protocol %d", + tt_record3("homa_gro_receive held_skb 0x%0x%0x isn't Homa: protocol %d", SPLIT_64(held_skb), protocol); continue; } @@ -406,13 +404,13 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, if (likely(homa->gro_policy & HOMA_GRO_SAME_CORE)) homa_set_softirq_cpu(skb, raw_smp_processor_id()); - done: +done: homa_check_pacer(homa, 1); core->last_gro = get_cycles(); return result; - bypass: - /* Record SoftIRQ cycles in a different metric to reflect that +bypass: + /* Record SoftIRQ cycles in a different metric to reflect that * they happened during bypass. */ saved_softirq_metric = core->metrics.softirq_cycles; @@ -449,6 +447,7 @@ void homa_gro_gen2(struct sk_buff *skb) int candidate = this_core; __u64 now = get_cycles(); struct homa_core *core; + for (i = CORES_TO_CHECK; i > 0; i--) { candidate++; if (unlikely(candidate >= nr_cpu_ids)) @@ -458,8 +457,7 @@ void homa_gro_gen2(struct sk_buff *skb) continue; if ((core->last_gro + homa->busy_cycles) > now) continue; - tt_record3("homa_gro_gen2 chose core %d for id %d " - "offset %d", + tt_record3("homa_gro_gen2 chose core %d for id %d offset %d", candidate, homa_local_id(h->common.sender_id), ntohl(h->seg.offset)); break; @@ -469,16 +467,15 @@ void homa_gro_gen2(struct sk_buff *skb) * rotate among them. */ int offset = homa_cores[this_core]->softirq_offset; + offset += 1; if (offset > CORES_TO_CHECK) offset = 1; homa_cores[this_core]->softirq_offset = offset; candidate = this_core + offset; - while (candidate >= nr_cpu_ids) { + while (candidate >= nr_cpu_ids) candidate -= nr_cpu_ids; - } - tt_record3("homa_gro_gen2 chose core %d for id %d " - "offset %d (all cores busy)", + tt_record3("homa_gro_gen2 chose core %d for id %d offset %d (all cores busy)", candidate, homa_local_id(h->common.sender_id), ntohl(h->seg.offset)); } @@ -510,9 +507,9 @@ void homa_gro_gen3(struct sk_buff *skb) core = candidates[0]; for (i = 0; i < NUM_GEN3_SOFTIRQ_CORES; i++) { int candidate = candidates[i]; - if (candidate < 0) { + + if (candidate < 0) break; - } if (homa_cores[candidate]->last_app_active < busy_time) { core = candidate; break; @@ -543,9 +540,9 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) { struct data_header *h = (struct data_header *) skb_transport_header(skb); // tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", - // h->common.type, homa_local_id(h->common.sender_id), - // ntohl(h->seg.offset), - // NAPI_GRO_CB(skb)->count); + // h->common.type, homa_local_id(h->common.sender_id), + // ntohl(h->seg.offset), + // NAPI_GRO_CB(skb)->count); homa_cores[raw_smp_processor_id()]->held_skb = NULL; if (homa->gro_policy & HOMA_GRO_GEN3) { @@ -576,8 +573,7 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) } } homa_set_softirq_cpu(skb, best); - tt_record3("homa_gro_complete chose core %d for id %d " - "offset %d with IDLE policy", + tt_record3("homa_gro_complete chose core %d for id %d offset %d with IDLE policy", best, homa_local_id(h->common.sender_id), ntohl(h->seg.offset)); } else if (homa->gro_policy & HOMA_GRO_NEXT) { @@ -585,11 +581,11 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) * SoftIRQ processing. */ int target = raw_smp_processor_id() + 1; + if (unlikely(target >= nr_cpu_ids)) target = 0; homa_set_softirq_cpu(skb, target); - tt_record3("homa_gro_complete chose core %d for id %d " - "offset %d with NEXT policy", + tt_record3("homa_gro_complete chose core %d for id %d offset %d with NEXT policy", target, homa_local_id(h->common.sender_id), ntohl(h->seg.offset)); } diff --git a/homa_outgoing.c b/homa_outgoing.c index b8d9eaf9..a4285548 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file contains functions related to the sender side of message * transmission. It also contains utility functions for sending packets. @@ -16,7 +14,7 @@ * @priority: Priority level for the packet; must be less than * HOMA_MAX_PRIORITIES. */ -inline static void set_priority(struct sk_buff *skb, struct homa_sock *hsk, +static inline void set_priority(struct sk_buff *skb, struct homa_sock *hsk, int priority) { /* Note: this code initially specified the priority in the VLAN @@ -76,6 +74,7 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, */ while (1) { struct seg_header seg; + if (bytes_left < seg_length) seg_length = bytes_left; err = homa_skb_append_from_iter(rpc->hsk->homa, skb, iter, @@ -185,7 +184,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, } return skb; - error: +error: homa_skb_free_tx(rpc->hsk->homa, skb); return ERR_PTR(err); } @@ -204,7 +203,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, * Zero means the caller will initiate transmission after this * function returns. * - * Return: 0 for success, or a negative errno for failure. It is is possible + * Return: 0 for success, or a negative errno for failure. It is possible * for the RPC to be freed while this function is active. If that * happens, copying will cease, -EINVAL will be returned, and * rpc->state will be RPC_DEAD. @@ -264,8 +263,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) homa_skb_stash_pages(rpc->hsk->homa, rpc->msgout.length); /* Each iteration of the loop below creates one GSO packet. */ - tt_record3("starting copy from user space for id %d, length %d, " - "unscheduled %d", + tt_record3("starting copy from user space for id %d, length %d, unscheduled %d", rpc->id, rpc->msgout.length, rpc->msgout.unscheduled); last_link = &rpc->msgout.packets; for (bytes_left = rpc->msgout.length; bytes_left > 0; ) { @@ -320,7 +318,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) homa_xmit_data(rpc, false); return 0; - error: +error: atomic_andnot(RPC_COPYING_FROM_USER, &rpc->flags); return err; } @@ -343,6 +341,7 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, size_t length, struct homa_rpc *rpc) { struct common_header *h = (struct common_header *) contents; + h->type = type; h->sport = htons(rpc->hsk->port); h->dport = htons(rpc->dport); @@ -414,26 +413,19 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, */ if (refcount_read(&skb->users) > 1) { if (hsk->inet.sk.sk_family == AF_INET6) { - printk(KERN_NOTICE "ip6_xmit didn't free " - "Homa control packet (type %d) " - "after error %d\n", + pr_notice("ip6_xmit didn't free Homa control packet (type %d) after error %d\n", h->type, result); } else { - printk(KERN_NOTICE "ip_queue_xmit didn't free " - "Homa control packet (type %d) " - "after error %d\n", + pr_notice("ip_queue_xmit didn't free Homa control packet (type %d) after error %d\n", h->type, result); - tt_record2("ip_queue_xmit didn't free Homa " - "control packet (type %d) " - "after error %d\n", + tt_record2("ip_queue_xmit didn't free Homa control packet (type %d) after error %d\n", h->type, result); } } } txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) - tt_record4("__homa_xmit_control found stopped txq for id %d, " - "qid %d, num_queued %d, limit %d", + tt_record4("__homa_xmit_control found stopped txq for id %d, qid %d, num_queued %d, limit %d", be64_to_cpu(h->sender_id), skb->queue_mapping, txq->dql.num_queued, txq->dql.adj_limit); @@ -458,8 +450,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); if (hsk->homa->verbose) - printk(KERN_NOTICE "sending UNKNOWN to peer " - "%s:%d for id %llu", + pr_notice("sending UNKNOWN to peer %s:%d for id %llu", homa_print_ipv6_addr(&saddr), ntohs(h->sport), homa_local_id(h->sender_id)); tt_record3("sending unknown to 0x%x:%d for id %llu", @@ -473,7 +464,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) unknown.common.sender_id = cpu_to_be64(homa_local_id(h->sender_id)); peer = homa_peer_find(&hsk->homa->peers, &saddr, &hsk->inet); if (!IS_ERR(peer)) - __homa_xmit_control(&unknown, sizeof(unknown), peer, hsk); + __homa_xmit_control(&unknown, sizeof(unknown), peer, hsk); } /** @@ -502,8 +493,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) struct sk_buff *skb = *rpc->msgout.next_xmit; if (rpc->msgout.next_xmit_offset >= rpc->msgout.granted) { - tt_record3("homa_xmit_data stopping at offset %d " - "for id %u: granted is %d", + tt_record3("homa_xmit_data stopping at offset %d for id %u: granted is %d", rpc->msgout.next_xmit_offset, rpc->id, rpc->msgout.granted); break; @@ -512,8 +502,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) if ((rpc->msgout.length - rpc->msgout.next_xmit_offset) >= homa->throttle_min_bytes) { if (!homa_check_nic_queue(homa, skb, force)) { - tt_record1("homa_xmit_data adding id %u to " - "throttle queue", rpc->id); + tt_record1("homa_xmit_data adding id %u to throttle queue", rpc->id); homa_add_to_throttled(rpc); break; } @@ -534,8 +523,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) __homa_xmit_data(skb, rpc, priority); txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) - tt_record4("homa_xmit_data found stopped txq for id %d, " - "qid %d, num_queued %d, limit %d", + tt_record4("homa_xmit_data found stopped txq for id %d, qid %d, num_queued %d, limit %d", rpc->id, skb->queue_mapping, txq->dql.num_queued, txq->dql.adj_limit); force = false; @@ -575,8 +563,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct common_header, checksum); if (rpc->hsk->inet.sk.sk_family == AF_INET6) { - tt_record4("calling ip6_xmit: wire_bytes %d, peer 0x%x, id %d, " - "offset %d", + tt_record4("calling ip6_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", homa_get_skb_info(skb)->wire_bytes, tt_addr(rpc->peer->addr), rpc->id, homa_info->offset); @@ -584,8 +571,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) 0, NULL, rpc->hsk->homa->priority_map[priority] << 4, 0); } else { - tt_record4("calling ip_queue_xmit: wire_bytes %d, peer 0x%x, " - "id %d, offset %d", + tt_record4("calling ip_queue_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", homa_get_skb_info(skb)->wire_bytes, tt_addr(rpc->peer->addr), rpc->id, homa_info->offset); @@ -593,14 +579,12 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) rpc->hsk->inet.tos = rpc->hsk->homa->priority_map[priority]<<5; err = ip_queue_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow); } - tt_record4("Finished queueing packet: rpc id %llu, offset %d, len %d, " - "qid %d", + tt_record4("Finished queueing packet: rpc id %llu, offset %d, len %d, qid %d", rpc->id, homa_info->offset, homa_get_skb_info(skb)->data_bytes, skb->queue_mapping); - if (err) { + if (err) INC_METRIC(data_xmit_errors, 1); - } INC_METRIC(packets_sent[0], 1); INC_METRIC(priority_bytes[priority], skb->len); INC_METRIC(priority_packets[priority], 1); @@ -669,8 +653,8 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, - sizeof(struct seg_header)); if (unlikely(!new_skb)) { if (rpc->hsk->homa->verbose) - printk(KERN_NOTICE "homa_resend_data " - "couldn't allocate skb\n"); + pr_notice("%s couldn't allocate skb\n", + __func__); UNIT_LOG("; ", "skb allocation error"); goto resend_done; } @@ -689,11 +673,10 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, err = homa_skb_append_from_skb(rpc->hsk->homa, new_skb, skb, seg_offset, seg_length); if (err != 0) { - printk(KERN_ERR "homa_resend_data got error %d " - "from homa_skb_append_from_skb\n", - err); - UNIT_LOG("; ", "homa_resend_data got error %d " - "while copying data", -err); + pr_err("%s got error %d from homa_skb_append_from_skb\n", + __func__, err); + UNIT_LOG("; ", "%s got error %d while copying data", + __func__, -err); kfree_skb(new_skb); goto resend_done; } @@ -842,7 +825,7 @@ int homa_pacer_main(void *transportInfo) void homa_pacer_xmit(struct homa *homa) { struct homa_rpc *rpc; - int i; + int i; /* Make sure only one instance of this function executes at a * time. @@ -914,8 +897,7 @@ void homa_pacer_xmit(struct homa *homa) } homa_throttle_unlock(homa); - tt_record4("pacer calling homa_xmit_data for rpc id %llu, " - "port %d, offset %d, bytes_left %d", + tt_record4("pacer calling homa_xmit_data for rpc id %llu, port %d, offset %d, bytes_left %d", rpc->id, rpc->hsk->port, rpc->msgout.next_xmit_offset, rpc->msgout.length - rpc->msgout.next_xmit_offset); @@ -931,8 +913,7 @@ void homa_pacer_xmit(struct homa *homa) */ homa_throttle_lock(homa); if (!list_empty(&rpc->throttled_links)) { - tt_record2("pacer removing id %d from " - "throttled list, offset %d", + tt_record2("pacer removing id %d from throttled list, offset %d", rpc->id, rpc->msgout.next_xmit_offset); list_del_rcu(&rpc->throttled_links); @@ -954,7 +935,7 @@ void homa_pacer_xmit(struct homa *homa) } homa_rpc_unlock(rpc); } - done: +done: spin_unlock_bh(&homa->pacer_mutex); } @@ -985,9 +966,8 @@ void homa_add_to_throttled(struct homa_rpc *rpc) int checks = 0; __u64 now; - if (!list_empty(&rpc->throttled_links)) { + if (!list_empty(&rpc->throttled_links)) return; - } now = get_cycles(); if (!list_empty(&homa->throttled_rpcs)) INC_METRIC(throttled_cycles, now - homa->throttle_add); @@ -997,6 +977,7 @@ void homa_add_to_throttled(struct homa_rpc *rpc) list_for_each_entry_rcu(candidate, &homa->throttled_rpcs, throttled_links) { int bytes_left_cand; + checks++; /* Watch out: the pacer might have just transmitted the last @@ -1049,13 +1030,13 @@ void homa_log_throttled(struct homa *homa) int rpcs = 0; int64_t bytes = 0; - printk(KERN_NOTICE "Printing throttled list\n"); + pr_notice("Printing throttled list\n"); homa_throttle_lock(homa); list_for_each_entry_rcu(rpc, &homa->throttled_rpcs, throttled_links) { rpcs++; if (!homa_bucket_try_lock(rpc->bucket, rpc->id, "homa_log_throttled")) { - printk(KERN_NOTICE "Skipping throttled RPC: locked\n"); + pr_notice("Skipping throttled RPC: locked\n"); continue; } if (*rpc->msgout.next_xmit != NULL) @@ -1066,6 +1047,6 @@ void homa_log_throttled(struct homa *homa) homa_rpc_unlock(rpc); } homa_throttle_unlock(homa); - printk(KERN_NOTICE "Finished printing throttle list: %d rpcs, " - "%lld bytes\n", rpcs, bytes); + pr_notice("Finished printing throttle list: %d rpcs, %lld bytes\n", + rpcs, bytes); } diff --git a/homa_peertab.c b/homa_peertab.c index ba4d07d7..c9ac76e5 100644 --- a/homa_peertab.c +++ b/homa_peertab.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file manages homa_peertab objects and is responsible for creating * and deleting homa_peer objects. @@ -21,15 +19,15 @@ int homa_peertab_init(struct homa_peertab *peertab) * an error. */ int i; + spin_lock_init(&peertab->write_lock); INIT_LIST_HEAD(&peertab->dead_dsts); - peertab->buckets = (struct hlist_head *) vmalloc( + peertab->buckets = vmalloc( HOMA_PEERTAB_BUCKETS * sizeof(*peertab->buckets)); if (!peertab->buckets) return -ENOMEM; - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { + for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) INIT_HLIST_HEAD(&peertab->buckets[i]); - } return 0; } @@ -45,6 +43,7 @@ void homa_peertab_destroy(struct homa_peertab *peertab) int i; struct homa_peer *peer; struct hlist_node *next; + if (!peertab->buckets) return; @@ -68,7 +67,7 @@ void homa_peertab_destroy(struct homa_peertab *peertab) * caller must free this. If there is an error, or if there * are no peers, NULL is returned. */ -struct homa_peer ** homa_peertab_get_peers(struct homa_peertab *peertab, +struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, int *num_peers) { int i, count; @@ -91,7 +90,7 @@ struct homa_peer ** homa_peertab_get_peers(struct homa_peertab *peertab, if (count == 0) return NULL; - result = (struct homa_peer **) kmalloc(count*sizeof(peer), GFP_KERNEL); + result = kmalloc_array(count, sizeof(peer), GFP_KERNEL); if (result == NULL) return NULL; *num_peers = count; @@ -149,16 +148,17 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, */ struct homa_peer *peer; struct dst_entry *dst; + // Should use siphash or jhash here: __u32 bucket = hash_32(addr->in6_u.u6_addr32[0], HOMA_PEERTAB_BUCKET_BITS); + bucket ^= hash_32(addr->in6_u.u6_addr32[1], HOMA_PEERTAB_BUCKET_BITS); bucket ^= hash_32(addr->in6_u.u6_addr32[2], HOMA_PEERTAB_BUCKET_BITS); bucket ^= hash_32(addr->in6_u.u6_addr32[3], HOMA_PEERTAB_BUCKET_BITS); hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], peertab_links) { - if (ipv6_addr_equal(&peer->addr, addr)) { + if (ipv6_addr_equal(&peer->addr, addr)) return peer; - } INC_METRIC(peer_hash_links, 1); } @@ -206,7 +206,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, spin_lock_init(&peer->ack_lock); INC_METRIC(peer_new_entries, 1); - done: +done: spin_unlock_bh(&peertab->write_lock); return peer; } @@ -228,8 +228,8 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, if (IS_ERR(dst)) { /* Retain the existing dst if we can't create a new one. */ if (hsk->homa->verbose) - printk(KERN_NOTICE "homa_refresh couldn't recreate " - "dst: error %ld", PTR_ERR(dst)); + pr_notice("%s couldn't recreate dst: error %ld", + __func__, PTR_ERR(dst)); INC_METRIC(peer_route_errors, 1); } else { struct homa_dead_dst *dead = (struct homa_dead_dst *) @@ -266,6 +266,7 @@ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, int length) { int i; + for (i = homa->num_priorities-1; ; i--) { if (peer->unsched_cutoffs[i] >= length) return i; @@ -287,6 +288,7 @@ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, memset(&peer->flow, 0, sizeof(peer->flow)); if (inet->sk.sk_family == AF_INET) { struct rtable *rt; + flowi4_init_output(&peer->flow.u.ip4, inet->sk.sk_bound_dev_if, inet->sk.sk_mark, inet->tos, RT_SCOPE_UNIVERSE, inet->sk.sk_protocol, 0, @@ -298,27 +300,26 @@ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, if (IS_ERR(rt)) return (struct dst_entry *)(PTR_ERR(rt)); return &rt->dst; - } else { - peer->flow.u.ip6.flowi6_oif = inet->sk.sk_bound_dev_if; - peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX; - peer->flow.u.ip6.flowi6_mark = inet->sk.sk_mark; - peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE; - peer->flow.u.ip6.flowi6_proto = inet->sk.sk_protocol; - peer->flow.u.ip6.flowi6_flags = 0; - peer->flow.u.ip6.flowi6_secid = 0; - peer->flow.u.ip6.flowi6_tun_key.tun_id = 0; - peer->flow.u.ip6.flowi6_uid = inet->sk.sk_uid; - peer->flow.u.ip6.daddr = peer->addr; - peer->flow.u.ip6.saddr = inet->pinet6->saddr; - peer->flow.u.ip6.fl6_dport = 0; - peer->flow.u.ip6.fl6_sport = 0; - peer->flow.u.ip6.mp_hash = 0; - peer->flow.u.ip6.__fl_common.flowic_tos = inet->tos; - peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(inet->tos, 0); - security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); - return ip6_dst_lookup_flow(sock_net(&inet->sk), &inet->sk, - &peer->flow.u.ip6, NULL); } + peer->flow.u.ip6.flowi6_oif = inet->sk.sk_bound_dev_if; + peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX; + peer->flow.u.ip6.flowi6_mark = inet->sk.sk_mark; + peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE; + peer->flow.u.ip6.flowi6_proto = inet->sk.sk_protocol; + peer->flow.u.ip6.flowi6_flags = 0; + peer->flow.u.ip6.flowi6_secid = 0; + peer->flow.u.ip6.flowi6_tun_key.tun_id = 0; + peer->flow.u.ip6.flowi6_uid = inet->sk.sk_uid; + peer->flow.u.ip6.daddr = peer->addr; + peer->flow.u.ip6.saddr = inet->pinet6->saddr; + peer->flow.u.ip6.fl6_dport = 0; + peer->flow.u.ip6.fl6_sport = 0; + peer->flow.u.ip6.mp_hash = 0; + peer->flow.u.ip6.__fl_common.flowic_tos = inet->tos; + peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(inet->tos, 0); + security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); + return ip6_dst_lookup_flow(sock_net(&inet->sk), &inet->sk, + &peer->flow.u.ip6, NULL); } /** @@ -357,6 +358,7 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, void homa_peer_lock_slow(struct homa_peer *peer) { __u64 start = get_cycles(); + tt_record("beginning wait for peer lock"); spin_lock_bh(&peer->ack_lock); tt_record("ending wait for peer lock"); diff --git a/homa_plumbing.c b/homa_plumbing.c index ba820ffa..743cb8ad 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file consists mostly of "glue" that hooks Homa into the rest of * the Linux kernel. The guts of the protocol are in other files. @@ -29,7 +27,7 @@ struct homa *homa = &homa_data; /* True means that the Homa module is in the process of unloading itself, * so everyone should clean up. */ -static bool exiting = false; +static bool exiting; /* Thread that runs timer code to detect lost packets and crashed peers. */ static struct task_struct *timer_kthread; @@ -190,7 +188,7 @@ static const struct proc_ops homa_metrics_pops = { }; /* Used to remove /proc/net/homa_metrics when the module is unloaded. */ -static struct proc_dir_entry *metrics_dir_entry = NULL; +static struct proc_dir_entry *metrics_dir_entry; /* Used to configure sysctl access to Homa configuration parameters.*/ static struct ctl_table homa_ctl_table[] = { @@ -520,18 +518,12 @@ static DECLARE_COMPLETION(timer_thread_done); * homa_load() - invoked when this module is loaded into the Linux kernel * Return: 0 on success, otherwise a negative errno. */ -static int __init homa_load(void) { +static int __init homa_load(void) +{ int status; - printk(KERN_NOTICE "Homa module loading\n"); - printk(KERN_NOTICE "Homa structure sizes: data_header %u, " - "seg_header %u, ack %u, " - "grant_header %u, peer %u, ip_hdr %u flowi %u " - "ipv6_hdr %u, flowi6 %u " - "tcp_sock %u homa_rpc %u sk_buff %u " - "rcvmsg_control %u sockaddr_in_union %u " - "HOMA_MAX_BPAGES %u NR_CPUS %u " - "nr_cpu_ids %u, MAX_NUMNODES %d\n", + pr_notice("Homa module loading\n"); + pr_notice("Homa structure sizes: data_header %u, seg_header %u, ack %u, grant_header %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", sizeof32(struct data_header), sizeof32(struct seg_header), sizeof32(struct homa_ack), @@ -545,45 +537,43 @@ static int __init homa_load(void) { sizeof32(struct homa_rpc), sizeof32(struct sk_buff), sizeof32(struct homa_recvmsg_args), - sizeof32(sockaddr_in_union), + sizeof32(union sockaddr_in_union), HOMA_MAX_BPAGES, NR_CPUS, nr_cpu_ids, MAX_NUMNODES); status = proto_register(&homa_prot, 1); if (status != 0) { - printk(KERN_ERR "proto_register failed for homa_prot: %d\n", - status); + pr_err("proto_register failed for homa_prot: %d\n", status); goto out; } status = proto_register(&homav6_prot, 1); if (status != 0) { - printk(KERN_ERR "proto_register failed for homav6_prot: %d\n", - status); + pr_err("proto_register failed for homav6_prot: %d\n", status); goto out; } inet_register_protosw(&homa_protosw); inet6_register_protosw(&homav6_protosw); status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA); if (status != 0) { - printk(KERN_ERR "inet_add_protocol failed in homa_load: %d\n", - status); + pr_err("inet_add_protocol failed in %s: %d\n", __func__, + status); goto out_cleanup; } status = inet6_add_protocol(&homav6_protocol, IPPROTO_HOMA); if (status != 0) { - printk(KERN_ERR "inet6_add_protocol failed in homa_load: %d\n", - status); + pr_err("inet6_add_protocol failed in %s: %d\n", __func__, + status); goto out_cleanup; } status = homa_init(homa); if (status) goto out_cleanup; - metrics_dir_entry = proc_create("homa_metrics", S_IRUGO, + metrics_dir_entry = proc_create("homa_metrics", 0444, init_net.proc_net, &homa_metrics_pops); if (!metrics_dir_entry) { - printk(KERN_ERR "couldn't create /proc/net/homa_metrics\n"); + pr_err("couldn't create /proc/net/homa_metrics\n"); status = -ENOMEM; goto out_cleanup; } @@ -591,21 +581,21 @@ static int __init homa_load(void) { homa_ctl_header = register_net_sysctl(&init_net, "net/homa", homa_ctl_table); if (!homa_ctl_header) { - printk(KERN_ERR "couldn't register Homa sysctl parameters\n"); + pr_err("couldn't register Homa sysctl parameters\n"); status = -ENOMEM; goto out_cleanup; } status = homa_offload_init(); if (status != 0) { - printk(KERN_ERR "Homa couldn't init offloads\n"); + pr_err("Homa couldn't init offloads\n"); goto out_cleanup; } timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); if (IS_ERR(timer_kthread)) { status = PTR_ERR(timer_kthread); - printk(KERN_ERR "couldn't create homa pacer thread: error %d\n", + pr_err("couldn't create homa pacer thread: error %d\n", status); timer_kthread = NULL; goto out_cleanup; @@ -635,8 +625,9 @@ static int __init homa_load(void) { /** * homa_unload() - invoked when this module is unloaded from the Linux kernel. */ -static void __exit homa_unload(void) { - printk(KERN_NOTICE "Homa module unloading\n"); +static void __exit homa_unload(void) +{ + pr_notice("Homa module unloading\n"); exiting = true; tt_destroy(); @@ -645,7 +636,7 @@ static void __exit homa_unload(void) { if (timer_kthread) wake_up_process(timer_kthread); if (homa_offload_end() != 0) - printk(KERN_ERR "Homa couldn't stop offloads\n"); + pr_err("Homa couldn't stop offloads\n"); wait_for_completion(&timer_thread_done); unregister_net_sysctl_table(homa_ctl_header); proc_remove(metrics_dir_entry); @@ -674,21 +665,18 @@ module_exit(homa_unload); int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct homa_sock *hsk = homa_sk(sock->sk); - sockaddr_in_union *addr_in = (sockaddr_in_union *) addr; + union sockaddr_in_union *addr_in = (union sockaddr_in_union *) addr; int port = 0; - if (unlikely(addr->sa_family != sock->sk->sk_family)) { + if (unlikely(addr->sa_family != sock->sk->sk_family)) return -EAFNOSUPPORT; - } if (addr_in->in6.sin6_family == AF_INET6) { - if (addr_len < sizeof(struct sockaddr_in6)) { + if (addr_len < sizeof(struct sockaddr_in6)) return -EINVAL; - } port = ntohs(addr_in->in4.sin_port); } else if (addr_in->in4.sin_family == AF_INET) { - if (addr_len < sizeof(struct sockaddr_in)) { + if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; - } port = ntohs(addr_in->in6.sin6_port); } return homa_sock_bind(&homa->port_map, hsk, port); @@ -699,8 +687,10 @@ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) * @sk: Socket being closed * @timeout: ?? */ -void homa_close(struct sock *sk, long timeout) { +void homa_close(struct sock *sk, long timeout) +{ struct homa_sock *hsk = homa_sk(sk); + homa_sock_destroy(hsk); sk_common_release(sk); tt_record1("closed socket, port %d\n", hsk->port); @@ -731,9 +721,10 @@ int homa_shutdown(struct socket *sock, int how) * * Return: 0 on success, otherwise a negative errno. */ -int homa_disconnect(struct sock *sk, int flags) { - printk(KERN_WARNING "unimplemented disconnect invoked on Homa socket\n"); - return -ENOSYS; +int homa_disconnect(struct sock *sk, int flags) +{ + pr_warn("unimplemented disconnect invoked on Homa socket\n"); + return -EINVAL; } /** @@ -744,7 +735,8 @@ int homa_disconnect(struct sock *sk, int flags) { * * Return: 0 on success, otherwise a negative errno. */ -int homa_ioc_abort(struct sock *sk, int *karg) { +int homa_ioc_abort(struct sock *sk, int *karg) +{ int ret = 0; struct homa_sock *hsk = homa_sk(sk); struct homa_abort_args args; @@ -753,9 +745,8 @@ int homa_ioc_abort(struct sock *sk, int *karg) { if (unlikely(copy_from_user(&args, (void *) karg, sizeof(args)))) return -EFAULT; - if (args._pad1 || args._pad2[0] || args._pad2[1]) { + if (args._pad1 || args._pad2[0] || args._pad2[1]) return -EINVAL; - } if (args.id == 0) { homa_abort_sock_rpcs(hsk, -args.error); return 0; @@ -764,11 +755,10 @@ int homa_ioc_abort(struct sock *sk, int *karg) { rpc = homa_find_client_rpc(hsk, args.id); if (rpc == NULL) return -EINVAL; - if (args.error == 0) { + if (args.error == 0) homa_rpc_free(rpc); - } else { + else homa_rpc_abort(rpc, -args.error); - } homa_rpc_unlock(rpc); return ret; } @@ -782,7 +772,8 @@ int homa_ioc_abort(struct sock *sk, int *karg) { * * Return: 0 on success, otherwise a negative errno. */ -int homa_ioctl(struct sock *sk, int cmd, int *karg) { +int homa_ioctl(struct sock *sk, int cmd, int *karg) +{ int result; __u64 start = get_cycles(); @@ -793,13 +784,12 @@ int homa_ioctl(struct sock *sk, int cmd, int *karg) { INC_METRIC(abort_cycles, get_cycles() - start); break; case HOMAIOCFREEZE: - tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, " - "pid %d", current->pid); + tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, pid %d", current->pid); tt_freeze(); result = 0; break; default: - printk(KERN_NOTICE "Unknown Homa ioctl: %d\n", cmd); + pr_notice("Unknown Homa ioctl: %d\n", cmd); result = -EINVAL; break; } @@ -816,6 +806,7 @@ int homa_ioctl(struct sock *sk, int cmd, int *karg) { int homa_socket(struct sock *sk) { struct homa_sock *hsk = homa_sk(sk); + homa_sock_init(hsk, homa); return 0; } @@ -870,9 +861,10 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, * Return: 0 on success, otherwise a negative errno. */ int homa_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option) { - printk(KERN_WARNING "unimplemented getsockopt invoked on Homa socket:" - " level %d, optname %d\n", level, optname); + char __user *optval, int __user *option) +{ + pr_warn("unimplemented getsockopt invoked on Homa socket: level %d, optname %d\n", + level, optname); return -EINVAL; } @@ -885,14 +877,15 @@ int homa_getsockopt(struct sock *sk, int level, int optname, * @length: Number of bytes of the message. * Return: 0 on success, otherwise a negative errno. */ -int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { +int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) +{ struct homa_sock *hsk = homa_sk(sk); struct homa_sendmsg_args args; __u64 start = get_cycles(); __u64 finish; int result = 0; struct homa_rpc *rpc = NULL; - sockaddr_in_union *addr = (sockaddr_in_union *) msg->msg_name; + union sockaddr_in_union *addr = (union sockaddr_in_union *) msg->msg_name; homa_cores[raw_smp_processor_id()]->last_app_active = start; if (unlikely(!msg->msg_control_is_user)) { @@ -969,9 +962,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { * this could be totally valid (e.g. client is * no longer interested in it). */ - tt_record2("homa_sendmsg error: RPC id %d, peer 0x%x, " - "doesn't exist", args.id, - tt_addr(canonical_dest)); + tt_record2("homa_sendmsg error: RPC id %d, peer 0x%x, doesn't exist", + args.id, tt_addr(canonical_dest)); return 0; } if (rpc->error) { @@ -979,8 +971,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { goto error; } if (rpc->state != RPC_IN_SERVICE) { - tt_record2("homa_sendmsg error: RPC id %d in bad " - "state %d", rpc->id, rpc->state); + tt_record2("homa_sendmsg error: RPC id %d in bad state %d", rpc->id, rpc->state); homa_rpc_unlock(rpc); rpc = 0; result = -EINVAL; @@ -1081,18 +1072,17 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, */ if (rpc->hsk->homa->freeze_type == SLOW_RPC) { uint64_t elapsed = (get_cycles() - rpc->start_cycles)>>10; + if ((elapsed <= hsk->homa->temp[1]) && (elapsed >= hsk->homa->temp[0]) && homa_is_client(rpc->id) && (rpc->msgin.length >= hsk->homa->temp[2]) && (rpc->msgin.length < hsk->homa->temp[3])) { - tt_record4("Long RTT: kcycles %d, id %d, peer 0x%x, " - "length %d", + tt_record4("Long RTT: kcycles %d, id %d, peer 0x%x, length %d", elapsed, rpc->id, tt_addr(rpc->peer->addr), rpc->msgin.length); - homa_freeze(rpc, SLOW_RPC, "Freezing because of long " - "elapsed time for RPC id %d, peer 0x%x"); + homa_freeze(rpc, SLOW_RPC, "Freezing because of long elapsed time for RPC id %d, peer 0x%x"); } } @@ -1106,12 +1096,14 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } if (sk->sk_family == AF_INET6) { struct sockaddr_in6 *in6 = msg->msg_name; + in6->sin6_family = AF_INET6; in6->sin6_port = htons(rpc->dport); in6->sin6_addr = rpc->peer->addr; *addr_len = sizeof(*in6); } else { struct sockaddr_in *in4 = msg->msg_name; + in4->sin_family = AF_INET; in4->sin_port = htons(rpc->dport); in4->sin_addr.s_addr = ipv6_to_ipv4( @@ -1141,7 +1133,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, done: if (unlikely(copy_to_user(msg->msg_control, &control, sizeof(control)))) { /* Note: in this case the message's buffers will be leaked. */ - printk(KERN_NOTICE "homa_recvmsg couldn't copy back args\n"); + pr_notice("%s couldn't copy back args\n", __func__); result = -EFAULT; } @@ -1158,8 +1150,9 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, * @sk: Socket for the operation * Return: ?? */ -int homa_hash(struct sock *sk) { - printk(KERN_WARNING "unimplemented hash invoked on Homa socket\n"); +int homa_hash(struct sock *sk) +{ + pr_warn("unimplemented hash invoked on Homa socket\n"); return 0; } @@ -1167,17 +1160,18 @@ int homa_hash(struct sock *sk) { * homa_unhash() - ??. * @sk: Socket for the operation */ -void homa_unhash(struct sock *sk) { - return; - printk(KERN_WARNING "unimplemented unhash invoked on Homa socket\n"); +void homa_unhash(struct sock *sk) +{ + pr_warn("unimplemented unhash invoked on Homa socket\n"); } /** * homa_rehash() - ??. * @sk: Socket for the operation */ -void homa_rehash(struct sock *sk) { - printk(KERN_WARNING "unimplemented rehash invoked on Homa socket\n"); +void homa_rehash(struct sock *sk) +{ + pr_warn("unimplemented rehash invoked on Homa socket\n"); } /** @@ -1187,7 +1181,8 @@ void homa_rehash(struct sock *sk) { * @snum: Unclear what this is. * Return: Zero for success, or a negative errno for an error. */ -int homa_get_port(struct sock *sk, unsigned short snum) { +int homa_get_port(struct sock *sk, unsigned short snum) +{ /* Homa always assigns ports immediately when a socket is created, * so there is nothing to do here. */ @@ -1200,9 +1195,10 @@ int homa_get_port(struct sock *sk, unsigned short snum) { * @err: ?? * Return: ?? */ -int homa_diag_destroy(struct sock *sk, int err) { - printk(KERN_WARNING "unimplemented diag_destroy invoked on Homa socket\n"); - return -ENOSYS; +int homa_diag_destroy(struct sock *sk, int err) +{ + pr_warn("unimplemented diag_destroy invoked on Homa socket\n"); + return -EINVAL; } @@ -1211,8 +1207,9 @@ int homa_diag_destroy(struct sock *sk, int err) { * @skb: Socket buffer. * Return: Always 0? */ -int homa_v4_early_demux(struct sk_buff *skb) { - printk(KERN_WARNING "unimplemented early_demux invoked on Homa socket\n"); +int homa_v4_early_demux(struct sk_buff *skb) +{ + pr_warn("unimplemented early_demux invoked on Homa socket\n"); return 0; } @@ -1221,8 +1218,9 @@ int homa_v4_early_demux(struct sk_buff *skb) { * @skb: Socket buffer. * @return: Always 0? */ -int homa_v4_early_demux_handler(struct sk_buff *skb) { - printk(KERN_WARNING "unimplemented early_demux_handler invoked on Homa socket\n"); +int homa_v4_early_demux_handler(struct sk_buff *skb) +{ + pr_warn("unimplemented early_demux_handler invoked on Homa socket\n"); return 0; } @@ -1232,11 +1230,12 @@ int homa_v4_early_demux_handler(struct sk_buff *skb) { * @skb: The incoming packet. * Return: Always 0 */ -int homa_softirq(struct sk_buff *skb) { +int homa_softirq(struct sk_buff *skb) +{ struct common_header *h; struct sk_buff *packets, *other_pkts, *next; struct sk_buff **prev_link, **other_link; - static __u64 last = 0; + static __u64 last; __u64 start; int header_offset; int first_packet = 1; @@ -1247,12 +1246,13 @@ int homa_softirq(struct sk_buff *skb) { homa_cores[raw_smp_processor_id()]->last_active = start; if ((start - last) > 1000000) { int scaled_ms = (int) (10*(start-last)/cpu_khz); + if ((scaled_ms >= 50) && (scaled_ms < 10000)) { // tt_record3("Gap in incoming packets: %d cycles " // "(%d.%1d ms)", // (int) (start - last), scaled_ms/10, // scaled_ms%10); -// printk(KERN_NOTICE "Gap in incoming packets: %llu " +// pr_notice("Gap in incoming packets: %llu " // "cycles, (%d.%1d ms)", (start - last), // scaled_ms/10, scaled_ms%10); } @@ -1271,6 +1271,7 @@ int homa_softirq(struct sk_buff *skb) { prev_link = &packets; for (skb = packets; skb != NULL; skb = next) { const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + next = skb->next; /* Make the header available at skb->data, even if the packet @@ -1284,9 +1285,7 @@ int homa_softirq(struct sk_buff *skb) { pull_length = skb->len; if (!pskb_may_pull(skb, pull_length)) { if (homa->verbose) - printk(KERN_NOTICE "Homa can't handle fragmented " - "packet (no space for header); " - "discarding\n"); + pr_notice("Homa can't handle fragmented packet (no space for header); discarding\n"); UNIT_LOG("", "pskb discard"); goto discard; } @@ -1300,9 +1299,7 @@ int homa_softirq(struct sk_buff *skb) { || (h->type >= BOGUS) || (skb->len < header_lengths[h->type-DATA]))) { if (homa->verbose) - printk(KERN_WARNING - "Homa %s packet from %s too " - "short: %d bytes\n", + pr_warn("Homa %s packet from %s too short: %d bytes\n", homa_symbol_for_type(h->type), homa_print_ipv6_addr(&saddr), skb->len - header_offset); @@ -1311,8 +1308,7 @@ int homa_softirq(struct sk_buff *skb) { } if (first_packet) { - tt_record4("homa_softirq: first packet from 0x%x:%d, " - "id %llu, type %d", + tt_record4("homa_softirq: first packet from 0x%x:%d, id %llu, type %d", tt_addr(saddr), ntohs(h->sport), homa_local_id(h->sender_id), h->type); first_packet = 0; @@ -1324,8 +1320,7 @@ int homa_softirq(struct sk_buff *skb) { if (unlikely(h->type == FREEZE)) { if (!tt_frozen) { homa_rpc_log_active_tt(homa, 0); - tt_record4("Freezing because of request on " - "port %d from 0x%x:%d, id %d", + tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", ntohs(h->dport), tt_addr(saddr), ntohs(h->sport), homa_local_id(h->sender_id)); @@ -1348,7 +1343,7 @@ int homa_softirq(struct sk_buff *skb) { prev_link = &skb->next; continue; - discard: +discard: *prev_link = skb->next; kfree_skb(skb); } @@ -1412,7 +1407,7 @@ int homa_softirq(struct sk_buff *skb) { */ int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb) { - printk(KERN_WARNING "unimplemented backlog_rcv invoked on Homa socket\n"); + pr_warn("unimplemented backlog_rcv invoked on Homa socket\n"); kfree_skb(skb); return 0; } @@ -1435,12 +1430,14 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) if ((type == ICMP_DEST_UNREACH) && (code == ICMP_PORT_UNREACH)) { struct common_header *h; char *icmp = (char *) icmp_hdr(skb); + iph = (struct iphdr *) (icmp + sizeof(struct icmphdr)); h = (struct common_header *) (icmp + sizeof(struct icmphdr) + iph->ihl*4); homa_abort_rpcs(homa, &saddr, htons(h->dport), -ENOTCONN); } else if (type == ICMP_DEST_UNREACH) { int error; + if (code == ICMP_PROT_UNREACH) error = -EPROTONOSUPPORT; else @@ -1449,9 +1446,8 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) iph->saddr, iph->daddr); homa_abort_rpcs(homa, &saddr, 0, error); } else { - printk(KERN_NOTICE "homa_err_handler_v4 invoked with " - "info %x, ICMP type %d, ICMP code %d\n", - info, type, code); + pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n", + __func__, info, type, code); } return 0; } @@ -1476,12 +1472,14 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, if ((type == ICMPV6_DEST_UNREACH) && (code == ICMPV6_PORT_UNREACH)) { struct common_header *h; char *icmp = (char *) icmp_hdr(skb); + iph = (struct ipv6hdr *) (icmp + sizeof(struct icmphdr)); h = (struct common_header *) (icmp + sizeof(struct icmphdr) + HOMA_IPV6_HEADER_LENGTH); homa_abort_rpcs(homa, &iph->daddr, htons(h->dport), -ENOTCONN); } else if (type == ICMPV6_DEST_UNREACH) { int error; + if (code == ICMP_PROT_UNREACH) error = -EPROTONOSUPPORT; else @@ -1491,9 +1489,8 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, homa_abort_rpcs(homa, &iph->daddr, 0, error); } else { if (homa->verbose) - printk(KERN_NOTICE "homa_err_handler_v6 invoked with " - "info %x, ICMP type %d, ICMP code %d\n", - info, type, code); + pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n", + __func__, info, type, code); } return 0; } @@ -1510,7 +1507,8 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, * state of the socket. */ __poll_t homa_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait) { + struct poll_table_struct *wait) +{ struct sock *sk = sock->sk; __poll_t mask; @@ -1546,9 +1544,8 @@ int homa_metrics_open(struct inode *inode, struct file *file) * completely closed. */ spin_lock(&homa->metrics_lock); - if (homa->metrics_active_opens == 0) { + if (homa->metrics_active_opens == 0) homa_print_metrics(homa); - } homa->metrics_active_opens++; spin_unlock(&homa->metrics_lock); return 0; @@ -1628,6 +1625,7 @@ int homa_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int result; + result = proc_dointvec(table, write, buffer, lenp, ppos); if (write) { /* Don't worry which particular value changed; update @@ -1664,8 +1662,7 @@ int homa_dointvec(struct ctl_table *table, int write, else if (action == 5) tt_printk(); else if (action == 6) { - tt_record("Calling homa_rpc_log_active because " - "of action 6"); + tt_record("Calling homa_rpc_log_active because of action 6"); homa_rpc_log_active_tt(homa, 0); tt_record("Freezing because of action 6"); tt_freeze(); @@ -1676,7 +1673,7 @@ int homa_dointvec(struct ctl_table *table, int write, tt_record("Finished freezing cluster"); tt_freeze(); } else if (action == 8) { - printk(KERN_NOTICE"homa_total_incoming is %d\n", + pr_notice("homa_total_incoming is %d\n", atomic_read(&homa->total_incoming)); } else if (action == 9) { tt_print_file("/users/ouster/node.tt"); @@ -1709,7 +1706,7 @@ int homa_sysctl_softirq_cores(struct ctl_table *table, int write, int max_values, *values; max_values = (NUM_GEN3_SOFTIRQ_CORES + 1) * nr_cpu_ids; - values = (int *) kmalloc(max_values * sizeof(int), GFP_KERNEL); + values = kmalloc_array(max_values, sizeof(int), GFP_KERNEL); if (values == NULL) return -ENOMEM; @@ -1728,6 +1725,7 @@ int homa_sysctl_softirq_cores(struct ctl_table *table, int write, for (i = 0; i < max_values; i += NUM_GEN3_SOFTIRQ_CORES + 1) { int j; + if (values[i] < 0) break; core = homa_cores[values[i]]; diff --git a/homa_pool.c b/homa_pool.c index a8dec585..5ea9ab47 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2022-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" @@ -29,7 +27,8 @@ * The caller must own the lock for @pool->hsk. * @pool: Pool to update. */ -inline static void set_bpages_needed(struct homa_pool *pool) { +static inline void set_bpages_needed(struct homa_pool *pool) +{ struct homa_rpc *rpc = list_first_entry(&pool->hsk->waiting_for_bufs, struct homa_rpc, buf_links); pool->bpages_needed = (rpc->msgin.length + HOMA_BPAGE_SIZE - 1) @@ -61,15 +60,15 @@ int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) result = -EINVAL; goto error; } - pool->descriptors = (struct homa_bpage *) kmalloc( - pool->num_bpages * sizeof(struct homa_bpage), - GFP_ATOMIC); + pool->descriptors = kmalloc_array(pool->num_bpages, + sizeof(struct homa_bpage), GFP_ATOMIC); if (!pool->descriptors) { result = -ENOMEM; goto error; } for (i = 0; i < pool->num_bpages; i++) { struct homa_bpage *bp = &pool->descriptors[i]; + spin_lock_init(&bp->lock); atomic_set(&bp->refs, 0); bp->owner = -1; @@ -79,8 +78,8 @@ int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) pool->bpages_needed = INT_MAX; /* Allocate and initialize core-specific data. */ - pool->cores = (struct homa_pool_core *) kmalloc(nr_cpu_ids * - sizeof(struct homa_pool_core), GFP_ATOMIC); + pool->cores = kmalloc_array(nr_cpu_ids, sizeof(struct homa_pool_core), + GFP_ATOMIC); if (!pool->cores) { result = -ENOMEM; goto error; @@ -95,11 +94,9 @@ int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) return 0; - error: - if (pool->descriptors) - kfree(pool->descriptors); - if (pool->cores) - kfree(pool->cores); +error: + kfree(pool->descriptors); + kfree(pool->cores); pool->region = NULL; return result; } @@ -130,7 +127,7 @@ void homa_pool_destroy(struct homa_pool *pool) * of the allocated pages (and the expiration time is also * set). Otherwise the pages are left unowned. * Return: 0 for success, -1 if there wasn't enough free space in the pool. -*/ + */ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, int set_owner) { @@ -163,6 +160,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, */ if (limit == 0) { int extra; + limit = pool->num_bpages - atomic_read(&pool->free_bpages); extra = limit>>2; @@ -295,7 +293,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) goto allocate_partial; /* Can't use the current page; get another one. */ - new_page: +new_page: if (homa_pool_get_pages(pool, 1, pages, 1) != 0) { homa_pool_release_buffers(pool, rpc->msgin.num_bpages, rpc->msgin.bpage_offsets); @@ -305,15 +303,14 @@ int homa_pool_allocate(struct homa_rpc *rpc) core->page_hint = pages[0]; core->allocated = 0; - allocate_partial: +allocate_partial: rpc->msgin.bpage_offsets[rpc->msgin.num_bpages] = core->allocated + (core->page_hint << HOMA_BPAGE_SHIFT); rpc->msgin.num_bpages++; core->allocated += partial; - success: - tt_record4("Allocated %d bpage pointers on port %d for id %d, " - "free_bpages now %d", +success: + tt_record4("Allocated %d bpage pointers on port %d for id %d, free_bpages now %d", rpc->msgin.num_bpages, pool->hsk->port, rpc->id, atomic_read(&pool->free_bpages)); return 0; @@ -321,10 +318,10 @@ int homa_pool_allocate(struct homa_rpc *rpc) /* We get here if there wasn't enough buffer space for this * message; add the RPC to hsk->waiting_for_bufs. */ - out_of_space: +out_of_space: INC_METRIC(buffer_alloc_failures, 1); - tt_record4("Buffer allocation failed, port %d, id %d, length %d, " - "free_bpages %d", pool->hsk->port, rpc->id, + tt_record4("Buffer allocation failed, port %d, id %d, length %d, free_bpages %d", + pool->hsk->port, rpc->id, rpc->msgin.length, atomic_read(&pool->free_bpages)); homa_sock_lock(pool->hsk, "homa_pool_allocate"); @@ -336,7 +333,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) } list_add_tail_rcu(&rpc->buf_links, &pool->hsk->waiting_for_bufs); - queued: +queued: set_bpages_needed(pool); homa_sock_unlock(pool->hsk); return 0; @@ -386,10 +383,11 @@ void homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, return; for (i = 0; i < num_buffers; i++) { __u32 bpage_index = buffers[i] >> HOMA_BPAGE_SHIFT; - struct homa_bpage *bpage= &pool->descriptors[bpage_index]; + struct homa_bpage *bpage = &pool->descriptors[bpage_index]; + if (bpage_index < pool->num_bpages) { - if (atomic_dec_return(&bpage->refs) == 0) - atomic_inc(&pool->free_bpages); + if (atomic_dec_return(&bpage->refs) == 0) + atomic_inc(&pool->free_bpages); } } tt_record3("Released %d bpages, free_bpages for port %d now %d", @@ -413,6 +411,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) #endif while (atomic_read(&pool->free_bpages) >= pool->bpages_needed) { struct homa_rpc *rpc; + homa_sock_lock(pool->hsk, "buffer pool"); if (list_empty(&pool->hsk->waiting_for_bufs)) { pool->bpages_needed = INT_MAX; @@ -429,8 +428,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) * operation again. */ homa_sock_unlock(pool->hsk); - UNIT_LOG("; ", "rpc lock unavailable in " - "homa_pool_release_buffers"); + UNIT_LOG("; ", "rpc lock unavailable in %s", __func__); continue; } list_del_init(&rpc->buf_links); @@ -439,8 +437,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) else set_bpages_needed(pool); homa_sock_unlock(pool->hsk); - tt_record4("Retrying buffer allocation for id %d, length %d, " - "free_bpages %d, new bpages_needed %d", + tt_record4("Retrying buffer allocation for id %d, length %d, free_bpages %d, new bpages_needed %d", rpc->id, rpc->msgin.length, atomic_read(&pool->free_bpages), pool->bpages_needed); @@ -452,4 +449,4 @@ void homa_pool_check_waiting(struct homa_pool *pool) } else homa_rpc_unlock(rpc); } -} \ No newline at end of file +} diff --git a/homa_skb.c b/homa_skb.c index e49455f1..d3a40f1e 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2024 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file contains functions for allocating and freeing sk_buffs. */ @@ -38,8 +36,10 @@ void homa_skb_page_pool_init(struct homa_page_pool *pool) void homa_skb_cleanup(struct homa *homa) { int i, j; + for (i = 0; i < nr_cpu_ids; i++) { struct homa_core *core = homa_cores[i]; + if (core->skb_page != NULL) { put_page(core->skb_page); core->skb_page = NULL; @@ -53,6 +53,7 @@ void homa_skb_cleanup(struct homa *homa) for (i = 0; i < MAX_NUMNODES; i++) { struct homa_numa *numa = homa_numas[i]; + if (!numa) continue; for (j = numa->page_pool.avail - 1; j >= 0; j--) @@ -120,7 +121,7 @@ void homa_skb_stash_pages(struct homa *homa, int length) spin_lock_bh(&homa->page_pool_mutex); while (pool->avail && (core->num_stashed_pages < pages_needed)) { pool->avail--; - if (pool->avail < pool->low_mark ) + if (pool->avail < pool->low_mark) pool->low_mark = pool->avail; core->stashed_pages[core->num_stashed_pages] = pool->pages[pool->avail]; @@ -191,10 +192,11 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) * @core: Allocate page in this core. * Return: True if successful, false if memory not available. */ -bool homa_skb_page_alloc(struct homa *homa, struct homa_core * core) +bool homa_skb_page_alloc(struct homa *homa, struct homa_core *core) { struct homa_page_pool *pool; - __u64 start; + __u64 start; + if (core->skb_page) { if (page_ref_count(core->skb_page) == 1) { /* The existing page is no longer in use, so we can @@ -219,6 +221,7 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_core * core) pool = &core->numa->page_pool; if (pool->avail) { struct homa_page_pool *pool = &core->numa->page_pool; + spin_lock_bh(&homa->page_pool_mutex); if (pool->avail) { pool->avail--; @@ -251,7 +254,7 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_core * core) core->page_size = core->page_inuse = 0; return false; - success: +success: return true; } @@ -300,8 +303,7 @@ int homa_skb_append_from_iter(struct homa *homa, struct sk_buff *skb, int chunk_length; char *dst; - while (length > 0) - { + while (length > 0) { chunk_length = length; dst = (char *) homa_skb_extend_frags(homa, skb, &chunk_length); if (!dst) @@ -321,7 +323,7 @@ int homa_skb_append_from_iter(struct homa *homa, struct sk_buff *skb, * @dst_skb: Data gets added to the end of this skb. * @src_skb: Data is copied out of this skb. * @offset: Offset within @src_skb of first byte to copy, relative - * to the transport header. + * to the transport header. * @length: Total number of bytes to copy; fewer bytes than this may * be copied if @src_skb isn't long enough to hold all of the * desired bytes. @@ -337,8 +339,7 @@ int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, /* Copy bytes from the linear part of the source, if any. */ head_len = skb_tail_pointer(src_skb) - skb_transport_header(src_skb); - if (offset < head_len) - { + if (offset < head_len) { chunk_size = length; if (chunk_size > (head_len - offset)) chunk_size = head_len - offset; @@ -351,13 +352,12 @@ int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, length -= chunk_size; } - /* Virtually copy bytes from source frags, if needed. */ + /* Virtually copy bytes from source frags, if needed. */ src_frag_offset = head_len; for (src_frags_left = src_shinfo->nr_frags, src_frag = &src_shinfo->frags[0]; - (src_frags_left > 0) && (length > 0); - src_frags_left--, src_frag_offset += skb_frag_size(src_frag), - src_frag++) - { + (src_frags_left > 0) && (length > 0); + src_frags_left--, src_frag_offset += skb_frag_size(src_frag), + src_frag++) { if (offset >= (src_frag_offset + skb_frag_size(src_frag))) continue; chunk_size = skb_frag_size(src_frag) - (offset - src_frag_offset); @@ -415,7 +415,8 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) if (refcount_read(&skb->users) != 1) { /* This sk_buff is still in use somewhere, so can't - * reclaim its pages. */ + * reclaim its pages. + */ kfree_skb(skb); continue; } @@ -423,6 +424,7 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) /* Reclaim cacheable pages. */ for (j = 0; j < shinfo->nr_frags; j++) { struct page *page = skb_frag_page(&shinfo->frags[j]); + if ((compound_order(page) == HOMA_SKB_PAGE_ORDER) && (page_ref_count(page) == 1)) { pages_to_cache[num_pages] = page; @@ -459,6 +461,7 @@ void homa_skb_cache_pages(struct homa *homa, struct page **pages, int count) #define LIMIT HOMA_PAGE_POOL_SIZE #endif int i; + spin_lock_bh(&homa->page_pool_mutex); for (i = 0; i < count; i++) { struct page *page = pages[i]; @@ -536,14 +539,14 @@ void homa_skb_release_pages(struct homa *homa) if (now < homa->skb_page_free_time) return; - /* Free pages every 0.5 second. */ + /* Free pages every 0.5 second. */ interval = cpu_khz*500; homa->skb_page_free_time = now + interval; release_max = homa->skb_page_frees_per_sec/2; if (homa->pages_to_free_slots < release_max) { if (homa->skb_pages_to_free != NULL) kfree(homa->skb_pages_to_free); - homa->skb_pages_to_free = kmalloc(release_max * + homa->skb_pages_to_free = kmalloc_array(release_max, sizeof(struct page *), GFP_KERNEL); homa->pages_to_free_slots = release_max; } @@ -553,12 +556,13 @@ void homa_skb_release_pages(struct homa *homa) spin_lock_bh(&homa->page_pool_mutex); for (i = 0; i < homa_num_numas; i++) { struct homa_page_pool *pool = &homa_numas[i]->page_pool; + if (pool->low_mark > max_low_mark) { max_low_mark = pool->low_mark; max_pool = pool; } - tt_record3("NUMA node %d has %d pages in skb page pool, " - "low mark %d", i, pool->avail, pool->low_mark); + tt_record3("NUMA node %d has %d pages in skb page pool, low mark %d", + i, pool->avail, pool->low_mark); pool->low_mark = pool->avail; } @@ -566,7 +570,7 @@ void homa_skb_release_pages(struct homa *homa) * releasing the lock, since freeing is expensive). */ min_pages = ((homa->skb_page_pool_min_kb * 1000) - + (HOMA_SKB_PAGE_SIZE - 1))/ HOMA_SKB_PAGE_SIZE; + + (HOMA_SKB_PAGE_SIZE - 1)) / HOMA_SKB_PAGE_SIZE; release = max_low_mark - min_pages; if (release > release_max) release = release_max; @@ -580,8 +584,9 @@ void homa_skb_release_pages(struct homa *homa) /* Free the pages that were collected. */ for (i = 0; i < release; i++) { struct page *page = homa->skb_pages_to_free[i]; + tt_record2("homa_skb_release_pages releasing page 0x%08x%08x", tt_hi(page), tt_lo(page)); put_page(page); } -} \ No newline at end of file +} diff --git a/homa_socktab.c b/homa_socktab.c index 4f710803..aa01bf12 100644 --- a/homa_socktab.c +++ b/homa_socktab.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file manages homa_socktab objects; it also implements several * operations on homa_sock objects, such as construction and destruction. @@ -15,10 +13,10 @@ void homa_socktab_init(struct homa_socktab *socktab) { int i; + spin_lock_init(&socktab->write_lock); - for (i = 0; i < HOMA_SOCKTAB_BUCKETS; i++) { + for (i = 0; i < HOMA_SOCKTAB_BUCKETS; i++) INIT_HLIST_HEAD(&socktab->buckets[i]); - } } /** @@ -78,6 +76,7 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) { struct homa_sock *hsk; struct homa_socktab_links *links; + while (1) { while (scan->next == NULL) { scan->current_bucket++; @@ -118,12 +117,10 @@ void homa_sock_init(struct homa_sock *hsk, struct homa *homa) ? HOMA_IPV4_HEADER_LENGTH : HOMA_IPV6_HEADER_LENGTH; hsk->shutdown = false; while (1) { - if (homa->next_client_port < HOMA_MIN_DEFAULT_PORT) { + if (homa->next_client_port < HOMA_MIN_DEFAULT_PORT) homa->next_client_port = HOMA_MIN_DEFAULT_PORT; - } - if (!homa_sock_find(socktab, homa->next_client_port)) { + if (!homa_sock_find(socktab, homa->next_client_port)) break; - } homa->next_client_port++; } hsk->port = homa->next_client_port; @@ -143,12 +140,14 @@ void homa_sock_init(struct homa_sock *hsk, struct homa *homa) INIT_LIST_HEAD(&hsk->response_interests); for (i = 0; i < HOMA_CLIENT_RPC_BUCKETS; i++) { struct homa_rpc_bucket *bucket = &hsk->client_rpc_buckets[i]; + spin_lock_init(&bucket->lock); INIT_HLIST_HEAD(&bucket->rpcs); bucket->id = i; } for (i = 0; i < HOMA_SERVER_RPC_BUCKETS; i++) { struct homa_rpc_bucket *bucket = &hsk->server_rpc_buckets[i]; + spin_lock_init(&bucket->lock); INIT_HLIST_HEAD(&bucket->rpcs); bucket->id = i + 1000000; @@ -251,9 +250,8 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, if (port == 0) return result; - if (port >= HOMA_MIN_DEFAULT_PORT) { + if (port >= HOMA_MIN_DEFAULT_PORT) return -EINVAL; - } homa_sock_lock(hsk, "homa_sock_bind"); spin_lock_bh(&socktab->write_lock); if (hsk->shutdown) { @@ -273,7 +271,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, hsk->inet.inet_sport = htons(hsk->port); hlist_add_head_rcu(&hsk->socktab_links.hash_links, &socktab->buckets[homa_port_hash(port)]); - done: +done: spin_unlock_bh(&socktab->write_lock); homa_sock_unlock(hsk); return result; @@ -293,9 +291,11 @@ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) { struct homa_socktab_links *link; struct homa_sock *result = NULL; + hlist_for_each_entry_rcu(link, &socktab->buckets[homa_port_hash(port)], hash_links) { struct homa_sock *hsk = link->sock; + if (hsk->port == port) { result = hsk; break; @@ -314,6 +314,7 @@ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) void homa_sock_lock_slow(struct homa_sock *hsk) { __u64 start = get_cycles(); + tt_record("beginning wait for socket lock"); spin_lock_bh(&hsk->lock); tt_record("ending wait for socket lock"); diff --git a/homa_timer.c b/homa_timer.c index 91540160..f6f4b555 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -1,9 +1,8 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file handles timing-related functions for Homa, such as retries - * and timeouts. */ + * and timeouts. + */ #include "homa_impl.h" @@ -31,9 +30,9 @@ void homa_check_rpc(struct homa_rpc *rpc) if ((rpc->done_timer_ticks + homa->request_ack_ticks - 1 - homa->timer_ticks) & 1<<31) { struct need_ack_header h; + homa_xmit_control(NEED_ACK, &h, sizeof(h), rpc); - tt_record4("Sent NEED_ACK for RPC id %d to " - "peer 0x%x, port %d, ticks %d", + tt_record4("Sent NEED_ACK for RPC id %d to peer 0x%x, port %d, ticks %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, homa->timer_ticks @@ -78,16 +77,14 @@ void homa_check_rpc(struct homa_rpc *rpc) return; if (rpc->silent_ticks >= homa->timeout_ticks) { INC_METRIC(rpc_timeouts, 1); - tt_record3("RPC id %d, peer 0x%x, aborted because of timeout, " - "state %d", + tt_record3("RPC id %d, peer 0x%x, aborted because of timeout, state %d", rpc->id, tt_addr(rpc->peer->addr), rpc->state); homa_rpc_log_active_tt(homa, 0); tt_record1("Freezing because of RPC abort (id %d)", rpc->id); homa_freeze_peers(homa); tt_freeze(); if (homa->verbose) - printk(KERN_NOTICE "RPC id %llu, peer %s, aborted " - "because of timeout, state %d\n", + pr_notice("RPC id %llu, peer %s, aborted because of timeout, state %d\n", rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), rpc->state); @@ -121,8 +118,7 @@ void homa_check_rpc(struct homa_rpc *rpc) if (homa_is_client(rpc->id)) { us = "client"; them = "server"; - tt_record4("Sent RESEND for client RPC id %llu, server 0x%x:%d, " - "offset %d", + tt_record4("Sent RESEND for client RPC id %llu, server 0x%x:%d, offset %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); tt_record4("length %d, granted %d, rem %d, rec_incoming %d", @@ -132,8 +128,7 @@ void homa_check_rpc(struct homa_rpc *rpc) } else { us = "server"; them = "client"; - tt_record4("Sent RESEND for server RPC id %llu, client 0x%x:%d " - "offset %d", + tt_record4("Sent RESEND for server RPC id %llu, client 0x%x:%d offset %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); tt_record4("length %d, granted %d, rem %d, rec_incoming %d", @@ -142,8 +137,7 @@ void homa_check_rpc(struct homa_rpc *rpc) rpc->msgin.rec_incoming); } if (homa->verbose) - printk(KERN_NOTICE "Homa %s RESEND to %s %s:%d for id %llu, " - "offset %d, length %d\n", us, them, + pr_notice("Homa %s RESEND to %s %s:%d for id %llu, offset %d, length %d\n", us, them, homa_print_ipv6_addr(&rpc->peer->addr), rpc->dport, rpc->id, rpc->msgin.recv_end, rpc->msgin.granted - rpc->msgin.recv_end); @@ -165,8 +159,8 @@ void homa_timer(struct homa *homa) int total_incoming_rpcs = 0; int sum_incoming = 0; int sum_incoming_rec = 0; - static __u64 prev_grant_count = 0; - static int zero_count = 0; + static __u64 prev_grant_count; + static int zero_count; int core; __u64 total_grants; @@ -176,11 +170,11 @@ void homa_timer(struct homa *homa) total_grants = 0; for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = &homa_cores[core]->metrics; + total_grants += m->packets_sent[GRANT-DATA]; } - tt_record4("homa_timer found total_incoming %d, num_grantable_rpcs %d, " - "num_active_rpcs %d, new grants %d", + tt_record4("homa_timer found total_incoming %d, num_grantable_rpcs %d, num_active_rpcs %d, new grants %d", atomic_read(&homa->total_incoming), homa->num_grantable_rpcs, homa->num_active_rpcs, @@ -189,7 +183,7 @@ void homa_timer(struct homa *homa) && (homa->num_grantable_rpcs > 20)) { zero_count++; if ((zero_count > 3) && !tt_frozen && 0) { - printk(KERN_ERR "homa timer found no grants going out\n"); + pr_err("%s found no grants going out\n", __func__); homa_rpc_log_active_tt(homa, 0); tt_record("freezing because no grants are going out"); homa_freeze_peers(homa); @@ -208,8 +202,10 @@ void homa_timer(struct homa *homa) while (hsk->dead_skbs >= homa->dead_buffs_limit) { /* If we get here, it means that homa_wait_for_message * isn't keeping up with RPC reaping, so we'll help - * out. See reap.txt for more info. */ + * out. See reap.txt for more info. + */ uint64_t start = get_cycles(); + tt_record("homa_timer calling homa_rpc_reap"); if (homa_rpc_reap(hsk, hsk->homa->reap_limit) == 0) break; @@ -253,8 +249,7 @@ void homa_timer(struct homa *homa) homa_unprotect_rpcs(hsk); } rcu_read_unlock(); - tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, " - "rec_sum %d, homa->total_incoming %d", + tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", total_incoming_rpcs, sum_incoming, sum_incoming_rec, atomic_read(&homa->total_incoming)); diff --git a/homa_utils.c b/homa_utils.c index cfa1e02c..df9cd0ba 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file contains miscellaneous utility functions for the Homa protocol. */ @@ -15,7 +13,7 @@ struct homa_core *homa_cores[NR_CPUS]; struct homa_numa *homa_numas[MAX_NUMNODES]; /* Total number of NUMA nodes actually defined in homa_numas. */ -int homa_num_numas = 0; +int homa_num_numas; /* Points to block of memory holding all homa_cores; used to free it. */ char *core_memory; @@ -35,6 +33,7 @@ int homa_init(struct homa *homa) size_t aligned_size; char *first; int i, err, num_numas; + _Static_assert(HOMA_MAX_PRIORITIES >= 8, "homa_init assumes at least 8 priority levels"); @@ -44,6 +43,7 @@ int homa_init(struct homa *homa) for (i = 0; i < nr_cpu_ids; i++) { struct homa_numa *numa; int n = cpu_to_node(i); + if (homa_numas[n]) continue; numa = kmalloc(sizeof(struct homa_numa), GFP_KERNEL); @@ -53,8 +53,7 @@ int homa_init(struct homa *homa) homa_num_numas = n+1; num_numas++; } - printk(KERN_NOTICE "Homa initialized %d homa_numas, highest " - "number %d\n", num_numas, homa_num_numas-1); + pr_notice("Homa initialized %d homa_numas, highest number %d\n", num_numas, homa_num_numas-1); /* Initialize core-specific info (if no-one else has already done it), * making sure that each core has private cache lines. @@ -63,8 +62,7 @@ int homa_init(struct homa *homa) aligned_size = (sizeof(struct homa_core) + 0x3f) & ~0x3f; core_memory = vmalloc(0x3f + (nr_cpu_ids*aligned_size)); if (!core_memory) { - printk(KERN_ERR "Homa couldn't allocate memory " - "for core-specific data\n"); + pr_err("Homa couldn't allocate memory for core-specific data\n"); return -ENOMEM; } first = (char *) (((__u64) core_memory + 0x3f) & ~0x3f); @@ -127,8 +125,7 @@ int homa_init(struct homa *homa) homa_socktab_init(&homa->port_map); err = homa_peertab_init(&homa->peers); if (err) { - printk(KERN_ERR "Couldn't initialize peer table (errno %d)\n", - -err); + pr_err("Couldn't initialize peer table (errno %d)\n", -err); return err; } spin_lock_init(&homa->page_pool_mutex); @@ -177,8 +174,7 @@ int homa_init(struct homa *homa) if (IS_ERR(homa->pacer_kthread)) { err = PTR_ERR(homa->pacer_kthread); homa->pacer_kthread = NULL; - printk(KERN_ERR "couldn't create homa pacer thread: error %d\n", - err); + pr_err("couldn't create homa pacer thread: error %d\n", err); return err; } homa->pacer_exit = false; @@ -214,6 +210,7 @@ int homa_init(struct homa *homa) void homa_destroy(struct homa *homa) { int i; + if (homa->pacer_kthread) { homa_pacer_stop(homa); wait_for_completion(&homa_pacer_kthread_done); @@ -226,6 +223,7 @@ void homa_destroy(struct homa *homa) for (i = 0; i < MAX_NUMNODES; i++) { struct homa_numa *numa = homa_numas[i]; + if (numa != NULL) { kfree(numa); homa_numas[i] = NULL; @@ -234,12 +232,10 @@ void homa_destroy(struct homa *homa) if (core_memory) { vfree(core_memory); core_memory = NULL; - for (i = 0; i < nr_cpu_ids; i++) { + for (i = 0; i < nr_cpu_ids; i++) homa_cores[i] = NULL; - } } - if (homa->metrics) - kfree(homa->metrics); + kfree(homa->metrics); } /** @@ -254,14 +250,14 @@ void homa_destroy(struct homa *homa) * caller must eventually unlock it. */ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, - const sockaddr_in_union *dest) + const union sockaddr_in_union *dest) { int err; struct homa_rpc *crpc; struct homa_rpc_bucket *bucket; struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); - crpc = (struct homa_rpc *) kmalloc(sizeof(*crpc), GFP_KERNEL); + crpc = kmalloc(sizeof(*crpc), GFP_KERNEL); if (unlikely(!crpc)) return ERR_PTR(-ENOMEM); @@ -275,7 +271,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, atomic_set(&crpc->grants_in_progress, 0); crpc->peer = homa_peer_find(&hsk->homa->peers, &dest_addr_as_ipv6, &hsk->inet); - if (unlikely(IS_ERR(crpc->peer))) { + if (IS_ERR(crpc->peer)) { tt_record("error in homa_peer_find"); err = PTR_ERR(crpc->peer); goto error; @@ -365,7 +361,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, } /* Initialize fields that don't require the socket lock. */ - srpc = (struct homa_rpc *) kmalloc(sizeof(*srpc), GFP_KERNEL); + srpc = kmalloc(sizeof(*srpc), GFP_KERNEL); if (!srpc) { err = -ENOMEM; goto error; @@ -376,7 +372,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, atomic_set(&srpc->flags, 0); atomic_set(&srpc->grants_in_progress, 0); srpc->peer = homa_peer_find(&hsk->homa->peers, source, &hsk->inet); - if (unlikely(IS_ERR(srpc->peer))) { + if (IS_ERR(srpc->peer)) { err = PTR_ERR(srpc->peer); goto error; } @@ -426,8 +422,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, error: homa_bucket_unlock(bucket, id); - if (srpc) - kfree(srpc); + kfree(srpc); return ERR_PTR(err); } @@ -443,6 +438,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id) { __u64 start = get_cycles(); + tt_record2("beginning wait for rpc lock, id %d (bucket %d)", id, bucket->id); spin_lock_bh(&bucket->lock); @@ -492,7 +488,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, homa_rpc_unlock(rpc); } - done: +done: if (hsk->port != server_port) rcu_read_unlock(); } @@ -553,9 +549,8 @@ void homa_rpc_free(struct homa_rpc *rpc) while (1) { struct homa_gap *gap = list_first_entry_or_null( &rpc->msgin.gaps, struct homa_gap, links); - if (gap == NULL) { + if (gap == NULL) break; - } list_del(&gap->links); kfree(gap); } @@ -619,8 +614,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) homa_sock_lock(hsk, "homa_rpc_reap"); if (atomic_read(&hsk->protect_count)) { INC_METRIC(disabled_reaps, 1); - tt_record2("homa_rpc_reap returning: protect_count " - "%d, dead_skbs %d", + tt_record2("homa_rpc_reap returning: protect_count %d, dead_skbs %d", atomic_read(&hsk->protect_count), hsk->dead_skbs); homa_sock_unlock(hsk); @@ -663,6 +657,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) if (rpc->msgin.length >= 0) { while (1) { struct sk_buff *skb; + skb = skb_dequeue(&rpc->msgin.packets); if (!skb) break; @@ -684,7 +679,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) /* Free all of the collected resources; release the socket * lock while doing this. */ - release: +release: hsk->dead_skbs -= num_skbs + rx_frees; result = !list_empty(&hsk->dead_rpcs) && ((num_skbs + num_rpcs) != 0); @@ -713,9 +708,8 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) struct homa_gap *gap = list_first_entry_or_null( &rpc->msgin.gaps, struct homa_gap, links); - if (gap == NULL) { + if (gap == NULL) break; - } list_del(&gap->links); kfree(gap); } @@ -749,7 +743,8 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) { struct homa_rpc *crpc; struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); - homa_bucket_lock(bucket, id, "homa_find_client_rpc"); + + homa_bucket_lock(bucket, id, __func__); hlist_for_each_entry_rcu(crpc, &bucket->rpcs, hash_links) { if (crpc->id == id) return crpc; @@ -775,7 +770,8 @@ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, { struct homa_rpc *srpc; struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); - homa_bucket_lock(bucket, id, "homa_find_server_rpc"); + + homa_bucket_lock(bucket, id, __func__); hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { if ((srpc->id == id) && (srpc->dport == sport) && ipv6_addr_equal(&srpc->peer->addr, saddr)) @@ -796,16 +792,13 @@ void homa_rpc_log(struct homa_rpc *rpc) char *peer = homa_print_ipv6_addr(&rpc->peer->addr); if (rpc->state == RPC_INCOMING) - printk(KERN_NOTICE "%s RPC INCOMING, id %llu, peer %s:%d, " - "%d/%d bytes received, incoming %d\n", + pr_notice("%s RPC INCOMING, id %llu, peer %s:%d, %d/%d bytes received, incoming %d\n", type, rpc->id, peer, rpc->dport, rpc->msgin.length - rpc->msgin.bytes_remaining, rpc->msgin.length, rpc->msgin.granted); else if (rpc->state == RPC_OUTGOING) { - printk(KERN_NOTICE "%s RPC OUTGOING, id %llu, peer %s:%d, " - "out length %d, left %d, granted %d, " - "in left %d, resend_ticks %u, silent_ticks %d\n", + pr_notice("%s RPC OUTGOING, id %llu, peer %s:%d, out length %d, left %d, granted %d, in left %d, resend_ticks %u, silent_ticks %d\n", type, rpc->id, peer, rpc->dport, rpc->msgout.length, rpc->msgout.length - rpc->msgout.next_xmit_offset, @@ -814,8 +807,7 @@ void homa_rpc_log(struct homa_rpc *rpc) rpc->resend_timer_ticks, rpc->silent_ticks); } else { - printk(KERN_NOTICE "%s RPC %s, id %llu, peer %s:%d, " - "incoming length %d, outgoing length %d\n", + pr_notice("%s RPC %s, id %llu, peer %s:%d, incoming length %d, outgoing length %d\n", type, homa_symbol_for_state(rpc), rpc->id, peer, rpc->dport, rpc->msgin.length, rpc->msgout.length); @@ -836,7 +828,7 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) struct homa_rpc *rpc; int count = 0; - printk("Logging active Homa RPCs:\n"); + pr_notice("Logging active Homa RPCs:\n"); rcu_read_lock(); for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); hsk != NULL; hsk = homa_socktab_next(&scan)) { @@ -854,7 +846,7 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) homa_unprotect_rpcs(hsk); } rcu_read_unlock(); - printk("Finished logging active Homa RPCs: %d active RPCs\n", count); + pr_notice("Finished logging active Homa RPCs: %d active RPCs\n", count); } /** @@ -866,13 +858,11 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) if (rpc->state == RPC_INCOMING) { int received = rpc->msgin.length - rpc->msgin.bytes_remaining; - tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes " - "received", + tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes received", rpc->id, tt_addr(rpc->peer->addr), received, rpc->msgin.length); if (1) - tt_record4("RPC id %d has incoming %d, " - "granted %d, prio %d", rpc->id, + tt_record4("RPC id %d has incoming %d, granted %d, prio %d", rpc->id, rpc->msgin.granted - received, rpc->msgin.granted, rpc->msgin.priority); tt_record4("RPC id %d: length %d, remaining %d, rank %d", @@ -886,8 +876,7 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) tt_record2("RPC id %d has %d bpages allocated", rpc->id, rpc->msgin.num_bpages); } else if (rpc->state == RPC_OUTGOING) { - tt_record4("Outgoing RPC id %d, peer 0x%x, %d/%d bytes " - "sent", + tt_record4("Outgoing RPC id %d, peer 0x%x, %d/%d bytes sent", rpc->id, tt_addr(rpc->peer->addr), rpc->msgout.next_xmit_offset, rpc->msgout.length); @@ -927,18 +916,18 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) continue; list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { struct freeze_header freeze; + count++; homa_rpc_log_tt(rpc); - if (freeze_count == 0) { + if (freeze_count == 0) continue; - } if (rpc->state != RPC_INCOMING) continue; if (rpc->msgin.granted <= (rpc->msgin.length - rpc->msgin.bytes_remaining)) continue; freeze_count--; - printk(KERN_NOTICE "Emitting FREEZE in homa_rpc_log_active_tt\n"); + pr_notice("Emitting FREEZE in %s\n", __func__); homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); } homa_unprotect_rpcs(hsk); @@ -979,6 +968,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) continue; list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { int incoming; + if (rpc->state != RPC_INCOMING) continue; incoming = rpc->msgin.granted - @@ -990,23 +980,19 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) continue; total_incoming += rpc->msgin.rec_incoming; if (verbose) - tt_record3("homa_validate_incoming: RPC id %d, " - "incoming %d, " - "rec_incoming %d", + tt_record3("homa_validate_incoming: RPC id %d, ncoming %d, rec_incoming %d", rpc->id, incoming, rpc->msgin.rec_incoming); if (rpc->msgin.granted >= rpc->msgin.length) continue; if (list_empty(&rpc->grantable_links)) { - tt_record1("homa_validate_incoming: RPC id %d " - "not linked in grantable list", + tt_record1("homa_validate_incoming: RPC id %d not linked in grantable list", rpc->id); *link_errors = 1; } if (list_empty(&rpc->grantable_links)) { - tt_record1("homa_validate_incoming: RPC id %d " - "peer not linked in grantable list", - rpc->id);\ + tt_record1("homa_validate_incoming: RPC id %d peer not linked in grantable list", + rpc->id); *link_errors = 1; } } @@ -1038,9 +1024,10 @@ char *homa_print_ipv4_addr(__be32 addr) #define NUM_BUFS_IPV4 4 #define BUF_SIZE_IPV4 30 static char buffers[NUM_BUFS_IPV4][BUF_SIZE_IPV4]; - static int next_buf = 0; + static int next_buf; __u32 a2 = ntohl(addr); char *buffer = buffers[next_buf]; + next_buf++; if (next_buf >= NUM_BUFS_IPV4) next_buf = 0; @@ -1064,23 +1051,27 @@ char *homa_print_ipv6_addr(const struct in6_addr *addr) #define NUM_BUFS (1 << 2) #define BUF_SIZE 64 static char buffers[NUM_BUFS][BUF_SIZE]; - static int next_buf = 0; + static int next_buf; char *buffer = buffers[next_buf]; + next_buf++; if (next_buf >= NUM_BUFS) next_buf = 0; #ifdef __UNIT_TEST__ struct in6_addr zero = {}; + if (ipv6_addr_equal(addr, &zero)) { snprintf(buffer, BUF_SIZE, "0.0.0.0"); } else if ((addr->s6_addr32[0] == 0) && (addr->s6_addr32[1] == 0) && (addr->s6_addr32[2] == htonl(0x0000ffff))) { __u32 a2 = ntohl(addr->s6_addr32[3]); + snprintf(buffer, BUF_SIZE, "%u.%u.%u.%u", (a2 >> 24) & 0xff, (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); } else { - const char *inet_ntop(int, const void *, char *, size_t); + const char *inet_ntop(int af, const void *src, char *dst, + size_t size); inet_ntop(AF_INET6, addr, buffer + 1, BUF_SIZE); buffer[0] = '['; strcat(buffer, "]"); @@ -1127,6 +1118,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) struct data_header *h = (struct data_header *) header; struct homa_skb_info *homa_info = homa_get_skb_info(skb); int data_left, i, seg_length, pos, offset; + if (skb_shinfo(skb)->gso_segs == 0) { seg_length = homa_data_len(skb); data_left = 0; @@ -1140,8 +1132,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) if (offset == -1) offset = ntohl(h->common.sequence); used = homa_snprintf(buffer, buf_len, used, - ", message_length %d, offset %d, " - "data_length %d, incoming %d", + ", message_length %d, offset %d, data_length %d, incoming %d", ntohl(h->message_length), offset, seg_length, ntohl(h->incoming)); if (ntohs(h->cutoff_version != 0)) @@ -1161,6 +1152,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { struct seg_header seg; + homa_skb_get(skb, &seg, pos, sizeof(seg)); offset = ntohl(seg.offset); } else { @@ -1178,6 +1170,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) case GRANT: { struct grant_header *h = (struct grant_header *) header; char *resend = (h->resend_all) ? ", resend_all" : ""; + used = homa_snprintf(buffer, buf_len, used, ", offset %d, grant_prio %u%s", ntohl(h->offset), h->priority, resend); @@ -1185,6 +1178,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) } case RESEND: { struct resend_header *h = (struct resend_header *) header; + used = homa_snprintf(buffer, buf_len, used, ", offset %d, length %d, resend_prio %u", ntohl(h->offset), ntohl(h->length), @@ -1199,6 +1193,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) break; case CUTOFFS: { struct cutoffs_header *h = (struct cutoffs_header *) header; + used = homa_snprintf(buffer, buf_len, used, ", cutoffs %d %d %d %d %d %d %d %d, version %u", ntohl(h->unsched_cutoffs[0]), @@ -1221,6 +1216,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) case ACK: { struct ack_header *h = (struct ack_header *) header; int i, count; + count = ntohs(h->num_acks); used = homa_snprintf(buffer, buf_len, used, ", acks"); for (i = 0; i < count; i++) { @@ -1278,6 +1274,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { struct seg_header seg; + homa_skb_get(skb, &seg, pos, sizeof(seg)); offset = ntohl(seg.offset); } else { @@ -1295,12 +1292,14 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) case GRANT: { struct grant_header *h = (struct grant_header *) header; char *resend = h->resend_all ? " resend_all" : ""; + snprintf(buffer, buf_len, "GRANT %d@%d%s", ntohl(h->offset), h->priority, resend); break; } case RESEND: { struct resend_header *h = (struct resend_header *) header; + snprintf(buffer, buf_len, "RESEND %d-%d@%d", ntohl(h->offset), ntohl(h->offset) + ntohl(h->length) - 1, h->priority); @@ -1321,7 +1320,6 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) case NEED_ACK: snprintf(buffer, buf_len, "NEED_ACK"); break; - break; case ACK: snprintf(buffer, buf_len, "ACK"); break; @@ -1358,7 +1356,7 @@ void homa_freeze_peers(struct homa *homa) return; } freeze.common.type = FREEZE; - freeze.common.sport = htons(hsk->port);; + freeze.common.sport = htons(hsk->port); freeze.common.dport = 0; freeze.common.flags = HOMA_TCP_FLAGS; freeze.common.urgent = htons(HOMA_TCP_URGENT); @@ -1367,8 +1365,7 @@ void homa_freeze_peers(struct homa *homa) tt_record1("Sending freeze to 0x%x", tt_addr(peers[i]->addr)); err = __homa_xmit_control(&freeze, sizeof(freeze), peers[i], hsk); if (err != 0) - tt_record2("homa_freeze_peers got error %d in xmit " - "to 0x%x\n", err, + tt_record2("homa_freeze_peers got error %d in xmit to 0x%x\n", err, tt_addr(peers[i]->addr)); } kfree(peers); @@ -1392,11 +1389,11 @@ void homa_freeze_peers(struct homa *homa) * Return: The number of characters now occupied in @buffer, not * including the terminating null character. */ -int homa_snprintf(char *buffer, int size, int used, const char* format, ...) +int homa_snprintf(char *buffer, int size, int used, const char *format, ...) { int new_chars; - va_list ap; + va_start(ap, format); if (used >= (size-1)) @@ -1420,6 +1417,7 @@ int homa_snprintf(char *buffer, int size, int used, const char* format, ...) char *homa_symbol_for_state(struct homa_rpc *rpc) { static char buffer[20]; + switch (rpc->state) { case RPC_OUTGOING: return "OUTGOING"; @@ -1446,6 +1444,7 @@ char *homa_symbol_for_state(struct homa_rpc *rpc) char *homa_symbol_for_type(uint8_t type) { static char buffer[20]; + switch (type) { case DATA: return "DATA"; @@ -1471,7 +1470,8 @@ char *homa_symbol_for_type(uint8_t type) * but (a) it's unlikely (this code only executes if the opcode is * bogus), (b) this is mostly for testing and debugging, and (c) the * code below ensures that the string cannot run past the end of the - * buffer, so the code is safe. */ + * buffer, so the code is safe. + */ snprintf(buffer, sizeof(buffer)-1, "unknown(%u)", type); buffer[sizeof(buffer)-1] = 0; return buffer; @@ -1485,7 +1485,7 @@ char *homa_symbol_for_type(uint8_t type) * new metric. Arguments after this provide the usual * values expected for printf-like functions. */ -void homa_append_metric(struct homa *homa, const char* format, ...) +void homa_append_metric(struct homa *homa, const char *format, ...) { char *new_buffer; size_t new_chars; @@ -1499,8 +1499,7 @@ void homa_append_metric(struct homa *homa, const char* format, ...) #endif homa->metrics = kmalloc(homa->metrics_capacity, GFP_KERNEL); if (!homa->metrics) { - printk(KERN_WARNING "homa_append_metric couldn't " - "allocate memory\n"); + pr_warn("%s couldn't allocate memory\n", __func__); return; } homa->metrics_length = 0; @@ -1523,8 +1522,7 @@ void homa_append_metric(struct homa *homa, const char* format, ...) homa->metrics_capacity *= 2; new_buffer = kmalloc(homa->metrics_capacity, GFP_KERNEL); if (!new_buffer) { - printk(KERN_WARNING "homa_append_metric couldn't " - "allocate memory\n"); + pr_warn("%s couldn't allocate memory\n", __func__); return; } memcpy(new_buffer, homa->metrics, homa->metrics_length); @@ -1548,25 +1546,21 @@ char *homa_print_metrics(struct homa *homa) homa->metrics_length = 0; homa_append_metric(homa, - "rdtsc_cycles %20llu " - "RDTSC cycle counter when metrics were gathered\n", + "rdtsc_cycles %20llu RDTSC cycle counter when metrics were gathered\n", get_cycles()); homa_append_metric(homa, - "cpu_khz %15llu " - "Clock rate for RDTSC counter, in khz\n", + "cpu_khz %15llu Clock rate for RDTSC counter, in khz\n", cpu_khz); for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = &homa_cores[core]->metrics; __s64 delta; + homa_append_metric(homa, - "core %15d " - "Core id for following metrics\n", + "core %15d Core id for following metrics\n", core); for (i = 0; i < HOMA_NUM_SMALL_COUNTS; i++) { homa_append_metric(homa, - "msg_bytes_%-9d %15llu " - "Bytes in incoming messages containing " - "%d-%d bytes\n", + "msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", (i+1)*64, m->small_msg_bytes[i], lower, (i+1)*64); lower = (i+1)*64 + 1; @@ -1574,145 +1568,111 @@ char *homa_print_metrics(struct homa *homa) for (i = (HOMA_NUM_SMALL_COUNTS*64)/1024; i < HOMA_NUM_MEDIUM_COUNTS; i++) { homa_append_metric(homa, - "msg_bytes_%-9d %15llu " - "Bytes in incoming messages containing " - "%d-%d bytes\n", + "msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", (i+1)*1024, m->medium_msg_bytes[i], lower, (i+1)*1024); lower = (i+1)*1024 + 1; } homa_append_metric(homa, - "large_msg_count %15llu " - "# of incoming messages >= %d bytes\n", + "large_msg_count %15llu # of incoming messages >= %d bytes\n", m->large_msg_count, lower); homa_append_metric(homa, - "large_msg_bytes %15llu " - "Bytes in incoming messages >= %d bytes\n", + "large_msg_bytes %15llu Bytes in incoming messages >= %d bytes\n", m->large_msg_bytes, lower); homa_append_metric(homa, - "sent_msg_bytes %15llu " - "Total bytes in all outgoing messages\n", + "sent_msg_bytes %15llu otal bytes in all outgoing messages\n", m->sent_msg_bytes); for (i = DATA; i < BOGUS; i++) { char *symbol = homa_symbol_for_type(i); + homa_append_metric(homa, - "packets_sent_%-7s %15llu " - "%s packets sent\n", + "packets_sent_%-7s %15llu %s packets sent\n", symbol, m->packets_sent[i-DATA], symbol); } for (i = DATA; i < BOGUS; i++) { char *symbol = homa_symbol_for_type(i); + homa_append_metric(homa, - "packets_rcvd_%-7s %15llu " - "%s packets received\n", + "packets_rcvd_%-7s %15llu %s packets received\n", symbol, m->packets_received[i-DATA], symbol); } for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { homa_append_metric(homa, - "priority%d_bytes %15llu " - "Bytes sent at priority %d " - "(including headers)\n", + "priority%d_bytes %15llu Bytes sent at priority %d (including headers)\n", i, m->priority_bytes[i], i); } for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { homa_append_metric(homa, - "priority%d_packets %15llu " - "Packets sent at priority %d\n", + "priority%d_packets %15llu Packets sent at priority %d\n", i, m->priority_packets[i], i); } homa_append_metric(homa, - "skb_allocs %15llu " - "sk_buffs allocated\n", + "skb_allocs %15llu sk_buffs allocated\n", m->skb_allocs); homa_append_metric(homa, - "skb_alloc_cycles %15llu " - "Time spent allocating sk_buffs\n", + "skb_alloc_cycles %15llu Time spent allocating sk_buffs\n", m->skb_alloc_cycles); homa_append_metric(homa, - "skb_frees %15llu " - "Data sk_buffs freed in normal paths\n", + "skb_frees %15llu Data sk_buffs freed in normal paths\n", m->skb_frees); homa_append_metric(homa, - "skb_free_cycles %15llu " - "Time spent freeing data sk_buffs\n", + "skb_free_cycles %15llu Time spent freeing data sk_buffs\n", m->skb_free_cycles); homa_append_metric(homa, - "skb_page_allocs %15llu " - "Pages allocated for sk_buff frags\n", + "skb_page_allocs %15llu Pages allocated for sk_buff frags\n", m->skb_page_allocs); homa_append_metric(homa, - "skb_page_alloc_cycles %15llu " - "Time spent allocating pages for sk_buff frags\n", + "skb_page_alloc_cycles %15llu Time spent allocating pages for sk_buff frags\n", m->skb_page_alloc_cycles); homa_append_metric(homa, - "requests_received %15llu " - "Incoming request messages\n", + "requests_received %15llu Incoming request messages\n", m->requests_received); homa_append_metric(homa, - "requests_queued %15llu " - "Requests for which no thread was waiting\n", + "requests_queued %15llu Requests for which no thread was waiting\n", m->requests_queued); homa_append_metric(homa, - "responses_received %15llu " - "Incoming response messages\n", + "responses_received %15llu Incoming response messages\n", m->responses_received); homa_append_metric(homa, - "responses_queued %15llu " - "Responses for which no thread was waiting\n", + "responses_queued %15llu Responses for which no thread was waiting\n", m->responses_queued); homa_append_metric(homa, - "fast_wakeups %15llu " - "Messages received while polling\n", + "fast_wakeups %15llu Messages received while polling\n", m->fast_wakeups); homa_append_metric(homa, - "slow_wakeups %15llu " - "Messages received after thread went to sleep\n", + "slow_wakeups %15llu Messages received after thread went to sleep\n", m->slow_wakeups); homa_append_metric(homa, - "handoffs_thread_waiting %15llu " - "RPC handoffs to waiting threads (vs. queue)\n", + "handoffs_thread_waiting %15llu RPC handoffs to waiting threads (vs. queue)\n", m->handoffs_thread_waiting); homa_append_metric(homa, - "handoffs_alt_thread %15llu " - "RPC handoffs not to first on list (avoid busy " - "core)\n", + "handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", m->handoffs_alt_thread); homa_append_metric(homa, - "poll_cycles %15llu " - "Time spent polling for incoming messages\n", + "poll_cycles %15llu Time spent polling for incoming messages\n", m->poll_cycles); homa_append_metric(homa, - "softirq_calls %15llu " - "Calls to homa_softirq (i.e. # GRO pkts " - "received)\n", + "softirq_calls %15llu Calls to homa_softirq (i.e. # GRO pkts received)\n", m->softirq_calls); homa_append_metric(homa, - "softirq_cycles %15llu " - "Time spent in homa_softirq during SoftIRQ\n", + "softirq_cycles %15llu Time spent in homa_softirq during SoftIRQ\n", m->softirq_cycles); homa_append_metric(homa, - "bypass_softirq_cycles %15llu " - "Time spent in homa_softirq during bypass " - "from GRO\n", + "bypass_softirq_cycles %15llu Time spent in homa_softirq during bypass from GRO\n", m->bypass_softirq_cycles); homa_append_metric(homa, - "linux_softirq_cycles %15llu " - "Time spent in all Linux SoftIRQ\n", + "linux_softirq_cycles %15llu Time spent in all Linux SoftIRQ\n", m->linux_softirq_cycles); homa_append_metric(homa, - "napi_cycles %15llu " - "Time spent in NAPI-level packet handling\n", + "napi_cycles %15llu Time spent in NAPI-level packet handling\n", m->napi_cycles); homa_append_metric(homa, - "send_cycles %15llu " - "Time spent in homa_sendmsg for requests\n", + "send_cycles %15llu Time spent in homa_sendmsg for requests\n", m->send_cycles); homa_append_metric(homa, - "send_calls %15llu " - "Total invocations of homa_sendmsg for " - "requests\n", + "send_calls %15llu Total invocations of homa_sendmsg for equests\n", m->send_calls); // It is possible for us to get here at a time when a // thread has been blocked for a long time and has @@ -1724,331 +1684,233 @@ char *homa_print_metrics(struct homa *homa) if (delta < 0) delta = 0; homa_append_metric(homa, - "recv_cycles %15llu " - "Unblocked time spent in recvmsg kernel call\n", + "recv_cycles %15llu Unblocked time spent in recvmsg kernel call\n", delta); homa_append_metric(homa, - "recv_calls %15llu " - "Total invocations of recvmsg kernel call\n", + "recv_calls %15llu Total invocations of recvmsg kernel call\n", m->recv_calls); homa_append_metric(homa, - "blocked_cycles %15llu " - "Time spent blocked in homa_recvmsg\n", + "blocked_cycles %15llu Time spent blocked in homa_recvmsg\n", m->blocked_cycles); homa_append_metric(homa, - "reply_cycles %15llu " - "Time spent in homa_sendmsg for responses\n", + "reply_cycles %15llu Time spent in homa_sendmsg for responses\n", m->reply_cycles); homa_append_metric(homa, - "reply_calls %15llu " - "Total invocations of homa_sendmsg for " - "responses\n", + "reply_calls %15llu Total invocations of homa_sendmsg for responses\n", m->reply_calls); homa_append_metric(homa, - "abort_cycles %15llu " - "Time spent in homa_ioc_abort kernel call\n", + "abort_cycles %15llu Time spent in homa_ioc_abort kernel call\n", m->reply_cycles); homa_append_metric(homa, - "abort_calls %15llu " - "Total invocations of abort kernel call\n", + "abort_calls %15llu Total invocations of abort kernel call\n", m->reply_calls); homa_append_metric(homa, - "so_set_buf_cycles %15llu " - "Time spent in setsockopt SO_HOMA_SET_BUF\n", + "so_set_buf_cycles %15llu Time spent in setsockopt SO_HOMA_SET_BUF\n", m->so_set_buf_cycles); homa_append_metric(homa, - "so_set_buf_calls %15llu " - "Total invocations of setsockopt SO_HOMA_SET_BUF\n", + "so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_SET_BUF\n", m->so_set_buf_calls); homa_append_metric(homa, - "grantable_lock_cycles %15llu " - "Time spent with homa->grantable_lock locked\n", + "grantable_lock_cycles %15llu Time spent with homa->grantable_lock locked\n", m->grantable_lock_cycles); homa_append_metric(homa, - "timer_cycles %15llu " - "Time spent in homa_timer\n", + "timer_cycles %15llu Time spent in homa_timer\n", m->timer_cycles); homa_append_metric(homa, - "timer_reap_cycles %15llu " - "Time in homa_timer spent reaping RPCs\n", + "timer_reap_cycles %15llu Time in homa_timer spent reaping RPCs\n", m->timer_reap_cycles); homa_append_metric(homa, - "data_pkt_reap_cycles %15llu " - "Time in homa_data_pkt spent reaping RPCs\n", + "data_pkt_reap_cycles %15llu Time in homa_data_pkt spent reaping RPCs\n", m->data_pkt_reap_cycles); homa_append_metric(homa, - "pacer_cycles %15llu " - "Time spent in homa_pacer_main\n", + "pacer_cycles %15llu Time spent in homa_pacer_main\n", m->pacer_cycles); homa_append_metric(homa, - "homa_cycles %15llu " - "Total time in all Homa-related functions\n", + "homa_cycles %15llu Total time in all Homa-related functions\n", m->softirq_cycles + m->napi_cycles + m->send_cycles + m->recv_cycles + m->reply_cycles - m->blocked_cycles + m->timer_cycles + m->pacer_cycles); homa_append_metric(homa, - "pacer_lost_cycles %15llu " - "Lost transmission time because pacer was " - "slow\n", + "pacer_lost_cycles %15llu Lost transmission time because pacer was slow\n", m->pacer_lost_cycles); homa_append_metric(homa, - "pacer_bytes %15llu " - "Bytes transmitted when the pacer was active\n", + "pacer_bytes %15llu Bytes transmitted when the pacer was active\n", m->pacer_bytes); homa_append_metric(homa, - "pacer_skipped_rpcs %15llu " - "Pacer aborts because of locked RPCs\n", + "pacer_skipped_rpcs %15llu Pacer aborts because of locked RPCs\n", m->pacer_skipped_rpcs); homa_append_metric(homa, - "pacer_needed_help %15llu " - "homa_pacer_xmit invocations from " - "homa_check_pacer\n", + "pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", m->pacer_needed_help); homa_append_metric(homa, - "throttled_cycles %15llu " - "Time when the throttled queue was nonempty\n", + "throttled_cycles %15llu Time when the throttled queue was nonempty\n", m->throttled_cycles); homa_append_metric(homa, - "resent_packets %15llu " - "DATA packets sent in response to RESENDs\n", + "resent_packets %15llu DATA packets sent in response to RESENDs\n", m->resent_packets); homa_append_metric(homa, - "peer_hash_links %15llu " - "Hash chain link traversals in peer table\n", + "peer_hash_links %15llu Hash chain link traversals in peer table\n", m->peer_hash_links); homa_append_metric(homa, - "peer_new_entries %15llu " - "New entries created in peer table\n", + "peer_new_entries %15llu New entries created in peer table\n", m->peer_new_entries); homa_append_metric(homa, - "peer_kmalloc_errors %15llu " - "kmalloc failures creating peer table " - "entries\n", + "peer_kmalloc_errors %15llu kmalloc failures creating peer table entries\n", m->peer_kmalloc_errors); homa_append_metric(homa, - "peer_route_errors %15llu " - "Routing failures creating peer table " - "entries\n", + "peer_route_errors %15llu Routing failures creating peer table entries\n", m->peer_route_errors); homa_append_metric(homa, - "control_xmit_errors %15llu " - "Errors sending control packets\n", + "control_xmit_errors %15llu Errors sending control packets\n", m->control_xmit_errors); homa_append_metric(homa, - "data_xmit_errors %15llu " - "Errors sending data packets\n", + "data_xmit_errors %15llu Errors sending data packets\n", m->data_xmit_errors); homa_append_metric(homa, - "unknown_rpcs %15llu " - "Non-grant packets discarded because RPC unknown\n", + "unknown_rpcs %15llu Non-grant packets discarded because RPC unknown\n", m->unknown_rpcs); homa_append_metric(homa, - "server_cant_create_rpcs %15llu " - "Packets discarded because server " - "couldn't create RPC\n", + "server_cant_create_rpcs %15llu Packets discarded because server couldn't create RPC\n", m->server_cant_create_rpcs); homa_append_metric(homa, - "unknown_packet_types %15llu " - "Packets discarded because of unsupported " - "type\n", + "unknown_packet_types %15llu Packets discarded because of unsupported type\n", m->unknown_packet_types); homa_append_metric(homa, - "short_packets %15llu " - "Packets discarded because too short\n", + "short_packets %15llu Packets discarded because too short\n", m->short_packets); homa_append_metric(homa, - "packet_discards %15llu " - "Non-resent packets discarded because data " - "already received\n", + "packet_discards %15llu Non-resent packets discarded because data already received\n", m->packet_discards); homa_append_metric(homa, - "resent_discards %15llu " - "Resent packets discarded because data " - "already received\n", + "resent_discards %15llu Resent packets discarded because data already received\n", m->resent_discards); homa_append_metric(homa, - "resent_packets_used %15llu " - "Retransmitted packets that were actually used\n", + "resent_packets_used %15llu Retransmitted packets that were actually used\n", m->resent_packets_used); homa_append_metric(homa, - "rpc_timeouts %15llu " - "RPCs aborted because peer was nonresponsive\n", + "rpc_timeouts %15llu RPCs aborted because peer was nonresponsive\n", m->rpc_timeouts); homa_append_metric(homa, - "server_rpc_discards %15llu " - "RPCs discarded by server because of errors\n", + "server_rpc_discards %15llu RPCs discarded by server because of errors\n", m->server_rpc_discards); homa_append_metric(homa, - "server_rpcs_unknown %15llu " - "RPCs aborted by server because unknown to " - "client\n", + "server_rpcs_unknown %15llu RPCs aborted by server because unknown to client\n", m->server_rpcs_unknown); homa_append_metric(homa, - "client_lock_misses %15llu " - "Bucket lock misses for client RPCs\n", + "client_lock_misses %15llu Bucket lock misses for client RPCs\n", m->client_lock_misses); homa_append_metric(homa, - "client_lock_miss_cycles %15llu " - "Time lost waiting for client bucket locks\n", + "client_lock_miss_cycles %15llu Time lost waiting for client bucket locks\n", m->client_lock_miss_cycles); homa_append_metric(homa, - "server_lock_misses %15llu " - "Bucket lock misses for server RPCs\n", + "server_lock_misses %15llu Bucket lock misses for server RPCs\n", m->server_lock_misses); homa_append_metric(homa, - "server_lock_miss_cycles %15llu " - "Time lost waiting for server bucket locks\n", + "server_lock_miss_cycles %15llu Time lost waiting for server bucket locks\n", m->server_lock_miss_cycles); homa_append_metric(homa, - "socket_lock_misses %15llu " - "Socket lock misses\n", + "socket_lock_misses %15llu Socket lock misses\n", m->socket_lock_misses); homa_append_metric(homa, - "socket_lock_miss_cycles %15llu " - "Time lost waiting for socket locks\n", + "socket_lock_miss_cycles %15llu Time lost waiting for socket locks\n", m->socket_lock_miss_cycles); homa_append_metric(homa, - "throttle_lock_misses %15llu " - "Throttle lock misses\n", + "throttle_lock_misses %15llu Throttle lock misses\n", m->throttle_lock_misses); homa_append_metric(homa, - "throttle_lock_miss_cycles %15llu " - "Time lost waiting for throttle locks\n", + "throttle_lock_miss_cycles %15llu Time lost waiting for throttle locks\n", m->throttle_lock_miss_cycles); homa_append_metric(homa, - "peer_ack_lock_misses %15llu " - "Misses on peer ack locks\n", + "peer_ack_lock_misses %15llu Misses on peer ack locks\n", m->peer_ack_lock_misses); homa_append_metric(homa, - "peer_ack_lock_miss_cycles %15llu " - "Time lost waiting for peer ack locks\n", + "peer_ack_lock_miss_cycles %15llu Time lost waiting for peer ack locks\n", m->peer_ack_lock_miss_cycles); homa_append_metric(homa, - "grantable_lock_misses %15llu " - "Grantable lock misses\n", + "grantable_lock_misses %15llu Grantable lock misses\n", m->grantable_lock_misses); homa_append_metric(homa, - "grantable_lock_miss_cycles%15llu " - "Time lost waiting for grantable lock\n", + "grantable_lock_miss_cycles%15llu Time lost waiting for grantable lock\n", m->grantable_lock_miss_cycles); homa_append_metric(homa, - "grantable_rpcs_integral %15llu " - "Integral of homa->num_grantable_rpcs*dt\n", + "grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", m->grantable_rpcs_integral); homa_append_metric(homa, - "grant_recalc_calls %15llu " - "Number of calls to homa_grant_recalc\n", + "grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", m->grant_recalc_calls); homa_append_metric(homa, - "grant_recalc_cycles %15llu " - "Time spent in homa_grant_recalc\n", + "grant_recalc_cycles %15llu Time spent in homa_grant_recalc\n", m->grant_recalc_cycles); homa_append_metric(homa, - "grant_recalc_skips %15llu " - "Number of times homa_grant_recalc skipped " - "redundant work\n", + "grant_recalc_skips %15llu Number of times homa_grant_recalc skipped redundant work\n", m->grant_recalc_skips); homa_append_metric(homa, - "grant_recalc_loops %15llu " - "Number of times homa_grant_recalc looped back\n", + "grant_recalc_loops %15llu Number of times homa_grant_recalc looped back\n", m->grant_recalc_loops); homa_append_metric(homa, - "grant_priority_bumps %15llu " - "Number of times an RPC moved up in the grant " - "priority order\n", + "grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", m->grant_priority_bumps); homa_append_metric(homa, - "fifo_grants %15llu " - "Grants issued using FIFO priority\n", + "fifo_grants %15llu Grants issued using FIFO priority\n", m->fifo_grants); homa_append_metric(homa, - "fifo_grants_no_incoming %15llu " - "FIFO grants to messages with no " - "outstanding grants\n", + "fifo_grants_no_incoming %15llu FIFO grants to messages with no outstanding grants\n", m->fifo_grants_no_incoming); homa_append_metric(homa, - "disabled_reaps %15llu " - "Reaper invocations that were disabled\n", + "disabled_reaps %15llu Reaper invocations that were disabled\n", m->disabled_reaps); homa_append_metric(homa, - "disabled_rpc_reaps %15llu " - "Disabled RPCs skipped by reaper\n", + "disabled_rpc_reaps %15llu Disabled RPCs skipped by reaper\n", m->disabled_rpc_reaps); homa_append_metric(homa, - "reaper_calls %15llu " - "Reaper invocations that were not disabled\n", + "reaper_calls %15llu Reaper invocations that were not disabled\n", m->reaper_calls); homa_append_metric(homa, - "reaper_dead_skbs %15llu " - "Sum of hsk->dead_skbs across all reaper " - "calls\n", + "reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper alls\n", m->reaper_dead_skbs); homa_append_metric(homa, - "forced_reaps %15llu " - "Reaps forced by accumulation of dead RPCs\n", + "forced_reaps %15llu Reaps forced by accumulation of dead RPCs\n", m->forced_reaps); homa_append_metric(homa, - "throttle_list_adds %15llu " - "Calls to homa_add_to_throttled\n", + "throttle_list_adds %15llu Calls to homa_add_to_throttled\n", m->throttle_list_adds); homa_append_metric(homa, - "throttle_list_checks %15llu " - "List elements checked in " - "homa_add_to_throttled\n", + "throttle_list_checks %15llu List elements checked in homa_add_to_throttled\n", m->throttle_list_checks); homa_append_metric(homa, - "ack_overflows %15llu " - "Explicit ACKs sent because peer->acks was " - "full\n", + "ack_overflows %15llu Explicit ACKs sent because peer->acks was full\n", m->ack_overflows); homa_append_metric(homa, - "ignored_need_acks %15llu " - "NEED_ACKs ignored because RPC result not " - "yet received\n", + "ignored_need_acks %15llu NEED_ACKs ignored because RPC result not yet received\n", m->ignored_need_acks); homa_append_metric(homa, - "bpage_reuses %15llu " - "Buffer page could be reused because ref " - "count was zero\n", + "bpage_reuses %15llu Buffer page could be reused because ref count was zero\n", m->bpage_reuses); homa_append_metric(homa, - "buffer_alloc_failures %15llu " - "homa_pool_allocate didn't find enough buffer " - "space for an RPC\n", + "buffer_alloc_failures %15llu homa_pool_allocate didn't find enough buffer space for an RPC\n", m->buffer_alloc_failures); homa_append_metric(homa, - "linux_pkt_alloc_bytes %15llu " - "Bytes allocated in new packets by NIC driver " - "due to cache overflows\n", + "linux_pkt_alloc_bytes %15llu Bytes allocated in new packets by NIC driver due to cache overflows\n", m->linux_pkt_alloc_bytes); homa_append_metric(homa, - "dropped_data_no_bufs %15llu " - "Data bytes dropped because app buffers full\n", + "dropped_data_no_bufs %15llu Data bytes dropped because app buffers full\n", m->dropped_data_no_bufs); homa_append_metric(homa, - "gen3_handoffs %15llu " - "GRO->SoftIRQ handoffs made by Gen3 balancer\n", + "gen3_handoffs %15llu GRO->SoftIRQ handoffs made by Gen3 balancer\n", m->gen3_handoffs); homa_append_metric(homa, - "gen3_alt_handoffs %15llu " - "Gen3 handoffs to secondary core (primary was " - "busy)\n", + "gen3_alt_handoffs %15llu Gen3 handoffs to secondary core (primary was busy)\n", m->gen3_alt_handoffs); homa_append_metric(homa, - "gro_grant_bypasses %15llu " - "Grant packets passed directly to homa_softirq " - "by homa_gro_receive\n", + "gro_grant_bypasses %15llu Grant packets passed directly to homa_softirq by homa_gro_receive\n", m->gro_grant_bypasses); homa_append_metric(homa, - "gro_data_bypasses %15llu " - "Data packets passed directly to homa_softirq " - "by homa_gro_receive\n", + "gro_data_bypasses %15llu Data packets passed directly to homa_softirq by homa_gro_receive\n", m->gro_data_bypasses); for (i = 0; i < NUM_TEMP_METRICS; i++) homa_append_metric(homa, - "temp%-2d %15llu " - "Temporary use in testing\n", + "temp%-2d %15llu Temporary use in testing\n", i, m->temp[i]); } @@ -2085,7 +1947,7 @@ void homa_prios_changed(struct homa *homa) homa->max_sched_prio = 0; break; } - if ((homa->unsched_cutoffs[i] >= HOMA_MAX_MESSAGE_LENGTH)) { + if (homa->unsched_cutoffs[i] >= HOMA_MAX_MESSAGE_LENGTH) { homa->max_sched_prio = i-1; break; } @@ -2100,10 +1962,11 @@ void homa_prios_changed(struct homa *homa) void homa_spin(int ns) { __u64 end; + end = get_cycles() + (ns*cpu_khz)/1000000; - while (get_cycles() < end) { + while (get_cycles() < end) /* Empty loop body.*/ - } + ; } /** @@ -2116,6 +1979,7 @@ void homa_spin(int ns) void homa_throttle_lock_slow(struct homa *homa) { __u64 start = get_cycles(); + tt_record("beginning wait for throttle lock"); spin_lock_bh(&homa->throttle_lock); tt_record("ending wait for throttle lock"); @@ -2141,14 +2005,16 @@ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) if (!tt_frozen) { // struct freeze_header freeze; int dummy; - printk(KERN_NOTICE "freezing in homa_freeze with freeze_type %d\n", type); + + pr_notice("freezing in %s with freeze_type %d\n", __func__, + type); tt_record1("homa_freeze calling homa_rpc_log_active with freeze_type %d", type); homa_rpc_log_active_tt(rpc->hsk->homa, 0); homa_validate_incoming(rpc->hsk->homa, 1, &dummy); - printk(KERN_NOTICE "%s\n", format); + pr_notice("%s\n", format); tt_record2(format, rpc->id, tt_addr(rpc->peer->addr)); tt_freeze(); // homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); homa_freeze_peers(rpc->hsk->homa); } -} \ No newline at end of file +} diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index fcfc529a..ce97e7d1 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -42,7 +42,7 @@ FIXTURE(homa_grant) { int server_port; __u64 client_id; __u64 server_id; - sockaddr_in_union server_addr; + union sockaddr_in_union server_addr; struct homa homa; struct homa_sock hsk; struct data_header data; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 86e88663..4b7f306a 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -129,7 +129,7 @@ FIXTURE(homa_incoming) { int server_port; __u64 client_id; __u64 server_id; - sockaddr_in_union server_addr; + union sockaddr_in_union server_addr; struct homa homa; struct homa_sock hsk; struct homa_sock hsk2; diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 4ecd3053..83002cbe 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -38,7 +38,7 @@ FIXTURE(homa_outgoing) { __u64 server_id; struct homa homa; struct homa_sock hsk; - sockaddr_in_union server_addr; + union sockaddr_in_union server_addr; struct homa_peer *peer; }; FIXTURE_SETUP(homa_outgoing) diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 556752f6..6dbf40ed 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -29,8 +29,8 @@ FIXTURE(homa_plumbing) { __u64 server_id; struct homa homa; struct homa_sock hsk; - sockaddr_in_union client_addr; - sockaddr_in_union server_addr; + union sockaddr_in_union client_addr; + union sockaddr_in_union server_addr; struct data_header data; int starting_skb_count; struct msghdr recvmsg_hdr; @@ -40,7 +40,7 @@ FIXTURE(homa_plumbing) { struct homa_sendmsg_args sendmsg_args; char buffer[2000]; sockptr_t optval; - sockaddr_in_union addr; + union sockaddr_in_union addr; }; FIXTURE_SETUP(homa_plumbing) { @@ -128,7 +128,7 @@ TEST_F(homa_plumbing, homa_bind__ipv6_address_too_short) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - sockaddr_in_union addr = {}; + union sockaddr_in_union addr = {}; addr.in6.sin6_family = AF_INET6; struct socket sock = {}; sock.sk = &self->hsk.inet.sk; @@ -142,7 +142,7 @@ TEST_F(homa_plumbing, homa_bind__ipv6_ok) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - sockaddr_in_union addr = {}; + union sockaddr_in_union addr = {}; addr.in6.sin6_family = AF_INET6; addr.in6.sin6_port = htons(123); struct socket sock = {}; @@ -158,7 +158,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_address_too_short) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - sockaddr_in_union addr = {}; + union sockaddr_in_union addr = {}; addr.in4.sin_family = AF_INET; struct socket sock = {}; sock.sk = &self->hsk.inet.sk; @@ -172,7 +172,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_ok) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - sockaddr_in_union addr = {}; + union sockaddr_in_union addr = {}; addr.in4.sin_family = AF_INET; addr.in4.sin_port = htons(345); struct socket sock = {}; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 3970d5bd..43ea267f 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -519,8 +519,8 @@ TEST_F(homa_pool, homa_pool_check_waiting__rpc_initially_locked) unit_log_clear(); atomic_set(&pool->free_bpages, 1); homa_pool_check_waiting(pool); - EXPECT_SUBSTR("rpc lock unavailable in homa_pool_release_buffers; " - "rpc lock unavailable in homa_pool_release_buffers", + EXPECT_SUBSTR("rpc lock unavailable in homa_pool_check_waiting; " + "rpc lock unavailable in homa_pool_check_waiting", unit_log_get()); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_TRUE(list_empty(&self->hsk.waiting_for_bufs)); diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index cd5193e9..33f00a8f 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -16,7 +16,7 @@ FIXTURE(homa_timer) { int server_port; __u64 client_id; __u64 server_id; - sockaddr_in_union server_addr; + union sockaddr_in_union server_addr; struct homa homa; struct homa_sock hsk; }; diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index fcc82212..aa03f90f 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -21,7 +21,7 @@ FIXTURE(homa_utils) { __u64 server_id; struct homa homa; struct homa_sock hsk; - sockaddr_in_union server_addr; + union sockaddr_in_union server_addr; struct data_header data; struct homa_rpc *crpc; struct iovec iovec; diff --git a/test/unit_timetrace.c b/test/unit_timetrace.c index aae1c001..650e96ef 100644 --- a/test/unit_timetrace.c +++ b/test/unit_timetrace.c @@ -90,7 +90,7 @@ TEST_F(timetrace, tt_record_buf__wraparound) TEST_F(timetrace, tt_find_oldest) { - int pos[NR_CPUS]; + int pos[nr_cpu_ids]; tt_buffer_size = 4; tt_record_buf(tt_buffers[0], 1500, "Buf0", 0, 0, 0, 0); diff --git a/test/utils.c b/test/utils.c index 841f7a23..8b4e3e6c 100644 --- a/test/utils.c +++ b/test/utils.c @@ -34,7 +34,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, int req_length, int resp_length) { int bytes_received; - sockaddr_in_union server_addr; + union sockaddr_in_union server_addr; int saved_id = atomic64_read(&hsk->homa->next_outgoing_id); server_addr.in6.sin6_family = AF_INET6; diff --git a/timetrace.c b/timetrace.c index e429786c..7c44cdd2 100644 --- a/timetrace.c +++ b/timetrace.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" @@ -20,7 +18,7 @@ extern int *tt_linux_homa_temp; extern int tt_linux_homa_temp_default[]; extern void (*tt_linux_inc_metrics)(int metric, __u64 count); extern void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, - const char* format, __u32 arg0, __u32 arg1, __u32 arg2, + const char *format, __u32 arg0, __u32 arg1, __u32 arg2, __u32 arg3); extern void tt_linux_skip_metrics(int metric, __u64 count); extern void (*tt_linux_printk)(void); @@ -40,7 +38,7 @@ extern void tt_inc_metric(int metric, __u64 count); * synchronization in tt_record, which improves performance significantly. * NR_CPUS is an overestimate of the actual number of cores; we use it * here, rather than nr_cpu_ids, because it allows for static allocation - * of this array. And + * of this array. */ struct tt_buffer *tt_buffers[NR_CPUS]; @@ -87,7 +85,7 @@ int tt_buffer_size = TT_BUF_SIZE; int tt_pf_storage = TT_PF_BUF_SIZE; /* Set during tests to disable "cpu_khz" line in trace output. */ -bool tt_test_no_khz = false; +bool tt_test_no_khz; /** * tt_init(): Enable time tracing, create /proc file for reading traces. @@ -105,16 +103,15 @@ int tt_init(char *proc_file, int *temp) { int i; - if (init) { + if (init) return 0; - } for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer; + buffer = kmalloc(sizeof(*buffer), GFP_KERNEL); if (buffer == NULL) { - printk(KERN_ERR "timetrace couldn't allocate " - "tt_buffers\n"); + pr_err("timetrace couldn't allocate tt_buffers\n"); goto error; } memset(buffer, 0, sizeof(*buffer)); @@ -122,10 +119,10 @@ int tt_init(char *proc_file, int *temp) } if (proc_file != NULL) { - tt_dir_entry = proc_create(proc_file, S_IRUGO, NULL, &tt_pops); + tt_dir_entry = proc_create(proc_file, 0444, NULL, &tt_pops); if (!tt_dir_entry) { - printk(KERN_ERR "couldn't create /proc/%s for timetrace " - "reading\n", proc_file); + pr_err("couldn't create /proc/%s for timetrace reading\n", + proc_file); goto error; } } else { @@ -138,9 +135,8 @@ int tt_init(char *proc_file, int *temp) init = true; #ifdef TT_KERNEL - for (i = 0; i < nr_cpu_ids; i++) { + for (i = 0; i < nr_cpu_ids; i++) tt_linux_buffers[i] = tt_buffers[i]; - } tt_linux_record = tt_record_buf; tt_linux_freeze = tt_freeze; tt_linux_freeze_count = &tt_freeze_count; @@ -155,7 +151,7 @@ int tt_init(char *proc_file, int *temp) return 0; - error: +error: for (i = 0; i < nr_cpu_ids; i++) { kfree(tt_buffers[i]); tt_buffers[i] = NULL; @@ -170,6 +166,7 @@ int tt_init(char *proc_file, int *temp) void tt_destroy(void) { int i; + spin_lock(&tt_lock); if (init) { init = false; @@ -186,9 +183,8 @@ void tt_destroy(void) tt_linux_record = ltt_record_nop; tt_linux_freeze = tt_linux_nop; tt_linux_freeze_count = &tt_linux_freeze_no_homa; - for (i = 0; i < nr_cpu_ids; i++) { + for (i = 0; i < nr_cpu_ids; i++) tt_linux_buffers[i] = NULL; - } tt_linux_inc_metrics = tt_linux_skip_metrics; tt_linux_printk = tt_linux_nop; tt_linux_dbg1 = (void (*)(char *, ...)) tt_linux_nop; @@ -214,7 +210,7 @@ void tt_freeze(void) if (tt_frozen) return; tt_record("timetrace frozen"); - printk(KERN_NOTICE "tt_freeze invoked\n"); + pr_notice("%s invoked\n", __func__); spin_lock(&tt_lock); if (!tt_frozen) { tt_frozen = true; @@ -243,10 +239,11 @@ void tt_freeze(void) * @arg3: Argument to use when printing a message about this event. */ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, - const char* format, __u32 arg0, __u32 arg1, __u32 arg2, + const char *format, __u32 arg0, __u32 arg1, __u32 arg2, __u32 arg3) { struct tt_event *event; + if (unlikely(atomic_read(&tt_freeze_count) > 0)) { // In order to ensure that reads produce consistent // results, don't record concurrently (this could cause @@ -285,7 +282,7 @@ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, */ void tt_find_oldest(int *pos) { - struct tt_buffer* buffer; + struct tt_buffer *buffer; int i; __u64 start_time = 0; @@ -297,10 +294,10 @@ void tt_find_oldest(int *pos) int index = (buffer->next_index + 1) & (tt_buffer_size-1); struct tt_event *event = &buffer->events[index]; + pos[i] = index; - if (event->timestamp > start_time) { + if (event->timestamp > start_time) start_time = event->timestamp; - } } } @@ -326,7 +323,7 @@ void tt_find_oldest(int *pos) */ int tt_proc_open(struct inode *inode, struct file *file) { - struct tt_proc_file* pf = NULL; + struct tt_proc_file *pf = NULL; int result = 0; spin_lock(&tt_lock); @@ -352,7 +349,7 @@ int tt_proc_open(struct inode *inode, struct file *file) "cpu_khz: %u\n", cpu_khz); } - done: +done: spin_unlock(&tt_lock); return result; } @@ -381,8 +378,8 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, spin_lock(&tt_lock); if ((pf == NULL) || (pf->file != file)) { - printk(KERN_ERR "tt_metrics_read found damaged " - "private_data: 0x%p\n", file->private_data); + pr_err("tt_metrics_read found damaged private_data: 0x%p\n", + file->private_data); copied_to_user = -EINVAL; goto done; } @@ -403,16 +400,17 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; + event = &buffer->events[pf->pos[i]]; if ((pf->pos[i] != buffer->next_index) && (event->timestamp < earliest_time)) { - current_core = i; - earliest_time = event->timestamp; + current_core = i; + earliest_time = event->timestamp; } } if (current_core < 0) { - /* None of the traces have any more events to process. */ - goto flush; + /* None of the traces have any more events. */ + goto flush; } /* Format one event. */ @@ -420,13 +418,12 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, pf->pos[current_core]]); available = tt_pf_storage - (pf->next_byte + pf->bytes_available - pf->msg_storage); - if (available == 0) { + if (available == 0) goto flush; - } entry_length = snprintf(pf->next_byte + pf->bytes_available, available, "%lu [C%02d] ", - (long unsigned int) event->timestamp, - current_core); + (unsigned long) event->timestamp, + current_core); if (available >= entry_length) entry_length += snprintf(pf->next_byte + pf->bytes_available + entry_length, @@ -437,7 +434,8 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, /* Not enough room for this entry. */ if (pf->bytes_available == 0) { /* Even a full buffer isn't enough for - * this entry; truncate the entry. */ + * this entry; truncate the entry. + */ entry_length = available - 1; } else { goto flush; @@ -450,11 +448,10 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, & (tt_buffer_size-1); continue; - flush: +flush: chunk_size = pf->bytes_available; - if (chunk_size > (length - copied_to_user)) { + if (chunk_size > (length - copied_to_user)) chunk_size = length - copied_to_user; - } if (chunk_size == 0) goto done; failed_to_copy = copy_to_user(user_buf + copied_to_user, @@ -473,7 +470,7 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, } } - done: +done: spin_unlock(&tt_lock); return copied_to_user; } @@ -503,11 +500,11 @@ loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence) int tt_proc_release(struct inode *inode, struct file *file) { int i; - struct tt_proc_file *pf = file->private_data; + if ((pf == NULL) || (pf->file != file)) { - printk(KERN_ERR "tt_metrics_release found damaged " - "private_data: 0x%p\n", file->private_data); + pr_err("tt_metrics_release found damaged private_data: 0x%p\n", + file->private_data); return -EINVAL; } @@ -528,6 +525,7 @@ int tt_proc_release(struct inode *inode, struct file *file) */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; + buffer->events[tt_buffer_size-1].format = NULL; buffer->next_index = 0; } @@ -569,10 +567,8 @@ void tt_print_file(char *path) int bytes_used = 0; loff_t offset = 0; - printk(KERN_ERR "tt_print_file starting, file %s\n", path); - if (atomic_xchg(&active, 1)) { - printk(KERN_ERR "concurrent call to tt_print_file aborting\n"); + pr_err("concurrent call to %s aborting\n", __func__); return; } if (!init) @@ -580,13 +576,12 @@ void tt_print_file(char *path) filp = filp_open(path, O_WRONLY | O_CREAT, 0666); if (IS_ERR(filp)) { - printk(KERN_ERR "tt_print_file couldn't open %s: " - "error %ld\n", path, -PTR_ERR(filp)); + pr_err("%s couldn't open %s: error %ld\n", __func__, path, + -PTR_ERR(filp)); filp = NULL; goto done; } - tt_record("tt_print_file printing timetrace"); atomic_inc(&tt_freeze_count); tt_find_oldest(pos); @@ -604,16 +599,17 @@ void tt_print_file(char *path) /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; + event = &buffer->events[pos[i]]; if ((pos[i] != buffer->next_index) && (event->timestamp < earliest_time)) { - current_core = i; - earliest_time = event->timestamp; + current_core = i; + earliest_time = event->timestamp; } } if (current_core < 0) { - /* None of the traces have any more events to process. */ - break; + /* None of the traces have any more events. */ + break; } event = &(tt_buffers[current_core]->events[ pos[current_core]]); @@ -623,7 +619,7 @@ void tt_print_file(char *path) bytes_used += snprintf(buffer + bytes_used, sizeof(buffer) - bytes_used, "%lu [C%02d] ", - (long unsigned int) event->timestamp, + (unsigned long) event->timestamp, current_core); bytes_used += snprintf(buffer + bytes_used, sizeof(buffer) - bytes_used, @@ -637,9 +633,8 @@ void tt_print_file(char *path) err = kernel_write(filp, buffer, bytes_used, &offset); if (err < 0) { - printk(KERN_NOTICE "tt_print_file got " - "error %d writing %s\n", - -err, path); + pr_notice("%s got error %d writing %s\n", + __func__, -err, path); goto done; } bytes_used = 0; @@ -648,25 +643,22 @@ void tt_print_file(char *path) if (bytes_used > 0) { err = kernel_write(filp, buffer, bytes_used, &offset); if (err < 0) - printk(KERN_ERR "tt_print_file got error %d " - "writing %s\n", -err, path); + pr_err("%s got error %d writing %s\n", + __func__, -err, path); } - printk(KERN_ERR "tt_print_file finishing up\n"); - done: +done: if (filp != NULL) { err = vfs_fsync(filp, 0); if (err < 0) - printk(KERN_ERR "tt_print_file got error %d " - "in fsync\n", -err); + pr_err("%s got error %d in fsync\n", __func__, -err); err = filp_close(filp, NULL); if (err < 0) - printk(KERN_ERR "tt_print_file got error %d " - "in filp_close\n", -err); + pr_err("%s got error %d in filp_close\n", __func__, + -err); } atomic_dec(&tt_freeze_count); atomic_set(&active, 0); - printk(KERN_ERR "tt_print_file(%s) finished\n", path); } /** @@ -687,7 +679,7 @@ void tt_printk(void) static atomic_t active; if (atomic_xchg(&active, 1)) { - printk(KERN_NOTICE "concurrent call to tt_printk aborting\n"); + pr_notice("concurrent call to %s aborting\n", __func__); return; } if (!init) @@ -695,7 +687,7 @@ void tt_printk(void) atomic_inc(&tt_freeze_count); tt_find_oldest(pos); - printk(KERN_NOTICE "cpu_khz: %u\n", cpu_khz); + pr_notice("cpu_khz: %u\n", cpu_khz); /* Each iteration of this loop printk's one event. */ while (true) { @@ -708,16 +700,17 @@ void tt_printk(void) /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; + event = &buffer->events[pos[i]]; if ((pos[i] != buffer->next_index) && (event->timestamp < earliest_time)) { - current_core = i; - earliest_time = event->timestamp; + current_core = i; + earliest_time = event->timestamp; } } if (current_core < 0) { - /* None of the traces have any more events to process. */ - break; + /* None of the traces have any more events. */ + break; } event = &(tt_buffers[current_core]->events[ pos[current_core]]); @@ -726,8 +719,8 @@ void tt_printk(void) snprintf(msg, sizeof(msg), event->format, event->arg0, event->arg1, event->arg2, event->arg3); - printk(KERN_NOTICE "%lu [C%02d] %s\n", - (long unsigned int) event->timestamp, + pr_notice("%lu [C%02d] %s\n", + (unsigned long) event->timestamp, current_core, msg); } @@ -748,7 +741,7 @@ void tt_get_messages(char *buffer, size_t length) /* Index of the next entry to return from each tt_buffer (too * large to allocate on stack, so allocate dynamically). */ - int *pos = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + int *pos = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); int printed = 0; *buffer = 0; @@ -767,16 +760,17 @@ void tt_get_messages(char *buffer, size_t length) /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; + event = &buffer->events[pos[i]]; if ((pos[i] != buffer->next_index) && (event->timestamp < earliest_time)) { - current_core = i; - earliest_time = event->timestamp; + current_core = i; + earliest_time = event->timestamp; } } if (current_core < 0) { - /* None of the traces have any more events to process. */ - break; + /* None of the traces have any more events. */ + break; } event = &(tt_buffers[current_core]->events[ pos[current_core]]); @@ -800,7 +794,7 @@ void tt_get_messages(char *buffer, size_t length) atomic_dec(&tt_freeze_count); - done: +done: kfree(pos); } diff --git a/timetrace.h b/timetrace.h index fdabf97c..3c186a57 100644 --- a/timetrace.h +++ b/timetrace.h @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +/* SPDX-License-Identifier: BSD-2-Clause */ #ifndef HOMA_TIMETRACE_H #define HOMA_TIMETRACE_H @@ -15,7 +13,7 @@ * Timetrace implements a circular buffer of entries, each of which * consists of a fine-grain timestamp, a short descriptive string, and * a few additional values. It's typically used to record times at - * various points in in kernel operations, in order to find performance + * various points in kernel operations, in order to find performance * bottlenecks. It can record a trace relatively efficiently (< 10ns as * of 6/2018), and the trace can be retrieved by user programs for * analysis by reading a file in /proc. @@ -33,7 +31,7 @@ struct tt_event { * Format string describing the event. NULL means that this * entry has never been occupied. */ - const char* format; + const char *format; /** * Up to 4 additional arguments that may be referenced by @@ -74,7 +72,7 @@ struct tt_buffer { */ struct tt_proc_file { /* Identifies a particular open file. */ - struct file* file; + struct file *file; /* Index of the next entry to return from each tt_buffer. */ int pos[NR_CPUS]; @@ -97,8 +95,8 @@ struct tt_proc_file { extern void tt_destroy(void); extern void tt_freeze(void); extern int tt_init(char *proc_file, int *temp); -extern void tt_record_buf(struct tt_buffer* buffer, __u64 timestamp, - const char* format, __u32 arg0, __u32 arg1, +extern void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, + const char *format, __u32 arg0, __u32 arg1, __u32 arg2, __u32 arg3); /* Private methods and variables: exposed so they can be accessed @@ -127,7 +125,7 @@ extern bool tt_test_no_khz; * the kernel. */ extern int64_t tt_debug_int64[100]; -extern void * tt_debug_ptr[100]; +extern void *tt_debug_ptr[100]; /** * tt_rdtsc(): return the current value of the fine-grain CPU cycle counter @@ -136,6 +134,7 @@ extern void * tt_debug_ptr[100]; static inline __u64 tt_rdtsc(void) { __u32 lo, hi; + __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); return (((__u64)hi << 32) | lo); } @@ -157,7 +156,7 @@ static inline __u64 tt_rdtsc(void) * @arg2 Argument to use when printing a message about this event. * @arg3 Argument to use when printing a message about this event. */ -static inline void tt_record4(const char* format, __u32 arg0, __u32 arg1, +static inline void tt_record4(const char *format, __u32 arg0, __u32 arg1, __u32 arg2, __u32 arg3) { #if ENABLE_TIME_TRACE @@ -165,7 +164,7 @@ static inline void tt_record4(const char* format, __u32 arg0, __u32 arg1, arg0, arg1, arg2, arg3); #endif } -static inline void tt_record3(const char* format, __u32 arg0, __u32 arg1, +static inline void tt_record3(const char *format, __u32 arg0, __u32 arg1, __u32 arg2) { #if ENABLE_TIME_TRACE @@ -173,21 +172,21 @@ static inline void tt_record3(const char* format, __u32 arg0, __u32 arg1, arg0, arg1, arg2, 0); #endif } -static inline void tt_record2(const char* format, __u32 arg0, __u32 arg1) +static inline void tt_record2(const char *format, __u32 arg0, __u32 arg1) { #if ENABLE_TIME_TRACE tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, arg0, arg1, 0, 0); #endif } -static inline void tt_record1(const char* format, __u32 arg0) +static inline void tt_record1(const char *format, __u32 arg0) { #if ENABLE_TIME_TRACE tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, arg0, 0, 0, 0); #endif } -static inline void tt_record(const char* format) +static inline void tt_record(const char *format) { #if ENABLE_TIME_TRACE tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, From 87b841e4690a008b2931f05f4d29e5424c7c556a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 26 Sep 2024 09:44:49 -0700 Subject: [PATCH 020/625] Remove obsolete timetrace analyzers (If any of these features are needed in the future, they should be reimplemented in tthoma.py) --- util/README.md | 33 +- util/ttcore.py | 105 ------ util/ttgrants.py | 783 --------------------------------------------- util/ttmlxalloc.py | 155 --------- util/ttnicdelay.py | 124 ------- util/ttrcv.py | 98 ------ util/ttskbs.py | 248 -------------- util/ttsoftirq.py | 139 -------- util/ttxmit.py | 250 --------------- 9 files changed, 4 insertions(+), 1931 deletions(-) delete mode 100755 util/ttcore.py delete mode 100755 util/ttgrants.py delete mode 100755 util/ttmlxalloc.py delete mode 100755 util/ttnicdelay.py delete mode 100755 util/ttrcv.py delete mode 100755 util/ttskbs.py delete mode 100755 util/ttsoftirq.py delete mode 100755 util/ttxmit.py diff --git a/util/README.md b/util/README.md index 716060de..d5023130 100644 --- a/util/README.md +++ b/util/README.md @@ -57,39 +57,14 @@ The following scripts are Homa-specific: **ttprint.py**: extracts the most recent timetrace from the kernel and prints it to standard output. -**ttsync.py**: uses Homa-specific information in a collection of timetraces -simultaneously on different nodes, and adjusts time values to synchronize -clocks. +**ttsync.py**: analyzes Homa-specific information in a collection of +timetraces simultaneously on different nodes and rewrites the traces to +synchronize their clocks. **tthoma.py**: this is the primary script for analyzing Homa data. It -contains multiple analyzers that extracts different kinds of data from a +contains multiple analyzers that extract different kinds of data from a collection of timetraces. Invoke with --help for full documentation. -The following scripts are older Homa-specific scripts. All of these -should eventually be superceded by tt_homa.py, and many already suffer -from bit-rot, so they may not work. - -**ttcore.py**: extracts records containing certain substrings and computes how -often those records occur on each core. - -**ttgrants.py**: computes *grant lag* for a timetrace: how long it takes after a -grant is issued for the granted packet to arrive. Also computes statistics on -when grants arrive, compared to when they need to arrive to transmit at full -link speed. - -**ttnicdelay.py**:: analyzes synchronized client and server traces to -detect situations where the NIC is delaying interrupts. - -**ttrcv.py**: analyzes packet arrivals in a timetrace, outputs information -on arrival times for each offset within a message. - -**ttsoftirq.py**: analyzes SoftIRQ wakeup times in a timetrace. Also measures -total lifetime of receive buffers from GRO -> kfree_skb. - -**ttxmit.py**: analyzes packet transmissions from a timetrace to identify -uplink bubbles (gaps during which the uplink was idle even though there -were active outbound messages). - ### Other Useful Tools **diff_rtts.py**: compares two .rtts files collected by the cperf benchmarks, diff --git a/util/ttcore.py b/util/ttcore.py deleted file mode 100755 index 3897f966..00000000 --- a/util/ttcore.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/python3 - -# Copyright (c) 2019-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause - -""" -Scan the timetrace data in a log file; for records containing certain -substrings, compute statistics for how often those records occur on each -core. -Usage: ttcore.py [substring substring ...] [file] -Each substring argument selects a collection of entries in the timetrace; -each collection will be analyzed separately for core usage. If no substrings -are specified, a default collection will be used. File gives the name of the -timetrace file to use (stdin is used if no file is specified). -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys - -# The substrate arguments that we are matching against timetrace entries -substrings = [] - -# For each entry in substrings there is an entry in this array, which -# consists of an array of counts (how many times a timetrace entry matching -# the substring occurred on each core). -cores = [] - -# Highest core number seen -max_core = 0 - -def scan(f): - """ - Scan the log file given by 'f' and accumulate core statistics. - """ - - global substrings, cores, max_core - startTime = 0.0 - prevTime = 0.0 - writes = 0 - for line in f: - match = re.match(' *([-0-9.]+) us .* \[C([0-9]+)\] (.*)', - line) - if not match: - print("Line didn't match: %s" % (line)) - continue - time = float(match.group(1)) - core = int(match.group(2)) - if core > max_core: - max_core = core - event = match.group(3) - for i in range(0, len(substrings)): - if substrings[i] in event: - c = cores[i] - while len(c) <= core: - c.append(0) - c[core] += 1 - -f = sys.stdin -substrings = [] -if len(sys.argv) > 1: - try: - f = open(sys.argv[-1]) - substrings = sys.argv[1:-1] - except: - substrings = sys.argv[1:] - -if len(substrings) == 0: - substrings = ["mlx processed", - "homa_softirq: first", - "homa_recvmsg returning", - "homa_sendmsg request", - "mlx_xmit starting, id", - "pacer calling", - "tcp_v4_rcv invoked", - "tcp_recvmsg returning" - ] - -for i in range(0, len(substrings)): - cores.append([]) - -scan(f) - -max_length = 0 -for i in range(0, len(substrings)): - length = len(substrings[i]) - if length > max_length: - max_length = length - while len(cores[i]) <= max_core: - cores[i].append(0) - -line = "Event Substring Core 0" -for i in range (1, len(cores[0])): - line += " %5d" % (i) -print(line) -for i in range(0, len(substrings)): - line = "%-*s " % (max_length+1, substrings[i] + ":") - for count in cores[i]: - line += " %5d" % (count) - print(line) \ No newline at end of file diff --git a/util/ttgrants.py b/util/ttgrants.py deleted file mode 100755 index 7010596b..00000000 --- a/util/ttgrants.py +++ /dev/null @@ -1,783 +0,0 @@ -#!/usr/bin/python3 - -""" -Scans a timetrace to compute various statistics related to grants, such -as how long it takes after a grant is issued for the first newly granted -packet to arrive. It can be used on either a client-side or server-side trace. -Usage: ttgrant.py [tt_file] - -The existing timetrace is in tt_file (or stdin in tt_file is omitted). -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys -from statistics import median - -# Parse command line options -parser = OptionParser(description= - 'Read a timetrace and output statistics related to grants (works' - ' on both clients and servers)', - usage='%prog [options] [trace]', - conflict_handler='resolve') -parser.add_option('--verbose', '-v', action='store_true', default=False, - dest='verbose', - help='print lots of output') -parser.add_option('--gbps', type='int', dest='gbps', default=100, - metavar = 'N', help='network speed in Gbps') -parser.add_option('--mtu_data', type='int', dest='mtu_data', default=8920, - help='amount of message data in a full-size packet') -parser.add_option('--window', type='int', dest='window', default=200000, - metavar = 'N', help='window sysctl parameter for Homa') - -(options, extra) = parser.parse_args() -f = sys.stdin -if len(extra) > 0: - f = open(extra[0]) - if len(extra) > 1: - print("Unrecognized argument %s" % (extra[1])) - exit(1) - -def percentile(list, pct, format, na): - """ - Finds the element of list corresponding to a given percentile pct - (0 is first, 100 or more is last), formats it according to format, - and returns the result. Returns na if the list is empty. - """ - if len(list) == 0: - return na - i = int(pct*len(list)/100) - if i >= len(list): - i = len(list) - 1 - return format % (list[i]) - -# Collects all the observed grant latencies (time from sending grant -# to receiving first data packet enabled by grant), in microseconds -latencies = [] - -# Keys are RPC ids. Each value is a list of lists, one per grant sent, -# for an incoming message, where each sublist consists of a -# triple identifying one grant. -recv_grants = {} - -# Keys are RPC ids, values are the highest offset seen in any grant -# for the RPC (including the initial "grant" for unscheduled data). -last_grant = {} - -# Largest observed incoming packet size (presumably a full GSO packet?). -packet_size = 0 - -# Keys are outgoing RPC ids; each value is the amount of unscheduled data -# transmitted for that RPC. -unscheduled = {} - -# Keys are incoming RPC ids; each value is the amount of unscheduled data -# that will be received for that RPC. -recv_unscheduled = {} - -# Keys are RPC ids; each value is a list of lists, one per grant received -# for that RPC, and each entry is a triple -# indicating when the grant was received and the range of bytes it covers. -in_grants = {} - -# Keys are RPC ids; each value is a list of lists, one per data packet -# received by homa_gro_receive for that RPC, and each entry is a -# pair describing that data packet. -gro_data = {} - -# Keys are RPC ids; each value is a list of lists, one per data packet -# received by homa_softirq for that RPC, and each entry is a -# pair describing that data packet. -softirq_data = {} - -# Keys are RPC ids; each value is a list of lists, one per data packet -# sent for that RPC, and each entry is an triple -# describing that data packet. -out_data = {} - -# Keys are RPC ids; each value is the first time at which we noticed that -# this RPC is transmitting data. -first_out = {} - -# Total number of bytes of grants that have been received so far. -total_in_grants = 0 - -# Total number of grants available at end of trace. -end_grants = 0 - -# Total bytes transmitted in data packets. -total_xmit = 0 - -# Keys are RPC ids for outgoing messages; each value is the length of the -# corresponding message. -send_lengths = {} - -# Keys are RPC ids for incoming messages; each value is the length of the -# corresponding message. -recv_lengths = {} - -# Keys are RPC ids; each value is the number of message bytes transmitted -# for that message. -send_xmits = {} - -# Keys are RPC ids; each value is the name of the peer for that id -peers = {} - -# Used for saving statistics about grants as the tt is read -latest_time = 0 -prev_time = 0 -interval_end = 1000.0 -prev_grants = 0 -prev_xmit = 0 -stats = "" -avail = "" - -# Active RPC statistics from trace: -recv_stats = {"active": 0, "granted": 0, "grants_pending": 0, "backlog": 0} - -def pkt_length(offset, msg_length, unsched): - """ - Returns the number of bytes in a packet: - offset: position of first data byte within message - msg_length: total length of the message - unsched: # of unscheduled bytes for this message - """ - length = options.mtu_data - if ((offset + length) > msg_length) and (msg_length >= offset): - length = msg_length - offset - if (offset < unsched) and (offset + length) > unsched: - length = unsched - offset - return length - -def peer_name(id): - """ - Return a human-readable name for the peer associated with a given RPC id. - """ - global peers - peer = int(peers[id], 0) - return "node%d" % ((peer & 0xff) - 1) - -def set_peer(id, peer): - """ - Sets the peer associated with a particular id. And, if this id has - already been associated with a different peer, clear its state - """ - if (id in peers): - if (peers[id] == peer): - return - recv_grants[id] = [] - last_grant[id] = [] - del unscheduled[id] - del recv_unscheduled[id] - in_grants[id] = [] - gro_data[id] = [] - softirq_data[id] = [] - out_data[id] = [] - del first_out[id] - if id in send_lengths: - del send_lengths[id] - del recv_lengths[id] - del send_xmits[id] - peers[id] = peer - -for line in f: - # Collect information about outgoing message lengths - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'homa_sendmsg request, .* id ([0-9]+), length ([0-9]+)', line) - if match: - time = float(match.group(1)) - latest_time = time - id = int(match.group(4)) - length = int(match.group(5)) - send_lengths[id] = length - last_grant[id] = 0 - send_xmits[id] = 0 - # print("%9.3f Outgoing message for id %d has %d bytes" - # % (time, id, length)) - - # Collect info about outgoing grants (including implicit grants - # for unscheduled bytes) - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'Incoming message for id ([0-9.]+) has ([0-9.]+) unscheduled', line) - if match: - time = float(match.group(1)) - latest_time = time - id = int(match.group(4)) - offset = int(match.group(5)) - recv_unscheduled[id] = offset - # print("%9.3f: unscheduled 'grant' for id %d, offset %d" % ( - # time, id, offset)) - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'sending grant for id ([0-9.]+), offset ([0-9.]+), .* ' - 'increment ([0-9.]+)', line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - offset = int(match.group(5)) - increment = int(match.group(5)) - - if not id in recv_grants: - recv_grants[id] = [] - recv_grants[id].append([time, offset - increment, offset]) - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'sending fifo grant for id ([0-9.]+), offset ([0-9.]+)', line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - offset = int(match.group(5)) - - if not id in recv_grants: - recv_grants[id] = [] - prev = offset - else: - prev = recv_grants[id][-1][2] - recv_grants[id].append([time, prev, offset]) - - # Collect info about incoming data packets processed by homa_softirq - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'incoming data packet, id ([0-9]+), .*, offset ([0-9.]+)/([0-9.]+)', - line) - if match: - time = float(match.group(1)) - latest_time = time - id = int(match.group(4)) - offset = int(match.group(5)) - length = int(match.group(6)) - - if not id in softirq_data: - softirq_data[id] = [] - softirq_data[id].append([time, offset]) - recv_lengths[id] = length - - # Collect info about incoming data packets processed by homa_gro_receive - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'homa_gro_receive got packet from ([^ ]+) id ([0-9]+), offset ([0-9.]+)', - line) - if match: - time = float(match.group(1)) - latest_time = time - peer = match.group(4) - id = int(match.group(5)) - offset = int(match.group(6)) - - set_peer(id, peer) - if not id in gro_data: - gro_data[id] = [] - gro_data[id].append([time, offset]) - - # Collect information about unscheduled data for outgoing RPCs - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'starting copy from user space .* id ([0-9]+), .* unscheduled ([0-9]+)', - line) - if match: - time = float(match.group(1)) - latest_time = time - id = int(match.group(4)) - unsched = int(match.group(5)) - unscheduled[id] = unsched - first_out[id] = time - last_grant[id] = unsched - total_in_grants += unsched - # print("%9.3f: %d unscheduled bytes for id %d" % (time, id, unsched)) - - # Collect info about incoming grants - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'processing grant for id ([0-9]+), offset ([0-9]+)', line) - if match: - time = float(match.group(1)) - latest_time = time - id = int(match.group(4)) - offset = int(match.group(5)) - - if not id in in_grants: - if not id in out_data: - # The trace doesn't include any outgoing data packets - # (started partway through an RPC) - continue - in_grants[id] = [] - if in_grants[id]: - start = in_grants[id][-1][2] - else: - if not id in unscheduled: - continue - start = unscheduled[id] - if start >= offset: - if options.verbose: - print("%9.3f: out of order grant for id %d: offset %d followed " - "by offset %d" % (time, id, start, offset)) - continue - in_grants[id].append([time, start, offset]) - - if not id in last_grant: - print("%9.3f no unscheduled grant found for id %d" % (time, id)) - continue - if offset > last_grant[id]: - total_in_grants += offset - last_grant[id] - last_grant[id] = offset - # print("%9.3f: incoming grant for id %d, range %d:%d" % ( - # time, id, start, offset)) - - # Collect info about outgoing data packets (and also the packet size) - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'Finished queueing packet: .* id ([0-9]+), offset ([0-9]+), ' - 'len ([0-9]+)', line) - if match: - time = float(match.group(1)) - latest_time = time - id = int(match.group(4)) - offset = int(match.group(5)) - length = int(match.group(6)) - - if length > packet_size: - packet_size = length - # print("Setting packet size to %d" % (packet_size)) - - if not id in out_data: - if offset != 0: - # The trace doesn't include all outgoing data packets - continue - out_data[id] = [] - out_data[id].append([time, offset, length]) - if not (id in first_out): - first_out[id] = time - - if not id in last_grant: - last_grant[id] = offset - print("%9.3f RPC id %d wasn't in last_grant (offset %d)" - % (time, id, offset)) - pkt_end = offset + length - if pkt_end > last_grant[id]: - total_in_grants += pkt_end - last_grant[id] - last_grant[id] = pkt_end - total_xmit += length - if not id in send_xmits: - send_xmits[id] = 0 - send_xmits[id] += length - - if (id in send_lengths) and (offset + length) == send_lengths[id]: - if last_grant[id] != send_lengths[id]: - print("%9.3f Final grants for id %d (%d) didn't match " - "length (%d)" % (time, id, last_grant[id], - send_lengths[id])) - if send_xmits[id] != send_lengths[id]: - print("%9.3f Xmit data for id %d (%d) didn't match message " - "length (%d)" % (time, id, send_xmits[id], send_lengths[id])) - # print("%9.3f: outgoing data for id %d, offset %d" % ( - # time, id, offset)) - - # Collect info about grants available at the end of the trace - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'RPC id ([0-9]+) has ([0-9]+) unsent grants', line) - if match: - time = float(match.group(1)) - latest_time = time - id = int(match.group(4)) - available = int(match.group(5)) - end_grants += available - avail += "%7d %7.1f\n" % (id, available/1000) - - # Collect info about active incoming RPCs at the end of the trace - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'Incoming RPC id ([0-9]+), .* ([0-9]+)/([0-9]+) bytes', line) - if match: - recvd = int(match.group(5)) - length = int(match.group(6)) - recv_stats["active"] += 1 - recv_stats["backlog"] += length - recvd - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'RPC id ([0-9]+) has ([0-9]+) outstanding grants', line) - if match: - outstanding = int(match.group(5)) - recv_stats["granted"] += 1 - recv_stats["grants_pending"] += outstanding - - # Generate statistics at regular intervals. - if latest_time >= interval_end: - log_detail = False - if int(interval_end/1000 + 0.1) == 30: - log_detail = True - interval = latest_time - prev_time - send_active = 0 - send_granted = 0 - send_grant_bytes = 0 - send_backlog = 0 - - for id in send_lengths: - backlog = send_lengths[id] - send_xmits[id] - if backlog > 0: - send_active += 1 - send_backlog += backlog - granted = last_grant[id] - send_xmits[id] - if granted > 0: - send_granted += 1 - send_grant_bytes += granted - - recv_active = 0 - recv_granted = 0 - recv_grant_bytes = 0 - recv_backlog = 0 - for id in recv_lengths: - if id in recv_unscheduled: - unsched = recv_unscheduled[id] - else: - unsched = 20000 - length = recv_lengths[id] - recvd = 0 - for pkt in softirq_data[id]: - pkt_end = pkt[1] + pkt_length(pkt[1], length, unsched) - if pkt_end > recvd: - recvd = pkt_end - if log_detail and id == 1385988: - print("Packet info for id %d: recvd %d, pkt_end %d, pkt %s" % - (id, recvd, pkt_end, pkt)) - backlog = length - recvd - if backlog > 0: - recv_active += 1 - recv_backlog += backlog - granted = unsched - if (id in recv_grants) and recv_grants[id]: - granted = recv_grants[id][-1][2] - pending = granted - recvd - if 0 and log_detail: - print("%9.3f id %d: length %d, recvd %d, backlog %d, " - "granted %d, pending %d" - % (time, id, length, recvd, backlog, granted, - pending)) - if pending > 0: - recv_granted += 1 - recv_grant_bytes += pending - if log_detail: - print("%9.3f RPC id %d has %d outstanding grants" % - (time, id, pending)) - - if interval > 0: - stats += "%4.0f ms: %6.2f %6.2f %3d/%3d %7.3f %6.2f %3d/%3d %7.3f %7.2f\n" % ( - interval_end/1000, - 8*(total_in_grants - prev_grants)/(interval*1000), - 8*(total_xmit - prev_xmit)/(interval*1000), - send_granted, send_active, send_grant_bytes/1e6, - send_backlog/1e6, recv_granted, recv_active, - recv_grant_bytes/1e6, recv_backlog/1e6) - prev_time = latest_time - prev_grants = total_in_grants - prev_xmit = total_xmit - interval_end += 1000.0 - -# Get statistics about the time from first data packet to first -# incoming grant -first_grants = [] -for id in out_data: - if not ((id in in_grants) and in_grants[id]): - continue - delay = in_grants[id][0][0] - out_data[id][0][0] - first_grants.append(delay) - # print("Grant lag for id %d: %.3f us (ip_queue_xmit %.3f, " - # "grant received %.1f" % (id, delay, out_data[id][0][0], - # in_grants[id][0][0])) - -# Time to transmit a full-size packet, in microseconds. -xmit_time = (packet_size * 8)/(options.gbps * 1000) -print("Largest observed outgoing packet: %d bytes" % (packet_size)) -print("Wire serialization time for %d-byte packet at %d Gbps: %.1f us" % ( - packet_size, options.gbps, xmit_time)) - -# Collect info for all incoming grants about how much additional data -# is authorized by each grant. -in_deltas = [] -for key in in_grants: - rpc_grants = in_grants[key] - for grant in rpc_grants: - in_deltas.append(grant[2] - grant[1]) - -# Compute lag in incoming grants (when the grant arrives relative to -# when we need it). For this, we only consider second and later grants -# for an RPC (assume the first one may be delayed by SRPT). -in_lags = [] -total_lag = 0 -for id in out_data: - if not id in in_grants: - continue - data = out_data[id] - grants = in_grants[id] - # For each grant, find the last data packet that could be sent - # without needing that grant - d = 0 - prev_data_time = 0 - for g in range(1, len(in_grants[id])): - grant = grants[g] - grant_start = grant[1] - time = grant[1] - if d >= len(data): - print("Ran out of data packets for id %d" % (id)) - break - while (data[d][1] < grant_start) and (d < (len(data)-1)): - prev_data_time = data[d][0] - d += 1 - if data[d][1] < grant_start: - break - lag = grant[0] - prev_data_time - xmit_time - in_lags.append(lag) - if (lag > 0): - total_lag += lag - # print("%9.3f: grant offset %d arrived for id %d, data time %9.3f" % ( - # grant[1], grant_start, id, prev_data_time)) - -# Compute total amount of time during which at least one RPC was actively -# transmitting. -xmit_active_time = 0 -start_times = [] -end_times = [] -for id in out_data: - start_times.append(first_out[id]) - end_times.append(out_data[id][-1][0]) -start_times = sorted(start_times) -end_times = sorted(end_times) -num_active = 0 -active_start = 0 -while (len(start_times) > 0) or (len(end_times) > 0): - if len(start_times) > 0: - if (len(end_times) == 0) or (start_times[0] < end_times[0]): - if num_active == 0: - active_start = start_times[0] - num_active += 1 - start_times.pop(0) - continue - num_active -= 1 - if num_active == 0: - xmit_active_time += end_times[0] - active_start - end_times.pop(0) - -# Compute "Latency": delay between issuing a grant and receipt in homa_softirq -# of the first data packet that depended on that grant. - -for id in recv_grants: - if not id in softirq_data: - continue; - - data = softirq_data[id].copy() - for grant in recv_grants[id]: - while data and (data[0][1] <= grant[1]): - data.pop(0) - if not data: - break - latency = data[0][0] - grant[0] - if options.verbose: - print("%9.3f: grant lag %.1f us (%9.3f us), id %d, " - "range %d:%d" % (data[0][0], latency, grant[0], - id, grant[1], grant[2])) - latencies.append(latency) - -# Compute "Xmit Lag": time it takes after a data packet arrives in GRO to -# send a new grant enabled by that data packet. -xmit_lags = [] -for id in recv_grants: - if not id in gro_data: - continue - data = sorted(gro_data[id], key=lambda tuple : tuple[1]) - prev_data = None - for grant in recv_grants[id]: - if grant[1] == 0: - continue - while data and (data[0][1] + options.window) <= grant[1]: - prev_data = data[0] - data.pop(0) - if not data: - break - - # The current data packet is the one *just after* the one that - # triggered the current grant, so the lag is measured from the - # previous data packet. - if not prev_data: - # print("%9.3f: no prev_data for id %d, prev %d, grant %d, gro_data %s" % - # (grant[0], id, grant[1], grant[2], gro_data[id])) - continue - lag = grant[0] - prev_data[0] - xmit_lags.append(lag) - if options.verbose: - print("%9.3f: data packet %d-%d triggered grant %d-%d, at %9.3f" % - (prev_data[0], prev_data[1], data[0][1], - grant[1], grant[2], grant[0])) - -# Compute "Client Lag": time it takes after sending a data packet for -# homa_softirq to receive a new grant triggered by that packet. -client_lags = [] -for id in in_grants: - if not id in out_data: - continue - data = out_data[id].copy() - for grant in in_grants[id]: - while data and (data[0][1] + data[0][2] + options.window) <= grant[1]: - data.pop(0) - if not data: - break - - lag = grant[0] - data[0][0] - client_lags.append(lag) - if options.verbose: - print("%9.3f: client data packet %d-%d triggered grant %d-%d, at %9.3f" % - (data[0][0], data[0][1], data[0][1] + data[0][2], - grant[1], grant[2], grant[0])) - -latencies = sorted(latencies) -first_grants = sorted(first_grants) -in_lags = sorted(in_lags) -xmit_lags = sorted(xmit_lags) -client_lags = sorted(client_lags) -print("\nLatency: time from sending grant for an incoming message") -print(" (in homa_send_grants) to receiving first granted") -print(" data in Homa SoftIRQ") -print("First Lag: time from calling ip_queue_xmit for first data packet") -print(" until homa_softirq gets first grant") -print("Client Lag: time from calling ip_queue_xmit for a data packet") -print(" until homa_softirq gets grant triggered by that packet") -print("In Lag: time when a grant arrived, relative to time when") -print(" it was needed to send message at full bandwidth") -print(" (skips first grant for each message)") -print("Xmit Lag: time when a data packet arrives that allows a new") -print(" grant until the new grant is transmitted") -print("Pctile Latency First Lag Client Lag In Lag Xmit Lag") -for p in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99, 100]: - print("%3d %s %s %s %s %s" % (p, - percentile(latencies, p, "%6.1f us", " N/A"), - percentile(first_grants, p, "%7.1f us", " N/A"), - percentile(client_lags, p, "%7.1f us", " N/A"), - percentile(in_lags, p, "%6.1f us", " N/A"), - percentile(xmit_lags, p, "%6.1f us", " N/A"))) - -if latencies: - out_avg = "%6.1f us" % (sum(latencies)/len(latencies)) -else: - out_avg = " N/A" -if first_grants: - first_avg = "%6.1f us" % (sum(first_grants)/len(first_grants)) -else: - first_avg = " N/A" -if client_lags: - client_avg = "%6.1f us" % (sum(client_lags)/len(client_lags)) -else: - client_avg = " N/A" -if in_lags: - in_lags_avg = "%6.1f us" % (sum(in_lags)/len(in_lags)) -else: - in_lags_avg = " N/A" -if xmit_lags: - xmit_avg = "%6.1f us" % (sum(xmit_lags)/len(xmit_lags)) -else: - xmit_avg = " N/A" -print("Avg: %9s %9s %9s %9s %9s" % (out_avg, first_avg, client_avg, - in_lags_avg, xmit_avg)) - -if xmit_active_time != 0: - print("\nTotal data packet xmit delays because grants were slow:\n" - "%.1f us (%.1f%% of xmit active time)" % ( - total_lag, 100.0*total_lag/xmit_active_time)) - -in_deltas = sorted(in_deltas) -print("\nSizes of incoming grants (additional authorized data)") -print("Pctile Size") -for p in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99, 100]: - print("%3d %8s" %(p, percentile(in_deltas, p, "%d", "N/A"))) - -if len(in_deltas) == 0: - in_avg = "N/A" -else: - in_avg = "%.0f" % (sum(in_deltas)/len(in_deltas)) -print("Average %8s" % (in_avg)) - -print("\nStatistics taken every millisecond during trace:") -print("Grants: rate of incoming grants (Gbps)") -print("Xmit: rate of data transmission (Gbps)") -print("SGranted: outgoing msgs with grants / active outgoing msgs") -print("SGrants: available grants for outgoing msgs (MB)") -print("SBacklog: untransmitted data in outgoing messages (MB)") -print("RGranted: incoming msgs with grants / active incoming msgs") -print("RGrants: outstanding grants for incoming msgs (MB)") -print("RBacklog: unreceived data in incoming messages (MB)") -print("Interval Grants Xmit SGranted SGrants SBacklog RGranted RGrants RBacklog") -print(stats, end='') -print("Average: % 6.2f %6.2f" % (8*total_in_grants/(latest_time*1000), - 8*total_xmit/(latest_time*1000))) - -if end_grants != 0: - print("\nTransmit grants available at end of trace:") - print("RPC id Grants") - print(avail, end='') - print("Total %8.1f" % (end_grants/1000)) - -print("\nIncoming messages: active %d, granted %d, outstanding %.3f, backlog %.3f" - % (recv_stats["active"], recv_stats["granted"], - recv_stats["grants_pending"]/1e6, recv_stats["backlog"]/1e6)) - - -# Print information about messages with outstanding grants. The "age" for -# an RPC is how long ago the most recent grant was sent for the RPC. -partial = "" -num_partial = 0 -fully = "" -num_fully = 0 -for id in recv_lengths: - if id in recv_unscheduled: - unsched = recv_unscheduled[id] - else: - unsched = 20000 - length = recv_lengths[id] - recvd = 0 - - gaps = "" - if softirq_data[id]: - pkts = sorted(softirq_data[id], key=lambda tuple : tuple[1]) - next_offset = pkts[0][1] - for pkt in pkts: - while (next_offset < pkt[1]) and (next_offset < length): - if gaps: - gaps += " " - gaps += str(next_offset) - next_offset += pkt_length(next_offset, length, unsched) - next_offset += pkt_length(next_offset, length, unsched) - recvd = pkts[-1][1] + pkt_length(pkts[-1][1], length, unsched) - if recvd < 0: - print("Recvd %d, pkt_length %d" % (recvd, - pkt_length(pkts[-1][1], length, unsched))) - backlog = length - recvd - if backlog == 0: - continue - if (not id in recv_grants) or not recv_grants[id]: - continue - grant = recv_grants[id][-1] - granted = grant[2] - if granted <= recvd: - continue - if recvd < 0: - print("\nBogus recvd %d for id %d; pkts: %s" % (recvd, id, pkts)) - info = "%7.1f %10d %-7s %7d %7d %7d %s\n" % ( - latest_time - grant[0], id, peer_name(id), recvd, - granted-recvd, length, gaps) - if granted == length: - num_fully += 1 - fully += info - else: - num_partial += 1 - partial += info - -print("\nIncoming messages with outstanding grants:") -print("Age: usec since last grant sent") -print("Id: Identifier of RPC") -print("Peer: Sending host name") -print("Recvd: Offset just after the last byte received") -print("Out: Bytes granted beyond Recvd") -print("Length: Total bytes in message") -print("Gaps: Offsets of missing packets") -print("\nFully granted messages (%d):" % (num_fully)) -print(" Age Id Peer Recvd Out Length Gaps") -print(fully, end='') - -print("\nPartially granted messages (%d):" % (num_partial)) -print(" Age Id Peer Recvd Out Length Gaps") -print(partial, end='') \ No newline at end of file diff --git a/util/ttmlxalloc.py b/util/ttmlxalloc.py deleted file mode 100755 index 33fb83ac..00000000 --- a/util/ttmlxalloc.py +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/python3 - -# Copyright (c) 2022-2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause - -""" -Scans a time trace file for entries generated by the Mellanox driver about -packet allocations and deallocations (because the per-channel cache -overflowed or underflowed). If the file argument is present, it specifies -the name of the time trace file; otherwise time traces read from standard -input. If --verbose is specified, then the program outputs a new time trace -where adjacent allocate/free entries have been collapsed into a single entry -for ease of reading. Otherwise it just prints statistics about the cost -of allocation and deallocation -Usage: ttmlxalloc.py [--verbose] [file] -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys - -verbose = False -f = sys.stdin - -# A dictionary where keys are core ids, and each value is the number -# of consecutive time trace entries for that core that are for page -# allocations/frees. -num_allocs = {} -num_frees = {} - -# A dictionary where keys are core ids, and each value is the time of -# the first page allocation/free entry in the current batch for that core. -first_alloc_time = {} -first_free_time = {} - -# A dictionary where keys are core ids, and each value is the time of -# the most recent page allocation/free for that core. -last_alloc_time = {} -last_free_time = {} - -# Time of previous time trace record that was printed. -prev_time = 0 - -# Each entry in this list is a count of the number of pages allocated/freed -# in one batch. -alloc_counts = [] -free_counts = [] - -# Each entry in this list is the time consumed by a single batch of page -# allocations/frees. -alloc_times = [] -free_times = [] - -# Dictionary whose keys are the ids of all the distinct RPCs seen in -# the trace. -ids = {} - -if (len(sys.argv) == 2) and (sys.argv[1] == "--help"): - print("Usage: %s [--stats] [file]" % (sys.argv[0])) - sys.exit(0) -if (len(sys.argv) >= 2) and (sys.argv[1] == "--verbose"): - verbose = True - sys.argv.pop(1) -if len(sys.argv) >= 2: - f = open(sys.argv[1]) - -for line in f: - match = re.match(' *([0-9.]+) us .* \[(C[0-9]+)\] (.*)', line) - if not match: - if verbose: - print(line) - continue - time = float(match.group(1)) - core = match.group(2) - msg = match.group(3) - if not core in num_allocs: - num_allocs[core] = 0 - num_frees[core] = 0 - match = re.match('.*id ([0-9.]+)', msg) - if match: - ids[match.group(1)] = 1 - - if 'mlx starting page alloc' in msg: - if num_allocs[core] == 0: - first_alloc_time[core] = time - last_alloc_time[core] = time - num_allocs[core] += 1 - continue - - if 'mlx starting page release' in msg: - if num_frees[core] == 0: - first_free_time[core] = time - last_free_time[core] = time - num_frees[core] += 1 - continue - - if 'mlx5e_rx_cache_get found ref count' in msg: - continue - - if num_allocs[core] != 0: - if verbose: - print("%9.3f us (+%8.3f us) [%s] mlx allocated %d pages (%.1f us)" % ( - time, time - prev_time, core, num_allocs[core], - last_alloc_time[core] - first_alloc_time[core])) - alloc_counts.append(num_allocs[core]) - alloc_times.append(last_alloc_time[core] - first_alloc_time[core]) - num_allocs[core] = 0 - prev_time = time - - if num_frees[core] != 0: - if verbose: - print("%9.3f us (+%8.3f us) [%s] mlx freed %d pages (%.1f us)" % ( - time, time - prev_time, core, num_frees[core], - last_free_time[core] - first_free_time[core])) - free_counts.append(num_frees[core]) - free_times.append(last_free_time[core] - first_free_time[core]) - num_frees[core] = 0 - prev_time = time - - if verbose: - print("%9.3f us (+%8.3f us) [%s] %s" % (time, time - prev_time, core, msg)) - -if verbose: - sys.exit(0) - -print("Total number of RPCs: %6d" % (len(ids))) -print("Total elapsed time: %8.1f us" % (prev_time)) -print("") -if len(alloc_counts) == 0: - print("No page allocations") -else: - print("Page allocations:") - print(" Total pages: %6d" % (sum(alloc_counts))) - print(" Batches: %5d" % (len(alloc_counts))) - print(" Average batch size: %7.1f" % (sum(alloc_counts)/len(alloc_counts))) - print(" Average batch time: %7.1f us" % (sum(alloc_times)/len(alloc_counts))) - print(" Alloc time per RPC: %7.1f us" % (sum(alloc_times)/len(ids))) - print(" Total time: %7.1f us (%.3f core)" % (sum(alloc_times), - sum(alloc_times)/prev_time)) -if len(free_counts) == 0: - print("No page frees") -else: - print("Page frees:") - print(" Total pages: %6d" % (sum(free_counts))) - print(" Batches: %5d" % (len(free_counts))) - print(" Average batch size: %7.1f" % (sum(free_counts)/len(free_counts))) - print(" Average batch time: %7.1f us" % (sum(free_times)/len(free_counts))) - print(" Free time per RPC: %7.1f us" % (sum(free_times)/len(ids))) - print(" Total time: %7.1f us (%.3f core)" % (sum(free_times), - sum(free_times)/prev_time)) \ No newline at end of file diff --git a/util/ttnicdelay.py b/util/ttnicdelay.py deleted file mode 100755 index ad113ed3..00000000 --- a/util/ttnicdelay.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/python3 - -# Copyright (c)2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause - -""" -This program looks for evidence suggesting that NICs are configured to -delay interrupts. It scans two timetraces for the same time interval, one -from a client and one from a server, looking for situations where the -server experiences a significant gap between two consecutive clients -even though the client transmitted them back-to-back. - -Usage: ttgap.py [--verbose] [client [server]] - -The "client" and "server" arguments give the names of the two timetrace -files; they default to client.tt and server.tt. One way to collect these -traces is by running "cp_node client --one-way --workload 500000" on the -client. -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys -from statistics import median - -client_trace = "client.tt" -server_trace = "server.tt" -verbose = False -if (len(sys.argv) >= 2) and (sys.argv[1] == "--help"): - print("Usage: %s [--verbose] [client_trace [server_trace]]" % (sys.argv[0])) - sys.exit(0) -if (len(sys.argv) >= 2) and (sys.argv[1] == "--verbose"): - verbose = True - sys.argv.pop(1) -if len(sys.argv) >= 2: - client_trace = sys.argv[1] - sys.argv.pop(1) -if len(sys.argv) >= 2: - server_trace = sys.argv[1] - -# Information about each data packet sent by the client: the key has the -# form "id:offset", identifying a particular data packet. The value is -# a list of where time is the time when the packet was sent -# and gap is the elapsed time since the previous packet was sent. -client_packets = {} - -last_xmit = 0.0 -total_xmit_gap = 0.0 - -for line in open(client_trace): - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'Finished queueing packet: rpc id ([0-9]+), offset ([0-9]+)', line) - if match: - time = float(match.group(1)) - id = match.group(4) - offset = match.group(5) - key = id + ":" + offset - gap = time-last_xmit - if 0: - print("%9.3f: xmit %s, gap %.1f" % (time, key, gap)) - if (offset != "0") and (gap > 10.0): - total_xmit_gap += gap - if last_xmit > 0: - client_packets[id + ":" + offset] = [time, gap] - last_xmit = time - -last_recv = 0.0 -total_gap = 0.0 -num_gaps = 0 -num_pkts = 0 -last_gap_pkt = 0 -gap_offsets = [] - -for line in open(server_trace): - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'homa_gro_receive got packet .* id ([0-9]+), offset ([0-9]+)', line) - if match: - time = float(match.group(1)) - id = int(match.group(4)) - offset = match.group(5) - key = "%d:%s" % (id-1, offset) - gap = time - last_recv - last_recv = time - if (offset != "0") and (key in client_packets): - num_pkts += 1 - client_time, client_gap = client_packets[key] - if (gap > 20) and (client_gap < 5): - if verbose: - print("%9.3f: recv %s, gap %.1f, xmit_gap %.1f " - "(sent at %.3f), pkts since last gap %d" % ( - time, key, gap, client_gap, client_time, - num_pkts - last_gap_pkt)) - num_gaps += 1 - total_gap += gap - client_gap - last_gap_pkt = num_pkts - gap_offsets.append(int(offset)) - -print("%d unexpected gaps over %d packets" % (num_gaps, num_pkts)) -print("Total recv gap %.1f us (%.1f%% of elapsed time)" % (total_gap, - 100.0*total_gap/last_xmit)) -print("Average interval between gaps: %.1f packets" % (num_pkts/num_gaps)) -print("Average gap length: %.1f us" % (total_gap/num_gaps)) - -if verbose: - print("Total xmit gap %.1fus (%.1f%% of elapsed time)" % (total_xmit_gap, - 100.0*total_xmit_gap/last_xmit)) - -if 0: - gap_offsets = sorted(gap_offsets) - cur_offset = -1 - count = 0 - for offset in gap_offsets: - if offset != cur_offset: - if cur_offset >= 0: - print("%6d %d" % (cur_offset, count)) - cur_offset = offset - count = 0 - count += 1 - print("%6d %d" % (cur_offset, count)) \ No newline at end of file diff --git a/util/ttrcv.py b/util/ttrcv.py deleted file mode 100755 index 282eb733..00000000 --- a/util/ttrcv.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/python3 - -""" -Analyzes packet arrivals in a timetrace, outputs info on arrival times -for each offset in a message. - -Usage: ttrcv.py [--verbose] [trace] - -If no timetrace file is given, this script reads timetrace info from stdin. -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -from operator import itemgetter -import os -import re -import string -import sys -from statistics import median - -# Parse command line options -parser = OptionParser(description= - 'Read a timetrace and output information about arrival times for ' - 'packets as a function of their offset in the message.', - usage='%prog [options] [trace]', - conflict_handler='resolve') -parser.add_option('--verbose', '-v', action='store_true', default=False, - dest='verbose', - help='print lots of output') - -# Most recent RPC id seen in a data packet -cur_id = 0 - -# True means a resend has been issued for cur_id -resend = False - -# Time when packet with offset 0 arrived for cur_id -offset0_time = 0 - -# Keys are offsets; values are lists of arrival times for that offset -arrivals = {} - -(options, extra) = parser.parse_args() -f = sys.stdin -if len(extra) > 0: - f = open(extra[0]) - if len(extra) > 1: - print("Unrecognized argument %s" % (extra[1])) - exit(1) - -for line in f: - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'homa_gro_receive got packet .* id ([0-9]+), offset ([0-9]+)', line) - if match: - time = float(match.group(1)) - core = match.group(3) - id = match.group(4) - offset = int(match.group(5)) - - if cur_id != id: - if offset != 0: - continue - cur_id = id - resend = False - offset0_time = time - if resend: - resend_info = " (after resend)" - else: - resend_info = "" - - if resend: - continue - - if not offset in arrivals: - arrivals[offset] = [time - offset0_time] - else: - arrivals[offset].append(time - offset0_time) - if options.verbose: - print("id %6s, offset %6d, time %9.3f%s" % (id, offset, - time - offset0_time, resend_info)) - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'Sent RESEND for server RPC id ([0-9]+), .* offset ([0-9]+)*', line) - if match: - time = float(match.group(1)) - core = match.group(3) - id = match.group(4) - offset = int(match.group(5)) - - if id == cur_id: - resend = True - -offsets = sorted(arrivals.keys()) -for offset in offsets: - print("%6d: %8.3f - %8.3f" % (offset, min(arrivals[offset]), - max(arrivals[offset]))) \ No newline at end of file diff --git a/util/ttskbs.py b/util/ttskbs.py deleted file mode 100755 index 6e3c62ec..00000000 --- a/util/ttskbs.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/python3 - -# Copyright (c) 2022-2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause - -""" -Scans a time trace file to analyze the lifetimes of receive buffers -(e.g. how many are active at a time, how long they live, etc.) -Usage: ttskbs.py [--threshold t] [--verbose] [file] -The --threshold option specifies a time in usecs: info will be printed -for every buffer whose lifetime is at least that long. If --verbose is -specified then start and end times are printed for each buffer. -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys - -# Parse command line options -parser = OptionParser(description= - 'Read a timetrace and output information about lifetimes of incoming ' - 'packet buffers.', - usage='%prog [options] [trace]', - conflict_handler='resolve') -parser.add_option('--id', type='string', dest='id', default=0, - help='print lifetime information on a packet-by-packet basis ' - 'for this RPC id') -parser.add_option('--threshold', type='float', dest='threshold', default=0, - help='print packets with lifetimes longer than this') -parser.add_option('--verbose', '-v', action='store_true', default=False, - dest='verbose', - help='print lots of output') - -(options, extra) = parser.parse_args() -f = sys.stdin -if len(extra) > 0: - f = open(extra[0]) - if len(extra) > 1: - print("Unrecognized argument %s" % (extra[1])) - exit(1) - -# Dictionary with one entry for each RPC, keyed by RPC id. -# Each entry is a dictionary keyed by offset, containing one entry for -# each packet buffer currently active for that RPC. -# Each of these entries is a dictionary with information about that packet: -# core: core on which the packet was received. -# gro: time when the buffer was seen by gro_receive -# softirq_start: time when homa_softirq woke up (eventually processed buffer) -# softirq: time when homa_softirq processed this buffer -# copy_start: time when homa_copy_out started processing a batch of -# buffers containing this one -# free: time when homa_copy_out freed this buffer -rpcs = {} - -num_active = 0 -max_active = 0 - -# List whose entries are the lifetimes of individual data packets. -lifetimes = [] - -# Dictionary with one entry for each core (keyed by core name); value -# is a list of lifetimes for packets received on that core. -core_lifetimes = {} - -# Dictionary where keys are core ids and values are the most recent time -# homa_softirq started executing on that core. -softirq_start = {} - -# Dictionary where keys are RPC ids and values are the most recent time -# homa_copy_out started copying out packets for that RPC. -copy_out_start = {} - -# Dictionary where each key is an RPC ids and each value is the offset of the -# last packet for that RPC that has been copied to user space. -last_offsets = {} - -earliest_time = 0 -latest_time = 0 - -for line in f: - match = re.match(' *([0-9.]+) us .* \[(C[0-9]+)\] (.*)', line) - if not match: - continue - time = float(match.group(1)) - core = match.group(2) - msg = match.group(3) - - match = re.match('.* id ([0-9.]+).*offset ([0-9.]+)', msg) - if match: - latest_time = time - id = match.group(1) - offset = int(match.group(2)) - if not id in rpcs: - rpcs[id] = {} - rpc = rpcs[id] - - if "_gro_receive got packet" in msg: - rpc[offset] = {"gro": time, "core": core} - num_active += 1 - if num_active > max_active: - max_active = num_active - if options.verbose: - print("%9.3f: allocate %s:%d (%d now active)" % ( - time, id, offset, num_active)) - if earliest_time == 0: - earliest_time = time - - if offset not in rpc: - continue - pkt = rpc[offset] - - if "incoming data packet" in msg: - pkt["softirq_start"] = softirq_start[core] - pkt["softirq"] = time - - if "homa_softirq: first packet" in msg: - softirq_start[core] = time - - match = re.match('.*starting copy to user space for id ([0-9.]+)', msg) - if match: - copy_out_start[match.group(1)] = time - - match = re.match('.*finished copying .* last offset ([0-9.]+)', msg) - if match: - offset = int(match.group(1)) - last_offsets[id] = offset - - match = re.match('.*finished freeing .* for id ([0-9.]+)', msg) - if match: - id = match.group(1) - if (not id in rpcs) or (not id in last_offsets): - continue - rpc = rpcs[id] - for offset, pkt in rpc.items(): - if (offset <= last_offsets[id]) and ("free" not in pkt): - pkt["copy_start"] = copy_out_start[id] - pkt["free"] = time - lifetime = time - pkt["gro"] - lifetimes.append(lifetime) - pkt_core = pkt["core"] - if not pkt_core in core_lifetimes: - core_lifetimes[pkt_core] = [] - core_lifetimes[pkt_core].append(lifetime) - num_active -= 1 - if options.verbose: - print("%9.3f: free %s:%d after %.1f us (%d now active)" % - (time, id, offset, lifetime, num_active)) - elif (options.threshold > 0) and (lifetime >= options.threshold): - print("%9.3f: packet %s:%d lifetime %5.1f usec " - "(alloced on %s at %9.3f)" % (time, - id, offset, lifetime, pkt_core, pkt["gro"])) - -if len(lifetimes) == 0: - print("No packets found with complete life cycle") - exit(1) - -print("Maximum number of active skbs: %d" % (max_active)) -print("Total lifetimes: %d" % (len(lifetimes))) - -# Lists of elapsed times from one event to another: -gro_to_softirq_start = [] -softirq_start_to_softirq = [] -softirq_to_copy_start = [] -copy_start_to_free = [] - -for id, rpc in rpcs.items(): - for offset, pkt in rpc.items(): - if (not "softirq_start" in pkt) or (not "softirq" in pkt) \ - or (not "copy_start" in pkt) or (not "free" in pkt): - continue - gro_to_softirq_start.append(pkt["softirq_start"] - pkt["gro"]) - softirq_start_to_softirq.append(pkt["softirq"] - pkt["softirq_start"]) - softirq_to_copy_start.append(pkt["copy_start"] - pkt["softirq"]) - copy_start_to_free.append(pkt["free"] - pkt["copy_start"]) -gro_to_softirq_start = sorted(gro_to_softirq_start) -softirq_start_to_softirq = sorted(softirq_start_to_softirq) -softirq_to_copy_start = sorted(softirq_to_copy_start) -copy_start_to_free = sorted(copy_start_to_free) -lifetimes = sorted(lifetimes) - -print(" Duration (usecs)") -print("Phase of packet lifetime P10 P50 P90 Max") -print("---------------------------------------------------------------------------") -l = len(gro_to_softirq_start) -print("GRO -> homa_softirq invocation: %6.1f %6.1f %6.1f %7.1f" % ( - gro_to_softirq_start[10*l//100], - gro_to_softirq_start[50*l//100], - gro_to_softirq_start[90*l//100], - gro_to_softirq_start[l-1])) -l = len(softirq_start_to_softirq) -print("homa_softirq_invocation -> SoftIRQ for packet %6.1f %6.1f %6.1f %7.1f" % ( - softirq_start_to_softirq[10*l//100], - softirq_start_to_softirq[50*l//100], - softirq_start_to_softirq[90*l//100], - softirq_start_to_softirq[l-1])) -l = len(softirq_to_copy_start) -print("SoftIRQ for packet -> copy_out invocation %6.1f %6.1f %6.1f %7.1f" % ( - softirq_to_copy_start[10*l//100], - softirq_to_copy_start[50*l//100], - softirq_to_copy_start[90*l//100], - softirq_to_copy_start[l-1])) -l = len(copy_start_to_free) -print("copy_out invocation -> packet free %6.1f %6.1f %6.1f %7.1f" % ( - copy_start_to_free[10*l//100], - copy_start_to_free[50*l//100], - copy_start_to_free[90*l//100], - copy_start_to_free[l-1])) -l = len(lifetimes) -print("End to end lifetime (GRO -> free) %6.1f %6.1f %6.1f %7.1f" % ( - lifetimes[10*l//100], - lifetimes[50*l//100], - lifetimes[90*l//100], - lifetimes[l-1])) - -# Print lifetime information by core - -cores = sorted(core_lifetimes.keys()) -print("\nLifetimes by core (usec):") -print("Core P10 P50 P90 Max Samples Kpkt/s") -print("-------------------------------------------------") -for core in cores: - sorted_lifetimes = sorted(core_lifetimes[core]) - l = len(sorted_lifetimes) - print("%s %6.1f %6.1f %6.1f %7.1f %5d %5.1f" % (core, - sorted_lifetimes[10*l//100], - sorted_lifetimes[50*l//100], - sorted_lifetimes[90*l//100], - sorted_lifetimes[-1], - l, 1000*l/(latest_time - earliest_time))) - -if options.id != 0: - hdr = "Packets for id %s:" % (options.id) - print("\n%s" % (hdr)) - print("-" * len(hdr)) - if options.id not in rpcs: - print("No packets for RPC id %d" % (options.id)) - else: - rpc = rpcs[options.id] - print("Offset Lifetime (usec)"); - for offset in sorted(rpc.keys()): - pkt = rpc[offset] - print("%6s %7.1f" % (offset, pkt["free"] - pkt["gro"])) - diff --git a/util/ttsoftirq.py b/util/ttsoftirq.py deleted file mode 100755 index 7d341330..00000000 --- a/util/ttsoftirq.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/python3 - -""" -Analyzes softirq wakeup times in a timetrace. Also analyzes how long -it takes from when the NAPI layer receives a packet until the packet -is released in homa_copy_to_user. -Usage: softirq.py [tt_file] - -The existing timetrace is in tt_file (or stdin in tt_file is omitted). -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -import os -import re -import string -import sys - -verbose = False -show_idle = False -f = sys.stdin - -while (len(sys.argv) > 1) and sys.argv[1].startswith("--"): - if sys.argv[1] == "--help": - print("Usage: %s [--verbose] [--show-idle] [file]" % (sys.argv[0])) - sys.exit(0) - if sys.argv[1] == "--verbose": - verbose = True - sys.argv.pop(1) - continue - if sys.argv[1] == "--show-idle": - show_idle = True - sys.argv.pop(1) - continue -if len(sys.argv) >= 2: - f = open(sys.argv[1]) - -queued = {} -delays = [] - -# One entry for each packet seen by homa_gro_receive. Key is "rpcId:offset", -# value is arrival time in homa_gro_receive. -arrivals = {} - -# One entry for each batch of packets freed by homa_copy_to_user. Value is -# elapsed time since packet was seen by homa_gro_receive. -lifetimes = [] - -# Keys are core numbers, values are time of last log entry seen for that core. -core_last = {} - -# Keys are core numbers, values are the core's idle time (elapsed time with -# no log entries) as of the most recent "enqueue_to_backlog" entry -# targeting that core. -idle_before_wakeup = {} - -# Contains one entry for each softirq wakeup, which is a dictionary -# with the following fields: -# core - The core on which homa_softirq ran -# time - Time when homa_softirq woke up -# delay - Elapsed time since enqueue_to_backlog was most recently invoked -# idle - How long the core was idle at the time of enqueue_to_backlog -wakeups = [] - -for line in f: - match = re.match(' *([-0-9.]+) us .* \[C([0-9]+)\]', line) - if not match: - continue - time = float(match.group(1)) - core = int(match.group(2)) - core_last[core] = time - - match = re.match('.*enqueue_to_backlog.* cpu ([0-9]+)', line) - if match: - dest = int(match.group(1)) - queued[dest] = time - if not dest in core_last: - core_last[dest] = 0.0 - idle_before_wakeup[dest] = time - core_last[dest] - - match = re.match('.*homa_softirq: first packet', line) - if match: - if core in queued: - delay = time - queued[core] - delays.append(delay) - if (delay > 10.0) and verbose: - print("%9.3f [C%02d] Long SoftIRQ delay: %.1f usec, " - "idle %.1f usec" % - (time, core, delay, idle_before_wakeup[core])) - wakeups.append({"time": time, "core": core, "delay": delay, - "idle": idle_before_wakeup[core]}); - del queued[core] - - match = re.match('.*homa_gro_receive got packet .* id ([0-9]+), ' - 'offset ([0-9]+),', line) - if match: - key = match.group(1) + ":" + match.group(2) - arrivals[key] = time - - match = re.match('.*finished copying .* bytes for id ([0-9]+), ' - '.* last offset ([0-9]+)', line) - if match: - key = match.group(1) + ":" + match.group(2) - if key in arrivals: - lifetime = time - arrivals[key] - lifetimes.append(lifetime) - if False and verbose: - print("%9.3f Packets freed with lifetime %5.1f us" - % (time, lifetime)) - -delays.sort() -print("Total SoftIRQ wakeup data points: %d" % (len(delays))) -print("Minimum delay: %4.1f usec" % (delays[0])) -print("Median delay: %4.1f usec" % (delays[len(delays)//2])) -print("P90 delay: %4.1f usec" % (delays[len(delays)*9//10])) -print("P99 delay: %4.1f usec" % (delays[len(delays)*99//100])) -print("Maximum delay: %4.1f usec" % (delays[-1])) - -print("") -if len(lifetimes) == 0: - print("Couldn't extract information on receive packet lifetimes"); - exit(1) - -lifetimes.sort() -print("Total packet lifetime data points: %d" % (len(lifetimes))) -print("Minimum lifetime: %4.1f usec" % (lifetimes[0])) -print("Median lifetime: %4.1f usec" % (lifetimes[len(lifetimes)//2])) -print("P90 lifetime: %4.1f usec" % (lifetimes[len(lifetimes)*9//10])) -print("P99 lifetime: %4.1f usec" % (lifetimes[len(lifetimes)*99//100])) -print("Maximum lifetime: %4.1f usec" % (lifetimes[-1])) - -if show_idle: - wakeups.sort(key=lambda item: item["delay"], reverse=True) - for wakeup in wakeups: - print("%9.3f -> %9.3f [C%02d] delay %5.1f, idle %5.1f" % ( - wakeup["time"] - wakeup["delay"], wakeup["time"], - wakeup["core"], wakeup["delay"], wakeup["idle"])) \ No newline at end of file diff --git a/util/ttxmit.py b/util/ttxmit.py deleted file mode 100755 index 964f475a..00000000 --- a/util/ttxmit.py +++ /dev/null @@ -1,250 +0,0 @@ -#!/usr/bin/python3 - -""" -Analyzes packet transmissions in a timetrace to find gaps where the -uplink was unnecessarily idle. - -Usage: ttxmit.py [--verbose] [--gbps n] [trace] - -If no timetrace file is given, this script reads timetrace info from stdin. -""" - -from __future__ import division, print_function -from glob import glob -from optparse import OptionParser -import math -from operator import itemgetter -import os -import re -import string -import sys -from statistics import median - -# Parse command line options -parser = OptionParser(description= - 'Read a timetrace and output information about gaps in data packet ' - 'transmissions.', - usage='%prog [options] [trace]', - conflict_handler='resolve') -parser.add_option('--verbose', '-v', action='store_true', default=False, - dest='verbose', - help='print lots of output') -parser.add_option('--gbps', type='int', dest='gbps', default=25, - help='network speed in Gbps') - -(options, extra) = parser.parse_args() -f = sys.stdin -if len(extra) > 0: - f = open(extra[0]) - if len(extra) > 1: - print("Unrecognized argument %s" % (extra[1])) - exit(1) - -# Time when all of the output packets presented to the NIC will have -# been fully transmitted. -idle_time = 0 - -# Will eventually hold the amount of data in a full-sized output -# packet (before GSO chops it up). -packet_size = 1000 - -# Dictionary holding one entry for each RPC that is currently active -# (some of its bytes have been transmitted, but not all). Index is -# RPC id, value is a list giving time when most recent -# packet was transmitted for the RPC, offset of the packet's data. -active_rpcs = {} - -# Total number of RPCs that completed during the trace. -completed_rpcs = 0 - -# Total time when there was at least one active RPC. -active_usecs = 0 - -# Total time in all gaps -gap_usecs = 0 - -# Time when len(total_active_time) went from 0 to 1. -active_start = 0 - -# Time when len(total_active_time) become 0. -active_end = 0 - -# Total number of data packets sent. -total_packets = 0 - -# Total amount of data transmitted. -total_bytes = 0 - -# Total number of packets that experienced gaps >= long_gap. -long_gaps = 0 - -# Threshold length for a gap to be considered "long". -long_gap = 2.0 - -# One entry for each period of time when the uplink was idle yet there -# were active outgoing RPCs. Value is a list : duration is the length of the gap, start end end give the range of -# the idle period, active counts the number of active RPCs at the end of the -# interval, and id and offset identify the packet whose transmission ended -# the gap. -gaps = [] - -# Holds the duration of all the gaps that were caused by lack of grants. -grant_gaps = [] - -# One entry for each period of time when there were no active RPCS. -# Each entry is a list : duration is the length -# of the gap, start and end give the range, and id identifies the RPC that -# ended the gap. -inactive_gaps = [] - -# Keys are RPC ids; each value is the total number of bytes granted for -# that RPC (i.e. the index of the first byte not yet granted). -granted = {} - -# Keys are RPC ids, values are meaningless. If an entry is present, it -# means that the most recently transmitted packet used up all of the -# granted bytes, so the next packet will have to wait for a grant. -needs_grant = {} - -for line in f: - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'calling .*_xmit: wire_bytes ([0-9]+), .* id ([0-9]+), ' - 'offset ([0-9]+)', line) - if match: - time = float(match.group(1)) - core = match.group(3) - length = int(match.group(4)) - id = match.group(5) - offset = int(match.group(6)) - - total_packets += 1 - total_bytes += length - if packet_size < length: - packet_size = length - - if (idle_time < time) and (len(active_rpcs) > 0): - gap_length = time - idle_time - gaps.append([gap_length, idle_time, time, len(active_rpcs), id, offset]) - gap_usecs += gap_length - if gap_length >= long_gap: - long_gaps += 1 - if id in needs_grant: - grant_gaps.append(gap_length) - - if (id in granted) and ((offset + length) >= granted[id]): - needs_grant[id] = True - else: - needs_grant.pop(id, None) - - if len(active_rpcs) == 0: - if idle_time < time: - active_start = time - if active_end != 0: - inactive_gaps.append([time - active_end, active_end, time, id]) - else: - active_start = idle_time - - xmit_time = (length * 8)/(options.gbps * 1000) - if (idle_time < time): - idle_time = time + xmit_time - else: - idle_time += xmit_time - - if length < packet_size: - active_rpcs.pop(id, None) - completed_rpcs += 1 - else: - active_rpcs[id] = [time, id] - - if len(active_rpcs) == 0: - active_usecs += idle_time - active_start - active_end = idle_time - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'processing grant for id ([0-9]+), offset ([0-9]+)', line) - if match: - id = match.group(4) - offset = int(match.group(5)) - granted[id] = offset - - match = re.match(' *([-0-9.]+) us \(\+ *([-0-9.]+) us\) \[C([0-9]+)\] ' - 'starting copy from user .* id ([0-9]+),.* unscheduled ([0-9]+)', - line) - if match: - id = match.group(4) - unsched = int(match.group(5)) - granted[id] = unsched - -if len(active_rpcs): - active_usecs += time - active_start - -print("RPC active time: %9.1f us (%.1f%% of elapsed time)" % ( - active_usecs, 100.0*active_usecs/time)) -print("Total xmit gaps: %9.1f us (%.1f%% of active time)" % ( - gap_usecs, 100.0*gap_usecs/active_usecs)) -print("Average xmit gap: %9.1f us" % (gap_usecs/total_packets)) -grant_gap_usecs = sum(grant_gaps) -if grant_gap_usecs > 0: - print("Gaps caused by delayed grants: %9.1f us (%.1f%% of all gap time)" % ( - grant_gap_usecs, 100.0*grant_gap_usecs/gap_usecs)) -print("%d data packets (%.1f%% of all packets) were delayed waiting for grants" - % (len(grant_gaps), 100*len(grant_gaps)/total_packets)) -print('%d data packets (%.1f%% of all packets) were delayed by gaps ' - '>= %.1f us' % (long_gaps, 100*long_gaps/ total_packets, - long_gap)) -print("Network bandwidth consumed when RPCs active: %.1f Gbps" % ( - total_bytes*8.0/(active_usecs*1e03))) -if (completed_rpcs > 0): - print("Average delay/RPC caused by missing grants: %.1f usec" % ( - grant_gap_usecs/completed_rpcs)) - -gaps = sorted(gaps, key=itemgetter(0), reverse=True) -print("\nLongest gaps:") -count = 0 -for gap in gaps: - print("%9.3f: gap of %5.1f us (starting at %9.3f), id %s, offset %d" % ( - gap[2], gap[0], gap[1], gap[4], gap[5])) - count += 1 - if count >= 10: - break - -if len(gaps) > 0: - gaps.reverse() - print("\nGap CDF (% of total xmit gap time in gaps <= given size):") - print("Percent Gap") - pctl = 0 - total_usecs = 0 - for gap in gaps: - total_usecs += gap[0] - if (total_usecs >= pctl*gap_usecs/100): - print("%5d %5.1f us" % (pctl, gap[0])) - pctl += 10 - if pctl <= 100: - print("%5d %5.1f us" % (100, gaps[-1][0])) - -if len(grant_gaps) > 0: - grant_gaps = sorted(grant_gaps) - print("\nCDF of gaps caused by grants (% of total grant gap time " - "in gaps <= given size):") - print("Percent Gap") - pctl = 0 - total_usecs = 0 - for gap in grant_gaps: - total_usecs += gap - if (total_usecs >= pctl*grant_gap_usecs/100): - print("%5d %5.1f us" % (pctl, gap)) - pctl += 10 - if pctl <= 100: - print("%5d %5.1f us" % (100, grant_gaps[-1])) - -if inactive_gaps: - inactive_gaps = sorted(inactive_gaps, key=itemgetter(0), reverse=True) - print("\nLongest intervals with no active RPCs:") - count = 0 - for gap in inactive_gaps: - print("%9.3f: %5.1f us starting at %9.3f, ending with id %s" % ( - gap[2], gap[0], gap[1], gap[3])) - count += 1 - if count >= 10: - break \ No newline at end of file From ef10deabbc6eb69258720ab952d462570c3d4a3a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 26 Sep 2024 10:07:57 -0700 Subject: [PATCH 021/625] Fix additional issues from checkpatch commit (related to sockaddr_in_union) --- homa_api.c | 8 ++++---- util/buffer_server.c | 2 +- util/send_raw.c | 4 ++-- util/test_utils.h | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/homa_api.c b/homa_api.c index c2b31f43..4969faca 100644 --- a/homa_api.c +++ b/homa_api.c @@ -35,7 +35,7 @@ * error occurred, -1 is returned and errno is set appropriately. */ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, - const sockaddr_in_union *dest_addr, uint64_t id) + const union sockaddr_in_union *dest_addr, uint64_t id) { struct homa_sendmsg_args args; struct iovec vec; @@ -78,7 +78,7 @@ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, * error occurred, -1 is returned and errno is set appropriately. */ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, - const sockaddr_in_union *dest_addr, uint64_t id) + const union sockaddr_in_union *dest_addr, uint64_t id) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -113,7 +113,7 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, * error occurred, -1 is returned and errno is set appropriately. */ int homa_send(int sockfd, const void *message_buf, size_t length, - const sockaddr_in_union *dest_addr, uint64_t *id, + const union sockaddr_in_union *dest_addr, uint64_t *id, uint64_t completion_cookie) { struct homa_sendmsg_args args; @@ -162,7 +162,7 @@ int homa_send(int sockfd, const void *message_buf, size_t length, * error occurred, -1 is returned and errno is set appropriately. */ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, - const sockaddr_in_union *dest_addr, uint64_t *id, + const union sockaddr_in_union *dest_addr, uint64_t *id, uint64_t completion_cookie) { struct homa_sendmsg_args args; diff --git a/util/buffer_server.c b/util/buffer_server.c index 77d0f488..e98f8e0e 100644 --- a/util/buffer_server.c +++ b/util/buffer_server.c @@ -25,7 +25,7 @@ int main(int argc, char** argv) { int fd, port; int optval = 1; - sockaddr_in_union bindAddress; + union sockaddr_in_union bindAddress; if (argc < 2) { printf("Usage: %s port\n", argv[0]); diff --git a/util/send_raw.c b/util/send_raw.c index f470bce3..41b6d8af 100644 --- a/util/send_raw.c +++ b/util/send_raw.c @@ -26,7 +26,7 @@ int main(int argc, char** argv) { char *message; char *host; int protocol; - sockaddr_in_union *addr; + union sockaddr_in_union *addr; uint8_t *bytes; if (argc < 3) { @@ -55,7 +55,7 @@ int main(int argc, char** argv) { host, gai_strerror(status)); exit(1); } - addr = (sockaddr_in_union*) result->ai_addr; + addr = (union sockaddr_in_union*) result->ai_addr; bytes = (uint8_t *) &addr->in4.sin_addr; printf("Destination address: %x (%d.%d.%d.%d)\n", addr->in4.sin_addr.s_addr, bytes[0], bytes[1], bytes[2], bytes[3]); diff --git a/util/test_utils.h b/util/test_utils.h index ec9a117b..a9c9a7c8 100644 --- a/util/test_utils.h +++ b/util/test_utils.h @@ -28,7 +28,7 @@ extern double get_cycles_per_sec(); extern int get_int(const char *s, const char *msg); extern void pin_thread(int core); extern const char* - print_address(const sockaddr_in_union *addr); + print_address(const union sockaddr_in_union *addr); extern void print_dist(uint64_t times[], int count); extern void seed_buffer(void *buffer, size_t length, int seed); #ifdef __cplusplus From dcebb4ed466b83f2bb9fc28e553c423503ee5818 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 26 Sep 2024 10:45:20 -0700 Subject: [PATCH 022/625] Extract homa_wire.h from homa_impl.h --- homa_grant.c | 1 + homa_impl.h | 478 +----------------------------------------------- homa_incoming.c | 3 +- homa_outgoing.c | 1 + homa_peertab.c | 2 +- homa_wire.h | 477 +++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 485 insertions(+), 477 deletions(-) create mode 100644 homa_wire.h diff --git a/homa_grant.c b/homa_grant.c index 6bb84672..224b526a 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_wire.h" /** * homa_grant_outranks() - Returns nonzero if rpc1 should be considered diff --git a/homa_impl.h b/homa_impl.h index b8ec2208..e91aaa76 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -48,6 +48,8 @@ #pragma GCC diagnostic warning "-Wpointer-sign" #pragma GCC diagnostic warning "-Wunused-variable" +#include "homa_wire.h" + #ifdef __UNIT_TEST__ #undef alloc_pages #define alloc_pages mock_alloc_pages @@ -144,87 +146,11 @@ extern struct homa_core *homa_cores[]; extern struct homa_numa *homa_numas[]; extern int homa_num_numas; -/** - * enum homa_packet_type - Defines the possible types of Homa packets. - * - * See the xxx_header structs below for more information about each type. - */ -enum homa_packet_type { - DATA = 0x10, - GRANT = 0x11, - RESEND = 0x12, - UNKNOWN = 0x13, - BUSY = 0x14, - CUTOFFS = 0x15, - FREEZE = 0x16, - NEED_ACK = 0x17, - ACK = 0x18, - BOGUS = 0x19, /* Used only in unit tests. */ - /* If you add a new type here, you must also do the following: - * 1. Change BOGUS so it is the highest opcode - * 2. Add support for the new opcode in homa_print_packet, - * homa_print_packet_short, homa_symbol_for_type, and mock_skb_new. - * 3. Add the header length to header_lengths in homa_plumbing.c. - */ -}; - -/** define HOMA_IPV6_HEADER_LENGTH - Size of IP header (V6). */ -#define HOMA_IPV6_HEADER_LENGTH 40 - -/** define HOMA_IPV4_HEADER_LENGTH - Size of IP header (V4). */ -#define HOMA_IPV4_HEADER_LENGTH 20 - -/** - * define HOMA_SKB_EXTRA - How many bytes of additional space to allow at the - * beginning of each sk_buff, before the IP header. This includes room for a - * VLAN header and also includes some extra space, "just to be safe" (not - * really sure if this is needed). - */ -#define HOMA_SKB_EXTRA 40 - -/** - * define HOMA_ETH_OVERHEAD - Number of bytes per Ethernet packet for Ethernet - * header, CRC, preamble, and inter-packet gap. - */ -#define HOMA_ETH_OVERHEAD 42 - -/** - * define HOMA_MIN_PKT_LENGTH - Every Homa packet must be padded to at least - * this length to meet Ethernet frame size limitations. This number includes - * Homa headers and data, but not IP or Ethernet headers. - */ -#define HOMA_MIN_PKT_LENGTH 26 - -/** - * define HOMA_MAX_HEADER - Number of bytes in the largest Homa header. - */ -#define HOMA_MAX_HEADER 90 - -/** - * define ETHERNET_MAX_PAYLOAD - Maximum length of an Ethernet packet, - * excluding preamble, frame delimeter, VLAN header, CRC, and interpacket gap; - * i.e. all of this space is available for Homa. - */ -#define ETHERNET_MAX_PAYLOAD 1500 - -/** - * define HOMA_MAX_PRIORITIES - The maximum number of priority levels that - * Homa can use (the actual number can be restricted to less than this at - * runtime). Changing this value will affect packet formats. - */ -#define HOMA_MAX_PRIORITIES 8 - #define sizeof32(type) ((int) (sizeof(type))) /** define CACHE_LINE_SIZE - The number of bytes in a cache line. */ #define CACHE_LINE_SIZE 64 -/** - * define NUM_PEER_UNACKED_IDS - The number of ids for unacked RPCs that - * can be stored in a struct homa_peer. - */ -#define NUM_PEER_UNACKED_IDS 5 - /** * define HOMA_MAX_GRANTS - Used to size various data structures for grant * management; the max_overcommit sysctl parameter must never be greater than @@ -251,404 +177,6 @@ struct homa_cache_line { char bytes[64]; }; -/** - * struct common_header - Wire format for the first bytes in every Homa - * packet. This must (mostly) match the format of a TCP header to enable - * Homa packets to actually be transmitted as TCP packets (and thereby - * take advantage of TSO and other features). - */ -struct common_header { - /** - * @sport: Port on source machine from which packet was sent. - * Must be in the same position as in a TCP header. - */ - __be16 sport; - - /** - * @dport: Port on destination that is to receive packet. Must be - * in the same position as in a TCP header. - */ - __be16 dport; - - /** - * @sequence: corresponds to the sequence number field in TCP headers; - * used in DATA packets to hold the offset in the message of the first - * byte of data. However, when TSO is used without TCP hijacking, this - * value will only be correct in the first segment of a GSO packet. - */ - __be32 sequence; - - /** - * The fields below correspond to the acknowledgment field in TCP - * headers; not used by Homa, except for the low-order 8 bits, which - * specify the Homa packet type (one of the values in the - * homa_packet_type enum). - */ - __be16 ack1; - __u8 ack2; - __u8 type; - - /** - * @doff: High order 4 bits holds the number of 4-byte chunks in a - * data_header (low-order bits unused). Used only for DATA packets; - * must be in the same position as the data offset in a TCP header. - * Used by TSO to determine where the replicated header portion ends. - */ - __u8 doff; - - /** - * @flags: Holds TCP flags such as URG, ACK, etc. The special value - * HOMA_TCP_FLAGS is stored here to distinguish Homa-over-TCP packets - * from real TCP packets. It includes the SYN and RST flags, - * which TCP would never use together; must not include URG or FIN - * (TSO will turn off FIN for all but the last segment). - */ - __u8 flags; -#define HOMA_TCP_FLAGS 6 - - /** - * @window: Corresponds to the window field in TCP headers. Not used - * by HOMA. - */ - __be16 window; - - /** - * @checksum: not used by Homa, but must occupy the same bytes as - * the checksum in a TCP header (TSO may modify this?). - */ - __be16 checksum; - - /** - * @urgent: occupies the same bytes as the urgent pointer in a TCP - * header. When Homa packets are transmitted over TCP, this has the - * special value HOMA_TCP_URGENT (which is set even though URG is - * not set) to indicate that the packet is actually a Homa packet. - */ - __be16 urgent; -#define HOMA_TCP_URGENT 0xb97d - - /** - * @sender_id: the identifier of this RPC as used on the sender (i.e., - * if the low-order bit is set, then the sender is the server for - * this RPC). - */ - __be64 sender_id; -} __packed; - -/** - * struct homa_ack - Identifies an RPC that can be safely deleted by its - * server. After sending the response for an RPC, the server must retain its - * state for the RPC until it knows that the client has successfully - * received the entire response. An ack indicates this. Clients will - * piggyback acks on future data packets, but if a client doesn't send - * any data to the server, the server will eventually request an ack - * explicitly with a NEED_ACK packet, in which case the client will - * return an explicit ACK. - */ -struct homa_ack { - /** - * @id: The client's identifier for the RPC. 0 means this ack - * is invalid. - */ - __be64 client_id; - - /** @client_port: The client-side port for the RPC. */ - __be16 client_port; - - /** @server_port: The server-side port for the RPC. */ - __be16 server_port; -} __packed; - -/* struct data_header - Contains data for part or all of a Homa message. - * An incoming packet consists of a data_header followed by message data. - * An outgoing packet can have this simple format as well, or it can be - * structured as a GSO packet. Homa supports two different formats for GSO - * packets, depending on whether TCP hijacking is enabled: - * - * No hijacking: TCP hijacking: - * - * |-----------------------| |-----------------------| - * | | | | - * | data_header | | data_header | - * | | | | - * |---------------------- | |-----------------------| - * | | | | - * | | | | - * | segment data | | segment data | - * | | | | - * | | | | - * |-----------------------| |-----------------------| - * | seg_header | | | - * |-----------------------| | | - * | | | segment data | - * | | | | - * | segment data | | | - * | | |-----------------------| - * | | | | - * |-----------------------| | | - * | seg_header | | segment data | - * |-----------------------| | | - * | | | | - * | | |-----------------------| - * | segment data | - * | | - * | | - * |-----------------------| - * - * With TCP hijacking, TSO will automatically adjust @common.sequence in - * the segments, so that value can be used as the offset of the data within - * the message. Without TCP hijacking, TSO will not adjust @common.sequence - * in the segments, so Homa sprinkles correct offsets (in seg_headers) - * throughout the segment data; TSO/GSO will include a different seg_header - * in each generated packet. - */ - -struct seg_header { - /** - * @offset: Offset within message of the first byte of data in - * this segment. If this field is -1 it means that the packet was - * generated by GSO with TCP hijacking. In this case the true offset - * is in @common.sequence. homa_gro_receive detects this situation - * and updates this value from @common.sequence if needed, so the - * value will always be valid once the packet reaches homa_softirq. - */ - __be32 offset; -} __packed; - -struct data_header { - struct common_header common; - - /** @message_length: Total #bytes in the message. */ - __be32 message_length; - - /** - * @incoming: The receiver can expect the sender to send all of the - * bytes in the message up to at least this offset (exclusive), - * even without additional grants. This includes unscheduled - * bytes, granted bytes, plus any additional bytes the sender - * transmits unilaterally (e.g., to round up to a full GSO batch). - */ - __be32 incoming; - - /** @ack: If the @client_id field of this is nonzero, provides info - * about an RPC that the recipient can now safely free. Note: in - * TSO packets this will get duplicated in each of the segments; - * in order to avoid repeated attempts to ack the same RPC, - * homa_gro_receive will clear this field in all segments but the - * first. - */ - struct homa_ack ack; - - /** - * @cutoff_version: The cutoff_version from the most recent - * CUTOFFS packet that the source of this packet has received - * from the destination of this packet, or 0 if the source hasn't - * yet received a CUTOFFS packet. - */ - __be16 cutoff_version; - - /** - * @retransmit: 1 means this packet was sent in response to a RESEND - * (it has already been sent previously). - */ - __u8 retransmit; - - __u8 pad; - - /** @seg: First of possibly many segments. */ - struct seg_header seg; -} __packed; -_Static_assert(sizeof(struct data_header) <= HOMA_MAX_HEADER, - "data_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); -_Static_assert(sizeof(struct data_header) >= HOMA_MIN_PKT_LENGTH, - "data_header too small: Homa doesn't currently have codeto pad data packets"); -_Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) - & 0x3) == 0, - " data_header length not a multiple of 4 bytes (required for TCP/TSO compatibility"); - -/** - * homa_data_len() - Returns the total number of bytes in a DATA packet - * after the data_header. Note: if the packet is a GSO packet, the result - * may include metadata as well as packet data. - */ -static inline int homa_data_len(struct sk_buff *skb) -{ - return skb->len - skb_transport_offset(skb) - sizeof(struct data_header); -} - -/** - * struct grant_header - Wire format for GRANT packets, which are sent by - * the receiver back to the sender to indicate that the sender may transmit - * additional bytes in the message. - */ -struct grant_header { - /** @common: Fields common to all packet types. */ - struct common_header common; - - /** - * @offset: Byte offset within the message. - * - * The sender should now transmit all data up to (but not including) - * this offset ASAP, if it hasn't already. - */ - __be32 offset; - - /** - * @priority: The sender should use this priority level for all future - * MESSAGE_FRAG packets for this message, until a GRANT is received - * with higher offset. Larger numbers indicate higher priorities. - */ - __u8 priority; - - /** - * @resend_all: Nonzero means that the sender should resend all previously - * transmitted data, starting at the beginning of the message (assume - * that no packets have been successfully received). - */ - __u8 resend_all; -} __packed; -_Static_assert(sizeof(struct grant_header) <= HOMA_MAX_HEADER, - "grant_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); - -/** - * struct resend_header - Wire format for RESEND packets. - * - * A RESEND is sent by the receiver when it believes that message data may - * have been lost in transmission (or if it is concerned that the sender may - * have crashed). The receiver should resend the specified portion of the - * message, even if it already sent it previously. - */ -struct resend_header { - /** @common: Fields common to all packet types. */ - struct common_header common; - - /** - * @offset: Offset within the message of the first byte of data that - * should be retransmitted. - */ - __be32 offset; - - /** - * @length: Number of bytes of data to retransmit; this could specify - * a range longer than the total message size. Zero is a special case - * used by servers; in this case, there is no need to actually resend - * anything; the purpose of this packet is to trigger an UNKNOWN - * response if the client no longer cares about this RPC. - */ - __be32 length; - - /** - * @priority: Packet priority to use. - * - * The sender should transmit all the requested data using this - * priority. - */ - __u8 priority; -} __packed; -_Static_assert(sizeof(struct resend_header) <= HOMA_MAX_HEADER, - "resend_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); - -/** - * struct unknown_header - Wire format for UNKNOWN packets. - * - * An UNKNOWN packet is sent by either server or client when it receives a - * packet for an RPC that is unknown to it. When a client receives an - * UNKNOWN packet it will typically restart the RPC from the beginning; - * when a server receives an UNKNOWN packet it will typically discard its - * state for the RPC. - */ -struct unknown_header { - /** @common: Fields common to all packet types. */ - struct common_header common; -} __packed; -_Static_assert(sizeof(struct unknown_header) <= HOMA_MAX_HEADER, - "unknown_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); - -/** - * struct busy_header - Wire format for BUSY packets. - * - * These packets tell the recipient that the sender is still alive (even if - * it isn't sending data expected by the recipient). - */ -struct busy_header { - /** @common: Fields common to all packet types. */ - struct common_header common; -} __packed; -_Static_assert(sizeof(struct busy_header) <= HOMA_MAX_HEADER, - "busy_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); - -/** - * struct cutoffs_header - Wire format for CUTOFFS packets. - * - * These packets tell the recipient how to assign priorities to - * unscheduled packets. - */ -struct cutoffs_header { - /** @common: Fields common to all packet types. */ - struct common_header common; - - /** - * @unsched_cutoffs: priorities to use for unscheduled packets - * sent to the sender of this packet. See documentation for - * @homa.unsched_cutoffs for the meanings of these values. - */ - __be32 unsched_cutoffs[HOMA_MAX_PRIORITIES]; - - /** - * @cutoff_version: unique identifier associated with @unsched_cutoffs. - * Must be included in future DATA packets sent to the sender of - * this packet. - */ - __be16 cutoff_version; -} __packed; -_Static_assert(sizeof(struct cutoffs_header) <= HOMA_MAX_HEADER, - "cutoffs_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); - -/** - * struct freeze_header - Wire format for FREEZE packets. - * - * These packets tell the recipient to freeze its timetrace; used - * for debugging. - */ -struct freeze_header { - /** @common: Fields common to all packet types. */ - struct common_header common; -} __packed; -_Static_assert(sizeof(struct freeze_header) <= HOMA_MAX_HEADER, - "freeze_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); - -/** - * struct need_ack_header - Wire format for NEED_ACK packets. - * - * These packets ask the recipient (a client) to return an ACK message if - * the packet's RPC is no longer active. - */ -struct need_ack_header { - /** @common: Fields common to all packet types. */ - struct common_header common; -} __packed; -_Static_assert(sizeof(struct need_ack_header) <= HOMA_MAX_HEADER, - "need_ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); - -/** - * struct ack_header - Wire format for ACK packets. - * - * These packets are sent from a client to a server to indicate that - * a set of RPCs is no longer active on the client, so the server can - * free any state it may have for them. - */ -struct ack_header { - /** @common: Fields common to all packet types. */ - struct common_header common; - - /** @num_acks: number of (leading) elements in @acks that are valid. */ - __be16 num_acks; - - struct homa_ack acks[NUM_PEER_UNACKED_IDS]; -} __packed; -_Static_assert(sizeof(struct ack_header) <= HOMA_MAX_HEADER, - "ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); - /** * struct homa_message_out - Describes a message (either request or response) * for which this machine is the sender. @@ -1671,7 +1199,7 @@ struct homa_peer { * @acks: info about client RPCs whose results have been completely * received. */ - struct homa_ack acks[NUM_PEER_UNACKED_IDS]; + struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; /** * @ack_lock: used to synchronize access to @num_acks and @acks. diff --git a/homa_incoming.c b/homa_incoming.c index cbf7f7b2..b774db20 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_wire.h" /** * homa_message_in_init() - Constructor for homa_message_in. @@ -814,7 +815,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, ack.common.urgent = htons(HOMA_TCP_URGENT); ack.common.sender_id = cpu_to_be64(id); ack.num_acks = htons(homa_peer_get_acks(peer, - NUM_PEER_UNACKED_IDS, ack.acks)); + HOMA_MAX_ACKS_PER_PKT, ack.acks)); __homa_xmit_control(&ack, sizeof(ack), peer, hsk); tt_record3("Responded to NEED_ACK for id %d, peer %0x%x with %d other acks", id, tt_addr(saddr), ntohs(ack.num_acks)); diff --git a/homa_outgoing.c b/homa_outgoing.c index a4285548..bbc4c19e 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_wire.h" /** * set_priority() - Arrange for an outgoing packet to have a particular diff --git a/homa_peertab.c b/homa_peertab.c index c9ac76e5..b616441d 100644 --- a/homa_peertab.c +++ b/homa_peertab.c @@ -378,7 +378,7 @@ void homa_peer_add_ack(struct homa_rpc *rpc) struct ack_header ack; homa_peer_lock(peer); - if (peer->num_acks < NUM_PEER_UNACKED_IDS) { + if (peer->num_acks < HOMA_MAX_ACKS_PER_PKT) { peer->acks[peer->num_acks].client_id = cpu_to_be64(rpc->id); peer->acks[peer->num_acks].client_port = htons(rpc->hsk->port); peer->acks[peer->num_acks].server_port = htons(rpc->dport); diff --git a/homa_wire.h b/homa_wire.h new file mode 100644 index 00000000..a6314db3 --- /dev/null +++ b/homa_wire.h @@ -0,0 +1,477 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file defines the on-the-wire format of Homa packets. */ + +#ifndef _HOMA_WIRE_H +#define _HOMA_WIRE_H + +/** + * enum homa_packet_type - Defines the possible types of Homa packets. + * + * See the xxx_header structs below for more information about each type. + */ +enum homa_packet_type { + DATA = 0x10, + GRANT = 0x11, + RESEND = 0x12, + UNKNOWN = 0x13, + BUSY = 0x14, + CUTOFFS = 0x15, + FREEZE = 0x16, + NEED_ACK = 0x17, + ACK = 0x18, + BOGUS = 0x19, /* Used only in unit tests. */ + /* If you add a new type here, you must also do the following: + * 1. Change BOGUS so it is the highest opcode + * 2. Add support for the new opcode in homa_print_packet, + * homa_print_packet_short, homa_symbol_for_type, and mock_skb_new. + * 3. Add the header length to header_lengths in homa_plumbing.c. + */ +}; + +/** define HOMA_IPV6_HEADER_LENGTH - Size of IP header (V6). */ +#define HOMA_IPV6_HEADER_LENGTH 40 + +/** define HOMA_IPV4_HEADER_LENGTH - Size of IP header (V4). */ +#define HOMA_IPV4_HEADER_LENGTH 20 + +/** + * define HOMA_SKB_EXTRA - How many bytes of additional space to allow at the + * beginning of each sk_buff, before the IP header. This includes room for a + * VLAN header and also includes some extra space, "just to be safe" (not + * really sure if this is needed). + */ +#define HOMA_SKB_EXTRA 40 + +/** + * define HOMA_ETH_OVERHEAD - Number of bytes per Ethernet packet for Ethernet + * header, CRC, preamble, and inter-packet gap. + */ +#define HOMA_ETH_OVERHEAD 42 + +/** + * define HOMA_MIN_PKT_LENGTH - Every Homa packet must be padded to at least + * this length to meet Ethernet frame size limitations. This number includes + * Homa headers and data, but not IP or Ethernet headers. + */ +#define HOMA_MIN_PKT_LENGTH 26 + +/** + * define HOMA_MAX_HEADER - Number of bytes in the largest Homa header. + */ +#define HOMA_MAX_HEADER 90 + +/** + * define ETHERNET_MAX_PAYLOAD - Maximum length of an Ethernet packet, + * excluding preamble, frame delimeter, VLAN header, CRC, and interpacket gap; + * i.e. all of this space is available for Homa. + */ +#define ETHERNET_MAX_PAYLOAD 1500 + +/** + * define HOMA_MAX_PRIORITIES - The maximum number of priority levels that + * Homa can use (the actual number can be restricted to less than this at + * runtime). Changing this value will affect packet formats. + */ +#define HOMA_MAX_PRIORITIES 8 + +/** + * struct common_header - Wire format for the first bytes in every Homa + * packet. This must (mostly) match the format of a TCP header to enable + * Homa packets to actually be transmitted as TCP packets (and thereby + * take advantage of TSO and other features). + */ +struct common_header { + /** + * @sport: Port on source machine from which packet was sent. + * Must be in the same position as in a TCP header. + */ + __be16 sport; + + /** + * @dport: Port on destination that is to receive packet. Must be + * in the same position as in a TCP header. + */ + __be16 dport; + + /** + * @sequence: corresponds to the sequence number field in TCP headers; + * used in DATA packets to hold the offset in the message of the first + * byte of data. However, when TSO is used without TCP hijacking, this + * value will only be correct in the first segment of a GSO packet. + */ + __be32 sequence; + + /** + * The fields below correspond to the acknowledgment field in TCP + * headers; not used by Homa, except for the low-order 8 bits, which + * specify the Homa packet type (one of the values in the + * homa_packet_type enum). + */ + __be16 ack1; + __u8 ack2; + __u8 type; + + /** + * @doff: High order 4 bits holds the number of 4-byte chunks in a + * data_header (low-order bits unused). Used only for DATA packets; + * must be in the same position as the data offset in a TCP header. + * Used by TSO to determine where the replicated header portion ends. + */ + __u8 doff; + + /** + * @flags: Holds TCP flags such as URG, ACK, etc. The special value + * HOMA_TCP_FLAGS is stored here to distinguish Homa-over-TCP packets + * from real TCP packets. It includes the SYN and RST flags, + * which TCP would never use together; must not include URG or FIN + * (TSO will turn off FIN for all but the last segment). + */ + __u8 flags; +#define HOMA_TCP_FLAGS 6 + + /** + * @window: Corresponds to the window field in TCP headers. Not used + * by HOMA. + */ + __be16 window; + + /** + * @checksum: not used by Homa, but must occupy the same bytes as + * the checksum in a TCP header (TSO may modify this?). + */ + __be16 checksum; + + /** + * @urgent: occupies the same bytes as the urgent pointer in a TCP + * header. When Homa packets are transmitted over TCP, this has the + * special value HOMA_TCP_URGENT (which is set even though URG is + * not set) to indicate that the packet is actually a Homa packet. + */ + __be16 urgent; +#define HOMA_TCP_URGENT 0xb97d + + /** + * @sender_id: the identifier of this RPC as used on the sender (i.e., + * if the low-order bit is set, then the sender is the server for + * this RPC). + */ + __be64 sender_id; +} __packed; + +/** + * struct homa_ack - Identifies an RPC that can be safely deleted by its + * server. After sending the response for an RPC, the server must retain its + * state for the RPC until it knows that the client has successfully + * received the entire response. An ack indicates this. Clients will + * piggyback acks on future data packets, but if a client doesn't send + * any data to the server, the server will eventually request an ack + * explicitly with a NEED_ACK packet, in which case the client will + * return an explicit ACK. + */ +struct homa_ack { + /** + * @id: The client's identifier for the RPC. 0 means this ack + * is invalid. + */ + __be64 client_id; + + /** @client_port: The client-side port for the RPC. */ + __be16 client_port; + + /** @server_port: The server-side port for the RPC. */ + __be16 server_port; +} __packed; + +/* struct data_header - Contains data for part or all of a Homa message. + * An incoming packet consists of a data_header followed by message data. + * An outgoing packet can have this simple format as well, or it can be + * structured as a GSO packet. Homa supports two different formats for GSO + * packets, depending on whether TCP hijacking is enabled: + * + * No hijacking: TCP hijacking: + * + * |-----------------------| |-----------------------| + * | | | | + * | data_header | | data_header | + * | | | | + * |---------------------- | |-----------------------| + * | | | | + * | | | | + * | segment data | | segment data | + * | | | | + * | | | | + * |-----------------------| |-----------------------| + * | seg_header | | | + * |-----------------------| | | + * | | | segment data | + * | | | | + * | segment data | | | + * | | |-----------------------| + * | | | | + * |-----------------------| | | + * | seg_header | | segment data | + * |-----------------------| | | + * | | | | + * | | |-----------------------| + * | segment data | + * | | + * | | + * |-----------------------| + * + * With TCP hijacking, TSO will automatically adjust @common.sequence in + * the segments, so that value can be used as the offset of the data within + * the message. Without TCP hijacking, TSO will not adjust @common.sequence + * in the segments, so Homa sprinkles correct offsets (in seg_headers) + * throughout the segment data; TSO/GSO will include a different seg_header + * in each generated packet. + */ + +struct seg_header { + /** + * @offset: Offset within message of the first byte of data in + * this segment. If this field is -1 it means that the packet was + * generated by GSO with TCP hijacking. In this case the true offset + * is in @common.sequence. homa_gro_receive detects this situation + * and updates this value from @common.sequence if needed, so the + * value will always be valid once the packet reaches homa_softirq. + */ + __be32 offset; +} __packed; + +struct data_header { + struct common_header common; + + /** @message_length: Total #bytes in the message. */ + __be32 message_length; + + /** + * @incoming: The receiver can expect the sender to send all of the + * bytes in the message up to at least this offset (exclusive), + * even without additional grants. This includes unscheduled + * bytes, granted bytes, plus any additional bytes the sender + * transmits unilaterally (e.g., to round up to a full GSO batch). + */ + __be32 incoming; + + /** @ack: If the @client_id field of this is nonzero, provides info + * about an RPC that the recipient can now safely free. Note: in + * TSO packets this will get duplicated in each of the segments; + * in order to avoid repeated attempts to ack the same RPC, + * homa_gro_receive will clear this field in all segments but the + * first. + */ + struct homa_ack ack; + + /** + * @cutoff_version: The cutoff_version from the most recent + * CUTOFFS packet that the source of this packet has received + * from the destination of this packet, or 0 if the source hasn't + * yet received a CUTOFFS packet. + */ + __be16 cutoff_version; + + /** + * @retransmit: 1 means this packet was sent in response to a RESEND + * (it has already been sent previously). + */ + __u8 retransmit; + + __u8 pad; + + /** @seg: First of possibly many segments. */ + struct seg_header seg; +} __packed; +_Static_assert(sizeof(struct data_header) <= HOMA_MAX_HEADER, + "data_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct data_header) >= HOMA_MIN_PKT_LENGTH, + "data_header too small: Homa doesn't currently have codeto pad data packets"); +_Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) + & 0x3) == 0, + " data_header length not a multiple of 4 bytes (required for TCP/TSO compatibility"); + +/** + * homa_data_len() - Returns the total number of bytes in a DATA packet + * after the data_header. Note: if the packet is a GSO packet, the result + * may include metadata as well as packet data. + */ +static inline int homa_data_len(struct sk_buff *skb) +{ + return skb->len - skb_transport_offset(skb) - sizeof(struct data_header); +} + +/** + * struct grant_header - Wire format for GRANT packets, which are sent by + * the receiver back to the sender to indicate that the sender may transmit + * additional bytes in the message. + */ +struct grant_header { + /** @common: Fields common to all packet types. */ + struct common_header common; + + /** + * @offset: Byte offset within the message. + * + * The sender should now transmit all data up to (but not including) + * this offset ASAP, if it hasn't already. + */ + __be32 offset; + + /** + * @priority: The sender should use this priority level for all future + * MESSAGE_FRAG packets for this message, until a GRANT is received + * with higher offset. Larger numbers indicate higher priorities. + */ + __u8 priority; + + /** + * @resend_all: Nonzero means that the sender should resend all previously + * transmitted data, starting at the beginning of the message (assume + * that no packets have been successfully received). + */ + __u8 resend_all; +} __packed; +_Static_assert(sizeof(struct grant_header) <= HOMA_MAX_HEADER, + "grant_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + +/** + * struct resend_header - Wire format for RESEND packets. + * + * A RESEND is sent by the receiver when it believes that message data may + * have been lost in transmission (or if it is concerned that the sender may + * have crashed). The receiver should resend the specified portion of the + * message, even if it already sent it previously. + */ +struct resend_header { + /** @common: Fields common to all packet types. */ + struct common_header common; + + /** + * @offset: Offset within the message of the first byte of data that + * should be retransmitted. + */ + __be32 offset; + + /** + * @length: Number of bytes of data to retransmit; this could specify + * a range longer than the total message size. Zero is a special case + * used by servers; in this case, there is no need to actually resend + * anything; the purpose of this packet is to trigger an UNKNOWN + * response if the client no longer cares about this RPC. + */ + __be32 length; + + /** + * @priority: Packet priority to use. + * + * The sender should transmit all the requested data using this + * priority. + */ + __u8 priority; +} __packed; +_Static_assert(sizeof(struct resend_header) <= HOMA_MAX_HEADER, + "resend_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + +/** + * struct unknown_header - Wire format for UNKNOWN packets. + * + * An UNKNOWN packet is sent by either server or client when it receives a + * packet for an RPC that is unknown to it. When a client receives an + * UNKNOWN packet it will typically restart the RPC from the beginning; + * when a server receives an UNKNOWN packet it will typically discard its + * state for the RPC. + */ +struct unknown_header { + /** @common: Fields common to all packet types. */ + struct common_header common; +} __packed; +_Static_assert(sizeof(struct unknown_header) <= HOMA_MAX_HEADER, + "unknown_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + +/** + * struct busy_header - Wire format for BUSY packets. + * + * These packets tell the recipient that the sender is still alive (even if + * it isn't sending data expected by the recipient). + */ +struct busy_header { + /** @common: Fields common to all packet types. */ + struct common_header common; +} __packed; +_Static_assert(sizeof(struct busy_header) <= HOMA_MAX_HEADER, + "busy_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + +/** + * struct cutoffs_header - Wire format for CUTOFFS packets. + * + * These packets tell the recipient how to assign priorities to + * unscheduled packets. + */ +struct cutoffs_header { + /** @common: Fields common to all packet types. */ + struct common_header common; + + /** + * @unsched_cutoffs: priorities to use for unscheduled packets + * sent to the sender of this packet. See documentation for + * @homa.unsched_cutoffs for the meanings of these values. + */ + __be32 unsched_cutoffs[HOMA_MAX_PRIORITIES]; + + /** + * @cutoff_version: unique identifier associated with @unsched_cutoffs. + * Must be included in future DATA packets sent to the sender of + * this packet. + */ + __be16 cutoff_version; +} __packed; +_Static_assert(sizeof(struct cutoffs_header) <= HOMA_MAX_HEADER, + "cutoffs_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + +/** + * struct freeze_header - Wire format for FREEZE packets. + * + * These packets tell the recipient to freeze its timetrace; used + * for debugging. + */ +struct freeze_header { + /** @common: Fields common to all packet types. */ + struct common_header common; +} __packed; +_Static_assert(sizeof(struct freeze_header) <= HOMA_MAX_HEADER, + "freeze_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + +/** + * struct need_ack_header - Wire format for NEED_ACK packets. + * + * These packets ask the recipient (a client) to return an ACK message if + * the packet's RPC is no longer active. + */ +struct need_ack_header { + /** @common: Fields common to all packet types. */ + struct common_header common; +} __packed; +_Static_assert(sizeof(struct need_ack_header) <= HOMA_MAX_HEADER, + "need_ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + +/** + * struct ack_header - Wire format for ACK packets. + * + * These packets are sent from a client to a server to indicate that + * a set of RPCs is no longer active on the client, so the server can + * free any state it may have for them. + */ +struct ack_header { + /** @common: Fields common to all packet types. */ + struct common_header common; + + /** @num_acks: number of (leading) elements in @acks that are valid. */ + __be16 num_acks; + +#define HOMA_MAX_ACKS_PER_PKT 5 + struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; +} __packed; +_Static_assert(sizeof(struct ack_header) <= HOMA_MAX_HEADER, + "ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + +#endif /* _HOMA_WIRE_H */ From 7e029c1b4d09a7e67db336632779afb095a27e36 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 27 Sep 2024 11:41:53 -0700 Subject: [PATCH 023/625] Add "node" to metric file names in cperf.py --- util/cperf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index 4eb50db8..f2f674a2 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -899,13 +899,13 @@ def run_experiments(*args): if homa_nodes: vlog("Recording final metrics from nodes %s" % (homa_nodes)) for id in homa_nodes: - f = open("%s/%d.metrics" % (exp.log_dir, id), 'w') + f = open("%s/node%d.metrics" % (exp.log_dir, id), 'w') subprocess.run(["ssh", "node%d" % (id), "metrics.py"], stdout=f) f.close() - shutil.copyfile("%s/%d.metrics" % (exp.log_dir, homa_clients[0]), - "%s/reports/%d.metrics" % (exp.log_dir, homa_clients[0])) - shutil.copyfile("%s/%d.metrics" % (exp.log_dir, homa_servers[0]), - "%s/reports/%d.metrics" % (exp.log_dir, homa_servers[0])) + shutil.copyfile("%s/node%d.metrics" % (exp.log_dir, homa_clients[0]), + "%s/reports/node%d.metrics" % (exp.log_dir, homa_clients[0])) + shutil.copyfile("%s/node%d.metrics" % (exp.log_dir, homa_servers[0]), + "%s/reports/node%d.metrics" % (exp.log_dir, homa_servers[0])) do_cmd("stop senders", all_nodes) do_cmd("stop clients", all_nodes) for exp in args: From 3a1961981acd4575ea890263f97c67f36fdd6635 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 27 Sep 2024 11:51:57 -0700 Subject: [PATCH 024/625] Extract homa_metrics.h from homa_impl.h, create homa_metrics.c --- Makefile | 1 + homa_impl.h | 686 +------------------------------------ homa_incoming.c | 1 - homa_metrics.c | 427 +++++++++++++++++++++++ homa_metrics.h | 692 ++++++++++++++++++++++++++++++++++++++ homa_offload.c | 8 +- homa_plumbing.c | 83 ----- homa_timer.c | 2 +- homa_utils.c | 441 ------------------------ homa_wire.h | 2 + test/Makefile | 2 + test/mock.c | 2 + test/unit_homa_grant.c | 22 +- test/unit_homa_incoming.c | 46 +-- test/unit_homa_metrics.c | 98 ++++++ test/unit_homa_offload.c | 14 +- test/unit_homa_outgoing.c | 26 +- test/unit_homa_peertab.c | 16 +- test/unit_homa_plumbing.c | 60 +--- test/unit_homa_pool.c | 4 +- test/unit_homa_skb.c | 8 +- test/unit_homa_socktab.c | 8 +- test/unit_homa_timer.c | 6 +- test/unit_homa_utils.c | 36 +- test/utils.h | 2 - timetrace.c | 3 +- 26 files changed, 1331 insertions(+), 1365 deletions(-) create mode 100644 homa_metrics.c create mode 100644 homa_metrics.h create mode 100644 test/unit_homa_metrics.c diff --git a/Makefile b/Makefile index 5af80ea3..fb277f17 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ ifneq ($(KERNELRELEASE),) obj-m += homa.o homa-y = homa_grant.o \ homa_incoming.o \ + homa_metrics.o \ homa_offload.o \ homa_outgoing.o \ homa_peertab.o \ diff --git a/homa_impl.h b/homa_impl.h index e91aaa76..856279f3 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -48,8 +48,6 @@ #pragma GCC diagnostic warning "-Wpointer-sign" #pragma GCC diagnostic warning "-Wunused-variable" -#include "homa_wire.h" - #ifdef __UNIT_TEST__ #undef alloc_pages #define alloc_pages mock_alloc_pages @@ -115,6 +113,15 @@ extern void mock_spin_unlock(spinlock_t *lock); #undef vmalloc #define vmalloc mock_vmalloc extern void *mock_vmalloc(size_t size); + +#undef DECLARE_PER_CPU +#define DECLARE_PER_CPU(type, name) extern type name[10]; + +#undef DEFINE_PER_CPU +#define DEFINE_PER_CPU(type, name) type name[10]; + +#undef per_cpu +#define per_cpu(name, core) (name[core]) #endif /* __UNIT_TEST__ */ /* Null out things that confuse VSCode Intellisense */ @@ -125,9 +132,6 @@ extern void *mock_vmalloc(size_t size); #define set_current_state(...) #endif -#include "homa.h" -#include "timetrace.h" - /* Forward declarations. */ struct homa_sock; struct homa_rpc; @@ -135,6 +139,11 @@ struct homa_rpc_bucket; struct homa; struct homa_peer; +#include "homa.h" +#include "timetrace.h" +#include "homa_wire.h" +#include "homa_metrics.h" + /* Declarations used in this file, so they can't be made at the end. */ extern void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id); extern int homa_grantable_lock_slow(struct homa *homa, int recalc); @@ -1870,657 +1879,6 @@ struct homa { int temp[4]; }; -/** - * struct homa_metrics - various performance counters kept by Homa. - * - * There is one of these structures for each core, so counters can - * be updated without worrying about synchronization or extra cache - * misses. This isn't quite perfect (it's conceivable that a process - * could move from one CPU to another in the middle of updating a counter), - * but this is unlikely, and we can tolerate the occasional miscounts - * that might result. - * - * All counters are free-running: they never reset. - */ -#define HOMA_NUM_SMALL_COUNTS 64 -#define HOMA_NUM_MEDIUM_COUNTS 128 -struct homa_metrics { - /** - * @small_msg_bytes: entry i holds the total number of bytes - * received in messages whose length is between 64*i and 64*i + 63, - * inclusive. - */ - __u64 small_msg_bytes[HOMA_NUM_SMALL_COUNTS]; - - /** - * @medium_msg_bytes: entry i holds the total number of bytes - * received in messages whose length is between 1024*i and - * 1024*i + 1023, inclusive. The first four entries are always 0 - * (small_msg_counts covers this range). - */ - __u64 medium_msg_bytes[HOMA_NUM_MEDIUM_COUNTS]; - - /** - * @large_msg_count: the total number of messages received whose - * length is too large to appear in medium_msg_bytes. - */ - __u64 large_msg_count; - - /** - * @large_msg_bytes: the total number of bytes received in - * messages too large to be counted by medium_msg_bytes. - */ - __u64 large_msg_bytes; - - /** - * @sent_msg_bytes: The total number of bytes in outbound - * messages. - */ - __u64 sent_msg_bytes; - - /** - * @packets_sent: total number of packets sent for each packet type - * (entry 0 corresponds to DATA, and so on). - */ - __u64 packets_sent[BOGUS-DATA]; - - /** - * @packets_received: total number of packets received for each - * packet type (entry 0 corresponds to DATA, and so on). - */ - __u64 packets_received[BOGUS-DATA]; - - /** @priority_bytes: total bytes sent at each priority level. */ - __u64 priority_bytes[HOMA_MAX_PRIORITIES]; - - /** @priority_packets: total packets sent at each priority level. */ - __u64 priority_packets[HOMA_MAX_PRIORITIES]; - - /** - * @skb_allocs: total number of calls to homa_skb_new_tx. - */ - __u64 skb_allocs; - - /** - * @skb_alloc_cycles: total time spent in homa_skb_new_tx, as - * measured with get_cycles(). - */ - __u64 skb_alloc_cycles; - - /** - * @skb_frees: total number of sk_buffs for data packets that have - * been freed (counts normal paths only). - */ - __u64 skb_frees; - - /** - * @skb_free_cycles: total time spent freeing sk_buffs, as - * measured with get_cycles(). - */ - __u64 skb_free_cycles; - - /** - * @skb_page_allocs: total number of calls to homa_skb_page_alloc. - */ - __u64 skb_page_allocs; - - /** - * @skb_page_alloc_cycles: total time spent in homa_skb_page_alloc, as - * measured with get_cycles(). - */ - __u64 skb_page_alloc_cycles; - - /** - * @requests_received: total number of request messages received. - */ - __u64 requests_received; - - /** - * @requests_queued: total number of requests that were added to - * @homa->ready_requests (no thread was waiting). - */ - __u64 requests_queued; - - /** - * @responses_received: total number of response messages received. - */ - __u64 responses_received; - - /** - * @responses_queued: total number of responses that were added to - * @homa->ready_responses (no thread was waiting). - */ - __u64 responses_queued; - - /** - * @fast_wakeups: total number of times that a message arrived for - * a receiving thread that was polling in homa_wait_for_message. - */ - __u64 fast_wakeups; - - /** - * @slow_wakeups: total number of times that a receiving thread - * had to be put to sleep (no message arrived while it was polling). - */ - __u64 slow_wakeups; - - /** - * @handoffs_thread_waiting: total number of times that an RPC - * was handed off to a waiting thread (vs. being queued). - */ - __u64 handoffs_thread_waiting; - - /** - * @handoffs_alt_thread: total number of times that a thread other - * than the first on the list was chosen for a handoff (because the - * first thread was on a busy core). - */ - __u64 handoffs_alt_thread; - - /** - * @poll_cycles: total time spent in the polling loop in - * homa_wait_for_message, as measured with get_cycles(). - */ - __u64 poll_cycles; - - /** - * @softirq_calls: total number of calls to homa_softirq (i.e., - * total number of GRO packets processed, each of which could contain - * multiple Homa packets. - */ - __u64 softirq_calls; - - /** - * @softirq_cycles: total time spent executing homa_softirq when - * invoked under Linux's SoftIRQ handler, as measured with get_cycles(). - */ - __u64 softirq_cycles; - - /** - * @bypass_softirq_cycles: total time spent executing homa_softirq when - * invoked during GRO, bypassing the SoftIRQ mechanism. - */ - __u64 bypass_softirq_cycles; - - /** - * @linux_softirq_cycles: total time spent executing all softirq - * activities, as measured by the linux softirq module, in get_cycles() - * units. Only available with modified Linux kernels. - */ - __u64 linux_softirq_cycles; - - /** - * @napi_cycles: total time spent executing all NAPI activities, - * as measured by the linux softirq module, in get_cycles() units. - * Only available with modified Linux kernels. - */ - __u64 napi_cycles; - - /** - * @send_cycles: total time spent executing the homa_sendmsg kernel - * call handler to send requests, as measured with get_cycles(). - */ - __u64 send_cycles; - - /** @send_calls: total number of invocations of homa_semdmsg - * for requests. - */ - __u64 send_calls; - - /** - * @recv_cycles: total time spent executing homa_recvmsg (including - * time when the thread is blocked), as measured with get_cycles(). - */ - __u64 recv_cycles; - - /** @recv_calls: total number of invocations of homa_recvmsg. */ - __u64 recv_calls; - - /** - * @blocked_cycles: total time threads spend in blocked state - * while executing the homa_recvmsg kernel call handler. - */ - __u64 blocked_cycles; - - /** - * @reply_cycles: total time spent executing the homa_sendmsg kernel - * call handler to send responses, as measured with get_cycles(). - */ - __u64 reply_cycles; - - /** - * @reply_calls: total number of invocations of homa_semdmsg - * for responses. - */ - __u64 reply_calls; - - /** - * @abort_cycles: total time spent executing the homa_ioc_abort - * kernel call handler, as measured with get_cycles(). - */ - __u64 abort_cycles; - - /** - * @abort_calls: total number of invocations of the homa_ioc_abort - * kernel call. - */ - __u64 abort_calls; - - /** - * @so_set_buf_cycles: total time spent executing the homa_ioc_set_buf - * kernel call handler, as measured with get_cycles(). - */ - __u64 so_set_buf_cycles; - - /** - * @so_set_buf_calls: total number of invocations of the homa_ioc_set_buf - * kernel call. - */ - __u64 so_set_buf_calls; - - /** - * @grantable_lock_cycles: total time spent with homa->grantable_lock - * locked. - */ - __u64 grantable_lock_cycles; - - /** - * @timer_cycles: total time spent in homa_timer, as measured with - * get_cycles(). - */ - __u64 timer_cycles; - - /** - * @timer_reap_cycles: total time spent by homa_timer to reap dead - * RPCs, as measured with get_cycles(). This time is included in - * @timer_cycles. - */ - __u64 timer_reap_cycles; - - /** - * @data_pkt_reap_cycles: total time spent by homa_data_pkt to reap - * dead RPCs, as measured with get_cycles(). - */ - __u64 data_pkt_reap_cycles; - - /** - * @pacer_cycles: total time spent executing in homa_pacer_main - * (not including blocked time), as measured with get_cycles(). - */ - __u64 pacer_cycles; - - /** - * @pacer_lost_cycles: unnecessary delays in transmitting packets - * (i.e. wasted output bandwidth) because the pacer was slow or got - * descheduled. - */ - __u64 pacer_lost_cycles; - - /** - * @pacer_bytes: total number of bytes transmitted when - * @homa->throttled_rpcs is nonempty. - */ - __u64 pacer_bytes; - - /** - * @pacer_skipped_rpcs: total number of times that the pacer had to - * abort because it couldn't lock an RPC. - */ - __u64 pacer_skipped_rpcs; - - /** - * @pacer_needed_help: total number of times that homa_check_pacer - * found that the pacer was running behind, so it actually invoked - * homa_pacer_xmit. - */ - __u64 pacer_needed_help; - - /** - * @throttled_cycles: total amount of time that @homa->throttled_rpcs - * is nonempty, as measured with get_cycles(). - */ - __u64 throttled_cycles; - - /** - * @resent_packets: total number of data packets issued in response to - * RESEND packets. - */ - __u64 resent_packets; - - /** - * @peer_hash_links: total # of link traversals in homa_peer_find. - */ - __u64 peer_hash_links; - - /** - * @peer_new_entries: total # of new entries created in Homa's - * peer table (this value doesn't increment if the desired peer is - * found in the entry in its hash chain). - */ - __u64 peer_new_entries; - - /** - * @peer_kmalloc errors: total number of times homa_peer_find - * returned an error because it couldn't allocate memory for a new - * peer. - */ - __u64 peer_kmalloc_errors; - - /** - * @peer_route errors: total number of times homa_peer_find - * returned an error because it couldn't create a route to the peer. - */ - __u64 peer_route_errors; - - /** - * @control_xmit_errors errors: total number of times ip_queue_xmit - * failed when transmitting a control packet. - */ - __u64 control_xmit_errors; - - /** - * @data_xmit_errors errors: total number of times ip_queue_xmit - * failed when transmitting a data packet. - */ - __u64 data_xmit_errors; - - /** - * @unknown_rpc: total number of times an incoming packet was - * discarded because it referred to a nonexistent RPC. Doesn't - * count grant packets received by servers (since these are - * fairly common). - */ - __u64 unknown_rpcs; - - /** - * @cant_create_server_rpc: total number of times a server discarded - * an incoming packet because it couldn't create a homa_rpc object. - */ - __u64 server_cant_create_rpcs; - - /** - * @unknown_packet_type: total number of times a packet was discarded - * because its type wasn't one of the supported values. - */ - __u64 unknown_packet_types; - - /** - * @short_packets: total number of times a packet was discarded - * because it was too short to hold all the required information. - */ - __u64 short_packets; - - /** - * @packet_discards: total number of times a normal (non-retransmitted) - * packet was discarded because all its data had already been received. - */ - __u64 packet_discards; - - /** - * @resent_discards: total number of times a retransmitted packet - * was discarded because its data had already been received. - */ - __u64 resent_discards; - - /** - * @resent_packets_used: total number of times a resent packet was - * actually incorporated into the message at the target (i.e. it - * wasn't redundant). - */ - __u64 resent_packets_used; - - /** - * @rpc_timeouts: total number of times an RPC (either client or - * server) was aborted because the peer was nonresponsive. - */ - __u64 rpc_timeouts; - - /** - * @server_rpc_discards: total number of times an RPC was aborted on - * the server side because of a timeout. - */ - __u64 server_rpc_discards; - - /** - * @server_rpcs_unknown: total number of times an RPC was aborted on - * the server side because it is no longer known to the client. - */ - __u64 server_rpcs_unknown; - - /** - * @client_lock_misses: total number of times that Homa had to wait - * to acquire a client bucket lock. - */ - __u64 client_lock_misses; - - /** - * @client_lock_miss_cycles: total time spent waiting for client - * bucket lock misses, measured by get_cycles(). - */ - __u64 client_lock_miss_cycles; - - /** - * @server_lock_misses: total number of times that Homa had to wait - * to acquire a server bucket lock. - */ - __u64 server_lock_misses; - - /** - * @server_lock_miss_cycles: total time spent waiting for server - * bucket lock misses, measured by get_cycles(). - */ - __u64 server_lock_miss_cycles; - - /** - * @socket_lock_miss_cycles: total time spent waiting for socket - * lock misses, measured by get_cycles(). - */ - __u64 socket_lock_miss_cycles; - - /** - * @socket_lock_misses: total number of times that Homa had to wait - * to acquire a socket lock. - */ - __u64 socket_lock_misses; - - /** - * @throttle_lock_miss_cycles: total time spent waiting for throttle - * lock misses, measured by get_cycles(). - */ - __u64 throttle_lock_miss_cycles; - - /** - * @throttle_lock_misses: total number of times that Homa had to wait - * to acquire the throttle lock. - */ - __u64 throttle_lock_misses; - - /** - * @peer_acklock_miss_cycles: total time spent waiting for peer - * lock misses, measured by get_cycles(). - */ - __u64 peer_ack_lock_miss_cycles; - - /** - * @peer_ack_lock_misses: total number of times that Homa had to wait - * to acquire the lock used for managing acks for a peer. - */ - __u64 peer_ack_lock_misses; - - /** - * @grantable_lock_miss_cycles: total time spent waiting for grantable - * lock misses, measured by get_cycles(). - */ - __u64 grantable_lock_miss_cycles; - - /** - * @grantable_lock_misses: total number of times that Homa had to wait - * to acquire the grantable lock. - */ - __u64 grantable_lock_misses; - - /** - * @grantable_rpcs_integral: cumulative sum of time_delta*grantable, - * where time_delta is a get_cycles time and grantable is the - * value of homa->num_grantable_rpcs over that time period. - */ - __u64 grantable_rpcs_integral; - - /** - * @grant_recalc_calls: cumulative number of times homa_grant_recalc - * has been invoked. - */ - __u64 grant_recalc_calls; - - /** - * @grant_recalc_cycles: total time spent in homa_grant_recalc, - * in get_cycles() units. - */ - __u64 grant_recalc_cycles; - - /** - * @grant_recalc_loops: cumulative number of times homa_grant_recalc - * has looped back to recalculate again. - */ - __u64 grant_recalc_loops; - - /** - * @grant_recalc_skips: cumulative number of times that - * homa_grant_recalc skipped its work because in other thread - * already did it. - */ - __u64 grant_recalc_skips; - - /** - * @grant_priority_bumps: cumulative number of times the grant priority - * of an RPC has increased above its next-higher-priority neighbor. - */ - __u64 grant_priority_bumps; - - /** - * @fifo_grants: total number of times that grants were sent to - * the oldest message. - */ - __u64 fifo_grants; - - /** - * @fifo_grants_no_incoming: total number of times that, when a - * FIFO grant was issued, the message had no outstanding grants - * (everything granted had been received). - */ - __u64 fifo_grants_no_incoming; - - /** - * @disabled_reaps: total number of times that the reaper couldn't - * run at all because it was disabled. - */ - __u64 disabled_reaps; - - /** - * @disabled_rpc_reaps: total number of times that the reaper skipped - * an RPC because reaping was disabled for that particular RPC - */ - __u64 disabled_rpc_reaps; - - /** - * @reaper_runs: total number of times that the reaper was invoked - * and was not disabled. - */ - __u64 reaper_calls; - - /** - * @reaper_dead_skbs: incremented by hsk->dead_skbs each time that - * reaper_calls is incremented. - */ - __u64 reaper_dead_skbs; - - /** - * @forced_reaps: total number of times that homa_wait_for_message - * invoked the reaper because dead_skbs was too high. - */ - __u64 forced_reaps; - - /** - * @throttle_list_adds: total number of calls to homa_add_to_throttled. - */ - __u64 throttle_list_adds; - - /** - * @throttle_list_checks: number of list elements examined in - * calls to homa_add_to_throttled. - */ - __u64 throttle_list_checks; - - /** - * @unacked_overflows: total number of times that homa_peer_add_ack - * found insufficient space for the new id and hence had to send an - * ACK message. - */ - __u64 ack_overflows; - - /** - * @ignored_need_acks: total number of times that a NEED_ACK packet - * was ignored because the RPC's result hadn't been fully received. - */ - __u64 ignored_need_acks; - - /** - * @bpage_resuses: total number of times that, when an owned page - * reached the end, it could be reused because all existing - * allocations had been released. - */ - __u64 bpage_reuses; - - /** - * @buffer_alloc_failures: total number of times that - * homa_pool_allocate was unable to allocate buffer space for - * an incoming message. - */ - __u64 buffer_alloc_failures; - - /** - * @linux_pkt_alloc_bytes: total bytes allocated in new packet buffers - * by the NIC driver because of packet cache underflows. - */ - __u64 linux_pkt_alloc_bytes; - - /** - * @dropped_data_no_bufs: total bytes of incoming data dropped because - * there was no application buffer space available. - */ - __u64 dropped_data_no_bufs; - - /** - * @gen3_handoffs: total number of handoffs from GRO to SoftIRQ made - * by Gen3 load balancer. - */ - __u64 gen3_handoffs; - - /** - * @gen3_alt_handoffs: total number of GRO->SoftIRQ handoffs that - * didn't choose the primary SoftIRQ core because it was busy with - * app threads. - */ - __u64 gen3_alt_handoffs; - - /** - * @gro_grant_bypasses: total number of GRANT packets passed directly - * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ - * mechanism (triggered by HOMA_GRO_FAST_GRANTS). - */ - __u64 gro_grant_bypasses; - - /** - * @gro_data_bypasses: total number of DATA packets passed directly - * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ - * mechanism (triggered by HOMA_GRO_SHORT_BYPASS). - */ - __u64 gro_data_bypasses; - - /** @temp: For temporary use during testing. */ -#define NUM_TEMP_METRICS 10 - __u64 temp[NUM_TEMP_METRICS]; -}; /** * struct homa_numa - Homa allocates one of these structures for each @@ -2654,9 +2012,6 @@ struct homa_core { * HOMA_SKB_PAGE_SIZE in length. */ struct page *stashed_pages[HOMA_MAX_STASHED(HOMA_MAX_MESSAGE_LENGTH)]; - - /** @metrics: performance statistics for this core. */ - struct homa_metrics metrics; }; /** @@ -2695,9 +2050,6 @@ struct homa_skb_info { int offset; }; -#define INC_METRIC(metric, count) \ - (homa_cores[raw_smp_processor_id()]->metrics.metric) += (count) - /** * homa_get_skb_info() - Return the address of Homa's private information * for an sk_buff. @@ -3146,7 +2498,6 @@ extern void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); extern void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb); extern void homa_add_to_throttled(struct homa_rpc *rpc); -extern void homa_append_metric(struct homa *homa, const char *format, ...); extern int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb); extern int homa_bind(struct socket *sk, struct sockaddr *addr, int addr_len); @@ -3232,12 +2583,6 @@ extern int homa_message_in_init(struct homa_rpc *rpc, int length, extern int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit); extern void homa_message_out_init(struct homa_rpc *rpc, int length); -extern loff_t homa_metrics_lseek(struct file *file, loff_t offset, - int whence); -extern int homa_metrics_open(struct inode *inode, struct file *file); -extern ssize_t homa_metrics_read(struct file *file, char __user *buffer, - size_t length, loff_t *offset); -extern int homa_metrics_release(struct inode *inode, struct file *file); extern void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); extern struct sk_buff @@ -3282,13 +2627,10 @@ extern void homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, __u32 *buffers); extern char *homa_print_ipv4_addr(__be32 addr); extern char *homa_print_ipv6_addr(const struct in6_addr *addr); -extern char *homa_print_metrics(struct homa *homa); extern char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); extern char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len); extern void homa_prios_changed(struct homa *homa); -extern int homa_proc_read_metrics(char *buffer, char **start, off_t offset, - int count, int *eof, void *data); extern int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); extern int homa_register_interests(struct homa_interest *interest, diff --git a/homa_incoming.c b/homa_incoming.c index b774db20..3221b617 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -5,7 +5,6 @@ */ #include "homa_impl.h" -#include "homa_wire.h" /** * homa_message_in_init() - Constructor for homa_message_in. diff --git a/homa_metrics.c b/homa_metrics.c new file mode 100644 index 00000000..42f3b7bf --- /dev/null +++ b/homa_metrics.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: BSD-2-Clause + +/* This file contains various functions for managing Homa's performance + * counters. + */ + +#include "homa_impl.h" + +DEFINE_PER_CPU(struct homa_metrics, homa_metrics); + +/* For functions that are invoked directly by Linux, so they can't be + * passed a struct homa arguments. + */ +extern struct homa *homa; + +/** + * homa_metric_append() - Formats a new metric and appends it to homa->metrics. + * @homa: The new data will appended to the @metrics field of + * this structure. + * @format: Standard printf-style format string describing the + * new metric. Arguments after this provide the usual + * values expected for printf-like functions. + */ +void homa_metric_append(struct homa *homa, const char *format, ...) +{ + char *new_buffer; + size_t new_chars; + va_list ap; + + if (!homa->metrics) { +#ifdef __UNIT_TEST__ + homa->metrics_capacity = 30; +#else + homa->metrics_capacity = 4096; +#endif + homa->metrics = kmalloc(homa->metrics_capacity, GFP_KERNEL); + if (!homa->metrics) { + pr_warn("%s couldn't allocate memory\n", __func__); + return; + } + homa->metrics_length = 0; + } + + /* May have to execute this loop multiple times if we run out + * of space in homa->metrics; each iteration expands the storage, + * until eventually it is large enough. + */ + while (true) { + va_start(ap, format); + new_chars = vsnprintf(homa->metrics + homa->metrics_length, + homa->metrics_capacity - homa->metrics_length, + format, ap); + va_end(ap); + if ((homa->metrics_length + new_chars) < homa->metrics_capacity) + break; + + /* Not enough room; expand buffer capacity. */ + homa->metrics_capacity *= 2; + new_buffer = kmalloc(homa->metrics_capacity, GFP_KERNEL); + if (!new_buffer) { + pr_warn("%s couldn't allocate memory\n", __func__); + return; + } + memcpy(new_buffer, homa->metrics, homa->metrics_length); + kfree(homa->metrics); + homa->metrics = new_buffer; + } + homa->metrics_length += new_chars; +} + +/** + * homa_metrics_print() - Sample all of the Homa performance metrics and + * generate a human-readable string describing all of them. + * @homa: Overall data about the Homa protocol implementation; + * the formatted string will be stored in homa->metrics. + * + * Return: The formatted string. + */ +char *homa_metrics_print(struct homa *homa) +{ + int core, i, lower = 0; + + homa->metrics_length = 0; +#define M(...) homa_metric_append(homa, __VA_ARGS__) + M("rdtsc_cycles %20llu RDTSC cycle counter when metrics were gathered\n", + get_cycles()); + M("cpu_khz %15llu Clock rate for RDTSC counter, in khz\n", + cpu_khz); + for (core = 0; core < nr_cpu_ids; core++) { + struct homa_metrics *m = &per_cpu(homa_metrics, core); + __s64 delta; + + M("core %15d Core id for following metrics\n", + core); + for (i = 0; i < HOMA_NUM_SMALL_COUNTS; i++) { + M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", + (i+1)*64, m->small_msg_bytes[i], lower, + (i+1)*64); + lower = (i+1)*64 + 1; + } + for (i = (HOMA_NUM_SMALL_COUNTS*64)/1024; + i < HOMA_NUM_MEDIUM_COUNTS; i++) { + M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", + (i+1)*1024, m->medium_msg_bytes[i], lower, + (i+1)*1024); + lower = (i+1)*1024 + 1; + } + M("large_msg_count %15llu # of incoming messages >= %d bytes\n", + m->large_msg_count, lower); + M("large_msg_bytes %15llu Bytes in incoming messages >= %d bytes\n", + m->large_msg_bytes, lower); + M("sent_msg_bytes %15llu otal bytes in all outgoing messages\n", + m->sent_msg_bytes); + for (i = DATA; i < BOGUS; i++) { + char *symbol = homa_symbol_for_type(i); + + M("packets_sent_%-7s %15llu %s packets sent\n", + symbol, m->packets_sent[i-DATA], + symbol); + } + for (i = DATA; i < BOGUS; i++) { + char *symbol = homa_symbol_for_type(i); + + M("packets_rcvd_%-7s %15llu %s packets received\n", + symbol, m->packets_received[i-DATA], + symbol); + } + for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { + M("priority%d_bytes %15llu Bytes sent at priority %d (including headers)\n", + i, m->priority_bytes[i], i); + } + for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { + M("priority%d_packets %15llu Packets sent at priority %d\n", + i, m->priority_packets[i], i); + } + M("skb_allocs %15llu sk_buffs allocated\n", + m->skb_allocs); + M("skb_alloc_cycles %15llu Time spent allocating sk_buffs\n", + m->skb_alloc_cycles); + M("skb_frees %15llu Data sk_buffs freed in normal paths\n", + m->skb_frees); + M("skb_free_cycles %15llu Time spent freeing data sk_buffs\n", + m->skb_free_cycles); + M("skb_page_allocs %15llu Pages allocated for sk_buff frags\n", + m->skb_page_allocs); + M("skb_page_alloc_cycles %15llu Time spent allocating pages for sk_buff frags\n", + m->skb_page_alloc_cycles); + M("requests_received %15llu Incoming request messages\n", + m->requests_received); + M("requests_queued %15llu Requests for which no thread was waiting\n", + m->requests_queued); + M("responses_received %15llu Incoming response messages\n", + m->responses_received); + M("responses_queued %15llu Responses for which no thread was waiting\n", + m->responses_queued); + M("fast_wakeups %15llu Messages received while polling\n", + m->fast_wakeups); + M("slow_wakeups %15llu Messages received after thread went to sleep\n", + m->slow_wakeups); + M("handoffs_thread_waiting %15llu RPC handoffs to waiting threads (vs. queue)\n", + m->handoffs_thread_waiting); + M("handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", + m->handoffs_alt_thread); + M("poll_cycles %15llu Time spent polling for incoming messages\n", + m->poll_cycles); + M("softirq_calls %15llu Calls to homa_softirq (i.e. # GRO pkts received)\n", + m->softirq_calls); + M("softirq_cycles %15llu Time spent in homa_softirq during SoftIRQ\n", + m->softirq_cycles); + M("bypass_softirq_cycles %15llu Time spent in homa_softirq during bypass from GRO\n", + m->bypass_softirq_cycles); + M("linux_softirq_cycles %15llu Time spent in all Linux SoftIRQ\n", + m->linux_softirq_cycles); + M("napi_cycles %15llu Time spent in NAPI-level packet handling\n", + m->napi_cycles); + M("send_cycles %15llu Time spent in homa_sendmsg for requests\n", + m->send_cycles); + M("send_calls %15llu Total invocations of homa_sendmsg for equests\n", + m->send_calls); + // It is possible for us to get here at a time when a + // thread has been blocked for a long time and has + // recorded blocked_cycles, but hasn't finished the + // system call so recv_cycles hasn't been incremented + // yet. If that happens, just record 0 to prevent + // underflow errors. + delta = m->recv_cycles - m->blocked_cycles; + if (delta < 0) + delta = 0; + M("recv_cycles %15llu Unblocked time spent in recvmsg kernel call\n", + delta); + M("recv_calls %15llu Total invocations of recvmsg kernel call\n", + m->recv_calls); + M("blocked_cycles %15llu Time spent blocked in homa_recvmsg\n", + m->blocked_cycles); + M("reply_cycles %15llu Time spent in homa_sendmsg for responses\n", + m->reply_cycles); + M("reply_calls %15llu Total invocations of homa_sendmsg for responses\n", + m->reply_calls); + M("abort_cycles %15llu Time spent in homa_ioc_abort kernel call\n", + m->reply_cycles); + M("abort_calls %15llu Total invocations of abort kernel call\n", + m->reply_calls); + M("so_set_buf_cycles %15llu Time spent in setsockopt SO_HOMA_SET_BUF\n", + m->so_set_buf_cycles); + M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_SET_BUF\n", + m->so_set_buf_calls); + M("grantable_lock_cycles %15llu Time spent with homa->grantable_lock locked\n", + m->grantable_lock_cycles); + M("timer_cycles %15llu Time spent in homa_timer\n", + m->timer_cycles); + M("timer_reap_cycles %15llu Time in homa_timer spent reaping RPCs\n", + m->timer_reap_cycles); + M("data_pkt_reap_cycles %15llu Time in homa_data_pkt spent reaping RPCs\n", + m->data_pkt_reap_cycles); + M("pacer_cycles %15llu Time spent in homa_pacer_main\n", + m->pacer_cycles); + M("homa_cycles %15llu Total time in all Homa-related functions\n", + m->softirq_cycles + m->napi_cycles + + m->send_cycles + m->recv_cycles + + m->reply_cycles - m->blocked_cycles + + m->timer_cycles + m->pacer_cycles); + M("pacer_lost_cycles %15llu Lost transmission time because pacer was slow\n", + m->pacer_lost_cycles); + M("pacer_bytes %15llu Bytes transmitted when the pacer was active\n", + m->pacer_bytes); + M("pacer_skipped_rpcs %15llu Pacer aborts because of locked RPCs\n", + m->pacer_skipped_rpcs); + M("pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", + m->pacer_needed_help); + M("throttled_cycles %15llu Time when the throttled queue was nonempty\n", + m->throttled_cycles); + M("resent_packets %15llu DATA packets sent in response to RESENDs\n", + m->resent_packets); + M("peer_hash_links %15llu Hash chain link traversals in peer table\n", + m->peer_hash_links); + M("peer_new_entries %15llu New entries created in peer table\n", + m->peer_new_entries); + M("peer_kmalloc_errors %15llu kmalloc failures creating peer table entries\n", + m->peer_kmalloc_errors); + M("peer_route_errors %15llu Routing failures creating peer table entries\n", + m->peer_route_errors); + M("control_xmit_errors %15llu Errors sending control packets\n", + m->control_xmit_errors); + M("data_xmit_errors %15llu Errors sending data packets\n", + m->data_xmit_errors); + M("unknown_rpcs %15llu Non-grant packets discarded because RPC unknown\n", + m->unknown_rpcs); + M("server_cant_create_rpcs %15llu Packets discarded because server couldn't create RPC\n", + m->server_cant_create_rpcs); + M("unknown_packet_types %15llu Packets discarded because of unsupported type\n", + m->unknown_packet_types); + M("short_packets %15llu Packets discarded because too short\n", + m->short_packets); + M("packet_discards %15llu Non-resent packets discarded because data already received\n", + m->packet_discards); + M("resent_discards %15llu Resent packets discarded because data already received\n", + m->resent_discards); + M("resent_packets_used %15llu Retransmitted packets that were actually used\n", + m->resent_packets_used); + M("rpc_timeouts %15llu RPCs aborted because peer was nonresponsive\n", + m->rpc_timeouts); + M("server_rpc_discards %15llu RPCs discarded by server because of errors\n", + m->server_rpc_discards); + M("server_rpcs_unknown %15llu RPCs aborted by server because unknown to client\n", + m->server_rpcs_unknown); + M("client_lock_misses %15llu Bucket lock misses for client RPCs\n", + m->client_lock_misses); + M("client_lock_miss_cycles %15llu Time lost waiting for client bucket locks\n", + m->client_lock_miss_cycles); + M("server_lock_misses %15llu Bucket lock misses for server RPCs\n", + m->server_lock_misses); + M("server_lock_miss_cycles %15llu Time lost waiting for server bucket locks\n", + m->server_lock_miss_cycles); + M("socket_lock_misses %15llu Socket lock misses\n", + m->socket_lock_misses); + M("socket_lock_miss_cycles %15llu Time lost waiting for socket locks\n", + m->socket_lock_miss_cycles); + M("throttle_lock_misses %15llu Throttle lock misses\n", + m->throttle_lock_misses); + M("throttle_lock_miss_cycles %15llu Time lost waiting for throttle locks\n", + m->throttle_lock_miss_cycles); + M("peer_ack_lock_misses %15llu Misses on peer ack locks\n", + m->peer_ack_lock_misses); + M("peer_ack_lock_miss_cycles %15llu Time lost waiting for peer ack locks\n", + m->peer_ack_lock_miss_cycles); + M("grantable_lock_misses %15llu Grantable lock misses\n", + m->grantable_lock_misses); + M("grantable_lock_miss_cycles%15llu Time lost waiting for grantable lock\n", + m->grantable_lock_miss_cycles); + M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", + m->grantable_rpcs_integral); + M("grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", + m->grant_recalc_calls); + M("grant_recalc_cycles %15llu Time spent in homa_grant_recalc\n", + m->grant_recalc_cycles); + M("grant_recalc_skips %15llu Number of times homa_grant_recalc skipped redundant work\n", + m->grant_recalc_skips); + M("grant_recalc_loops %15llu Number of times homa_grant_recalc looped back\n", + m->grant_recalc_loops); + M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", + m->grant_priority_bumps); + M("fifo_grants %15llu Grants issued using FIFO priority\n", + m->fifo_grants); + M("fifo_grants_no_incoming %15llu FIFO grants to messages with no outstanding grants\n", + m->fifo_grants_no_incoming); + M("disabled_reaps %15llu Reaper invocations that were disabled\n", + m->disabled_reaps); + M("disabled_rpc_reaps %15llu Disabled RPCs skipped by reaper\n", + m->disabled_rpc_reaps); + M("reaper_calls %15llu Reaper invocations that were not disabled\n", + m->reaper_calls); + M("reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper alls\n", + m->reaper_dead_skbs); + M("forced_reaps %15llu Reaps forced by accumulation of dead RPCs\n", + m->forced_reaps); + M("throttle_list_adds %15llu Calls to homa_add_to_throttled\n", + m->throttle_list_adds); + M("throttle_list_checks %15llu List elements checked in homa_add_to_throttled\n", + m->throttle_list_checks); + M("ack_overflows %15llu Explicit ACKs sent because peer->acks was full\n", + m->ack_overflows); + M("ignored_need_acks %15llu NEED_ACKs ignored because RPC result not yet received\n", + m->ignored_need_acks); + M("bpage_reuses %15llu Buffer page could be reused because ref count was zero\n", + m->bpage_reuses); + M("buffer_alloc_failures %15llu homa_pool_allocate didn't find enough buffer space for an RPC\n", + m->buffer_alloc_failures); + M("linux_pkt_alloc_bytes %15llu Bytes allocated in new packets by NIC driver due to cache overflows\n", + m->linux_pkt_alloc_bytes); + M("dropped_data_no_bufs %15llu Data bytes dropped because app buffers full\n", + m->dropped_data_no_bufs); + M("gen3_handoffs %15llu GRO->SoftIRQ handoffs made by Gen3 balancer\n", + m->gen3_handoffs); + M("gen3_alt_handoffs %15llu Gen3 handoffs to secondary core (primary was busy)\n", + m->gen3_alt_handoffs); + M("gro_grant_bypasses %15llu Grant packets passed directly to homa_softirq by homa_gro_receive\n", + m->gro_grant_bypasses); + M("gro_data_bypasses %15llu Data packets passed directly to homa_softirq by homa_gro_receive\n", + m->gro_data_bypasses); + for (i = 0; i < NUM_TEMP_METRICS; i++) + M("temp%-2d %15llu Temporary use in testing\n", + i, m->temp[i]); + } + + return homa->metrics; +} +/** + * homa_metrics_open() - This function is invoked when /proc/net/homa_metrics is + * opened. + * @inode: The inode corresponding to the file. + * @file: Information about the open file. + * + * Return: always 0. + */ +int homa_metrics_open(struct inode *inode, struct file *file) +{ + /* Collect all of the metrics when the file is opened, and save + * these for use by subsequent reads (don't want the metrics to + * change between reads). If there are concurrent opens on the + * file, only read the metrics once, during the first open, and + * use this copy for subsequent opens, until the file has been + * completely closed. + */ + spin_lock(&homa->metrics_lock); + if (homa->metrics_active_opens == 0) + homa_metrics_print(homa); + homa->metrics_active_opens++; + spin_unlock(&homa->metrics_lock); + return 0; +} + +/** + * homa_metrics_read() - This function is invoked to handle read kernel calls on + * /proc/net/homa_metrics. + * @file: Information about the file being read. + * @buffer: Address in user space of the buffer in which data from the file + * should be returned. + * @length: Number of bytes available at @buffer. + * @offset: Current read offset within the file. + * + * Return: the number of bytes returned at @buffer. 0 means the end of the + * file was reached, and a negative number indicates an error (-errno). + */ +ssize_t homa_metrics_read(struct file *file, char __user *buffer, + size_t length, loff_t *offset) +{ + size_t copied; + + if (*offset >= homa->metrics_length) + return 0; + copied = homa->metrics_length - *offset; + if (copied > length) + copied = length; + if (copy_to_user(buffer, homa->metrics + *offset, copied)) + return -EFAULT; + *offset += copied; + return copied; +} + +/** + * homa_metrics_lseek() - This function is invoked to handle seeks on + * /proc/net/homa_metrics. Right now seeks are ignored: the file must be + * read sequentially. + * @file: Information about the file being read. + * @offset: Distance to seek, in bytes + * @whence: Starting point from which to measure the distance to seek. + */ +loff_t homa_metrics_lseek(struct file *file, loff_t offset, int whence) +{ + return 0; +} + +/** + * homa_metrics_release() - This function is invoked when the last reference to + * an open /proc/net/homa_metrics is closed. It performs cleanup. + * @inode: The inode corresponding to the file. + * @file: Information about the open file. + * + * Return: always 0. + */ +int homa_metrics_release(struct inode *inode, struct file *file) +{ + spin_lock(&homa->metrics_lock); + homa->metrics_active_opens--; + spin_unlock(&homa->metrics_lock); + return 0; +} diff --git a/homa_metrics.h b/homa_metrics.h new file mode 100644 index 00000000..a7ddcb05 --- /dev/null +++ b/homa_metrics.h @@ -0,0 +1,692 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file contains declarations related to Homa's performance metrics. */ + +#ifndef _HOMA_METRICS_H +#define _HOMA_METRICS_H + +#include +#include + +#include "homa_wire.h" + +/** + * struct homa_metrics - various performance counters kept by Homa. + * + * There is one of these structures for each core, so counters can + * be updated without worrying about synchronization or extra cache + * misses. + * + * All counters are free-running: they never reset. + */ +#define HOMA_NUM_SMALL_COUNTS 64 +#define HOMA_NUM_MEDIUM_COUNTS 128 +struct homa_metrics { + /** + * @small_msg_bytes: entry i holds the total number of bytes + * received in messages whose length is between 64*i and 64*i + 63, + * inclusive. + */ + __u64 small_msg_bytes[HOMA_NUM_SMALL_COUNTS]; + + /** + * @medium_msg_bytes: entry i holds the total number of bytes + * received in messages whose length is between 1024*i and + * 1024*i + 1023, inclusive. The first four entries are always 0 + * (small_msg_counts covers this range). + */ + __u64 medium_msg_bytes[HOMA_NUM_MEDIUM_COUNTS]; + + /** + * @large_msg_count: the total number of messages received whose + * length is too large to appear in medium_msg_bytes. + */ + __u64 large_msg_count; + + /** + * @large_msg_bytes: the total number of bytes received in + * messages too large to be counted by medium_msg_bytes. + */ + __u64 large_msg_bytes; + + /** + * @sent_msg_bytes: The total number of bytes in outbound + * messages. + */ + __u64 sent_msg_bytes; + + /** + * @packets_sent: total number of packets sent for each packet type + * (entry 0 corresponds to DATA, and so on). + */ + __u64 packets_sent[BOGUS-DATA]; + + /** + * @packets_received: total number of packets received for each + * packet type (entry 0 corresponds to DATA, and so on). + */ + __u64 packets_received[BOGUS-DATA]; + + /** @priority_bytes: total bytes sent at each priority level. */ + __u64 priority_bytes[HOMA_MAX_PRIORITIES]; + + /** @priority_packets: total packets sent at each priority level. */ + __u64 priority_packets[HOMA_MAX_PRIORITIES]; + + /** + * @skb_allocs: total number of calls to homa_skb_new_tx. + */ + __u64 skb_allocs; + + /** + * @skb_alloc_cycles: total time spent in homa_skb_new_tx, as + * measured with get_cycles(). + */ + __u64 skb_alloc_cycles; + + /** + * @skb_frees: total number of sk_buffs for data packets that have + * been freed (counts normal paths only). + */ + __u64 skb_frees; + + /** + * @skb_free_cycles: total time spent freeing sk_buffs, as + * measured with get_cycles(). + */ + __u64 skb_free_cycles; + + /** + * @skb_page_allocs: total number of calls to homa_skb_page_alloc. + */ + __u64 skb_page_allocs; + + /** + * @skb_page_alloc_cycles: total time spent in homa_skb_page_alloc, as + * measured with get_cycles(). + */ + __u64 skb_page_alloc_cycles; + + /** + * @requests_received: total number of request messages received. + */ + __u64 requests_received; + + /** + * @requests_queued: total number of requests that were added to + * @homa->ready_requests (no thread was waiting). + */ + __u64 requests_queued; + + /** + * @responses_received: total number of response messages received. + */ + __u64 responses_received; + + /** + * @responses_queued: total number of responses that were added to + * @homa->ready_responses (no thread was waiting). + */ + __u64 responses_queued; + + /** + * @fast_wakeups: total number of times that a message arrived for + * a receiving thread that was polling in homa_wait_for_message. + */ + __u64 fast_wakeups; + + /** + * @slow_wakeups: total number of times that a receiving thread + * had to be put to sleep (no message arrived while it was polling). + */ + __u64 slow_wakeups; + + /** + * @handoffs_thread_waiting: total number of times that an RPC + * was handed off to a waiting thread (vs. being queued). + */ + __u64 handoffs_thread_waiting; + + /** + * @handoffs_alt_thread: total number of times that a thread other + * than the first on the list was chosen for a handoff (because the + * first thread was on a busy core). + */ + __u64 handoffs_alt_thread; + + /** + * @poll_cycles: total time spent in the polling loop in + * homa_wait_for_message, as measured with get_cycles(). + */ + __u64 poll_cycles; + + /** + * @softirq_calls: total number of calls to homa_softirq (i.e., + * total number of GRO packets processed, each of which could contain + * multiple Homa packets. + */ + __u64 softirq_calls; + + /** + * @softirq_cycles: total time spent executing homa_softirq when + * invoked under Linux's SoftIRQ handler, as measured with get_cycles(). + */ + __u64 softirq_cycles; + + /** + * @bypass_softirq_cycles: total time spent executing homa_softirq when + * invoked during GRO, bypassing the SoftIRQ mechanism. + */ + __u64 bypass_softirq_cycles; + + /** + * @linux_softirq_cycles: total time spent executing all softirq + * activities, as measured by the linux softirq module, in get_cycles() + * units. Only available with modified Linux kernels. + */ + __u64 linux_softirq_cycles; + + /** + * @napi_cycles: total time spent executing all NAPI activities, + * as measured by the linux softirq module, in get_cycles() units. + * Only available with modified Linux kernels. + */ + __u64 napi_cycles; + + /** + * @send_cycles: total time spent executing the homa_sendmsg kernel + * call handler to send requests, as measured with get_cycles(). + */ + __u64 send_cycles; + + /** @send_calls: total number of invocations of homa_semdmsg + * for requests. + */ + __u64 send_calls; + + /** + * @recv_cycles: total time spent executing homa_recvmsg (including + * time when the thread is blocked), as measured with get_cycles(). + */ + __u64 recv_cycles; + + /** @recv_calls: total number of invocations of homa_recvmsg. */ + __u64 recv_calls; + + /** + * @blocked_cycles: total time threads spend in blocked state + * while executing the homa_recvmsg kernel call handler. + */ + __u64 blocked_cycles; + + /** + * @reply_cycles: total time spent executing the homa_sendmsg kernel + * call handler to send responses, as measured with get_cycles(). + */ + __u64 reply_cycles; + + /** + * @reply_calls: total number of invocations of homa_semdmsg + * for responses. + */ + __u64 reply_calls; + + /** + * @abort_cycles: total time spent executing the homa_ioc_abort + * kernel call handler, as measured with get_cycles(). + */ + __u64 abort_cycles; + + /** + * @abort_calls: total number of invocations of the homa_ioc_abort + * kernel call. + */ + __u64 abort_calls; + + /** + * @so_set_buf_cycles: total time spent executing the homa_ioc_set_buf + * kernel call handler, as measured with get_cycles(). + */ + __u64 so_set_buf_cycles; + + /** + * @so_set_buf_calls: total number of invocations of the homa_ioc_set_buf + * kernel call. + */ + __u64 so_set_buf_calls; + + /** + * @grantable_lock_cycles: total time spent with homa->grantable_lock + * locked. + */ + __u64 grantable_lock_cycles; + + /** + * @timer_cycles: total time spent in homa_timer, as measured with + * get_cycles(). + */ + __u64 timer_cycles; + + /** + * @timer_reap_cycles: total time spent by homa_timer to reap dead + * RPCs, as measured with get_cycles(). This time is included in + * @timer_cycles. + */ + __u64 timer_reap_cycles; + + /** + * @data_pkt_reap_cycles: total time spent by homa_data_pkt to reap + * dead RPCs, as measured with get_cycles(). + */ + __u64 data_pkt_reap_cycles; + + /** + * @pacer_cycles: total time spent executing in homa_pacer_main + * (not including blocked time), as measured with get_cycles(). + */ + __u64 pacer_cycles; + + /** + * @pacer_lost_cycles: unnecessary delays in transmitting packets + * (i.e. wasted output bandwidth) because the pacer was slow or got + * descheduled. + */ + __u64 pacer_lost_cycles; + + /** + * @pacer_bytes: total number of bytes transmitted when + * @homa->throttled_rpcs is nonempty. + */ + __u64 pacer_bytes; + + /** + * @pacer_skipped_rpcs: total number of times that the pacer had to + * abort because it couldn't lock an RPC. + */ + __u64 pacer_skipped_rpcs; + + /** + * @pacer_needed_help: total number of times that homa_check_pacer + * found that the pacer was running behind, so it actually invoked + * homa_pacer_xmit. + */ + __u64 pacer_needed_help; + + /** + * @throttled_cycles: total amount of time that @homa->throttled_rpcs + * is nonempty, as measured with get_cycles(). + */ + __u64 throttled_cycles; + + /** + * @resent_packets: total number of data packets issued in response to + * RESEND packets. + */ + __u64 resent_packets; + + /** + * @peer_hash_links: total # of link traversals in homa_peer_find. + */ + __u64 peer_hash_links; + + /** + * @peer_new_entries: total # of new entries created in Homa's + * peer table (this value doesn't increment if the desired peer is + * found in the entry in its hash chain). + */ + __u64 peer_new_entries; + + /** + * @peer_kmalloc errors: total number of times homa_peer_find + * returned an error because it couldn't allocate memory for a new + * peer. + */ + __u64 peer_kmalloc_errors; + + /** + * @peer_route errors: total number of times homa_peer_find + * returned an error because it couldn't create a route to the peer. + */ + __u64 peer_route_errors; + + /** + * @control_xmit_errors errors: total number of times ip_queue_xmit + * failed when transmitting a control packet. + */ + __u64 control_xmit_errors; + + /** + * @data_xmit_errors errors: total number of times ip_queue_xmit + * failed when transmitting a data packet. + */ + __u64 data_xmit_errors; + + /** + * @unknown_rpc: total number of times an incoming packet was + * discarded because it referred to a nonexistent RPC. Doesn't + * count grant packets received by servers (since these are + * fairly common). + */ + __u64 unknown_rpcs; + + /** + * @cant_create_server_rpc: total number of times a server discarded + * an incoming packet because it couldn't create a homa_rpc object. + */ + __u64 server_cant_create_rpcs; + + /** + * @unknown_packet_type: total number of times a packet was discarded + * because its type wasn't one of the supported values. + */ + __u64 unknown_packet_types; + + /** + * @short_packets: total number of times a packet was discarded + * because it was too short to hold all the required information. + */ + __u64 short_packets; + + /** + * @packet_discards: total number of times a normal (non-retransmitted) + * packet was discarded because all its data had already been received. + */ + __u64 packet_discards; + + /** + * @resent_discards: total number of times a retransmitted packet + * was discarded because its data had already been received. + */ + __u64 resent_discards; + + /** + * @resent_packets_used: total number of times a resent packet was + * actually incorporated into the message at the target (i.e. it + * wasn't redundant). + */ + __u64 resent_packets_used; + + /** + * @rpc_timeouts: total number of times an RPC (either client or + * server) was aborted because the peer was nonresponsive. + */ + __u64 rpc_timeouts; + + /** + * @server_rpc_discards: total number of times an RPC was aborted on + * the server side because of a timeout. + */ + __u64 server_rpc_discards; + + /** + * @server_rpcs_unknown: total number of times an RPC was aborted on + * the server side because it is no longer known to the client. + */ + __u64 server_rpcs_unknown; + + /** + * @client_lock_misses: total number of times that Homa had to wait + * to acquire a client bucket lock. + */ + __u64 client_lock_misses; + + /** + * @client_lock_miss_cycles: total time spent waiting for client + * bucket lock misses, measured by get_cycles(). + */ + __u64 client_lock_miss_cycles; + + /** + * @server_lock_misses: total number of times that Homa had to wait + * to acquire a server bucket lock. + */ + __u64 server_lock_misses; + + /** + * @server_lock_miss_cycles: total time spent waiting for server + * bucket lock misses, measured by get_cycles(). + */ + __u64 server_lock_miss_cycles; + + /** + * @socket_lock_miss_cycles: total time spent waiting for socket + * lock misses, measured by get_cycles(). + */ + __u64 socket_lock_miss_cycles; + + /** + * @socket_lock_misses: total number of times that Homa had to wait + * to acquire a socket lock. + */ + __u64 socket_lock_misses; + + /** + * @throttle_lock_miss_cycles: total time spent waiting for throttle + * lock misses, measured by get_cycles(). + */ + __u64 throttle_lock_miss_cycles; + + /** + * @throttle_lock_misses: total number of times that Homa had to wait + * to acquire the throttle lock. + */ + __u64 throttle_lock_misses; + + /** + * @peer_acklock_miss_cycles: total time spent waiting for peer + * lock misses, measured by get_cycles(). + */ + __u64 peer_ack_lock_miss_cycles; + + /** + * @peer_ack_lock_misses: total number of times that Homa had to wait + * to acquire the lock used for managing acks for a peer. + */ + __u64 peer_ack_lock_misses; + + /** + * @grantable_lock_miss_cycles: total time spent waiting for grantable + * lock misses, measured by get_cycles(). + */ + __u64 grantable_lock_miss_cycles; + + /** + * @grantable_lock_misses: total number of times that Homa had to wait + * to acquire the grantable lock. + */ + __u64 grantable_lock_misses; + + /** + * @grantable_rpcs_integral: cumulative sum of time_delta*grantable, + * where time_delta is a get_cycles time and grantable is the + * value of homa->num_grantable_rpcs over that time period. + */ + __u64 grantable_rpcs_integral; + + /** + * @grant_recalc_calls: cumulative number of times homa_grant_recalc + * has been invoked. + */ + __u64 grant_recalc_calls; + + /** + * @grant_recalc_cycles: total time spent in homa_grant_recalc, + * in get_cycles() units. + */ + __u64 grant_recalc_cycles; + + /** + * @grant_recalc_loops: cumulative number of times homa_grant_recalc + * has looped back to recalculate again. + */ + __u64 grant_recalc_loops; + + /** + * @grant_recalc_skips: cumulative number of times that + * homa_grant_recalc skipped its work because in other thread + * already did it. + */ + __u64 grant_recalc_skips; + + /** + * @grant_priority_bumps: cumulative number of times the grant priority + * of an RPC has increased above its next-higher-priority neighbor. + */ + __u64 grant_priority_bumps; + + /** + * @fifo_grants: total number of times that grants were sent to + * the oldest message. + */ + __u64 fifo_grants; + + /** + * @fifo_grants_no_incoming: total number of times that, when a + * FIFO grant was issued, the message had no outstanding grants + * (everything granted had been received). + */ + __u64 fifo_grants_no_incoming; + + /** + * @disabled_reaps: total number of times that the reaper couldn't + * run at all because it was disabled. + */ + __u64 disabled_reaps; + + /** + * @disabled_rpc_reaps: total number of times that the reaper skipped + * an RPC because reaping was disabled for that particular RPC + */ + __u64 disabled_rpc_reaps; + + /** + * @reaper_runs: total number of times that the reaper was invoked + * and was not disabled. + */ + __u64 reaper_calls; + + /** + * @reaper_dead_skbs: incremented by hsk->dead_skbs each time that + * reaper_calls is incremented. + */ + __u64 reaper_dead_skbs; + + /** + * @forced_reaps: total number of times that homa_wait_for_message + * invoked the reaper because dead_skbs was too high. + */ + __u64 forced_reaps; + + /** + * @throttle_list_adds: total number of calls to homa_add_to_throttled. + */ + __u64 throttle_list_adds; + + /** + * @throttle_list_checks: number of list elements examined in + * calls to homa_add_to_throttled. + */ + __u64 throttle_list_checks; + + /** + * @unacked_overflows: total number of times that homa_peer_add_ack + * found insufficient space for the new id and hence had to send an + * ACK message. + */ + __u64 ack_overflows; + + /** + * @ignored_need_acks: total number of times that a NEED_ACK packet + * was ignored because the RPC's result hadn't been fully received. + */ + __u64 ignored_need_acks; + + /** + * @bpage_resuses: total number of times that, when an owned page + * reached the end, it could be reused because all existing + * allocations had been released. + */ + __u64 bpage_reuses; + + /** + * @buffer_alloc_failures: total number of times that + * homa_pool_allocate was unable to allocate buffer space for + * an incoming message. + */ + __u64 buffer_alloc_failures; + + /** + * @linux_pkt_alloc_bytes: total bytes allocated in new packet buffers + * by the NIC driver because of packet cache underflows. + */ + __u64 linux_pkt_alloc_bytes; + + /** + * @dropped_data_no_bufs: total bytes of incoming data dropped because + * there was no application buffer space available. + */ + __u64 dropped_data_no_bufs; + + /** + * @gen3_handoffs: total number of handoffs from GRO to SoftIRQ made + * by Gen3 load balancer. + */ + __u64 gen3_handoffs; + + /** + * @gen3_alt_handoffs: total number of GRO->SoftIRQ handoffs that + * didn't choose the primary SoftIRQ core because it was busy with + * app threads. + */ + __u64 gen3_alt_handoffs; + + /** + * @gro_grant_bypasses: total number of GRANT packets passed directly + * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ + * mechanism (triggered by HOMA_GRO_FAST_GRANTS). + */ + __u64 gro_grant_bypasses; + + /** + * @gro_data_bypasses: total number of DATA packets passed directly + * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ + * mechanism (triggered by HOMA_GRO_SHORT_BYPASS). + */ + __u64 gro_data_bypasses; + + /** @temp: For temporary use during testing. */ +#define NUM_TEMP_METRICS 10 + __u64 temp[NUM_TEMP_METRICS]; +}; + +DECLARE_PER_CPU(struct homa_metrics, homa_metrics); + +/** + * per_cpu_metrics() - Return the metrics structure for the current core. + * This is unsynchronized and doesn't guarantee non-preemption. + */ +static inline struct homa_metrics *homa_metrics_per_cpu(void) +{ + return &per_cpu(homa_metrics, raw_smp_processor_id()); +} + +/* It isn't necessary to disable preemption here, because we don't need + * perfect synchronization: if the invoking thread is moved to a + * different core and races with an INC_METRIC there, the worst that + * happens is that one of the INC_METRICs is lost, which isn't a big deal. + */ +#define INC_METRIC(metric, count) per_cpu(homa_metrics, \ + raw_smp_processor_id()).metric+= (count) + +extern void homa_metric_append(struct homa *homa, const char *format, ...); +extern loff_t homa_metrics_lseek(struct file *file, loff_t offset, + int whence); +extern int homa_metrics_open(struct inode *inode, struct file *file); +extern char *homa_metrics_print(struct homa *homa); +extern ssize_t homa_metrics_read(struct file *file, char __user *buffer, + size_t length, loff_t *offset); +extern int homa_metrics_release(struct inode *inode, struct file *file); +extern int homa_proc_read_metrics(char *buffer, char **start, off_t offset, + int count, int *eof, void *data); + +#endif /* _HOMA_METRICS_H */ \ No newline at end of file diff --git a/homa_offload.c b/homa_offload.c index 55cb8a83..eb3353c8 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -262,6 +262,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, int busy = (now - core->last_gro) < homa->gro_busy_cycles; __u32 hash; __u64 saved_softirq_metric, softirq_cycles; + __u64 *softirq_cycles_metric; struct data_header *h_new = (struct data_header *) skb_transport_header(skb); int priority; @@ -413,10 +414,11 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, /* Record SoftIRQ cycles in a different metric to reflect that * they happened during bypass. */ - saved_softirq_metric = core->metrics.softirq_cycles; + softirq_cycles_metric = &homa_metrics_per_cpu()->softirq_cycles; + saved_softirq_metric = *softirq_cycles_metric; homa_softirq(skb); - softirq_cycles = core->metrics.softirq_cycles - saved_softirq_metric; - core->metrics.softirq_cycles = saved_softirq_metric; + softirq_cycles = *softirq_cycles_metric - saved_softirq_metric; + *softirq_cycles_metric = saved_softirq_metric; INC_METRIC(bypass_softirq_cycles, softirq_cycles); core->last_gro = get_cycles(); diff --git a/homa_plumbing.c b/homa_plumbing.c index 743cb8ad..a4d38eb0 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1526,89 +1526,6 @@ __poll_t homa_poll(struct file *file, struct socket *sock, return mask; } -/** - * homa_metrics_open() - This function is invoked when /proc/net/homa_metrics is - * opened. - * @inode: The inode corresponding to the file. - * @file: Information about the open file. - * - * Return: always 0. - */ -int homa_metrics_open(struct inode *inode, struct file *file) -{ - /* Collect all of the metrics when the file is opened, and save - * these for use by subsequent reads (don't want the metrics to - * change between reads). If there are concurrent opens on the - * file, only read the metrics once, during the first open, and - * use this copy for subsequent opens, until the file has been - * completely closed. - */ - spin_lock(&homa->metrics_lock); - if (homa->metrics_active_opens == 0) - homa_print_metrics(homa); - homa->metrics_active_opens++; - spin_unlock(&homa->metrics_lock); - return 0; -} - -/** - * homa_metrics_read() - This function is invoked to handle read kernel calls on - * /proc/net/homa_metrics. - * @file: Information about the file being read. - * @buffer: Address in user space of the buffer in which data from the file - * should be returned. - * @length: Number of bytes available at @buffer. - * @offset: Current read offset within the file. - * - * Return: the number of bytes returned at @buffer. 0 means the end of the - * file was reached, and a negative number indicates an error (-errno). - */ -ssize_t homa_metrics_read(struct file *file, char __user *buffer, - size_t length, loff_t *offset) -{ - size_t copied; - - if (*offset >= homa->metrics_length) - return 0; - copied = homa->metrics_length - *offset; - if (copied > length) - copied = length; - if (copy_to_user(buffer, homa->metrics + *offset, copied)) - return -EFAULT; - *offset += copied; - return copied; -} - - -/** - * homa_metrics_lseek() - This function is invoked to handle seeks on - * /proc/net/homa_metrics. Right now seeks are ignored: the file must be - * read sequentially. - * @file: Information about the file being read. - * @offset: Distance to seek, in bytes - * @whence: Starting point from which to measure the distance to seek. - */ -loff_t homa_metrics_lseek(struct file *file, loff_t offset, int whence) -{ - return 0; -} - -/** - * homa_metrics_release() - This function is invoked when the last reference to - * an open /proc/net/homa_metrics is closed. It performs cleanup. - * @inode: The inode corresponding to the file. - * @file: Information about the open file. - * - * Return: always 0. - */ -int homa_metrics_release(struct inode *inode, struct file *file) -{ - spin_lock(&homa->metrics_lock); - homa->metrics_active_opens--; - spin_unlock(&homa->metrics_lock); - return 0; -} - /** * homa_dointvec() - This function is a wrapper around proc_dointvec. It is * invoked to read and write sysctl values and also update other values diff --git a/homa_timer.c b/homa_timer.c index f6f4b555..aefd6588 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -169,7 +169,7 @@ void homa_timer(struct homa *homa) total_grants = 0; for (core = 0; core < nr_cpu_ids; core++) { - struct homa_metrics *m = &homa_cores[core]->metrics; + struct homa_metrics *m = homa_metrics_per_cpu(); total_grants += m->packets_sent[GRANT-DATA]; } diff --git a/homa_utils.c b/homa_utils.c index df9cd0ba..afecba53 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -88,7 +88,6 @@ int homa_init(struct homa *homa) core->page_inuse = 0; core->page_size = 0; core->num_stashed_pages = 0; - memset(&core->metrics, 0, sizeof(core->metrics)); } } @@ -1477,446 +1476,6 @@ char *homa_symbol_for_type(uint8_t type) return buffer; } -/** - * homa_append_metric() - Formats a new metric and appends it to homa->metrics. - * @homa: The new data will appended to the @metrics field of - * this structure. - * @format: Standard printf-style format string describing the - * new metric. Arguments after this provide the usual - * values expected for printf-like functions. - */ -void homa_append_metric(struct homa *homa, const char *format, ...) -{ - char *new_buffer; - size_t new_chars; - va_list ap; - - if (!homa->metrics) { -#ifdef __UNIT_TEST__ - homa->metrics_capacity = 30; -#else - homa->metrics_capacity = 4096; -#endif - homa->metrics = kmalloc(homa->metrics_capacity, GFP_KERNEL); - if (!homa->metrics) { - pr_warn("%s couldn't allocate memory\n", __func__); - return; - } - homa->metrics_length = 0; - } - - /* May have to execute this loop multiple times if we run out - * of space in homa->metrics; each iteration expands the storage, - * until eventually it is large enough. - */ - while (true) { - va_start(ap, format); - new_chars = vsnprintf(homa->metrics + homa->metrics_length, - homa->metrics_capacity - homa->metrics_length, - format, ap); - va_end(ap); - if ((homa->metrics_length + new_chars) < homa->metrics_capacity) - break; - - /* Not enough room; expand buffer capacity. */ - homa->metrics_capacity *= 2; - new_buffer = kmalloc(homa->metrics_capacity, GFP_KERNEL); - if (!new_buffer) { - pr_warn("%s couldn't allocate memory\n", __func__); - return; - } - memcpy(new_buffer, homa->metrics, homa->metrics_length); - kfree(homa->metrics); - homa->metrics = new_buffer; - } - homa->metrics_length += new_chars; -} - -/** - * homa_print_metrics() - Sample all of the Homa performance metrics and - * generate a human-readable string describing all of them. - * @homa: Overall data about the Homa protocol implementation; - * the formatted string will be stored in homa->metrics. - * - * Return: The formatted string. - */ -char *homa_print_metrics(struct homa *homa) -{ - int core, i, lower = 0; - - homa->metrics_length = 0; - homa_append_metric(homa, - "rdtsc_cycles %20llu RDTSC cycle counter when metrics were gathered\n", - get_cycles()); - homa_append_metric(homa, - "cpu_khz %15llu Clock rate for RDTSC counter, in khz\n", - cpu_khz); - for (core = 0; core < nr_cpu_ids; core++) { - struct homa_metrics *m = &homa_cores[core]->metrics; - __s64 delta; - - homa_append_metric(homa, - "core %15d Core id for following metrics\n", - core); - for (i = 0; i < HOMA_NUM_SMALL_COUNTS; i++) { - homa_append_metric(homa, - "msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", - (i+1)*64, m->small_msg_bytes[i], lower, - (i+1)*64); - lower = (i+1)*64 + 1; - } - for (i = (HOMA_NUM_SMALL_COUNTS*64)/1024; - i < HOMA_NUM_MEDIUM_COUNTS; i++) { - homa_append_metric(homa, - "msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", - (i+1)*1024, m->medium_msg_bytes[i], lower, - (i+1)*1024); - lower = (i+1)*1024 + 1; - } - homa_append_metric(homa, - "large_msg_count %15llu # of incoming messages >= %d bytes\n", - m->large_msg_count, lower); - homa_append_metric(homa, - "large_msg_bytes %15llu Bytes in incoming messages >= %d bytes\n", - m->large_msg_bytes, lower); - homa_append_metric(homa, - "sent_msg_bytes %15llu otal bytes in all outgoing messages\n", - m->sent_msg_bytes); - for (i = DATA; i < BOGUS; i++) { - char *symbol = homa_symbol_for_type(i); - - homa_append_metric(homa, - "packets_sent_%-7s %15llu %s packets sent\n", - symbol, m->packets_sent[i-DATA], - symbol); - } - for (i = DATA; i < BOGUS; i++) { - char *symbol = homa_symbol_for_type(i); - - homa_append_metric(homa, - "packets_rcvd_%-7s %15llu %s packets received\n", - symbol, m->packets_received[i-DATA], - symbol); - } - for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { - homa_append_metric(homa, - "priority%d_bytes %15llu Bytes sent at priority %d (including headers)\n", - i, m->priority_bytes[i], i); - } - for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { - homa_append_metric(homa, - "priority%d_packets %15llu Packets sent at priority %d\n", - i, m->priority_packets[i], i); - } - homa_append_metric(homa, - "skb_allocs %15llu sk_buffs allocated\n", - m->skb_allocs); - homa_append_metric(homa, - "skb_alloc_cycles %15llu Time spent allocating sk_buffs\n", - m->skb_alloc_cycles); - homa_append_metric(homa, - "skb_frees %15llu Data sk_buffs freed in normal paths\n", - m->skb_frees); - homa_append_metric(homa, - "skb_free_cycles %15llu Time spent freeing data sk_buffs\n", - m->skb_free_cycles); - homa_append_metric(homa, - "skb_page_allocs %15llu Pages allocated for sk_buff frags\n", - m->skb_page_allocs); - homa_append_metric(homa, - "skb_page_alloc_cycles %15llu Time spent allocating pages for sk_buff frags\n", - m->skb_page_alloc_cycles); - homa_append_metric(homa, - "requests_received %15llu Incoming request messages\n", - m->requests_received); - homa_append_metric(homa, - "requests_queued %15llu Requests for which no thread was waiting\n", - m->requests_queued); - homa_append_metric(homa, - "responses_received %15llu Incoming response messages\n", - m->responses_received); - homa_append_metric(homa, - "responses_queued %15llu Responses for which no thread was waiting\n", - m->responses_queued); - homa_append_metric(homa, - "fast_wakeups %15llu Messages received while polling\n", - m->fast_wakeups); - homa_append_metric(homa, - "slow_wakeups %15llu Messages received after thread went to sleep\n", - m->slow_wakeups); - homa_append_metric(homa, - "handoffs_thread_waiting %15llu RPC handoffs to waiting threads (vs. queue)\n", - m->handoffs_thread_waiting); - homa_append_metric(homa, - "handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", - m->handoffs_alt_thread); - homa_append_metric(homa, - "poll_cycles %15llu Time spent polling for incoming messages\n", - m->poll_cycles); - homa_append_metric(homa, - "softirq_calls %15llu Calls to homa_softirq (i.e. # GRO pkts received)\n", - m->softirq_calls); - homa_append_metric(homa, - "softirq_cycles %15llu Time spent in homa_softirq during SoftIRQ\n", - m->softirq_cycles); - homa_append_metric(homa, - "bypass_softirq_cycles %15llu Time spent in homa_softirq during bypass from GRO\n", - m->bypass_softirq_cycles); - homa_append_metric(homa, - "linux_softirq_cycles %15llu Time spent in all Linux SoftIRQ\n", - m->linux_softirq_cycles); - homa_append_metric(homa, - "napi_cycles %15llu Time spent in NAPI-level packet handling\n", - m->napi_cycles); - homa_append_metric(homa, - "send_cycles %15llu Time spent in homa_sendmsg for requests\n", - m->send_cycles); - homa_append_metric(homa, - "send_calls %15llu Total invocations of homa_sendmsg for equests\n", - m->send_calls); - // It is possible for us to get here at a time when a - // thread has been blocked for a long time and has - // recorded blocked_cycles, but hasn't finished the - // system call so recv_cycles hasn't been incremented - // yet. If that happens, just record 0 to prevent - // underflow errors. - delta = m->recv_cycles - m->blocked_cycles; - if (delta < 0) - delta = 0; - homa_append_metric(homa, - "recv_cycles %15llu Unblocked time spent in recvmsg kernel call\n", - delta); - homa_append_metric(homa, - "recv_calls %15llu Total invocations of recvmsg kernel call\n", - m->recv_calls); - homa_append_metric(homa, - "blocked_cycles %15llu Time spent blocked in homa_recvmsg\n", - m->blocked_cycles); - homa_append_metric(homa, - "reply_cycles %15llu Time spent in homa_sendmsg for responses\n", - m->reply_cycles); - homa_append_metric(homa, - "reply_calls %15llu Total invocations of homa_sendmsg for responses\n", - m->reply_calls); - homa_append_metric(homa, - "abort_cycles %15llu Time spent in homa_ioc_abort kernel call\n", - m->reply_cycles); - homa_append_metric(homa, - "abort_calls %15llu Total invocations of abort kernel call\n", - m->reply_calls); - homa_append_metric(homa, - "so_set_buf_cycles %15llu Time spent in setsockopt SO_HOMA_SET_BUF\n", - m->so_set_buf_cycles); - homa_append_metric(homa, - "so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_SET_BUF\n", - m->so_set_buf_calls); - homa_append_metric(homa, - "grantable_lock_cycles %15llu Time spent with homa->grantable_lock locked\n", - m->grantable_lock_cycles); - homa_append_metric(homa, - "timer_cycles %15llu Time spent in homa_timer\n", - m->timer_cycles); - homa_append_metric(homa, - "timer_reap_cycles %15llu Time in homa_timer spent reaping RPCs\n", - m->timer_reap_cycles); - homa_append_metric(homa, - "data_pkt_reap_cycles %15llu Time in homa_data_pkt spent reaping RPCs\n", - m->data_pkt_reap_cycles); - homa_append_metric(homa, - "pacer_cycles %15llu Time spent in homa_pacer_main\n", - m->pacer_cycles); - homa_append_metric(homa, - "homa_cycles %15llu Total time in all Homa-related functions\n", - m->softirq_cycles + m->napi_cycles + - m->send_cycles + m->recv_cycles + - m->reply_cycles - m->blocked_cycles + - m->timer_cycles + m->pacer_cycles); - homa_append_metric(homa, - "pacer_lost_cycles %15llu Lost transmission time because pacer was slow\n", - m->pacer_lost_cycles); - homa_append_metric(homa, - "pacer_bytes %15llu Bytes transmitted when the pacer was active\n", - m->pacer_bytes); - homa_append_metric(homa, - "pacer_skipped_rpcs %15llu Pacer aborts because of locked RPCs\n", - m->pacer_skipped_rpcs); - homa_append_metric(homa, - "pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", - m->pacer_needed_help); - homa_append_metric(homa, - "throttled_cycles %15llu Time when the throttled queue was nonempty\n", - m->throttled_cycles); - homa_append_metric(homa, - "resent_packets %15llu DATA packets sent in response to RESENDs\n", - m->resent_packets); - homa_append_metric(homa, - "peer_hash_links %15llu Hash chain link traversals in peer table\n", - m->peer_hash_links); - homa_append_metric(homa, - "peer_new_entries %15llu New entries created in peer table\n", - m->peer_new_entries); - homa_append_metric(homa, - "peer_kmalloc_errors %15llu kmalloc failures creating peer table entries\n", - m->peer_kmalloc_errors); - homa_append_metric(homa, - "peer_route_errors %15llu Routing failures creating peer table entries\n", - m->peer_route_errors); - homa_append_metric(homa, - "control_xmit_errors %15llu Errors sending control packets\n", - m->control_xmit_errors); - homa_append_metric(homa, - "data_xmit_errors %15llu Errors sending data packets\n", - m->data_xmit_errors); - homa_append_metric(homa, - "unknown_rpcs %15llu Non-grant packets discarded because RPC unknown\n", - m->unknown_rpcs); - homa_append_metric(homa, - "server_cant_create_rpcs %15llu Packets discarded because server couldn't create RPC\n", - m->server_cant_create_rpcs); - homa_append_metric(homa, - "unknown_packet_types %15llu Packets discarded because of unsupported type\n", - m->unknown_packet_types); - homa_append_metric(homa, - "short_packets %15llu Packets discarded because too short\n", - m->short_packets); - homa_append_metric(homa, - "packet_discards %15llu Non-resent packets discarded because data already received\n", - m->packet_discards); - homa_append_metric(homa, - "resent_discards %15llu Resent packets discarded because data already received\n", - m->resent_discards); - homa_append_metric(homa, - "resent_packets_used %15llu Retransmitted packets that were actually used\n", - m->resent_packets_used); - homa_append_metric(homa, - "rpc_timeouts %15llu RPCs aborted because peer was nonresponsive\n", - m->rpc_timeouts); - homa_append_metric(homa, - "server_rpc_discards %15llu RPCs discarded by server because of errors\n", - m->server_rpc_discards); - homa_append_metric(homa, - "server_rpcs_unknown %15llu RPCs aborted by server because unknown to client\n", - m->server_rpcs_unknown); - homa_append_metric(homa, - "client_lock_misses %15llu Bucket lock misses for client RPCs\n", - m->client_lock_misses); - homa_append_metric(homa, - "client_lock_miss_cycles %15llu Time lost waiting for client bucket locks\n", - m->client_lock_miss_cycles); - homa_append_metric(homa, - "server_lock_misses %15llu Bucket lock misses for server RPCs\n", - m->server_lock_misses); - homa_append_metric(homa, - "server_lock_miss_cycles %15llu Time lost waiting for server bucket locks\n", - m->server_lock_miss_cycles); - homa_append_metric(homa, - "socket_lock_misses %15llu Socket lock misses\n", - m->socket_lock_misses); - homa_append_metric(homa, - "socket_lock_miss_cycles %15llu Time lost waiting for socket locks\n", - m->socket_lock_miss_cycles); - homa_append_metric(homa, - "throttle_lock_misses %15llu Throttle lock misses\n", - m->throttle_lock_misses); - homa_append_metric(homa, - "throttle_lock_miss_cycles %15llu Time lost waiting for throttle locks\n", - m->throttle_lock_miss_cycles); - homa_append_metric(homa, - "peer_ack_lock_misses %15llu Misses on peer ack locks\n", - m->peer_ack_lock_misses); - homa_append_metric(homa, - "peer_ack_lock_miss_cycles %15llu Time lost waiting for peer ack locks\n", - m->peer_ack_lock_miss_cycles); - homa_append_metric(homa, - "grantable_lock_misses %15llu Grantable lock misses\n", - m->grantable_lock_misses); - homa_append_metric(homa, - "grantable_lock_miss_cycles%15llu Time lost waiting for grantable lock\n", - m->grantable_lock_miss_cycles); - homa_append_metric(homa, - "grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", - m->grantable_rpcs_integral); - homa_append_metric(homa, - "grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", - m->grant_recalc_calls); - homa_append_metric(homa, - "grant_recalc_cycles %15llu Time spent in homa_grant_recalc\n", - m->grant_recalc_cycles); - homa_append_metric(homa, - "grant_recalc_skips %15llu Number of times homa_grant_recalc skipped redundant work\n", - m->grant_recalc_skips); - homa_append_metric(homa, - "grant_recalc_loops %15llu Number of times homa_grant_recalc looped back\n", - m->grant_recalc_loops); - homa_append_metric(homa, - "grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", - m->grant_priority_bumps); - homa_append_metric(homa, - "fifo_grants %15llu Grants issued using FIFO priority\n", - m->fifo_grants); - homa_append_metric(homa, - "fifo_grants_no_incoming %15llu FIFO grants to messages with no outstanding grants\n", - m->fifo_grants_no_incoming); - homa_append_metric(homa, - "disabled_reaps %15llu Reaper invocations that were disabled\n", - m->disabled_reaps); - homa_append_metric(homa, - "disabled_rpc_reaps %15llu Disabled RPCs skipped by reaper\n", - m->disabled_rpc_reaps); - homa_append_metric(homa, - "reaper_calls %15llu Reaper invocations that were not disabled\n", - m->reaper_calls); - homa_append_metric(homa, - "reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper alls\n", - m->reaper_dead_skbs); - homa_append_metric(homa, - "forced_reaps %15llu Reaps forced by accumulation of dead RPCs\n", - m->forced_reaps); - homa_append_metric(homa, - "throttle_list_adds %15llu Calls to homa_add_to_throttled\n", - m->throttle_list_adds); - homa_append_metric(homa, - "throttle_list_checks %15llu List elements checked in homa_add_to_throttled\n", - m->throttle_list_checks); - homa_append_metric(homa, - "ack_overflows %15llu Explicit ACKs sent because peer->acks was full\n", - m->ack_overflows); - homa_append_metric(homa, - "ignored_need_acks %15llu NEED_ACKs ignored because RPC result not yet received\n", - m->ignored_need_acks); - homa_append_metric(homa, - "bpage_reuses %15llu Buffer page could be reused because ref count was zero\n", - m->bpage_reuses); - homa_append_metric(homa, - "buffer_alloc_failures %15llu homa_pool_allocate didn't find enough buffer space for an RPC\n", - m->buffer_alloc_failures); - homa_append_metric(homa, - "linux_pkt_alloc_bytes %15llu Bytes allocated in new packets by NIC driver due to cache overflows\n", - m->linux_pkt_alloc_bytes); - homa_append_metric(homa, - "dropped_data_no_bufs %15llu Data bytes dropped because app buffers full\n", - m->dropped_data_no_bufs); - homa_append_metric(homa, - "gen3_handoffs %15llu GRO->SoftIRQ handoffs made by Gen3 balancer\n", - m->gen3_handoffs); - homa_append_metric(homa, - "gen3_alt_handoffs %15llu Gen3 handoffs to secondary core (primary was busy)\n", - m->gen3_alt_handoffs); - homa_append_metric(homa, - "gro_grant_bypasses %15llu Grant packets passed directly to homa_softirq by homa_gro_receive\n", - m->gro_grant_bypasses); - homa_append_metric(homa, - "gro_data_bypasses %15llu Data packets passed directly to homa_softirq by homa_gro_receive\n", - m->gro_data_bypasses); - for (i = 0; i < NUM_TEMP_METRICS; i++) - homa_append_metric(homa, - "temp%-2d %15llu Temporary use in testing\n", - i, m->temp[i]); - } - - return homa->metrics; -} - /** * homa_prios_changed() - This function is called whenever configuration * information related to priorities, such as @homa->unsched_cutoffs or diff --git a/homa_wire.h b/homa_wire.h index a6314db3..da9c41b4 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -5,6 +5,8 @@ #ifndef _HOMA_WIRE_H #define _HOMA_WIRE_H +#include + /** * enum homa_packet_type - Defines the possible types of Homa packets. * diff --git a/test/Makefile b/test/Makefile index 8571b6ba..bf03aa9b 100644 --- a/test/Makefile +++ b/test/Makefile @@ -40,6 +40,7 @@ CCFLAGS := -std=c++11 $(WARNS) -MD -g $(CCINCLUDES) $(DEFS) -fsanitize=address TEST_SRCS := unit_homa_grant.c \ unit_homa_incoming.c \ unit_homa_offload.c \ + unit_homa_metrics.c \ unit_homa_outgoing.c \ unit_homa_peertab.c \ unit_homa_pool.c \ @@ -53,6 +54,7 @@ TEST_OBJS := $(patsubst %.c,%.o,$(TEST_SRCS)) HOMA_SRCS := homa_grant.c \ homa_incoming.c \ + homa_metrics.c \ homa_offload.c \ homa_outgoing.c \ homa_peertab.c \ diff --git a/test/mock.c b/test/mock.c index 686e9024..88509416 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1542,6 +1542,8 @@ void mock_teardown(void) mock_active_rcu_locks); mock_active_rcu_locks = 0; + memset(homa_metrics, 0, sizeof(homa_metrics)); + unit_hook_clear(); } diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index ce97e7d1..6e8d51ab 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -192,7 +192,7 @@ TEST_F(homa_grant, homa_grant_add_rpc__update_metrics) mock_cycles = 200; test_rpc(self, 100, self->server_ip, 100000); EXPECT_EQ(4, self->homa.num_grantable_rpcs); - EXPECT_EQ(300, core_metrics.grantable_rpcs_integral); + EXPECT_EQ(300, homa_metrics_per_cpu()->grantable_rpcs_integral); EXPECT_EQ(200, self->homa.last_grantable_change); } TEST_F(homa_grant, homa_grant_add_rpc__insert_in_peer_list) @@ -335,7 +335,7 @@ TEST_F(homa_grant, homa_grant_remove_rpc__update_metrics) homa_grant_remove_rpc(rpc); EXPECT_EQ(2, self->homa.num_grantable_rpcs); - EXPECT_EQ(300, core_metrics.grantable_rpcs_integral); + EXPECT_EQ(300, homa_metrics_per_cpu()->grantable_rpcs_integral); EXPECT_EQ(200, self->homa.last_grantable_change); } TEST_F(homa_grant, homa_grant_remove_rpc__not_first_in_peer_list) @@ -739,7 +739,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); - EXPECT_NE(0, core_metrics.grant_recalc_cycles); + EXPECT_NE(0, homa_metrics_per_cpu()->grant_recalc_cycles); } TEST_F(homa_grant, homa_grant_recalc__already_locked) { @@ -763,7 +763,7 @@ TEST_F(homa_grant, homa_grant_recalc__skip_recalc) EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, rpc->msgin.granted); EXPECT_EQ(2, atomic_read(&self->homa.grant_recalc_count)); - EXPECT_EQ(1, core_metrics.grant_recalc_skips); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_recalc_skips); } TEST_F(homa_grant, homa_grant_recalc__clear_existing_active_rpcs) { @@ -871,14 +871,14 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) unit_hook_register(grantable_spinlock_hook); hook_homa = &self->homa; mock_trylock_errors = 0xfe0; - EXPECT_EQ(0, core_metrics.grant_recalc_skips); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_skips); homa_grant_recalc(&self->homa, 0); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(0, rpc3->msgin.granted); EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_EQ(1, core_metrics.grant_recalc_skips); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_recalc_skips); } TEST_F(homa_grant, homa_grant_pick_rpcs__basics) @@ -1073,8 +1073,8 @@ TEST_F(homa_grant, homa_grantable_lock_slow__basics) EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); homa_grantable_unlock(&self->homa); - EXPECT_EQ(1, core_metrics.grantable_lock_misses); - EXPECT_EQ(500, core_metrics.grantable_lock_miss_cycles); + EXPECT_EQ(1, homa_metrics_per_cpu()->grantable_lock_misses); + EXPECT_EQ(500, homa_metrics_per_cpu()->grantable_lock_miss_cycles); } TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) { @@ -1086,12 +1086,12 @@ TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) EXPECT_EQ(0, homa_grantable_lock_slow(&self->homa, 1)); hook_homa = NULL; - EXPECT_EQ(1, core_metrics.grantable_lock_misses); - EXPECT_EQ(500, core_metrics.grantable_lock_miss_cycles); + EXPECT_EQ(1, homa_metrics_per_cpu()->grantable_lock_misses); + EXPECT_EQ(500, homa_metrics_per_cpu()->grantable_lock_miss_cycles); /* Make sure the check only occurs if the recalc argument is set. */ mock_trylock_errors = 0xff; EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); - EXPECT_EQ(2, core_metrics.grantable_lock_misses); + EXPECT_EQ(2, homa_metrics_per_cpu()->grantable_lock_misses); homa_grantable_unlock(&self->homa); } \ No newline at end of file diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 4b7f306a..dbfc6f02 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -222,11 +222,11 @@ TEST_F(homa_incoming, homa_message_in_init__update_metrics) EXPECT_EQ(0, homa_message_in_init(crpc, 0x3000, 0)); EXPECT_EQ(0, homa_message_in_init(crpc, 1000000, 0)); EXPECT_EQ(0, homa_message_in_init(crpc, 900000, 0)); - EXPECT_EQ(270, core_metrics.small_msg_bytes[2]); - EXPECT_EQ(0xfff, core_metrics.small_msg_bytes[63]); - EXPECT_EQ(0x3000, core_metrics.medium_msg_bytes[11]); - EXPECT_EQ(0, core_metrics.medium_msg_bytes[15]); - EXPECT_EQ(1900000, core_metrics.large_msg_bytes); + EXPECT_EQ(270, homa_metrics_per_cpu()->small_msg_bytes[2]); + EXPECT_EQ(0xfff, homa_metrics_per_cpu()->small_msg_bytes[63]); + EXPECT_EQ(0x3000, homa_metrics_per_cpu()->medium_msg_bytes[11]); + EXPECT_EQ(0, homa_metrics_per_cpu()->medium_msg_bytes[15]); + EXPECT_EQ(1900000, homa_metrics_per_cpu()->large_msg_bytes); } TEST_F(homa_incoming, homa_gap_retry) @@ -567,21 +567,21 @@ TEST_F(homa_incoming, homa_add_packet__metrics) homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 0)); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(0, core_metrics.resent_discards); - EXPECT_EQ(1, core_metrics.packet_discards); + EXPECT_EQ(0, homa_metrics_per_cpu()->resent_discards); + EXPECT_EQ(1, homa_metrics_per_cpu()->packet_discards); self->data.retransmit = 1; homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 0)); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(1, core_metrics.resent_discards); - EXPECT_EQ(1, core_metrics.packet_discards); + EXPECT_EQ(1, homa_metrics_per_cpu()->resent_discards); + EXPECT_EQ(1, homa_metrics_per_cpu()->packet_discards); self->data.seg.offset = htonl(4200); homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(1, core_metrics.resent_packets_used); + EXPECT_EQ(1, homa_metrics_per_cpu()->resent_packets_used); } TEST_F(homa_incoming, homa_copy_to_user__basics) @@ -859,7 +859,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cant_create_server_rpc) 1400, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, mock_skb_count()); - EXPECT_EQ(1, core_metrics.server_cant_create_rpcs); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_cant_create_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__existing_server_rpc) { @@ -922,7 +922,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); - EXPECT_EQ(1, core_metrics.unknown_rpcs); + EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) { @@ -933,7 +933,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); - EXPECT_EQ(0, core_metrics.unknown_rpcs); + EXPECT_EQ(0, homa_metrics_per_cpu()->unknown_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) { @@ -1009,7 +1009,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = 99}; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h, 0, 0), &self->homa); - EXPECT_EQ(1, core_metrics.unknown_packet_types); + EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_packet_types); } TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) { @@ -1077,7 +1077,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(31, self->hsk.dead_skbs); - EXPECT_EQ(0, core_metrics.data_pkt_reap_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); /* Second packet: must reap. */ self->homa.dead_buffs_limit = 15; @@ -1085,7 +1085,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(21, self->hsk.dead_skbs); - EXPECT_NE(0, core_metrics.data_pkt_reap_cycles); + EXPECT_NE(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); } TEST_F(homa_incoming, homa_data_pkt__basics) @@ -1104,7 +1104,7 @@ TEST_F(homa_incoming, homa_data_pkt__basics) EXPECT_EQ(200, crpc->msgin.bytes_remaining); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); EXPECT_EQ(1600, crpc->msgin.granted); - EXPECT_EQ(1, core_metrics.responses_received); + EXPECT_EQ(1, homa_metrics_per_cpu()->responses_received); } TEST_F(homa_incoming, homa_data_pkt__wrong_client_rpc_state) { @@ -1169,7 +1169,7 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffers) atomic_set(&self->hsk.buffer_pool.free_bpages, 0); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); - EXPECT_EQ(1400, core_metrics.dropped_data_no_bufs); + EXPECT_EQ(1400, homa_metrics_per_cpu()->dropped_data_no_bufs); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } TEST_F(homa_incoming, homa_data_pkt__update_delta) @@ -1598,7 +1598,7 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) struct sk_buff *skb = mock_skb_new(self->server_ip, &h.common, 0, 0); mock_kmalloc_errors = 1; homa_cutoffs_pkt(skb, &self->hsk); - EXPECT_EQ(1, core_metrics.peer_kmalloc_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); peer = homa_peer_find(&self->homa.peers, self->server_ip, &self->hsk.inet); ASSERT_FALSE(IS_ERR(peer)); @@ -1622,7 +1622,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) &self->homa); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks", unit_log_get()); - EXPECT_EQ(1, core_metrics.packets_received[ + EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ NEED_ACK - DATA]); } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) @@ -1641,7 +1641,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, core_metrics.packets_received[ + EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ NEED_ACK - DATA]); } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) @@ -1660,7 +1660,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, core_metrics.packets_received[ + EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ NEED_ACK - DATA]); } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) @@ -1701,7 +1701,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); - EXPECT_EQ(1, core_metrics.packets_received[ACK - DATA]); + EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ACK - DATA]); } TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) { diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c new file mode 100644 index 00000000..dc5d5044 --- /dev/null +++ b/test/unit_homa_metrics.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#include "homa_impl.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +extern struct homa *homa; + +FIXTURE(homa_metrics) { + struct homa homa; +}; +FIXTURE_SETUP(homa_metrics) +{ + homa_init(&self->homa); + homa = &self->homa; +} +FIXTURE_TEARDOWN(homa_metrics) +{ + homa = NULL; + homa_destroy(&self->homa); + unit_teardown(); +} + +TEST_F(homa_metrics, homa_metric_append) +{ + self->homa.metrics_length = 0; + homa_metric_append(&self->homa, "x: %d, y: %d", 10, 20); + EXPECT_EQ(12, self->homa.metrics_length); + EXPECT_STREQ("x: 10, y: 20", self->homa.metrics); + + homa_metric_append(&self->homa, ", z: %d", 12345); + EXPECT_EQ(22, self->homa.metrics_length); + EXPECT_STREQ("x: 10, y: 20, z: 12345", self->homa.metrics); + EXPECT_EQ(30, self->homa.metrics_capacity); + + homa_metric_append(&self->homa, ", q: %050d", 88); + EXPECT_EQ(77, self->homa.metrics_length); + EXPECT_STREQ("x: 10, y: 20, z: 12345, " + "q: 00000000000000000000000000000000000000000000000088", + self->homa.metrics); + EXPECT_EQ(120, self->homa.metrics_capacity); +} +TEST_F(homa_metrics, homa_metrics_open) +{ + EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); + EXPECT_NE(NULL, self->homa.metrics); + + strcpy(self->homa.metrics, "12345"); + EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); + EXPECT_EQ(5, strlen(self->homa.metrics)); + EXPECT_EQ(2, self->homa.metrics_active_opens); +} +TEST_F(homa_metrics, homa_metrics_read__basics) +{ + char buffer[1000]; + loff_t offset = 10; + self->homa.metrics = kmalloc(100, GFP_KERNEL); + self->homa.metrics_capacity = 100; + strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); + self->homa.metrics_length = 26; + EXPECT_EQ(5, homa_metrics_read(NULL, buffer, 5, &offset)); + EXPECT_SUBSTR("_copy_to_user copied 5 bytes", unit_log_get()); + EXPECT_EQ(15, offset); + + unit_log_clear(); + EXPECT_EQ(11, homa_metrics_read(NULL, buffer, 1000, &offset)); + EXPECT_SUBSTR("_copy_to_user copied 11 bytes", unit_log_get()); + EXPECT_EQ(26, offset); + + unit_log_clear(); + EXPECT_EQ(0, homa_metrics_read(NULL, buffer, 1000, &offset)); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(26, offset); +} +TEST_F(homa_metrics, homa_metrics_read__error_copying_to_user) +{ + char buffer[1000]; + loff_t offset = 10; + self->homa.metrics = kmalloc(100, GFP_KERNEL); + self->homa.metrics_capacity = 100; + strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); + self->homa.metrics_length = 26; + mock_copy_to_user_errors = 1; + EXPECT_EQ(EFAULT, -homa_metrics_read(NULL, buffer, 5, &offset)); +} + +TEST_F(homa_metrics, homa_metrics_release) +{ + self->homa.metrics_active_opens = 2; + EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); + EXPECT_EQ(1, self->homa.metrics_active_opens); + + EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); + EXPECT_EQ(0, self->homa.metrics_active_opens); +} \ No newline at end of file diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 59deb723..3b90788e 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -267,7 +267,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) skb = mock_skb_new(&self->ip, &h.common, 1400, 2000); struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(0, core_metrics.gro_data_bypasses); + EXPECT_EQ(0, homa_metrics_per_cpu()->gro_data_bypasses); /* Second attempt: HOMA_GRO_SHORT_BYPASS enabled but message longer * than one packet. @@ -277,7 +277,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) skb2 = mock_skb_new(&self->ip, &h.common, 1400, 2000); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(0, core_metrics.gro_data_bypasses); + EXPECT_EQ(0, homa_metrics_per_cpu()->gro_data_bypasses); /* Third attempt: bypass should happen. */ h.message_length = htonl(1400); @@ -286,14 +286,14 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) skb3 = mock_skb_new(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(1, core_metrics.gro_data_bypasses); + EXPECT_EQ(1, homa_metrics_per_cpu()->gro_data_bypasses); /* Third attempt: no bypass because core busy. */ cur_core->last_gro = 600; skb4 = mock_skb_new(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(1, core_metrics.gro_data_bypasses); + EXPECT_EQ(1, homa_metrics_per_cpu()->gro_data_bypasses); kfree_skb(skb); kfree_skb(skb2); @@ -326,7 +326,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) struct sk_buff *skb = mock_skb_new(&client_ip, &h.common, 0, 0); struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(0, core_metrics.gro_grant_bypasses); + EXPECT_EQ(0, homa_metrics_per_cpu()->gro_grant_bypasses); EXPECT_STREQ("", unit_log_get()); /* Second attempt: HOMA_FAST_GRANTS is enabled. */ @@ -335,7 +335,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) struct sk_buff *skb2 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(1, core_metrics.gro_grant_bypasses); + EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); EXPECT_SUBSTR("xmit DATA 1400@10000", unit_log_get()); /* Third attempt: core is too busy for fast grants. */ @@ -343,7 +343,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) struct sk_buff *skb3 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(1, core_metrics.gro_grant_bypasses); + EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); kfree_skb(skb); kfree_skb(skb3); } diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 83002cbe..8ff9143f 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -559,7 +559,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) mock_ip_queue_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, core_metrics.control_xmit_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->control_xmit_errors); } TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) { @@ -583,7 +583,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) mock_ip6_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, core_metrics.control_xmit_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->control_xmit_errors); } TEST_F(homa_outgoing, homa_xmit_unknown) @@ -763,7 +763,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) mock_ip_queue_xmit_errors = 1; skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 5); - EXPECT_EQ(1, core_metrics.data_xmit_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->data_xmit_errors); } TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) { @@ -779,7 +779,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) mock_ip6_xmit_errors = 1; skb_get(crpc->msgout.packets); __homa_xmit_data(crpc->msgout.packets, crpc, 5); - EXPECT_EQ(1, core_metrics.data_xmit_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->data_xmit_errors); } TEST_F(homa_outgoing, homa_resend_data__basics) @@ -984,8 +984,8 @@ TEST_F(homa_outgoing, homa_check_nic_queue__pacer_metrics) EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, true)); EXPECT_EQ(10500, atomic64_read(&self->homa.link_idle_time)); - EXPECT_EQ(500, core_metrics.pacer_bytes); - EXPECT_EQ(200, core_metrics.pacer_lost_cycles); + EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); + EXPECT_EQ(200, homa_metrics_per_cpu()->pacer_lost_cycles); } TEST_F(homa_outgoing, homa_check_nic_queue__queue_empty) { @@ -1141,7 +1141,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) mock_trylock_errors = ~1; homa_pacer_xmit(&self->homa); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, core_metrics.pacer_skipped_rpcs); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_skipped_rpcs); unit_log_clear(); mock_trylock_errors = 0; homa_pacer_xmit(&self->homa); @@ -1234,16 +1234,16 @@ TEST_F(homa_outgoing, homa_add_to_throttled__inc_metrics) self->server_port, self->client_id+4, 15000, 1000); homa_add_to_throttled(crpc1); - EXPECT_EQ(1, core_metrics.throttle_list_adds); - EXPECT_EQ(0, core_metrics.throttle_list_checks); + EXPECT_EQ(1, homa_metrics_per_cpu()->throttle_list_adds); + EXPECT_EQ(0, homa_metrics_per_cpu()->throttle_list_checks); homa_add_to_throttled(crpc2); - EXPECT_EQ(2, core_metrics.throttle_list_adds); - EXPECT_EQ(1, core_metrics.throttle_list_checks); + EXPECT_EQ(2, homa_metrics_per_cpu()->throttle_list_adds); + EXPECT_EQ(1, homa_metrics_per_cpu()->throttle_list_checks); homa_add_to_throttled(crpc3); - EXPECT_EQ(3, core_metrics.throttle_list_adds); - EXPECT_EQ(3, core_metrics.throttle_list_checks); + EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_adds); + EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_checks); } TEST_F(homa_outgoing, homa_remove_from_throttled) diff --git a/test/unit_homa_peertab.c b/test/unit_homa_peertab.c index d1645cab..92c352ea 100644 --- a/test/unit_homa_peertab.c +++ b/test/unit_homa_peertab.c @@ -72,7 +72,7 @@ TEST_F(homa_peertab, homa_peer_find__basics) peer2 = homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); EXPECT_NE(peer, peer2); - EXPECT_EQ(2, core_metrics.peer_new_entries); + EXPECT_EQ(2, homa_metrics_per_cpu()->peer_new_entries); } static struct _test_data_homa_peertab *test_data; @@ -191,7 +191,7 @@ TEST_F(homa_peertab, homa_peer_find__kmalloc_error) peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_EQ(ENOMEM, -PTR_ERR(peer)); - EXPECT_EQ(1, core_metrics.peer_kmalloc_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); } TEST_F(homa_peertab, homa_peer_find__route_error) { @@ -201,7 +201,7 @@ TEST_F(homa_peertab, homa_peer_find__route_error) peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); - EXPECT_EQ(1, core_metrics.peer_route_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); } TEST_F(homa_peertab, homa_dst_refresh__basics) @@ -229,7 +229,7 @@ TEST_F(homa_peertab, homa_dst_refresh__routing_error) mock_route_errors = 1; homa_dst_refresh(&self->homa.peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); - EXPECT_EQ(1, core_metrics.peer_route_errors); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); EXPECT_EQ(0, dead_count(&self->homa.peers)); } TEST_F(homa_peertab, homa_dst_refresh__malloc_error) @@ -324,15 +324,15 @@ TEST_F(homa_peertab, homa_peer_lock_slow) ASSERT_NE(NULL, peer); homa_peer_lock(peer); - EXPECT_EQ(0, core_metrics.peer_ack_lock_misses); - EXPECT_EQ(0, core_metrics.peer_ack_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); homa_peer_unlock(peer); mock_trylock_errors = 1; unit_hook_register(peer_spinlock_hook); homa_peer_lock(peer); - EXPECT_EQ(1, core_metrics.peer_ack_lock_misses); - EXPECT_EQ(1000, core_metrics.peer_ack_lock_miss_cycles); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_ack_lock_misses); + EXPECT_EQ(1000, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); homa_peer_unlock(peer); } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 6dbf40ed..ad47ca1c 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -268,7 +268,7 @@ TEST_F(homa_plumbing, homa_set_sock_opt__success) sizeof(struct homa_set_buf_args))); EXPECT_EQ(args.start, self->hsk.buffer_pool.region); EXPECT_EQ(64, self->hsk.buffer_pool.num_bpages); - EXPECT_EQ(1, core_metrics.so_set_buf_calls); + EXPECT_EQ(1, homa_metrics_per_cpu()->so_set_buf_calls); } TEST_F(homa_plumbing, homa_sendmsg__args_not_in_user_space) @@ -686,7 +686,7 @@ TEST_F(homa_plumbing, homa_softirq__packet_too_short) skb->len -= 1; homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, core_metrics.short_packets); + EXPECT_EQ(1, homa_metrics_per_cpu()->short_packets); } TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) { @@ -695,7 +695,7 @@ TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, core_metrics.short_packets); + EXPECT_EQ(1, homa_metrics_per_cpu()->short_packets); } TEST_F(homa_plumbing, homa_softirq__process_short_messages_first) { @@ -824,57 +824,3 @@ TEST_F(homa_plumbing, homa_softirq__per_rpc_batching) "sk->sk_data_ready invoked", unit_log_get()); } - -TEST_F(homa_plumbing, homa_metrics_open) -{ - EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); - EXPECT_NE(NULL, self->homa.metrics); - - strcpy(self->homa.metrics, "12345"); - EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); - EXPECT_EQ(5, strlen(self->homa.metrics)); - EXPECT_EQ(2, self->homa.metrics_active_opens); -} -TEST_F(homa_plumbing, homa_metrics_read__basics) -{ - char buffer[1000]; - loff_t offset = 10; - self->homa.metrics = kmalloc(100, GFP_KERNEL); - self->homa.metrics_capacity = 100; - strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); - self->homa.metrics_length = 26; - EXPECT_EQ(5, homa_metrics_read(NULL, buffer, 5, &offset)); - EXPECT_SUBSTR("_copy_to_user copied 5 bytes", unit_log_get()); - EXPECT_EQ(15, offset); - - unit_log_clear(); - EXPECT_EQ(11, homa_metrics_read(NULL, buffer, 1000, &offset)); - EXPECT_SUBSTR("_copy_to_user copied 11 bytes", unit_log_get()); - EXPECT_EQ(26, offset); - - unit_log_clear(); - EXPECT_EQ(0, homa_metrics_read(NULL, buffer, 1000, &offset)); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(26, offset); -} -TEST_F(homa_plumbing, homa_metrics_read__error_copying_to_user) -{ - char buffer[1000]; - loff_t offset = 10; - self->homa.metrics = kmalloc(100, GFP_KERNEL); - self->homa.metrics_capacity = 100; - strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); - self->homa.metrics_length = 26; - mock_copy_to_user_errors = 1; - EXPECT_EQ(EFAULT, -homa_metrics_read(NULL, buffer, 5, &offset)); -} - -TEST_F(homa_plumbing, homa_metrics_release) -{ - self->homa.metrics_active_opens = 2; - EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); - EXPECT_EQ(1, self->homa.metrics_active_opens); - - EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); - EXPECT_EQ(0, self->homa.metrics_active_opens); -} diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 43ea267f..96496713 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -320,7 +320,7 @@ TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); EXPECT_EQ(raw_smp_processor_id(), pool->descriptors[2].owner); - EXPECT_EQ(1, core_metrics.bpage_reuses); + EXPECT_EQ(1, homa_metrics_per_cpu()->bpage_reuses); } TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) { @@ -405,7 +405,7 @@ TEST_F(homa_pool, homa_pool_allocate__out_of_space) rpc = list_next_entry(rpc, buf_links); EXPECT_EQ(100, rpc->id); EXPECT_TRUE(list_is_last(&rpc->buf_links, &self->hsk.waiting_for_bufs)); - EXPECT_EQ(3, core_metrics.buffer_alloc_failures); + EXPECT_EQ(3, homa_metrics_per_cpu()->buffer_alloc_failures); EXPECT_EQ(1, pool->bpages_needed); } diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 6de0ef12..19959130 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -304,8 +304,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); EXPECT_NE(NULL, core->skb_page); EXPECT_EQ(HOMA_SKB_PAGE_SIZE, core->page_size); - EXPECT_EQ(1, core_metrics.skb_page_allocs); - EXPECT_NE(0, core_metrics.skb_page_alloc_cycles); + EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); + EXPECT_NE(0, homa_metrics_per_cpu()->skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) { @@ -317,8 +317,8 @@ TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) EXPECT_NE(NULL, core->skb_page); EXPECT_EQ(PAGE_SIZE, core->page_size); EXPECT_EQ(0, core->page_inuse); - EXPECT_EQ(1, core_metrics.skb_page_allocs); - EXPECT_NE(0, core_metrics.skb_page_alloc_cycles); + EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); + EXPECT_NE(0, homa_metrics_per_cpu()->skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__no_pages_available) { diff --git a/test/unit_homa_socktab.c b/test/unit_homa_socktab.c index 646182a7..ec55140a 100644 --- a/test/unit_homa_socktab.c +++ b/test/unit_homa_socktab.c @@ -294,13 +294,13 @@ TEST_F(homa_socktab, homa_sock_lock_slow) mock_cycles = ~0; homa_sock_lock(&self->hsk, "unit test"); - EXPECT_EQ(0, core_metrics.socket_lock_misses); - EXPECT_EQ(0, core_metrics.socket_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_miss_cycles); homa_sock_unlock(&self->hsk); mock_trylock_errors = 1; homa_sock_lock(&self->hsk, "unit test"); - EXPECT_EQ(1, core_metrics.socket_lock_misses); - EXPECT_NE(0, core_metrics.socket_lock_miss_cycles); + EXPECT_EQ(1, homa_metrics_per_cpu()->socket_lock_misses); + EXPECT_NE(0, homa_metrics_per_cpu()->socket_lock_miss_cycles); homa_sock_unlock(&self->hsk); } \ No newline at end of file diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 33f00a8f..c8167984 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -136,11 +136,11 @@ TEST_F(homa_timer, homa_check_rpc__timeout) unit_log_clear(); crpc->silent_ticks = self->homa.timeout_ticks-1; homa_check_rpc(crpc); - EXPECT_EQ(0, core_metrics.rpc_timeouts); + EXPECT_EQ(0, homa_metrics_per_cpu()->rpc_timeouts); EXPECT_EQ(0, crpc->error); crpc->silent_ticks = self->homa.timeout_ticks; homa_check_rpc(crpc); - EXPECT_EQ(1, core_metrics.rpc_timeouts); + EXPECT_EQ(1, homa_metrics_per_cpu()->rpc_timeouts); EXPECT_EQ(ETIMEDOUT, -crpc->error); } TEST_F(homa_timer, homa_check_rpc__issue_resend) @@ -250,7 +250,7 @@ TEST_F(homa_timer, homa_timer__basics) unit_log_clear(); crpc->peer->outstanding_resends = self->homa.timeout_resends; homa_timer(&self->homa); - EXPECT_EQ(1, core_metrics.rpc_timeouts); + EXPECT_EQ(1, homa_metrics_per_cpu()->rpc_timeouts); EXPECT_EQ(ETIMEDOUT, -crpc->error); } TEST_F(homa_timer, homa_timer__reap_dead_rpcs) diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index aa03f90f..dfd9b9ef 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -280,18 +280,18 @@ TEST_F(homa_utils, homa_bucket_lock_slow) ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); - EXPECT_EQ(0, core_metrics.client_lock_misses); - EXPECT_EQ(0, core_metrics.client_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_miss_cycles); homa_bucket_lock_slow(crpc->bucket, crpc->id); homa_rpc_unlock(crpc); - EXPECT_EQ(1, core_metrics.client_lock_misses); - EXPECT_NE(0, core_metrics.client_lock_miss_cycles); - EXPECT_EQ(0, core_metrics.server_lock_misses); - EXPECT_EQ(0, core_metrics.server_lock_miss_cycles); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_lock_misses); + EXPECT_NE(0, homa_metrics_per_cpu()->client_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_miss_cycles); homa_bucket_lock_slow(srpc->bucket, srpc->id); homa_rpc_unlock(srpc); - EXPECT_EQ(1, core_metrics.server_lock_misses); - EXPECT_NE(0, core_metrics.server_lock_miss_cycles); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_lock_misses); + EXPECT_NE(0, homa_metrics_per_cpu()->server_lock_miss_cycles); } TEST_F(homa_utils, homa_rpc_acked__basics) @@ -741,26 +741,6 @@ TEST_F(homa_utils, homa_snprintf) buffer); } -TEST_F(homa_utils, homa_append_metric) -{ - self->homa.metrics_length = 0; - homa_append_metric(&self->homa, "x: %d, y: %d", 10, 20); - EXPECT_EQ(12, self->homa.metrics_length); - EXPECT_STREQ("x: 10, y: 20", self->homa.metrics); - - homa_append_metric(&self->homa, ", z: %d", 12345); - EXPECT_EQ(22, self->homa.metrics_length); - EXPECT_STREQ("x: 10, y: 20, z: 12345", self->homa.metrics); - EXPECT_EQ(30, self->homa.metrics_capacity); - - homa_append_metric(&self->homa, ", q: %050d", 88); - EXPECT_EQ(77, self->homa.metrics_length); - EXPECT_STREQ("x: 10, y: 20, z: 12345, " - "q: 00000000000000000000000000000000000000000000000088", - self->homa.metrics); - EXPECT_EQ(120, self->homa.metrics_capacity); -} - TEST_F(homa_utils, homa_prios_changed__basics) { set_cutoffs(&self->homa, 90, 80, HOMA_MAX_MESSAGE_LENGTH*2, 60, 50, diff --git a/test/utils.h b/test/utils.h index f782266f..988c7785 100644 --- a/test/utils.h +++ b/test/utils.h @@ -30,8 +30,6 @@ enum unit_rpc_state { UNIT_IN_SERVICE = 24, }; -#define core_metrics homa_cores[raw_smp_processor_id()]->metrics - #define cur_core homa_cores[raw_smp_processor_id()] extern char *unit_ack_string(struct homa_ack *ack); diff --git a/timetrace.c b/timetrace.c index 7c44cdd2..fdac6291 100644 --- a/timetrace.c +++ b/timetrace.c @@ -845,8 +845,7 @@ void tt_inc_metric(int metric, __u64 count) offsetof(struct homa_metrics, linux_softirq_cycles), offsetof(struct homa_metrics, linux_pkt_alloc_bytes), }; - __u64 *metric_addr = (__u64 *)(((char *) - &homa_cores[raw_smp_processor_id()]->metrics) + __u64 *metric_addr = (__u64 *)(((char *) homa_metrics_per_cpu()) + offsets[metric]); *metric_addr += count; } From 44c2279a03831eda2e120ad140c7d7560bf3b3c0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 27 Sep 2024 14:42:46 -0700 Subject: [PATCH 025/625] Fix bug in homa_rpc_reap Extraneous code caused corruption of hsk->dead_skbs. --- homa_utils.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/homa_utils.c b/homa_utils.c index afecba53..e1d17b4f 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -701,8 +701,6 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) rpc->msgin.num_bpages, rpc->msgin.bpage_offsets); if (rpc->msgin.length >= 0) { - rpc->hsk->dead_skbs += skb_queue_len( - &rpc->msgin.packets); while (1) { struct homa_gap *gap = list_first_entry_or_null( &rpc->msgin.gaps, From 88391c4da719b252637239150abbfc13417243d1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 27 Sep 2024 14:50:58 -0700 Subject: [PATCH 026/625] Fix bug in homa_wait_for_message Could return a freed RPC if the RPC was freed by homa_copy_to_user. Resolves #63 --- homa_incoming.c | 14 +++--- test/unit_homa_incoming.c | 93 ++++++++++++++++++--------------------- 2 files changed, 53 insertions(+), 54 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 3221b617..6a44c9f1 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -221,8 +221,14 @@ int homa_copy_to_user(struct homa_rpc *rpc) * copy them, and reacquire the lock. */ while (true) { - struct sk_buff *skb = __skb_dequeue(&rpc->msgin.packets); + struct sk_buff *skb; + if (rpc->state == RPC_DEAD) { + error = -EINVAL; + break; + } + + skb = __skb_dequeue(&rpc->msgin.packets); if (skb != NULL) { skbs[n] = skb; n++; @@ -308,8 +314,6 @@ int homa_copy_to_user(struct homa_rpc *rpc) atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc, "homa_copy_to_user"); atomic_andnot(APP_NEEDS_LOCK|RPC_COPYING_TO_USER, &rpc->flags); - if (rpc->state == RPC_DEAD) - error = -EINVAL; if (error) break; } @@ -1318,12 +1322,12 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, &rpc->flags); } else atomic_andnot(RPC_HANDING_OFF, &rpc->flags); + if (!rpc->error) + rpc->error = homa_copy_to_user(rpc); if (rpc->state == RPC_DEAD) { homa_rpc_unlock(rpc); continue; } - if (!rpc->error) - rpc->error = homa_copy_to_user(rpc); if (rpc->error) goto done; atomic_andnot(RPC_PKTS_READY, &rpc->flags); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index dbfc6f02..8443cd80 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -88,15 +88,13 @@ void lock_delete_hook(char *id) if (strcmp(id, "spin_lock") != 0) return; if (lock_delete_count == 0) - { homa_rpc_free(hook_rpc); - } lock_delete_count--; } -/* The following function is used via unit_hook to delete an RPC after it +/* The following function is used via unit_hook to free an RPC after it * has been matched in homa_wait_for_message. */ -void match_delete_hook(char *id) +void match_free_hook(char *id) { if (strcmp(id, "found_rpc") == 0) homa_rpc_free(hook_rpc); @@ -615,6 +613,24 @@ TEST_F(homa_incoming, homa_copy_to_user__basics) unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } +TEST_F(homa_incoming, homa_copy_to_user__rpc_freed) +{ + struct homa_rpc *crpc; + + mock_bpage_size = 2048; + mock_bpage_shift = 11; + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 4000); + ASSERT_NE(NULL, crpc); + homa_rpc_free(crpc); + + unit_log_clear(); + mock_copy_to_user_dont_copy = -1; + EXPECT_EQ(EINVAL, -homa_copy_to_user(crpc)); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); +} TEST_F(homa_incoming, homa_copy_to_user__multiple_batches) { struct homa_rpc *crpc; @@ -738,27 +754,6 @@ TEST_F(homa_incoming, homa_copy_to_user__error_in_skb_copy_datagram_iter) EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } -TEST_F(homa_incoming, homa_copy_to_user__rpc_freed) -{ - struct homa_rpc *crpc; - - crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->server_port, self->client_id, - 1000, 4000); - ASSERT_NE(NULL, crpc); - self->data.message_length = htonl(4000); - self->data.seg.offset = htonl(1400); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, - 1400, 101000), crpc); - - unit_log_clear(); - mock_copy_to_user_dont_copy = -1; - unit_hook_register(lock_delete_hook); - hook_rpc = crpc; - EXPECT_EQ(EINVAL, -homa_copy_to_user(crpc)); - EXPECT_EQ(RPC_DEAD, crpc->state); - EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); -} TEST_F(homa_incoming, homa_copy_to_user__timetrace_info) { struct homa_rpc *crpc; @@ -2265,30 +2260,6 @@ TEST_F(homa_incoming, homa_wait_for_message__explicit_rpc_deleted_while_sleeping self->client_id); EXPECT_EQ(EINVAL, -PTR_ERR(rpc)); } -TEST_F(homa_incoming, homa_wait_for_message__rpc_deleted_after_matching) -{ - /* Arrange for 2 RPCs to be ready, but delete the first one after - * it has matched; this should cause the second one to be matched. - */ - struct homa_rpc *rpc; - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc1); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 20000, 1600); - ASSERT_NE(NULL, crpc2); - unit_log_clear(); - - hook_rpc = crpc1; - unit_hook_register(match_delete_hook); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); - EXPECT_EQ(RPC_DEAD, crpc1->state); - EXPECT_EQ(crpc2, rpc); - homa_rpc_unlock(rpc); -} TEST_F(homa_incoming, homa_wait_for_message__socket_shutdown_while_sleeping) { struct homa_rpc *rpc; @@ -2321,6 +2292,30 @@ TEST_F(homa_incoming, homa_wait_for_message__copy_to_user) EXPECT_EQ(0, atomic_read(&crpc->flags) & (RPC_PKTS_READY|RPC_COPYING_TO_USER)); } +TEST_F(homa_incoming, homa_wait_for_message__rpc_freed_after_matching) +{ + /* Arrange for 2 RPCs to be ready, but delete the first one after + * it has matched; this should cause the second one to be matched. + */ + struct homa_rpc *rpc; + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc1); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 20000, 1600); + ASSERT_NE(NULL, crpc2); + unit_log_clear(); + + hook_rpc = crpc1; + unit_hook_register(match_free_hook); + rpc = homa_wait_for_message(&self->hsk, + HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); + EXPECT_EQ(RPC_DEAD, crpc1->state); + EXPECT_EQ(crpc2, rpc); + homa_rpc_unlock(rpc); +} TEST_F(homa_incoming, homa_wait_for_message__copy_to_user_fails) { struct homa_rpc *rpc; From 9f65a69abeda1e224a11a450baba800e077daa36 Mon Sep 17 00:00:00 2001 From: zcusanza Date: Sat, 28 Sep 2024 21:39:14 -0700 Subject: [PATCH 027/625] Add missing definition for SPLIT_64 Resolves #64 --- timetrace.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/timetrace.h b/timetrace.h index 3c186a57..7547e381 100644 --- a/timetrace.h +++ b/timetrace.h @@ -204,5 +204,7 @@ static inline __u32 tt_lo(void *p) return ((__u64) p) & 0xffffffff; } +#define SPLIT_64(value) (int) (((__u64) (value)) >> 32), (int) (((__u64) (value)) & 0xffffffff) + #endif // HOMA_TIMETRACE_H From 58f6572bc3b6573d081dc0b8c0767caed56cd02b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 30 Sep 2024 15:42:38 -0700 Subject: [PATCH 028/625] Extract homa_rpc.h and homa_sock.h from homa_impl.h * Extract homa_rpc.c from homa_utils.c * Rename homa_socktab.c -> homa_sock.c --- Makefile | 3 +- homa_impl.h | 866 +----------------- homa_metrics.c | 2 +- homa_plumbing.c | 2 +- homa_pool.c | 6 +- homa_rpc.c | 771 ++++++++++++++++ homa_rpc.h | 513 +++++++++++ homa_socktab.c => homa_sock.c | 36 +- homa_sock.h | 378 ++++++++ homa_utils.c | 798 +--------------- test/Makefile | 6 +- test/mock.c | 5 + test/unit_homa_incoming.c | 8 +- test/unit_homa_plumbing.c | 24 +- test/unit_homa_pool.c | 72 +- test/unit_homa_rpc.c | 666 ++++++++++++++ .../{unit_homa_socktab.c => unit_homa_sock.c} | 38 +- test/unit_homa_utils.c | 648 ------------- 18 files changed, 2450 insertions(+), 2392 deletions(-) create mode 100644 homa_rpc.c create mode 100644 homa_rpc.h rename homa_socktab.c => homa_sock.c (89%) create mode 100644 homa_sock.h create mode 100644 test/unit_homa_rpc.c rename test/{unit_homa_socktab.c => unit_homa_sock.c} (91%) diff --git a/Makefile b/Makefile index fb277f17..a76e3a55 100644 --- a/Makefile +++ b/Makefile @@ -11,8 +11,9 @@ homa-y = homa_grant.o \ homa_peertab.o \ homa_pool.o \ homa_plumbing.o \ + homa_rpc.o \ homa_skb.o \ - homa_socktab.o \ + homa_sock.o \ homa_timer.o \ homa_utils.o \ timetrace.o diff --git a/homa_impl.h b/homa_impl.h index 856279f3..8bd2a21f 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -141,14 +141,13 @@ struct homa_peer; #include "homa.h" #include "timetrace.h" +#include "homa_rpc.h" #include "homa_wire.h" #include "homa_metrics.h" /* Declarations used in this file, so they can't be made at the end. */ -extern void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id); extern int homa_grantable_lock_slow(struct homa *homa, int recalc); extern void homa_peer_lock_slow(struct homa_peer *peer); -extern void homa_sock_lock_slow(struct homa_sock *hsk); extern void homa_throttle_lock_slow(struct homa *homa); extern struct homa_core *homa_cores[]; @@ -186,183 +185,6 @@ struct homa_cache_line { char bytes[64]; }; -/** - * struct homa_message_out - Describes a message (either request or response) - * for which this machine is the sender. - */ -struct homa_message_out { - /** - * @length: Total bytes in message (excluding headers). A value - * less than 0 means this structure is uninitialized and therefore - * not in use (all other fields will be zero in this case). - */ - int length; - - /** @num_skbs: Total number of buffers currently in @packets. */ - int num_skbs; - - /** - * @copied_from_user: Number of bytes of the message that have - * been copied from user space into skbs in @packets. - */ - int copied_from_user; - - /** - * @packets: Singly-linked list of all packets in message, linked - * using homa_next_skb. The list is in order of offset in the message - * (offset 0 first); each sk_buff can potentially contain multiple - * data_segments, which will be split into separate packets by GSO. - * This list grows gradually as data is copied in from user space, - * so it may not be complete. - */ - struct sk_buff *packets; - - /** - * @next_xmit: Pointer to pointer to next packet to transmit (will - * either refer to @packets or homa_next_skb(skb) for some skb - * in @packets). - */ - struct sk_buff **next_xmit; - - /** - * @next_xmit_offset: All bytes in the message, up to but not - * including this one, have been transmitted. - */ - int next_xmit_offset; - - /** - * @active_xmits: The number of threads that are currently - * transmitting data packets for this RPC; can't reap the RPC - * until this count becomes zero. - */ - atomic_t active_xmits; - - /** - * @unscheduled: Initial bytes of message that we'll send - * without waiting for grants. - */ - int unscheduled; - - /** - * @granted: Total number of bytes we are currently permitted to - * send, including unscheduled bytes; must wait for grants before - * sending bytes at or beyond this position. Never larger than - * @length. - */ - int granted; - - /** @priority: Priority level to use for future scheduled packets. */ - __u8 sched_priority; - - /** - * @init_cycles: Time in get_cycles units when this structure was - * initialized. Used to find the oldest outgoing message. - */ - __u64 init_cycles; -}; - -/** - * struct homa_gap - Represents a range of bytes within a message that have - * not yet been received. - */ -struct homa_gap { - /** @start: offset of first byte in this gap. */ - int start; - - /** @end: offset of byte just after last one in this gap. */ - int end; - - /** - * @time: time (in get_cycles units) when the gap was first detected. - * As of 7/2024 this isn't used for anything. - */ - __u64 time; - - /** @links: for linking into list in homa_message_in. */ - struct list_head links; -}; - -/** - * struct homa_message_in - Holds the state of a message received by - * this machine; used for both requests and responses. - */ -struct homa_message_in { - /** - * @length: Payload size in bytes. A value less than 0 means this - * structure is uninitialized and therefore not in use. - */ - int length; - - /** - * @packets: DATA packets for this message that have been received but - * not yet copied to user space (no particular order). - */ - struct sk_buff_head packets; - - /** - * @recv_end: Offset of the byte just after the highest one that - * has been received so far. - */ - int recv_end; - - /** - * @gaps: List of homa_gaps describing all of the bytes with - * offsets less than @recv_end that have not yet been received. - */ - struct list_head gaps; - - /** - * @bytes_remaining: Amount of data for this message that has - * not yet been received; will determine the message's priority. - */ - int bytes_remaining; - - /** - * @granted: Total # of bytes (starting from offset 0) that the sender - * may transmit without additional grants, includes unscheduled bytes. - * Never larger than @length. Note: once initialized, this - * may not be modified without holding @homa->grantable_lock. - */ - int granted; - - /** - * @rec_incoming: Number of bytes in homa->total_incoming currently - * contributed ("recorded") from this RPC. - */ - int rec_incoming; - - /** - * @rank: The index of this RPC in homa->active_rpcs and - * homa->active_remaining, or -1 if this RPC is not in those arrays. - * Set by homa_grant, read-only to the RPC. - */ - atomic_t rank; - - /** @priority: Priority level to include in future GRANTS. */ - int priority; - - /** @resend_all: if nonzero, set resend_all in the next grant packet. */ - __u8 resend_all; - - /** - * @birth: get_cycles time when this RPC was added to the grantable - * list. Invalid if RPC isn't in the grantable list. - */ - __u64 birth; - - /** - * @num_bpages: The number of entries in @bpage_offsets used for this - * message (0 means buffers not allocated yet). - */ - __u32 num_bpages; - - /** @bpage_offsets: Describes buffer space allocated for this message. - * Each entry is an offset from the start of the buffer region. - * All but the last pointer refer to areas of size HOMA_BPAGE_SIZE. - */ - __u32 bpage_offsets[HOMA_MAX_BPAGES]; -}; - /** * struct homa_interest - Contains various information used while waiting * for incoming messages (indicates what kinds of messages a particular @@ -431,334 +253,6 @@ static inline void homa_interest_init(struct homa_interest *interest) interest->response_links.next = LIST_POISON1; } -/** - * struct homa_rpc - One of these structures exists for each active - * RPC. The same structure is used to manage both outgoing RPCs on - * clients and incoming RPCs on servers. - */ -struct homa_rpc { - /** @hsk: Socket that owns the RPC. */ - struct homa_sock *hsk; - - /** @bucket: Pointer to the bucket in hsk->client_rpc_buckets or - * hsk->server_rpc_buckets where this RPC is linked. Used primarily - * for locking the RPC (which is done by locking its bucket). - */ - struct homa_rpc_bucket *bucket; - - /** - * @state: The current state of this RPC: - * - * @RPC_OUTGOING: The RPC is waiting for @msgout to be transmitted - * to the peer. - * @RPC_INCOMING: The RPC is waiting for data @msgin to be received - * from the peer; at least one packet has already - * been received. - * @RPC_IN_SERVICE: Used only for server RPCs: the request message - * has been read from the socket, but the response - * message has not yet been presented to the kernel. - * @RPC_DEAD: RPC has been deleted and is waiting to be - * reaped. In some cases, information in the RPC - * structure may be accessed in this state. - * - * Client RPCs pass through states in the following order: - * RPC_OUTGOING, RPC_INCOMING, RPC_DEAD. - * - * Server RPCs pass through states in the following order: - * RPC_INCOMING, RPC_IN_SERVICE, RPC_OUTGOING, RPC_DEAD. - */ - enum { - RPC_OUTGOING = 5, - RPC_INCOMING = 6, - RPC_IN_SERVICE = 8, - RPC_DEAD = 9 - } state; - - /** - * @flags: Additional state information: an OR'ed combination of - * various single-bit flags. See below for definitions. Must be - * manipulated with atomic operations because some of the manipulations - * occur without holding the RPC lock. - */ - atomic_t flags; - - /* Valid bits for @flags: - * RPC_PKTS_READY - The RPC has input packets ready to be - * copied to user space. - * RPC_COPYING_FROM_USER - Data is being copied from user space into - * the RPC; the RPC must not be reaped. - * RPC_COPYING_TO_USER - Data is being copied from this RPC to - * user space; the RPC must not be reaped. - * RPC_HANDING_OFF - This RPC is in the process of being - * handed off to a waiting thread; it must - * not be reaped. - * APP_NEEDS_LOCK - Means that code in the application thread - * needs the RPC lock (e.g. so it can start - * copying data to user space) so others - * (e.g. SoftIRQ processing) should relinquish - * the lock ASAP. Without this, SoftIRQ can - * lock out the application for a long time, - * preventing data copies to user space from - * starting (and they limit throughput at - * high network speeds). - */ -#define RPC_PKTS_READY 1 -#define RPC_COPYING_FROM_USER 2 -#define RPC_COPYING_TO_USER 4 -#define RPC_HANDING_OFF 8 -#define APP_NEEDS_LOCK 16 - -#define RPC_CANT_REAP (RPC_COPYING_FROM_USER | RPC_COPYING_TO_USER \ - | RPC_HANDING_OFF) - - /** - * @grants_in_progress: Count of active grant sends for this RPC; - * it's not safe to reap the RPC unless this value is zero. - * This variable is needed so that grantable_lock can be released - * while sending grants, to reduce contention. - */ - atomic_t grants_in_progress; - - /** - * @peer: Information about the other machine (the server, if - * this is a client RPC, or the client, if this is a server RPC). - */ - struct homa_peer *peer; - - /** @dport: Port number on @peer that will handle packets. */ - __u16 dport; - - /** - * @id: Unique identifier for the RPC among all those issued - * from its port. The low-order bit indicates whether we are - * server (1) or client (0) for this RPC. - */ - __u64 id; - - /** - * @completion_cookie: Only used on clients. Contains identifying - * information about the RPC provided by the application; returned to - * the application with the RPC's result. - */ - __u64 completion_cookie; - - /** - * @error: Only used on clients. If nonzero, then the RPC has - * failed and the value is a negative errno that describes the - * problem. - */ - int error; - - /** - * @msgin: Information about the message we receive for this RPC - * (for server RPCs this is the request, for client RPCs this is the - * response). - */ - struct homa_message_in msgin; - - /** - * @msgout: Information about the message we send for this RPC - * (for client RPCs this is the request, for server RPCs this is the - * response). - */ - struct homa_message_out msgout; - - /** - * @hash_links: Used to link this object into a hash bucket for - * either @hsk->client_rpc_buckets (for a client RPC), or - * @hsk->server_rpc_buckets (for a server RPC). - */ - struct hlist_node hash_links; - - /** - * @ready_links: Used to link this object into - * @hsk->ready_requests or @hsk->ready_responses. - */ - struct list_head ready_links; - - /** - * @buf_links: Used to link this RPC into @hsk->waiting_for_bufs. - * If the RPC isn't on @hsk->waiting_for_bufs, this is an empty - * list pointing to itself. - */ - struct list_head buf_links; - - /** - * @active_links: For linking this object into @hsk->active_rpcs. - * The next field will be LIST_POISON1 if this RPC hasn't yet been - * linked into @hsk->active_rpcs. Access with RCU. - */ - struct list_head active_links; - - /** @dead_links: For linking this object into @hsk->dead_rpcs. */ - struct list_head dead_links; - - /** - * @interest: Describes a thread that wants to be notified when - * msgin is complete, or NULL if none. - */ - struct homa_interest *interest; - - /** - * @grantable_links: Used to link this RPC into peer->grantable_rpcs. - * If this RPC isn't in peer->grantable_rpcs, this is an empty - * list pointing to itself. - */ - struct list_head grantable_links; - - /** - * @throttled_links: Used to link this RPC into homa->throttled_rpcs. - * If this RPC isn't in homa->throttled_rpcs, this is an empty - * list pointing to itself. - */ - struct list_head throttled_links; - - /** - * @silent_ticks: Number of times homa_timer has been invoked - * since the last time a packet indicating progress was received - * for this RPC, so we don't need to send a resend for a while. - */ - int silent_ticks; - - /** - * @resend_timer_ticks: Value of homa->timer_ticks the last time - * we sent a RESEND for this RPC. - */ - __u32 resend_timer_ticks; - - /** - * @done_timer_ticks: The value of homa->timer_ticks the first - * time we noticed that this (server) RPC is done (all response - * packets have been transmitted), so we're ready for an ack. - * Zero means we haven't reached that point yet. - */ - __u32 done_timer_ticks; - - /** - * @magic: when the RPC is alive, this holds a distinct value that - * is unlikely to occur naturally. The value is cleared when the - * RPC is reaped, so we can detect accidental use of an RPC after - * it has been reaped. - */ -#define HOMA_RPC_MAGIC 0xdeadbeef - int magic; - - /** - * @start_cycles: time (from get_cycles()) when this RPC was created. - * Used (sometimes) for testing. - */ - uint64_t start_cycles; -}; - -/** - * homa_rpc_validate() - Check to see if an RPC has been reaped (which - * would mean it is no longer valid); if so, crash the kernel with a stack - * trace. - * @rpc: RPC to validate. - */ -static inline void homa_rpc_validate(struct homa_rpc *rpc) -{ - if (rpc->magic == HOMA_RPC_MAGIC) - return; - pr_err("Accessing reaped Homa RPC!\n"); - BUG(); -} - -/** - * define HOMA_SOCKTAB_BUCKETS - Number of hash buckets in a homa_socktab. - * Must be a power of 2. - */ -#define HOMA_SOCKTAB_BUCKETS 1024 - -/** - * struct homa_socktab - A hash table that maps from port numbers (either - * client or server) to homa_sock objects. - * - * This table is managed exclusively by homa_socktab.c, using RCU to - * minimize synchronization during lookups. - */ -struct homa_socktab { - /** - * @mutex: Controls all modifications to this object; not needed - * for socket lookups (RCU is used instead). Also used to - * synchronize port allocation. - */ - spinlock_t write_lock; - - /** - * @buckets: Heads of chains for hash table buckets. Chains - * consist of homa_socktab_link objects. - */ - struct hlist_head buckets[HOMA_SOCKTAB_BUCKETS]; -}; - -/** - * struct homa_socktab_links - Used to link homa_socks into the hash chains - * of a homa_socktab. - */ -struct homa_socktab_links { - /* Must be the first element of the struct! */ - struct hlist_node hash_links; - struct homa_sock *sock; -}; - -/** - * struct homa_socktab_scan - Records the state of an iteration over all - * the entries in a homa_socktab, in a way that permits RCU-safe deletion - * of entries. - */ -struct homa_socktab_scan { - /** @socktab: The table that is being scanned. */ - struct homa_socktab *socktab; - - /** - * @current_bucket: the index of the bucket in socktab->buckets - * currently being scanned. If >= HOMA_SOCKTAB_BUCKETS, the scan - * is complete. - */ - int current_bucket; - - /** - * @next: the next socket to return from homa_socktab_next (this - * socket has not yet been returned). NULL means there are no - * more sockets in the current bucket. - */ - struct homa_socktab_links *next; -}; - -/** - * define HOMA_CLIENT_RPC_BUCKETS - Number of buckets in hash tables for - * client RPCs. Must be a power of 2. - */ -#define HOMA_CLIENT_RPC_BUCKETS 1024 - -/** - * define HOMA_SERVER_RPC_BUCKETS - Number of buckets in hash tables for - * server RPCs. Must be a power of 2. - */ -#define HOMA_SERVER_RPC_BUCKETS 1024 - -struct homa_rpc_bucket { - /** - * @lock: serves as a lock both for this bucket (e.g., when - * adding and removing RPCs) and also for all of the RPCs in - * the bucket. Must be held whenever manipulating an RPC in - * this bucket. This dual purpose permits clean and safe - * deletion and garbage collection of RPCs. - */ - spinlock_t lock; - - /** @rpcs: list of RPCs that hash to this bucket. */ - struct hlist_head rpcs; - - /** @id: identifier for this bucket, used in error messages etc. - * It's the index of the bucket within its hash table bucket - * array, with an additional offset to separate server and - * client RPCs. - */ - int id; -}; - /** * struct homa_bpage - Contains information about a single page in * a buffer pool. @@ -892,147 +386,6 @@ struct homa_pool { int check_waiting_invoked; }; -/** - * struct homa_sock - Information about an open socket. - */ -struct homa_sock { - /* Info for other network layers. Note: IPv6 info (struct ipv6_pinfo - * comes at the very end of the struct, *after* Homa's data, if this - * socket uses IPv6). - */ - union { - /** @sock: generic socket data; must be the first field. */ - struct sock sock; - - /** - * @inet: generic Internet socket data; must also be the - first field (contains sock as its first member). - */ - struct inet_sock inet; - }; - - /** - * @lock: Must be held when modifying fields such as interests - * and lists of RPCs. This lock is used in place of sk->sk_lock - * because it's used differently (it's always used as a simple - * spin lock). See sync.txt for more on Homa's synchronization - * strategy. - */ - spinlock_t lock; - - /** - * @last_locker: identifies the code that most recently acquired - * @lock successfully. Occasionally used for debugging. - */ - char *last_locker; - - /** - * @protect_count: counts the number of calls to homa_protect_rpcs - * for which there have not yet been calls to homa_unprotect_rpcs. - * See sync.txt for more info. - */ - atomic_t protect_count; - - /** - * @homa: Overall state about the Homa implementation. NULL - * means this socket has been deleted. - */ - struct homa *homa; - - /** @shutdown: True means the socket is no longer usable. */ - bool shutdown; - - /** - * @port: Port number: identifies this socket uniquely among all - * those on this node. - */ - __u16 port; - - /** - * @ip_header_length: Length of IP headers for this socket (depends - * on IPv4 vs. IPv6). - */ - int ip_header_length; - - /** - * @client_socktab_links: Links this socket into the homa_socktab - * based on @port. - */ - struct homa_socktab_links socktab_links; - - /** - * @active_rpcs: List of all existing RPCs related to this socket, - * including both client and server RPCs. This list isn't strictly - * needed, since RPCs are already in one of the hash tables below, - * but it's more efficient for homa_timer to have this list - * (so it doesn't have to scan large numbers of hash buckets). - * The list is sorted, with the oldest RPC first. Manipulate with - * RCU so timer can access without locking. - */ - struct list_head active_rpcs; - - /** - * @dead_rpcs: Contains RPCs for which homa_rpc_free has been - * called, but their packet buffers haven't yet been freed. - */ - struct list_head dead_rpcs; - - /** @dead_skbs: Total number of socket buffers in RPCs on dead_rpcs. */ - int dead_skbs; - - /** - * @waiting_for_bufs: Contains RPCs that are blocked because there - * wasn't enough space in the buffer pool region for their incoming - * messages. Sorted in increasing order of message length. - */ - struct list_head waiting_for_bufs; - - /** - * @ready_requests: Contains server RPCs whose request message is - * in a state requiring attention from a user process. The head is - * oldest, i.e. next to return. - */ - struct list_head ready_requests; - - /** - * @ready_responses: Contains client RPCs whose response message is - * in a state requiring attention from a user process. The head is - * oldest, i.e. next to return. - */ - struct list_head ready_responses; - - /** - * @request_interests: List of threads that want to receive incoming - * request messages. - */ - struct list_head request_interests; - - /** - * @response_interests: List of threads that want to receive incoming - * response messages. - */ - struct list_head response_interests; - - /** - * @client_rpc_buckets: Hash table for fast lookup of client RPCs. - * Modifications are synchronized with bucket locks, not - * the socket lock. - */ - struct homa_rpc_bucket client_rpc_buckets[HOMA_CLIENT_RPC_BUCKETS]; - - /** - * @server_rpc_buckets: Hash table for fast lookup of server RPCs. - * Modifications are synchronized with bucket locks, not - * the socket lock. - */ - struct homa_rpc_bucket server_rpc_buckets[HOMA_SERVER_RPC_BUCKETS]; - - /** - * @buffer_pool: used to allocate buffer space for incoming messages. - */ - struct homa_pool buffer_pool; -}; - /** * struct homa_dead_dst - Used to retain dst_entries that are no longer * needed, until it is safe to delete them (I'm not confident that the RCU @@ -1970,12 +1323,6 @@ struct homa_core { */ __u64 syscall_end_time; - /** - * @rpcs_locked: The total number of RPCs currently locked on this - * core; better not ever be more than 1! - */ - int rpcs_locked; - /** * @skb_page: a page of data available being used for skb frags. * This pointer is included in the page's reference count. @@ -2085,103 +1432,6 @@ static inline __u64 homa_local_id(__be64 sender_id) return be64_to_cpu(sender_id) ^ 1; } -/** - * homa_bucket_lock() - Acquire the lock for an RPC hash table bucket. - * @bucket: Bucket to lock - * @id: ID of the RPC that is requesting the lock. Normally ignored, - * but used occasionally for diagnostics and debugging. - * @locker: Static string identifying the locking code. Normally ignored, - * but used occasionally for diagnostics and debugging. - */ -static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, - __u64 id, const char *locker) -{ - int core = raw_smp_processor_id(); - - if (!spin_trylock_bh(&bucket->lock)) - homa_bucket_lock_slow(bucket, id); - homa_cores[core]->rpcs_locked++; - BUG_ON(homa_cores[core]->rpcs_locked > 1); -} - -/** - * homa_bucket_try_lock() - Acquire the lock for an RPC hash table bucket if - * it is available. - * @bucket: Bucket to lock - * @id: ID of the RPC that is requesting the lock. - * @locker: Static string identifying the locking code. Normally ignored, - * but used when debugging deadlocks. - * Return: Nonzero if lock was successfully acquired, zero if it is - * currently owned by someone else. - */ -static inline int homa_bucket_try_lock(struct homa_rpc_bucket *bucket, - __u64 id, const char *locker) -{ - int core = raw_smp_processor_id(); - - if (!spin_trylock_bh(&bucket->lock)) - return 0; - homa_cores[core]->rpcs_locked++; - BUG_ON(homa_cores[core]->rpcs_locked > 1); - return 1; -} - -/** - * homa_bucket_unlock() - Release the lock for an RPC hash table bucket. - * @bucket: Bucket to unlock. - * @id: ID of the RPC that was using the lock. - */ -static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id) -{ - homa_cores[raw_smp_processor_id()]->rpcs_locked--; - spin_unlock_bh(&bucket->lock); -} - -/** - * homa_rpc_lock() - Acquire the lock for an RPC. - * @rpc: RPC to lock. Note: this function is only safe under - * limited conditions. The caller must ensure that the RPC - * cannot be reaped before the lock is acquired. It cannot - * do that by acquiring the socket lock, since that violates - * lock ordering constraints. One approach is to use - * homa_protect_rpcs. Don't use this function unless you - * are very sure what you are doing! See sync.txt for more - * info on locking. - * @locker: Static string identifying the locking code. Normally ignored, - * but used occasionally for diagnostics and debugging. - */ -static inline void homa_rpc_lock(struct homa_rpc *rpc, const char *locker) -{ - homa_bucket_lock(rpc->bucket, rpc->id, locker); -} - -/** - * homa_rpc_unlock() - Release the lock for an RPC. - * @rpc: RPC to unlock. - */ -static inline void homa_rpc_unlock(struct homa_rpc *rpc) -{ - homa_bucket_unlock(rpc->bucket, rpc->id); -} - -/** - * homa_client_rpc_bucket() - Find the bucket containing a given - * client RPC. - * @hsk: Socket associated with the RPC. - * @id: Id of the desired RPC. - * - * Return: The bucket in which this RPC will appear, if the RPC exists. - */ -static inline struct homa_rpc_bucket *homa_client_rpc_bucket( - struct homa_sock *hsk, __u64 id) -{ - /* We can use a really simple hash function here because RPC ids - * are allocated sequentially. - */ - return &hsk->client_rpc_buckets[(id >> 1) - & (HOMA_CLIENT_RPC_BUCKETS - 1)]; -} - /** * homa_next_skb() - Compute address of Homa's private link field in @skb. * @skb: Socket buffer containing private link field. @@ -2213,25 +1463,6 @@ static inline int homa_port_hash(__u16 port) return port & (HOMA_SOCKTAB_BUCKETS - 1); } -/** - * homa_server_rpc_bucket() - Find the bucket containing a given - * server RPC. - * @hsk: Socket associated with the RPC. - * @id: Id of the desired RPC. - * - * Return: The bucket in which this RPC will appear, if the RPC exists. - */ -static inline struct homa_rpc_bucket *homa_server_rpc_bucket( - struct homa_sock *hsk, __u64 id) -{ - /* Each client allocates RPC ids sequentially, so they will - * naturally distribute themselves across the hash space. - * Thus we can use the id directly as hash. - */ - return &hsk->server_rpc_buckets[(id >> 1) - & (HOMA_SERVER_RPC_BUCKETS - 1)]; -} - /** * homa_set_doff() - Fills in the doff TCP header field for a Homa packet. * @h: Packet header whose doff field is to be set. @@ -2250,32 +1481,6 @@ static inline struct homa_sock *homa_sk(const struct sock *sk) return (struct homa_sock *)sk; } -/** - * homa_sock_lock() - Acquire the lock for a socket. If the socket - * isn't immediately available, record stats on the waiting time. - * @hsk: Socket to lock. - * @locker: Static string identifying where the socket was locked; - * used to track down deadlocks. - */ -static inline void homa_sock_lock(struct homa_sock *hsk, const char *locker) -{ - if (!spin_trylock_bh(&hsk->lock)) { -// printk(KERN_NOTICE "Slow path for socket %d, last locker %s", -// hsk->client_port, hsk->last_locker); - homa_sock_lock_slow(hsk); - } -// hsk->last_locker = locker; -} - -/** - * homa_sock_unlock() - Release the lock for a socket. - * @hsk: Socket to lock. - */ -static inline void homa_sock_unlock(struct homa_sock *hsk) -{ - spin_unlock_bh(&hsk->lock); -} - /** * homa_peer_lock() - Acquire the lock for a peer's @unacked_lock. If the lock * isn't immediately available, record stats on the waiting time. @@ -2296,39 +1501,6 @@ static inline void homa_peer_unlock(struct homa_peer *peer) spin_unlock_bh(&peer->ack_lock); } -/** - * homa_protect_rpcs() - Ensures that no RPCs will be reaped for a given - * socket until homa_sock_unprotect is called. Typically used by functions - * that want to scan the active RPCs for a socket without holding the socket - * lock. Multiple calls to this function may be in effect at once. - * @hsk: Socket whose RPCs should be protected. Must not be locked - * by the caller; will be locked here. - * - * Return: 1 for success, 0 if the socket has been shutdown, in which - * case its RPCs cannot be protected. - */ -static inline int homa_protect_rpcs(struct homa_sock *hsk) -{ - int result; - - homa_sock_lock(hsk, __func__); - result = !hsk->shutdown; - if (result) - atomic_inc(&hsk->protect_count); - homa_sock_unlock(hsk); - return result; -} - -/** - * homa_unprotect_rpcs() - Cancel the effect of a previous call to - * homa_sock_protect(), so that RPCs can once again be reaped. - * @hsk: Socket whose RPCs should be unprotected. - */ -static inline void homa_unprotect_rpcs(struct homa_sock *hsk) -{ - atomic_dec(&hsk->protect_count); -} - /** * homa_grantable_lock() - Acquire the grantable lock. If the lock * isn't immediately available, record stats on the waiting time. @@ -2501,8 +1673,6 @@ extern void homa_add_to_throttled(struct homa_rpc *rpc); extern int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb); extern int homa_bind(struct socket *sk, struct sockaddr *addr, int addr_len); -extern void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id); -extern void homa_check_rpc(struct homa_rpc *rpc); extern int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force); extern struct homa_rpc @@ -2530,11 +1700,6 @@ extern int homa_err_handler_v6(struct sk_buff *skb, __be32 info); extern int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter); -extern struct homa_rpc - *homa_find_client_rpc(struct homa_sock *hsk, __u64 id); -extern struct homa_rpc - *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u16 sport, __u64 id); extern void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format); extern void homa_freeze_peers(struct homa *homa); @@ -2645,20 +1810,7 @@ extern void homa_rpc_abort(struct homa_rpc *crpc, int error); extern void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); extern void homa_rpc_free(struct homa_rpc *rpc); -extern void homa_rpc_free_rcu(struct rcu_head *rcu_head); extern void homa_rpc_handoff(struct homa_rpc *rpc); -extern void homa_rpc_log(struct homa_rpc *rpc); -extern void homa_rpc_log_tt(struct homa_rpc *rpc); -extern void homa_rpc_log_active(struct homa *homa, uint64_t id); -extern void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); -extern struct homa_rpc - *homa_rpc_new_client(struct homa_sock *hsk, - const union sockaddr_in_union *dest); -extern struct homa_rpc - *homa_rpc_new_server(struct homa_sock *hsk, - const struct in6_addr *source, struct data_header *h, - int *created); -extern int homa_rpc_reap(struct homa_sock *hsk, int count); extern void homa_send_ipis(void); extern int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); extern int homa_setsockopt(struct sock *sk, int level, int optname, @@ -2689,24 +1841,8 @@ extern void homa_skb_release_pages(struct homa *homa); extern void homa_skb_stash_pages(struct homa *homa, int length); extern int homa_snprintf(char *buffer, int size, int used, const char *format, ...) __printf(4, 5); -extern int homa_sock_bind(struct homa_socktab *socktab, - struct homa_sock *hsk, __u16 port); -extern void homa_sock_destroy(struct homa_sock *hsk); -extern struct homa_sock * - homa_sock_find(struct homa_socktab *socktab, __u16 port); -extern void homa_sock_init(struct homa_sock *hsk, struct homa *homa); -extern void homa_sock_shutdown(struct homa_sock *hsk); -extern int homa_socket(struct sock *sk); -extern void homa_socktab_destroy(struct homa_socktab *socktab); -extern void homa_socktab_init(struct homa_socktab *socktab); -extern struct homa_sock - *homa_socktab_next(struct homa_socktab_scan *scan); -extern struct homa_sock - *homa_socktab_start_scan(struct homa_socktab *socktab, - struct homa_socktab_scan *scan); extern int homa_softirq(struct sk_buff *skb); extern void homa_spin(int ns); -extern char *homa_symbol_for_state(struct homa_rpc *rpc); extern char *homa_symbol_for_type(uint8_t type); extern int homa_sysctl_softirq_cores(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/homa_metrics.c b/homa_metrics.c index 42f3b7bf..879efe5d 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -309,7 +309,7 @@ char *homa_metrics_print(struct homa *homa) m->disabled_rpc_reaps); M("reaper_calls %15llu Reaper invocations that were not disabled\n", m->reaper_calls); - M("reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper alls\n", + M("reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper calls\n", m->reaper_dead_skbs); M("forced_reaps %15llu Reaps forced by accumulation of dead RPCs\n", m->forced_reaps); diff --git a/homa_plumbing.c b/homa_plumbing.c index a4d38eb0..52337aa1 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1050,7 +1050,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, result = -EINVAL; goto done; } - homa_pool_release_buffers(&hsk->buffer_pool, control.num_bpages, + homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages, control.bpage_offsets); control.num_bpages = 0; diff --git a/homa_pool.c b/homa_pool.c index 5ea9ab47..33946639 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -47,7 +47,7 @@ static inline void set_bpages_needed(struct homa_pool *pool) int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) { int i, result; - struct homa_pool *pool = &hsk->buffer_pool; + struct homa_pool *pool = hsk->buffer_pool; if (((__u64) region) & ~PAGE_MASK) return -EINVAL; @@ -231,7 +231,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, */ int homa_pool_allocate(struct homa_rpc *rpc) { - struct homa_pool *pool = &rpc->hsk->buffer_pool; + struct homa_pool *pool = rpc->hsk->buffer_pool; int full_pages, partial, i, core_id; __u32 pages[HOMA_MAX_BPAGES]; struct homa_pool_core *core; @@ -361,7 +361,7 @@ void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available) *available = (bpage_index < (rpc->msgin.num_bpages-1)) ? HOMA_BPAGE_SIZE - bpage_offset : rpc->msgin.length - offset; - return rpc->hsk->buffer_pool.region + rpc->msgin.bpage_offsets[bpage_index] + return rpc->hsk->buffer_pool->region + rpc->msgin.bpage_offsets[bpage_index] + bpage_offset; } diff --git a/homa_rpc.c b/homa_rpc.c new file mode 100644 index 00000000..0dc2dee6 --- /dev/null +++ b/homa_rpc.c @@ -0,0 +1,771 @@ +// SPDX-License-Identifier: BSD-2-Clause + +/* This file contains functions for managing homa_rpc structs. */ + +#include "homa_impl.h" + +/** + * homa_rpc_new_client() - Allocate and construct a client RPC (one that is used + * to issue an outgoing request). Doesn't send any packets. Invoked with no + * locks held. + * @hsk: Socket to which the RPC belongs. + * @dest: Address of host (ip and port) to which the RPC will be sent. + * + * Return: A printer to the newly allocated object, or a negative + * errno if an error occurred. The RPC will be locked; the + * caller must eventually unlock it. + */ +struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, + const union sockaddr_in_union *dest) +{ + int err; + struct homa_rpc *crpc; + struct homa_rpc_bucket *bucket; + struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); + + crpc = kmalloc(sizeof(*crpc), GFP_KERNEL); + if (unlikely(!crpc)) + return ERR_PTR(-ENOMEM); + + /* Initialize fields that don't require the socket lock. */ + crpc->hsk = hsk; + crpc->id = atomic64_fetch_add(2, &hsk->homa->next_outgoing_id); + bucket = homa_client_rpc_bucket(hsk, crpc->id); + crpc->bucket = bucket; + crpc->state = RPC_OUTGOING; + atomic_set(&crpc->flags, 0); + atomic_set(&crpc->grants_in_progress, 0); + crpc->peer = homa_peer_find(&hsk->homa->peers, &dest_addr_as_ipv6, + &hsk->inet); + if (IS_ERR(crpc->peer)) { + tt_record("error in homa_peer_find"); + err = PTR_ERR(crpc->peer); + goto error; + } + crpc->dport = ntohs(dest->in6.sin6_port); + crpc->completion_cookie = 0; + crpc->error = 0; + crpc->msgin.length = -1; + crpc->msgin.num_bpages = 0; + memset(&crpc->msgout, 0, sizeof(crpc->msgout)); + crpc->msgout.length = -1; + INIT_LIST_HEAD(&crpc->ready_links); + INIT_LIST_HEAD(&crpc->buf_links); + INIT_LIST_HEAD(&crpc->dead_links); + crpc->interest = NULL; + INIT_LIST_HEAD(&crpc->grantable_links); + INIT_LIST_HEAD(&crpc->throttled_links); + crpc->silent_ticks = 0; + crpc->resend_timer_ticks = hsk->homa->timer_ticks; + crpc->done_timer_ticks = 0; + crpc->magic = HOMA_RPC_MAGIC; + crpc->start_cycles = get_cycles(); + + /* Initialize fields that require locking. This allows the most + * expensive work, such as copying in the message from user space, + * to be performed without holding locks. Also, can't hold spin + * locks while doing things that could block, such as memory allocation. + */ + homa_bucket_lock(bucket, crpc->id, "homa_rpc_new_client"); + homa_sock_lock(hsk, "homa_rpc_new_client"); + if (hsk->shutdown) { + homa_sock_unlock(hsk); + homa_rpc_unlock(crpc); + err = -ESHUTDOWN; + goto error; + } + hlist_add_head(&crpc->hash_links, &bucket->rpcs); + list_add_tail_rcu(&crpc->active_links, &hsk->active_rpcs); + homa_sock_unlock(hsk); + + return crpc; + +error: + kfree(crpc); + return ERR_PTR(err); +} + +/** + * homa_rpc_new_server() - Allocate and construct a server RPC (one that is + * used to manage an incoming request). If appropriate, the RPC will also + * be handed off (we do it here, while we have the socket locked, to avoid + * acquiring the socket lock a second time later for the handoff). + * @hsk: Socket that owns this RPC. + * @source: IP address (network byte order) of the RPC's client. + * @h: Header for the first data packet received for this RPC; used + * to initialize the RPC. + * @created: Will be set to 1 if a new RPC was created and 0 if an + * existing RPC was found. + * + * Return: A pointer to a new RPC, which is locked, or a negative errno + * if an error occurred. If there is already an RPC corresponding + * to h, then it is returned instead of creating a new RPC. + */ +struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, + const struct in6_addr *source, struct data_header *h, + int *created) +{ + int err; + struct homa_rpc *srpc = NULL; + __u64 id = homa_local_id(h->common.sender_id); + struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); + + /* Lock the bucket, and make sure no-one else has already created + * the desired RPC. + */ + homa_bucket_lock(bucket, id, "homa_rpc_new_server"); + hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { + if ((srpc->id == id) && + (srpc->dport == ntohs(h->common.sport)) && + ipv6_addr_equal(&srpc->peer->addr, source)) { + /* RPC already exists; just return it instead + * of creating a new RPC. + */ + *created = 0; + return srpc; + } + } + + /* Initialize fields that don't require the socket lock. */ + srpc = kmalloc(sizeof(*srpc), GFP_KERNEL); + if (!srpc) { + err = -ENOMEM; + goto error; + } + srpc->hsk = hsk; + srpc->bucket = bucket; + srpc->state = RPC_INCOMING; + atomic_set(&srpc->flags, 0); + atomic_set(&srpc->grants_in_progress, 0); + srpc->peer = homa_peer_find(&hsk->homa->peers, source, &hsk->inet); + if (IS_ERR(srpc->peer)) { + err = PTR_ERR(srpc->peer); + goto error; + } + srpc->dport = ntohs(h->common.sport); + srpc->id = id; + srpc->completion_cookie = 0; + srpc->error = 0; + srpc->msgin.length = -1; + srpc->msgin.num_bpages = 0; + memset(&srpc->msgout, 0, sizeof(srpc->msgout)); + srpc->msgout.length = -1; + INIT_LIST_HEAD(&srpc->ready_links); + INIT_LIST_HEAD(&srpc->buf_links); + INIT_LIST_HEAD(&srpc->dead_links); + srpc->interest = NULL; + INIT_LIST_HEAD(&srpc->grantable_links); + INIT_LIST_HEAD(&srpc->throttled_links); + srpc->silent_ticks = 0; + srpc->resend_timer_ticks = hsk->homa->timer_ticks; + srpc->done_timer_ticks = 0; + srpc->magic = HOMA_RPC_MAGIC; + srpc->start_cycles = get_cycles(); + tt_record2("Incoming message for id %d has %d unscheduled bytes", + srpc->id, ntohl(h->incoming)); + err = homa_message_in_init(srpc, ntohl(h->message_length), + ntohl(h->incoming)); + if (err != 0) + goto error; + + /* Initialize fields that require socket to be locked. */ + homa_sock_lock(hsk, "homa_rpc_new_server"); + if (hsk->shutdown) { + homa_sock_unlock(hsk); + err = -ESHUTDOWN; + goto error; + } + hlist_add_head(&srpc->hash_links, &bucket->rpcs); + list_add_tail_rcu(&srpc->active_links, &hsk->active_rpcs); + if ((ntohl(h->seg.offset) == 0) && (srpc->msgin.num_bpages > 0)) { + atomic_or(RPC_PKTS_READY, &srpc->flags); + homa_rpc_handoff(srpc); + } + homa_sock_unlock(hsk); + INC_METRIC(requests_received, 1); + *created = 1; + return srpc; + +error: + homa_bucket_unlock(bucket, id); + kfree(srpc); + return ERR_PTR(err); +} + +/** + * homa_rpc_acked() - This function is invoked when an ack is received + * for an RPC; if the RPC still exists, is freed. + * @hsk: Socket on which the ack was received. May or may not correspond + * to the RPC, but can sometimes be used to avoid a socket lookup. + * @saddr: Source address from which the act was received (the client + * note for the RPC) + * @ack: Information about an RPC from @saddr that may now be deleted safely. + */ +void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, + struct homa_ack *ack) +{ + struct homa_rpc *rpc; + struct homa_sock *hsk2 = hsk; + __u64 id = homa_local_id(ack->client_id); + __u16 client_port = ntohs(ack->client_port); + __u16 server_port = ntohs(ack->server_port); + + UNIT_LOG("; ", "ack %llu", id); + if (hsk2->port != server_port) { + /* Without RCU, sockets other than hsk can be deleted + * out from under us. + */ + rcu_read_lock(); + hsk2 = homa_sock_find(&hsk->homa->port_map, server_port); + if (!hsk2) + goto done; + } + rpc = homa_find_server_rpc(hsk2, saddr, client_port, id); + if (rpc) { + tt_record1("homa_rpc_acked freeing id %d", rpc->id); + homa_rpc_free(rpc); + homa_rpc_unlock(rpc); + } + +done: + if (hsk->port != server_port) + rcu_read_unlock(); +} + +/** + * homa_rpc_free() - Destructor for homa_rpc; will arrange for all resources + * associated with the RPC to be released (eventually). + * @rpc: Structure to clean up, or NULL. Must be locked. Its socket must + * not be locked. + */ +void homa_rpc_free(struct homa_rpc *rpc) +{ + /* The goal for this function is to make the RPC inaccessible, + * so that no other code will ever access it again. However, don't + * actually release resources; leave that to homa_rpc_reap, which + * runs later. There are two reasons for this. First, releasing + * resources may be expensive, so we don't want to keep the caller + * waiting; homa_rpc_reap will run in situations where there is time + * to spare. Second, there may be other code that currently has + * pointers to this RPC but temporarily released the lock (e.g. to + * copy data to/from user space). It isn't safe to clean up until + * that code has finished its work and released any pointers to the + * RPC (homa_rpc_reap will ensure that this has happened). So, this + * function should only make changes needed to make the RPC + * inaccessible. + */ + if (!rpc || (rpc->state == RPC_DEAD)) + return; + UNIT_LOG("; ", "homa_rpc_free invoked"); + tt_record1("homa_rpc_free invoked for id %d", rpc->id); + rpc->state = RPC_DEAD; + + /* The following line must occur before the socket is locked or + * RPC is added to dead_rpcs. This is necessary because homa_grant_free + * releases the RPC lock and reacquires it (see comment in + * homa_grant_free for more info). + */ + homa_grant_free_rpc(rpc); + + /* Unlink from all lists, so no-one will ever find this RPC again. */ + homa_sock_lock(rpc->hsk, "homa_rpc_free"); + __hlist_del(&rpc->hash_links); + list_del_rcu(&rpc->active_links); + list_add_tail_rcu(&rpc->dead_links, &rpc->hsk->dead_rpcs); + __list_del_entry(&rpc->ready_links); + __list_del_entry(&rpc->buf_links); + if (rpc->interest != NULL) { + rpc->interest->reg_rpc = NULL; + wake_up_process(rpc->interest->thread); + rpc->interest = NULL; + } +// tt_record3("Freeing rpc id %d, socket %d, dead_skbs %d", rpc->id, +// rpc->hsk->client_port, +// rpc->hsk->dead_skbs); + + if (rpc->msgin.length >= 0) { + rpc->hsk->dead_skbs += skb_queue_len(&rpc->msgin.packets); + while (1) { + struct homa_gap *gap = list_first_entry_or_null( + &rpc->msgin.gaps, struct homa_gap, links); + if (gap == NULL) + break; + list_del(&gap->links); + kfree(gap); + } + } + rpc->hsk->dead_skbs += rpc->msgout.num_skbs; + if (rpc->hsk->dead_skbs > rpc->hsk->homa->max_dead_buffs) + /* This update isn't thread-safe; it's just a + * statistic so it's OK if updates occasionally get + * missed. + */ + rpc->hsk->homa->max_dead_buffs = rpc->hsk->dead_skbs; + + homa_sock_unlock(rpc->hsk); + homa_remove_from_throttled(rpc); +} + +/** + * homa_rpc_reap() - Invoked to release resources associated with dead + * RPCs for a given socket. For a large RPC, it can take a long time to + * free all of its packet buffers, so we try to perform this work + * off the critical path where it won't delay applications. Each call to + * this function does a small chunk of work. See the file reap.txt for + * more information. + * @hsk: Homa socket that may contain dead RPCs. Must not be locked by the + * caller; this function will lock and release. + * @count: Number of buffers to free during this call. + * + * Return: A return value of 0 means that we ran out of work to do; calling + * again will do no work (there could be unreaped RPCs, but if so, + * reaping has been disabled for them). A value greater than + * zero means there is still more reaping work to be done. + */ +int homa_rpc_reap(struct homa_sock *hsk, int count) +{ +#ifdef __UNIT_TEST__ +#define BATCH_MAX 3 +#else +#define BATCH_MAX 20 +#endif + struct sk_buff *skbs[BATCH_MAX]; + struct homa_rpc *rpcs[BATCH_MAX]; + int num_skbs, num_rpcs; + struct homa_rpc *rpc; + int i, batch_size; + int rx_frees = 0; + int result; + + INC_METRIC(reaper_calls, 1); + INC_METRIC(reaper_dead_skbs, hsk->dead_skbs); + + /* Each iteration through the following loop will reap + * BATCH_MAX skbs. + */ + while (count > 0) { + batch_size = count; + if (batch_size > BATCH_MAX) + batch_size = BATCH_MAX; + count -= batch_size; + num_skbs = num_rpcs = 0; + + homa_sock_lock(hsk, "homa_rpc_reap"); + if (atomic_read(&hsk->protect_count)) { + INC_METRIC(disabled_reaps, 1); + tt_record2("homa_rpc_reap returning: protect_count %d, dead_skbs %d", + atomic_read(&hsk->protect_count), + hsk->dead_skbs); + homa_sock_unlock(hsk); + return 0; + } + + /* Collect buffers and freeable RPCs. */ + list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) { + if ((atomic_read(&rpc->flags) & RPC_CANT_REAP) + || (atomic_read(&rpc->grants_in_progress) + != 0) + || (atomic_read(&rpc->msgout.active_xmits) + != 0)) { + INC_METRIC(disabled_rpc_reaps, 1); + continue; + } + rpc->magic = 0; + + /* For Tx sk_buffs, collect them here but defer + * freeing until after releasing the socket lock. + */ + if (rpc->msgout.length >= 0) { + while (rpc->msgout.packets) { + skbs[num_skbs] = rpc->msgout.packets; + rpc->msgout.packets = homa_get_skb_info( + rpc->msgout.packets) + ->next_skb; + num_skbs++; + rpc->msgout.num_skbs--; + if (num_skbs >= batch_size) + goto release; + } + } + + /* In the normal case rx sk_buffs will already have been + * freed before we got here. Thus it's OK to free + * immediately in rare situations where there are + * buffers left. + */ + if (rpc->msgin.length >= 0) { + while (1) { + struct sk_buff *skb; + + skb = skb_dequeue(&rpc->msgin.packets); + if (!skb) + break; + kfree_skb(skb); + rx_frees++; + } + } + + /* If we get here, it means all packets have been + * removed from the RPC. + */ + rpcs[num_rpcs] = rpc; + num_rpcs++; + list_del_rcu(&rpc->dead_links); + if (num_rpcs >= batch_size) + goto release; + } + + /* Free all of the collected resources; release the socket + * lock while doing this. + */ +release: + hsk->dead_skbs -= num_skbs + rx_frees; + result = !list_empty(&hsk->dead_rpcs) + && ((num_skbs + num_rpcs) != 0); + homa_sock_unlock(hsk); + homa_skb_free_many_tx(hsk->homa, skbs, num_skbs); + for (i = 0; i < num_rpcs; i++) { + rpc = rpcs[i]; + UNIT_LOG("; ", "reaped %llu", rpc->id); + /* Lock and unlock the RPC before freeing it. This + * is needed to deal with races where the code + * that invoked homa_rpc_free hasn't unlocked the + * RPC yet. + */ + homa_rpc_lock(rpc, "homa_rpc_reap"); + homa_rpc_unlock(rpc); + + if (unlikely(rpc->msgin.num_bpages)) + homa_pool_release_buffers( + rpc->hsk->buffer_pool, + rpc->msgin.num_bpages, + rpc->msgin.bpage_offsets); + if (rpc->msgin.length >= 0) { + while (1) { + struct homa_gap *gap = list_first_entry_or_null( + &rpc->msgin.gaps, + struct homa_gap, links); + if (gap == NULL) + break; + list_del(&gap->links); + kfree(gap); + } + } + tt_record1("homa_rpc_reap finished reaping id %d", + rpc->id); + rpc->state = 0; + kfree(rpc); + } + tt_record4("reaped %d skbs, %d rpcs; %d skbs remain for port %d", + num_skbs + rx_frees, num_rpcs, hsk->dead_skbs, + hsk->port); + if (!result) + break; + } + homa_pool_check_waiting(hsk->buffer_pool); + return result; +} + +/** + * homa_find_client_rpc() - Locate client-side information about the RPC that + * a packet belongs to, if there is any. Thread-safe without socket lock. + * @hsk: Socket via which packet was received. + * @id: Unique identifier for the RPC. + * + * Return: A pointer to the homa_rpc for this id, or NULL if none. + * The RPC will be locked; the caller must eventually unlock it + * by invoking homa_rpc_unlock. + */ +struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) +{ + struct homa_rpc *crpc; + struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); + + homa_bucket_lock(bucket, id, __func__); + hlist_for_each_entry_rcu(crpc, &bucket->rpcs, hash_links) { + if (crpc->id == id) + return crpc; + } + homa_bucket_unlock(bucket, id); + return NULL; +} + +/** + * homa_find_server_rpc() - Locate server-side information about the RPC that + * a packet belongs to, if there is any. Thread-safe without socket lock. + * @hsk: Socket via which packet was received. + * @saddr: Address from which the packet was sent. + * @sport: Port at @saddr from which the packet was sent. + * @id: Unique identifier for the RPC (must have server bit set). + * + * Return: A pointer to the homa_rpc matching the arguments, or NULL + * if none. The RPC will be locked; the caller must eventually + * unlock it by invoking homa_rpc_unlock. + */ +struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, + const struct in6_addr *saddr, __u16 sport, __u64 id) +{ + struct homa_rpc *srpc; + struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); + + homa_bucket_lock(bucket, id, __func__); + hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { + if ((srpc->id == id) && (srpc->dport == sport) && + ipv6_addr_equal(&srpc->peer->addr, saddr)) + return srpc; + } + homa_bucket_unlock(bucket, id); + return NULL; +} + +/** + * homa_rpc_log() - Log info about a particular RPC; this is functionality + * pulled out of homa_rpc_log_active because its indentation got too deep. + * @rpc: RPC for which key info should be written to the system log. + */ +void homa_rpc_log(struct homa_rpc *rpc) +{ + char *type = homa_is_client(rpc->id) ? "Client" : "Server"; + char *peer = homa_print_ipv6_addr(&rpc->peer->addr); + + if (rpc->state == RPC_INCOMING) + pr_notice("%s RPC INCOMING, id %llu, peer %s:%d, %d/%d bytes received, incoming %d\n", + type, rpc->id, peer, rpc->dport, + rpc->msgin.length + - rpc->msgin.bytes_remaining, + rpc->msgin.length, rpc->msgin.granted); + else if (rpc->state == RPC_OUTGOING) { + pr_notice("%s RPC OUTGOING, id %llu, peer %s:%d, out length %d, left %d, granted %d, in left %d, resend_ticks %u, silent_ticks %d\n", + type, rpc->id, peer, rpc->dport, + rpc->msgout.length, + rpc->msgout.length - rpc->msgout.next_xmit_offset, + rpc->msgout.granted, + rpc->msgin.bytes_remaining, + rpc->resend_timer_ticks, + rpc->silent_ticks); + } else { + pr_notice("%s RPC %s, id %llu, peer %s:%d, incoming length %d, outgoing length %d\n", + type, homa_symbol_for_state(rpc), + rpc->id, peer, rpc->dport, + rpc->msgin.length, rpc->msgout.length); + } +} + +/** + * homa_rpc_log_active() - Print information to the system log about all + * active RPCs. Intended primarily for debugging. + * @homa: Overall data about the Homa protocol implementation. + * @id: An RPC id: if nonzero, then only RPCs with this id will be + * logged. + */ +void homa_rpc_log_active(struct homa *homa, uint64_t id) +{ + struct homa_socktab_scan scan; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int count = 0; + + pr_notice("Logging active Homa RPCs:\n"); + rcu_read_lock(); + for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); + hsk != NULL; hsk = homa_socktab_next(&scan)) { + if (list_empty(&hsk->active_rpcs) || hsk->shutdown) + continue; + + if (!homa_protect_rpcs(hsk)) + continue; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + count++; + if ((id != 0) && (id != rpc->id)) + continue; + homa_rpc_log(rpc); + } + homa_unprotect_rpcs(hsk); + } + rcu_read_unlock(); + pr_notice("Finished logging active Homa RPCs: %d active RPCs\n", count); +} + +/** + * homa_rpc_log_tt() - Log info about a particular RPC using timetraces. + * @rpc: RPC for which key info should be written to the system log. + */ +void homa_rpc_log_tt(struct homa_rpc *rpc) +{ + if (rpc->state == RPC_INCOMING) { + int received = rpc->msgin.length + - rpc->msgin.bytes_remaining; + tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes received", + rpc->id, tt_addr(rpc->peer->addr), + received, rpc->msgin.length); + if (1) + tt_record4("RPC id %d has incoming %d, granted %d, prio %d", rpc->id, + rpc->msgin.granted - received, + rpc->msgin.granted, rpc->msgin.priority); + tt_record4("RPC id %d: length %d, remaining %d, rank %d", + rpc->id, rpc->msgin.length, + rpc->msgin.bytes_remaining, + atomic_read(&rpc->msgin.rank)); + if (rpc->msgin.num_bpages == 0) + tt_record1("RPC id %d is blocked waiting for buffers", + rpc->id); + else + tt_record2("RPC id %d has %d bpages allocated", + rpc->id, rpc->msgin.num_bpages); + } else if (rpc->state == RPC_OUTGOING) { + tt_record4("Outgoing RPC id %d, peer 0x%x, %d/%d bytes sent", + rpc->id, tt_addr(rpc->peer->addr), + rpc->msgout.next_xmit_offset, + rpc->msgout.length); + if (rpc->msgout.granted > rpc->msgout.next_xmit_offset) + tt_record3("RPC id %d has %d unsent grants (granted %d)", + rpc->id, rpc->msgout.granted + - rpc->msgout.next_xmit_offset, + rpc->msgout.granted); + } else { + tt_record2("RPC id %d is in state %d", rpc->id, rpc->state); + } +} + +/** + * homa_rpc_log_active_tt() - Log information about all active RPCs using + * timetraces. + * @homa: Overall data about the Homa protocol implementation. + * @freeze_count: If nonzero, FREEZE requests will be sent for this many + * incoming RPCs with outstanding grants + */ +void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) +{ + struct homa_socktab_scan scan; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int count = 0; + + homa_grant_log_tt(homa); + tt_record("Logging active Homa RPCs:"); + rcu_read_lock(); + for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); + hsk != NULL; hsk = homa_socktab_next(&scan)) { + if (list_empty(&hsk->active_rpcs) || hsk->shutdown) + continue; + + if (!homa_protect_rpcs(hsk)) + continue; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + struct freeze_header freeze; + + count++; + homa_rpc_log_tt(rpc); + if (freeze_count == 0) + continue; + if (rpc->state != RPC_INCOMING) + continue; + if (rpc->msgin.granted <= (rpc->msgin.length + - rpc->msgin.bytes_remaining)) + continue; + freeze_count--; + pr_notice("Emitting FREEZE in %s\n", __func__); + homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); + } + homa_unprotect_rpcs(hsk); + } + rcu_read_unlock(); + tt_record1("Finished logging (%d active Homa RPCs)", count); +} + +/** + * homa_validate_incoming() - Scan all of the active RPCs to compute what + * homa_total_incoming should be, and see if it actually matches. + * @homa: Overall data about the Homa protocol implementation. + * @verbose: Print incoming info for each individual RPC. + * @link_errors: Set to 1 if one or more grantable RPCs don't seem to + * be linked into the grantable lists. + * Return: The difference between the actual value of homa->total_incoming + * and the expected value computed from the individual RPCs (positive + * means homa->total_incoming is higher than expected). + */ +int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) +{ + struct homa_socktab_scan scan; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int total_incoming = 0; + int actual; + + tt_record1("homa_validate_incoming starting, total_incoming %d", + atomic_read(&homa->total_incoming)); + *link_errors = 0; + rcu_read_lock(); + for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); + hsk != NULL; hsk = homa_socktab_next(&scan)) { + if (list_empty(&hsk->active_rpcs) || hsk->shutdown) + continue; + + if (!homa_protect_rpcs(hsk)) + continue; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + int incoming; + + if (rpc->state != RPC_INCOMING) + continue; + incoming = rpc->msgin.granted - + (rpc->msgin.length + - rpc->msgin.bytes_remaining); + if (incoming < 0) + incoming = 0; + if (rpc->msgin.rec_incoming == 0) + continue; + total_incoming += rpc->msgin.rec_incoming; + if (verbose) + tt_record3("homa_validate_incoming: RPC id %d, ncoming %d, rec_incoming %d", + rpc->id, incoming, + rpc->msgin.rec_incoming); + if (rpc->msgin.granted >= rpc->msgin.length) + continue; + if (list_empty(&rpc->grantable_links)) { + tt_record1("homa_validate_incoming: RPC id %d not linked in grantable list", + rpc->id); + *link_errors = 1; + } + if (list_empty(&rpc->grantable_links)) { + tt_record1("homa_validate_incoming: RPC id %d peer not linked in grantable list", + rpc->id); + *link_errors = 1; + } + } + homa_unprotect_rpcs(hsk); + } + rcu_read_unlock(); + actual = atomic_read(&homa->total_incoming); + tt_record3("homa_validate_incoming diff %d (expected %d, got %d)", + actual - total_incoming, total_incoming, actual); + return actual - total_incoming; +} + +/** + * homa_symbol_for_state() - Returns a printable string describing an + * RPC state. + * @rpc: RPC whose state should be returned in printable form. + * + * Return: A static string holding the current state of @rpc. + */ +char *homa_symbol_for_state(struct homa_rpc *rpc) +{ + static char buffer[20]; + + switch (rpc->state) { + case RPC_OUTGOING: + return "OUTGOING"; + case RPC_INCOMING: + return "INCOMING"; + case RPC_IN_SERVICE: + return "IN_SERVICE"; + case RPC_DEAD: + return "DEAD"; + } + + /* See safety comment in homa_symbol_for_type. */ + snprintf(buffer, sizeof(buffer)-1, "unknown(%u)", rpc->state); + buffer[sizeof(buffer)-1] = 0; + return buffer; +} diff --git a/homa_rpc.h b/homa_rpc.h new file mode 100644 index 00000000..629a08f4 --- /dev/null +++ b/homa_rpc.h @@ -0,0 +1,513 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file defines homa_rpc and related structs. */ + +#ifndef _HOMA_RPC_H +#define _HOMA_RPC_H + +#include +#include +#include + +#include "homa_sock.h" +#include "homa_wire.h" + +/* Forward references. */ +struct homa_ack; + +/** + * struct homa_message_out - Describes a message (either request or response) + * for which this machine is the sender. + */ +struct homa_message_out { + /** + * @length: Total bytes in message (excluding headers). A value + * less than 0 means this structure is uninitialized and therefore + * not in use (all other fields will be zero in this case). + */ + int length; + + /** @num_skbs: Total number of buffers currently in @packets. */ + int num_skbs; + + /** + * @copied_from_user: Number of bytes of the message that have + * been copied from user space into skbs in @packets. + */ + int copied_from_user; + + /** + * @packets: Singly-linked list of all packets in message, linked + * using homa_next_skb. The list is in order of offset in the message + * (offset 0 first); each sk_buff can potentially contain multiple + * data_segments, which will be split into separate packets by GSO. + * This list grows gradually as data is copied in from user space, + * so it may not be complete. + */ + struct sk_buff *packets; + + /** + * @next_xmit: Pointer to pointer to next packet to transmit (will + * either refer to @packets or homa_next_skb(skb) for some skb + * in @packets). + */ + struct sk_buff **next_xmit; + + /** + * @next_xmit_offset: All bytes in the message, up to but not + * including this one, have been transmitted. + */ + int next_xmit_offset; + + /** + * @active_xmits: The number of threads that are currently + * transmitting data packets for this RPC; can't reap the RPC + * until this count becomes zero. + */ + atomic_t active_xmits; + + /** + * @unscheduled: Initial bytes of message that we'll send + * without waiting for grants. + */ + int unscheduled; + + /** + * @granted: Total number of bytes we are currently permitted to + * send, including unscheduled bytes; must wait for grants before + * sending bytes at or beyond this position. Never larger than + * @length. + */ + int granted; + + /** @priority: Priority level to use for future scheduled packets. */ + __u8 sched_priority; + + /** + * @init_cycles: Time in get_cycles units when this structure was + * initialized. Used to find the oldest outgoing message. + */ + __u64 init_cycles; +}; + +/** + * struct homa_gap - Represents a range of bytes within a message that have + * not yet been received. + */ +struct homa_gap { + /** @start: offset of first byte in this gap. */ + int start; + + /** @end: offset of byte just after last one in this gap. */ + int end; + + /** + * @time: time (in get_cycles units) when the gap was first detected. + * As of 7/2024 this isn't used for anything. + */ + __u64 time; + + /** @links: for linking into list in homa_message_in. */ + struct list_head links; +}; + +/** + * struct homa_message_in - Holds the state of a message received by + * this machine; used for both requests and responses. + */ +struct homa_message_in { + /** + * @length: Payload size in bytes. A value less than 0 means this + * structure is uninitialized and therefore not in use. + */ + int length; + + /** + * @packets: DATA packets for this message that have been received but + * not yet copied to user space (no particular order). + */ + struct sk_buff_head packets; + + /** + * @recv_end: Offset of the byte just after the highest one that + * has been received so far. + */ + int recv_end; + + /** + * @gaps: List of homa_gaps describing all of the bytes with + * offsets less than @recv_end that have not yet been received. + */ + struct list_head gaps; + + /** + * @bytes_remaining: Amount of data for this message that has + * not yet been received; will determine the message's priority. + */ + int bytes_remaining; + + /** + * @granted: Total # of bytes (starting from offset 0) that the sender + * may transmit without additional grants, includes unscheduled bytes. + * Never larger than @length. Note: once initialized, this + * may not be modified without holding @homa->grantable_lock. + */ + int granted; + + /** + * @rec_incoming: Number of bytes in homa->total_incoming currently + * contributed ("recorded") from this RPC. + */ + int rec_incoming; + + /** + * @rank: The index of this RPC in homa->active_rpcs and + * homa->active_remaining, or -1 if this RPC is not in those arrays. + * Set by homa_grant, read-only to the RPC. + */ + atomic_t rank; + + /** @priority: Priority level to include in future GRANTS. */ + int priority; + + /** @resend_all: if nonzero, set resend_all in the next grant packet. */ + __u8 resend_all; + + /** + * @birth: get_cycles time when this RPC was added to the grantable + * list. Invalid if RPC isn't in the grantable list. + */ + __u64 birth; + + /** + * @num_bpages: The number of entries in @bpage_offsets used for this + * message (0 means buffers not allocated yet). + */ + __u32 num_bpages; + + /** @bpage_offsets: Describes buffer space allocated for this message. + * Each entry is an offset from the start of the buffer region. + * All but the last pointer refer to areas of size HOMA_BPAGE_SIZE. + */ + __u32 bpage_offsets[HOMA_MAX_BPAGES]; +}; + +/** + * struct homa_rpc - One of these structures exists for each active + * RPC. The same structure is used to manage both outgoing RPCs on + * clients and incoming RPCs on servers. + */ +struct homa_rpc { + /** @hsk: Socket that owns the RPC. */ + struct homa_sock *hsk; + + /** @bucket: Pointer to the bucket in hsk->client_rpc_buckets or + * hsk->server_rpc_buckets where this RPC is linked. Used primarily + * for locking the RPC (which is done by locking its bucket). + */ + struct homa_rpc_bucket *bucket; + + /** + * @state: The current state of this RPC: + * + * @RPC_OUTGOING: The RPC is waiting for @msgout to be transmitted + * to the peer. + * @RPC_INCOMING: The RPC is waiting for data @msgin to be received + * from the peer; at least one packet has already + * been received. + * @RPC_IN_SERVICE: Used only for server RPCs: the request message + * has been read from the socket, but the response + * message has not yet been presented to the kernel. + * @RPC_DEAD: RPC has been deleted and is waiting to be + * reaped. In some cases, information in the RPC + * structure may be accessed in this state. + * + * Client RPCs pass through states in the following order: + * RPC_OUTGOING, RPC_INCOMING, RPC_DEAD. + * + * Server RPCs pass through states in the following order: + * RPC_INCOMING, RPC_IN_SERVICE, RPC_OUTGOING, RPC_DEAD. + */ + enum { + RPC_OUTGOING = 5, + RPC_INCOMING = 6, + RPC_IN_SERVICE = 8, + RPC_DEAD = 9 + } state; + + /** + * @flags: Additional state information: an OR'ed combination of + * various single-bit flags. See below for definitions. Must be + * manipulated with atomic operations because some of the manipulations + * occur without holding the RPC lock. + */ + atomic_t flags; + + /* Valid bits for @flags: + * RPC_PKTS_READY - The RPC has input packets ready to be + * copied to user space. + * RPC_COPYING_FROM_USER - Data is being copied from user space into + * the RPC; the RPC must not be reaped. + * RPC_COPYING_TO_USER - Data is being copied from this RPC to + * user space; the RPC must not be reaped. + * RPC_HANDING_OFF - This RPC is in the process of being + * handed off to a waiting thread; it must + * not be reaped. + * APP_NEEDS_LOCK - Means that code in the application thread + * needs the RPC lock (e.g. so it can start + * copying data to user space) so others + * (e.g. SoftIRQ processing) should relinquish + * the lock ASAP. Without this, SoftIRQ can + * lock out the application for a long time, + * preventing data copies to user space from + * starting (and they limit throughput at + * high network speeds). + */ +#define RPC_PKTS_READY 1 +#define RPC_COPYING_FROM_USER 2 +#define RPC_COPYING_TO_USER 4 +#define RPC_HANDING_OFF 8 +#define APP_NEEDS_LOCK 16 + +#define RPC_CANT_REAP (RPC_COPYING_FROM_USER | RPC_COPYING_TO_USER \ + | RPC_HANDING_OFF) + + /** + * @grants_in_progress: Count of active grant sends for this RPC; + * it's not safe to reap the RPC unless this value is zero. + * This variable is needed so that grantable_lock can be released + * while sending grants, to reduce contention. + */ + atomic_t grants_in_progress; + + /** + * @peer: Information about the other machine (the server, if + * this is a client RPC, or the client, if this is a server RPC). + */ + struct homa_peer *peer; + + /** @dport: Port number on @peer that will handle packets. */ + __u16 dport; + + /** + * @id: Unique identifier for the RPC among all those issued + * from its port. The low-order bit indicates whether we are + * server (1) or client (0) for this RPC. + */ + __u64 id; + + /** + * @completion_cookie: Only used on clients. Contains identifying + * information about the RPC provided by the application; returned to + * the application with the RPC's result. + */ + __u64 completion_cookie; + + /** + * @error: Only used on clients. If nonzero, then the RPC has + * failed and the value is a negative errno that describes the + * problem. + */ + int error; + + /** + * @msgin: Information about the message we receive for this RPC + * (for server RPCs this is the request, for client RPCs this is the + * response). + */ + struct homa_message_in msgin; + + /** + * @msgout: Information about the message we send for this RPC + * (for client RPCs this is the request, for server RPCs this is the + * response). + */ + struct homa_message_out msgout; + + /** + * @hash_links: Used to link this object into a hash bucket for + * either @hsk->client_rpc_buckets (for a client RPC), or + * @hsk->server_rpc_buckets (for a server RPC). + */ + struct hlist_node hash_links; + + /** + * @ready_links: Used to link this object into + * @hsk->ready_requests or @hsk->ready_responses. + */ + struct list_head ready_links; + + /** + * @buf_links: Used to link this RPC into @hsk->waiting_for_bufs. + * If the RPC isn't on @hsk->waiting_for_bufs, this is an empty + * list pointing to itself. + */ + struct list_head buf_links; + + /** + * @active_links: For linking this object into @hsk->active_rpcs. + * The next field will be LIST_POISON1 if this RPC hasn't yet been + * linked into @hsk->active_rpcs. Access with RCU. + */ + struct list_head active_links; + + /** @dead_links: For linking this object into @hsk->dead_rpcs. */ + struct list_head dead_links; + + /** + * @interest: Describes a thread that wants to be notified when + * msgin is complete, or NULL if none. + */ + struct homa_interest *interest; + + /** + * @grantable_links: Used to link this RPC into peer->grantable_rpcs. + * If this RPC isn't in peer->grantable_rpcs, this is an empty + * list pointing to itself. + */ + struct list_head grantable_links; + + /** + * @throttled_links: Used to link this RPC into homa->throttled_rpcs. + * If this RPC isn't in homa->throttled_rpcs, this is an empty + * list pointing to itself. + */ + struct list_head throttled_links; + + /** + * @silent_ticks: Number of times homa_timer has been invoked + * since the last time a packet indicating progress was received + * for this RPC, so we don't need to send a resend for a while. + */ + int silent_ticks; + + /** + * @resend_timer_ticks: Value of homa->timer_ticks the last time + * we sent a RESEND for this RPC. + */ + __u32 resend_timer_ticks; + + /** + * @done_timer_ticks: The value of homa->timer_ticks the first + * time we noticed that this (server) RPC is done (all response + * packets have been transmitted), so we're ready for an ack. + * Zero means we haven't reached that point yet. + */ + __u32 done_timer_ticks; + + /** + * @magic: when the RPC is alive, this holds a distinct value that + * is unlikely to occur naturally. The value is cleared when the + * RPC is reaped, so we can detect accidental use of an RPC after + * it has been reaped. + */ +#define HOMA_RPC_MAGIC 0xdeadbeef + int magic; + + /** + * @start_cycles: time (from get_cycles()) when this RPC was created. + * Used (sometimes) for testing. + */ + uint64_t start_cycles; +}; + +extern void homa_check_rpc(struct homa_rpc *rpc); +extern struct homa_rpc + *homa_find_client_rpc(struct homa_sock *hsk, __u64 id); +extern struct homa_rpc + *homa_find_server_rpc(struct homa_sock *hsk, + const struct in6_addr *saddr, __u16 sport, __u64 id); +extern void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, + struct homa_ack *ack); +extern void homa_rpc_free(struct homa_rpc *rpc); +extern void homa_rpc_log(struct homa_rpc *rpc); +extern void homa_rpc_log_active(struct homa *homa, uint64_t id); +extern void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); +extern void homa_rpc_log_tt(struct homa_rpc *rpc); +extern struct homa_rpc + *homa_rpc_new_client(struct homa_sock *hsk, + const union sockaddr_in_union *dest); +extern struct homa_rpc + *homa_rpc_new_server(struct homa_sock *hsk, + const struct in6_addr *source, struct data_header *h, + int *created); +extern int homa_rpc_reap(struct homa_sock *hsk, int count); +extern char *homa_symbol_for_state(struct homa_rpc *rpc); +extern int homa_validate_incoming(struct homa *homa, int verbose, + int *link_errors); + +/** + * homa_rpc_lock() - Acquire the lock for an RPC. + * @rpc: RPC to lock. Note: this function is only safe under + * limited conditions (in most cases homa_bucket_lock should be + * used). The caller must ensure that the RPC cannot be reaped + * before the lock is acquired. It cannot do that by acquirin + * the socket lock, since that violates lock ordering constraints. + * One approach is to use homa_protect_rpcs. Don't use this function + * unless you are very sure what you are doing! See sync.txt for + * more info on locking. + * @locker: Static string identifying the locking code. Normally ignored, + * but used occasionally for diagnostics and debugging. + */ +static inline void homa_rpc_lock(struct homa_rpc *rpc, const char *locker) +{ + homa_bucket_lock(rpc->bucket, rpc->id, locker); +} + +/** + * homa_rpc_unlock() - Release the lock for an RPC. + * @rpc: RPC to unlock. + */ +static inline void homa_rpc_unlock(struct homa_rpc *rpc) +{ + homa_bucket_unlock(rpc->bucket, rpc->id); +} + +/** + * homa_protect_rpcs() - Ensures that no RPCs will be reaped for a given + * socket until homa_sock_unprotect is called. Typically used by functions + * that want to scan the active RPCs for a socket without holding the socket + * lock. Multiple calls to this function may be in effect at once. + * @hsk: Socket whose RPCs should be protected. Must not be locked + * by the caller; will be locked here. + * + * Return: 1 for success, 0 if the socket has been shutdown, in which + * case its RPCs cannot be protected. + */ +static inline int homa_protect_rpcs(struct homa_sock *hsk) +{ + int result; + + homa_sock_lock(hsk, __func__); + result = !hsk->shutdown; + if (result) + atomic_inc(&hsk->protect_count); + homa_sock_unlock(hsk); + return result; +} + +/** + * homa_unprotect_rpcs() - Cancel the effect of a previous call to + * homa_sock_protect(), so that RPCs can once again be reaped. + * @hsk: Socket whose RPCs should be unprotected. + */ +static inline void homa_unprotect_rpcs(struct homa_sock *hsk) +{ + atomic_dec(&hsk->protect_count); +} + +/** + * homa_rpc_validate() - Check to see if an RPC has been reaped (which + * would mean it is no longer valid); if so, crash the kernel with a stack + * trace. + * @rpc: RPC to validate. + */ +static inline void homa_rpc_validate(struct homa_rpc *rpc) +{ + if (rpc->magic == HOMA_RPC_MAGIC) + return; + pr_err("Accessing reaped Homa RPC!\n"); + BUG(); +} + +#endif /* _HOMA_RPC_H */ diff --git a/homa_socktab.c b/homa_sock.c similarity index 89% rename from homa_socktab.c rename to homa_sock.c index aa01bf12..fe898263 100644 --- a/homa_socktab.c +++ b/homa_sock.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: BSD-2-Clause -/* This file manages homa_socktab objects; it also implements several - * operations on homa_sock objects, such as construction and destruction. - */ +/* This file manages homa_sock and homa_socktab objects. */ #include "homa_impl.h" @@ -152,7 +150,7 @@ void homa_sock_init(struct homa_sock *hsk, struct homa *homa) INIT_HLIST_HEAD(&bucket->rpcs); bucket->id = i + 1000000; } - memset(&hsk->buffer_pool, 0, sizeof(hsk->buffer_pool)); + hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_KERNEL); if (homa->hijack_tcp) hsk->sock.sk_protocol = IPPROTO_TCP; spin_unlock_bh(&socktab->write_lock); @@ -207,7 +205,8 @@ void homa_sock_shutdown(struct homa_sock *hsk) wake_up_process(interest->thread); homa_sock_unlock(hsk); - homa_pool_destroy(&hsk->buffer_pool); + homa_pool_destroy(hsk->buffer_pool); + kfree(hsk->buffer_pool); i = 0; while (!list_empty(&hsk->dead_rpcs)) { @@ -321,3 +320,30 @@ void homa_sock_lock_slow(struct homa_sock *hsk) INC_METRIC(socket_lock_misses, 1); INC_METRIC(socket_lock_miss_cycles, get_cycles() - start); } + +/** + * homa_bucket_lock_slow() - This function implements the slow path for + * locking a bucket in one of the hash tables of RPCs. It is invoked when a + * lock isn't immediately available. It waits for the lock, but also records + * statistics about the waiting time. + * @bucket: The hash table bucket to lock. + * @id: ID of the particular RPC being locked (multiple RPCs may + * share a single bucket lock). + */ +void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id) +{ + __u64 start = get_cycles(); + + tt_record2("beginning wait for rpc lock, id %d (bucket %d)", + id, bucket->id); + spin_lock_bh(&bucket->lock); + tt_record2("ending wait for bucket lock, id %d (bucket %d)", + id, bucket->id); + if (homa_is_client(id)) { + INC_METRIC(client_lock_misses, 1); + INC_METRIC(client_lock_miss_cycles, get_cycles() - start); + } else { + INC_METRIC(server_lock_misses, 1); + INC_METRIC(server_lock_miss_cycles, get_cycles() - start); + } +} diff --git a/homa_sock.h b/homa_sock.h new file mode 100644 index 00000000..bbf6047f --- /dev/null +++ b/homa_sock.h @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file defines structs and other things related to Homa sockets. */ + +#ifndef _HOMA_SOCK_H +#define _HOMA_SOCK_H + +/* Forward declarations. */ +struct homa; +struct homa_pool; + +extern void homa_sock_lock_slow(struct homa_sock *hsk); + +/** + * define HOMA_SOCKTAB_BUCKETS - Number of hash buckets in a homa_socktab. + * Must be a power of 2. + */ +#define HOMA_SOCKTAB_BUCKETS 1024 + +/** + * struct homa_socktab - A hash table that maps from port numbers (either + * client or server) to homa_sock objects. + * + * This table is managed exclusively by homa_socktab.c, using RCU to + * minimize synchronization during lookups. + */ +struct homa_socktab { + /** + * @mutex: Controls all modifications to this object; not needed + * for socket lookups (RCU is used instead). Also used to + * synchronize port allocation. + */ + spinlock_t write_lock; + + /** + * @buckets: Heads of chains for hash table buckets. Chains + * consist of homa_socktab_link objects. + */ + struct hlist_head buckets[HOMA_SOCKTAB_BUCKETS]; +}; + +/** + * struct homa_socktab_links - Used to link homa_socks into the hash chains + * of a homa_socktab. + */ +struct homa_socktab_links { + /* Must be the first element of the struct! */ + struct hlist_node hash_links; + struct homa_sock *sock; +}; + +/** + * struct homa_socktab_scan - Records the state of an iteration over all + * the entries in a homa_socktab, in a way that permits RCU-safe deletion + * of entries. + */ +struct homa_socktab_scan { + /** @socktab: The table that is being scanned. */ + struct homa_socktab *socktab; + + /** + * @current_bucket: the index of the bucket in socktab->buckets + * currently being scanned. If >= HOMA_SOCKTAB_BUCKETS, the scan + * is complete. + */ + int current_bucket; + + /** + * @next: the next socket to return from homa_socktab_next (this + * socket has not yet been returned). NULL means there are no + * more sockets in the current bucket. + */ + struct homa_socktab_links *next; +}; + +/** + * struct homa_rpc_bucket - One bucket in a hash table of RPCs. + */ + +struct homa_rpc_bucket { + /** + * @lock: serves as a lock both for this bucket (e.g., when + * adding and removing RPCs) and also for all of the RPCs in + * the bucket. Must be held whenever manipulating an RPC in + * this bucket. This dual purpose permits clean and safe + * deletion and garbage collection of RPCs. + */ + spinlock_t lock; + + /** @rpcs: list of RPCs that hash to this bucket. */ + struct hlist_head rpcs; + + /** @id: identifier for this bucket, used in error messages etc. + * It's the index of the bucket within its hash table bucket + * array, with an additional offset to separate server and + * client RPCs. + */ + int id; +}; + +/** + * define HOMA_CLIENT_RPC_BUCKETS - Number of buckets in hash tables for + * client RPCs. Must be a power of 2. + */ +#define HOMA_CLIENT_RPC_BUCKETS 1024 + +/** + * define HOMA_SERVER_RPC_BUCKETS - Number of buckets in hash tables for + * server RPCs. Must be a power of 2. + */ +#define HOMA_SERVER_RPC_BUCKETS 1024 + +/** + * struct homa_sock - Information about an open socket. + */ +struct homa_sock { + /* Info for other network layers. Note: IPv6 info (struct ipv6_pinfo + * comes at the very end of the struct, *after* Homa's data, if this + * socket uses IPv6). + */ + union { + /** @sock: generic socket data; must be the first field. */ + struct sock sock; + + /** + * @inet: generic Internet socket data; must also be the + first field (contains sock as its first member). + */ + struct inet_sock inet; + }; + + /** + * @lock: Must be held when modifying fields such as interests + * and lists of RPCs. This lock is used in place of sk->sk_lock + * because it's used differently (it's always used as a simple + * spin lock). See sync.txt for more on Homa's synchronization + * strategy. + */ + spinlock_t lock; + + /** + * @last_locker: identifies the code that most recently acquired + * @lock successfully. Occasionally used for debugging. + */ + char *last_locker; + + /** + * @protect_count: counts the number of calls to homa_protect_rpcs + * for which there have not yet been calls to homa_unprotect_rpcs. + * See sync.txt for more info. + */ + atomic_t protect_count; + + /** + * @homa: Overall state about the Homa implementation. NULL + * means this socket has been deleted. + */ + struct homa *homa; + + /** @shutdown: True means the socket is no longer usable. */ + bool shutdown; + + /** + * @port: Port number: identifies this socket uniquely among all + * those on this node. + */ + __u16 port; + + /** + * @ip_header_length: Length of IP headers for this socket (depends + * on IPv4 vs. IPv6). + */ + int ip_header_length; + + /** + * @client_socktab_links: Links this socket into the homa_socktab + * based on @port. + */ + struct homa_socktab_links socktab_links; + + /** + * @active_rpcs: List of all existing RPCs related to this socket, + * including both client and server RPCs. This list isn't strictly + * needed, since RPCs are already in one of the hash tables below, + * but it's more efficient for homa_timer to have this list + * (so it doesn't have to scan large numbers of hash buckets). + * The list is sorted, with the oldest RPC first. Manipulate with + * RCU so timer can access without locking. + */ + struct list_head active_rpcs; + + /** + * @dead_rpcs: Contains RPCs for which homa_rpc_free has been + * called, but their packet buffers haven't yet been freed. + */ + struct list_head dead_rpcs; + + /** @dead_skbs: Total number of socket buffers in RPCs on dead_rpcs. */ + int dead_skbs; + + /** + * @waiting_for_bufs: Contains RPCs that are blocked because there + * wasn't enough space in the buffer pool region for their incoming + * messages. Sorted in increasing order of message length. + */ + struct list_head waiting_for_bufs; + + /** + * @ready_requests: Contains server RPCs whose request message is + * in a state requiring attention from a user process. The head is + * oldest, i.e. next to return. + */ + struct list_head ready_requests; + + /** + * @ready_responses: Contains client RPCs whose response message is + * in a state requiring attention from a user process. The head is + * oldest, i.e. next to return. + */ + struct list_head ready_responses; + + /** + * @request_interests: List of threads that want to receive incoming + * request messages. + */ + struct list_head request_interests; + + /** + * @response_interests: List of threads that want to receive incoming + * response messages. + */ + struct list_head response_interests; + + /** + * @client_rpc_buckets: Hash table for fast lookup of client RPCs. + * Modifications are synchronized with bucket locks, not + * the socket lock. + */ + struct homa_rpc_bucket client_rpc_buckets[HOMA_CLIENT_RPC_BUCKETS]; + + /** + * @server_rpc_buckets: Hash table for fast lookup of server RPCs. + * Modifications are synchronized with bucket locks, not + * the socket lock. + */ + struct homa_rpc_bucket server_rpc_buckets[HOMA_SERVER_RPC_BUCKETS]; + + /** + * @buffer_pool: used to allocate buffer space for incoming messages. + * Storage is dynamically allocated. + */ + struct homa_pool *buffer_pool; +}; + +extern void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id); +extern int homa_sock_bind(struct homa_socktab *socktab, + struct homa_sock *hsk, __u16 port); +extern void homa_sock_destroy(struct homa_sock *hsk); +extern struct homa_sock * + homa_sock_find(struct homa_socktab *socktab, __u16 port); +extern void homa_sock_init(struct homa_sock *hsk, struct homa *homa); +extern void homa_sock_shutdown(struct homa_sock *hsk); +extern int homa_socket(struct sock *sk); +extern void homa_socktab_destroy(struct homa_socktab *socktab); +extern void homa_socktab_init(struct homa_socktab *socktab); +extern struct homa_sock + *homa_socktab_next(struct homa_socktab_scan *scan); +extern struct homa_sock + *homa_socktab_start_scan(struct homa_socktab *socktab, + struct homa_socktab_scan *scan); + +/** + * homa_sock_lock() - Acquire the lock for a socket. If the socket + * isn't immediately available, record stats on the waiting time. + * @hsk: Socket to lock. + * @locker: Static string identifying where the socket was locked; + * used to track down deadlocks. + */ +static inline void homa_sock_lock(struct homa_sock *hsk, const char *locker) +{ + if (!spin_trylock_bh(&hsk->lock)) { +// printk(KERN_NOTICE "Slow path for socket %d, last locker %s", +// hsk->client_port, hsk->last_locker); + homa_sock_lock_slow(hsk); + } +// hsk->last_locker = locker; +} + +/** + * homa_sock_unlock() - Release the lock for a socket. + * @hsk: Socket to lock. + */ +static inline void homa_sock_unlock(struct homa_sock *hsk) +{ + spin_unlock_bh(&hsk->lock); +} + +/** + * homa_client_rpc_bucket() - Find the bucket containing a given + * client RPC. + * @hsk: Socket associated with the RPC. + * @id: Id of the desired RPC. + * + * Return: The bucket in which this RPC will appear, if the RPC exists. + */ +static inline struct homa_rpc_bucket *homa_client_rpc_bucket( + struct homa_sock *hsk, __u64 id) +{ + /* We can use a really simple hash function here because RPC ids + * are allocated sequentially. + */ + return &hsk->client_rpc_buckets[(id >> 1) + & (HOMA_CLIENT_RPC_BUCKETS - 1)]; +} + +/** + * homa_server_rpc_bucket() - Find the bucket containing a given + * server RPC. + * @hsk: Socket associated with the RPC. + * @id: Id of the desired RPC. + * + * Return: The bucket in which this RPC will appear, if the RPC exists. + */ +static inline struct homa_rpc_bucket *homa_server_rpc_bucket( + struct homa_sock *hsk, __u64 id) +{ + /* Each client allocates RPC ids sequentially, so they will + * naturally distribute themselves across the hash space. + * Thus we can use the id directly as hash. + */ + return &hsk->server_rpc_buckets[(id >> 1) + & (HOMA_SERVER_RPC_BUCKETS - 1)]; +} + +/** + * homa_bucket_lock() - Acquire the lock for an RPC hash table bucket. + * @bucket: Bucket to lock + * @id: ID of the RPC that is requesting the lock. Normally ignored, + * but used occasionally for diagnostics and debugging. + * @locker: Static string identifying the locking code. Normally ignored, + * but used occasionally for diagnostics and debugging. + */ +static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, + __u64 id, const char *locker) +{ + if (!spin_trylock_bh(&bucket->lock)) + homa_bucket_lock_slow(bucket, id); +} + +/** + * homa_bucket_try_lock() - Acquire the lock for an RPC hash table bucket if + * it is available. + * @bucket: Bucket to lock + * @id: ID of the RPC that is requesting the lock. + * @locker: Static string identifying the locking code. Normally ignored, + * but used when debugging deadlocks. + * Return: Nonzero if lock was successfully acquired, zero if it is + * currently owned by someone else. + */ +static inline int homa_bucket_try_lock(struct homa_rpc_bucket *bucket, + __u64 id, const char *locker) +{ + if (!spin_trylock_bh(&bucket->lock)) + return 0; + return 1; +} + +/** + * homa_bucket_unlock() - Release the lock for an RPC hash table bucket. + * @bucket: Bucket to unlock. + * @id: ID of the RPC that was using the lock. + */ +static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id) +{ + spin_unlock_bh(&bucket->lock); +} + +#endif /* _HOMA_SOCK_H */ \ No newline at end of file diff --git a/homa_utils.c b/homa_utils.c index e1d17b4f..2aa7f666 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: BSD-2-Clause -/* This file contains miscellaneous utility functions for the Homa protocol. */ +/* This file contains miscellaneous utility functions for Homa, such + * as initializing and destroying homa structs. + */ #include "homa_impl.h" @@ -83,7 +85,6 @@ int homa_init(struct homa *homa) core->last_app_active = 0; core->held_skb = NULL; core->held_bucket = 0; - core->rpcs_locked = 0; core->skb_page = NULL; core->page_inuse = 0; core->page_size = 0; @@ -237,771 +238,6 @@ void homa_destroy(struct homa *homa) kfree(homa->metrics); } -/** - * homa_rpc_new_client() - Allocate and construct a client RPC (one that is used - * to issue an outgoing request). Doesn't send any packets. Invoked with no - * locks held. - * @hsk: Socket to which the RPC belongs. - * @dest: Address of host (ip and port) to which the RPC will be sent. - * - * Return: A printer to the newly allocated object, or a negative - * errno if an error occurred. The RPC will be locked; the - * caller must eventually unlock it. - */ -struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, - const union sockaddr_in_union *dest) -{ - int err; - struct homa_rpc *crpc; - struct homa_rpc_bucket *bucket; - struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); - - crpc = kmalloc(sizeof(*crpc), GFP_KERNEL); - if (unlikely(!crpc)) - return ERR_PTR(-ENOMEM); - - /* Initialize fields that don't require the socket lock. */ - crpc->hsk = hsk; - crpc->id = atomic64_fetch_add(2, &hsk->homa->next_outgoing_id); - bucket = homa_client_rpc_bucket(hsk, crpc->id); - crpc->bucket = bucket; - crpc->state = RPC_OUTGOING; - atomic_set(&crpc->flags, 0); - atomic_set(&crpc->grants_in_progress, 0); - crpc->peer = homa_peer_find(&hsk->homa->peers, &dest_addr_as_ipv6, - &hsk->inet); - if (IS_ERR(crpc->peer)) { - tt_record("error in homa_peer_find"); - err = PTR_ERR(crpc->peer); - goto error; - } - crpc->dport = ntohs(dest->in6.sin6_port); - crpc->completion_cookie = 0; - crpc->error = 0; - crpc->msgin.length = -1; - crpc->msgin.num_bpages = 0; - memset(&crpc->msgout, 0, sizeof(crpc->msgout)); - crpc->msgout.length = -1; - INIT_LIST_HEAD(&crpc->ready_links); - INIT_LIST_HEAD(&crpc->buf_links); - INIT_LIST_HEAD(&crpc->dead_links); - crpc->interest = NULL; - INIT_LIST_HEAD(&crpc->grantable_links); - INIT_LIST_HEAD(&crpc->throttled_links); - crpc->silent_ticks = 0; - crpc->resend_timer_ticks = hsk->homa->timer_ticks; - crpc->done_timer_ticks = 0; - crpc->magic = HOMA_RPC_MAGIC; - crpc->start_cycles = get_cycles(); - - /* Initialize fields that require locking. This allows the most - * expensive work, such as copying in the message from user space, - * to be performed without holding locks. Also, can't hold spin - * locks while doing things that could block, such as memory allocation. - */ - homa_bucket_lock(bucket, crpc->id, "homa_rpc_new_client"); - homa_sock_lock(hsk, "homa_rpc_new_client"); - if (hsk->shutdown) { - homa_sock_unlock(hsk); - homa_rpc_unlock(crpc); - err = -ESHUTDOWN; - goto error; - } - hlist_add_head(&crpc->hash_links, &bucket->rpcs); - list_add_tail_rcu(&crpc->active_links, &hsk->active_rpcs); - homa_sock_unlock(hsk); - - return crpc; - -error: - kfree(crpc); - return ERR_PTR(err); -} - -/** - * homa_rpc_new_server() - Allocate and construct a server RPC (one that is - * used to manage an incoming request). If appropriate, the RPC will also - * be handed off (we do it here, while we have the socket locked, to avoid - * acquiring the socket lock a second time later for the handoff). - * @hsk: Socket that owns this RPC. - * @source: IP address (network byte order) of the RPC's client. - * @h: Header for the first data packet received for this RPC; used - * to initialize the RPC. - * @created: Will be set to 1 if a new RPC was created and 0 if an - * existing RPC was found. - * - * Return: A pointer to a new RPC, which is locked, or a negative errno - * if an error occurred. If there is already an RPC corresponding - * to h, then it is returned instead of creating a new RPC. - */ -struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, - const struct in6_addr *source, struct data_header *h, - int *created) -{ - int err; - struct homa_rpc *srpc = NULL; - __u64 id = homa_local_id(h->common.sender_id); - struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); - - /* Lock the bucket, and make sure no-one else has already created - * the desired RPC. - */ - homa_bucket_lock(bucket, id, "homa_rpc_new_server"); - hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { - if ((srpc->id == id) && - (srpc->dport == ntohs(h->common.sport)) && - ipv6_addr_equal(&srpc->peer->addr, source)) { - /* RPC already exists; just return it instead - * of creating a new RPC. - */ - *created = 0; - return srpc; - } - } - - /* Initialize fields that don't require the socket lock. */ - srpc = kmalloc(sizeof(*srpc), GFP_KERNEL); - if (!srpc) { - err = -ENOMEM; - goto error; - } - srpc->hsk = hsk; - srpc->bucket = bucket; - srpc->state = RPC_INCOMING; - atomic_set(&srpc->flags, 0); - atomic_set(&srpc->grants_in_progress, 0); - srpc->peer = homa_peer_find(&hsk->homa->peers, source, &hsk->inet); - if (IS_ERR(srpc->peer)) { - err = PTR_ERR(srpc->peer); - goto error; - } - srpc->dport = ntohs(h->common.sport); - srpc->id = id; - srpc->completion_cookie = 0; - srpc->error = 0; - srpc->msgin.length = -1; - srpc->msgin.num_bpages = 0; - memset(&srpc->msgout, 0, sizeof(srpc->msgout)); - srpc->msgout.length = -1; - INIT_LIST_HEAD(&srpc->ready_links); - INIT_LIST_HEAD(&srpc->buf_links); - INIT_LIST_HEAD(&srpc->dead_links); - srpc->interest = NULL; - INIT_LIST_HEAD(&srpc->grantable_links); - INIT_LIST_HEAD(&srpc->throttled_links); - srpc->silent_ticks = 0; - srpc->resend_timer_ticks = hsk->homa->timer_ticks; - srpc->done_timer_ticks = 0; - srpc->magic = HOMA_RPC_MAGIC; - srpc->start_cycles = get_cycles(); - tt_record2("Incoming message for id %d has %d unscheduled bytes", - srpc->id, ntohl(h->incoming)); - err = homa_message_in_init(srpc, ntohl(h->message_length), - ntohl(h->incoming)); - if (err != 0) - goto error; - - /* Initialize fields that require socket to be locked. */ - homa_sock_lock(hsk, "homa_rpc_new_server"); - if (hsk->shutdown) { - homa_sock_unlock(hsk); - err = -ESHUTDOWN; - goto error; - } - hlist_add_head(&srpc->hash_links, &bucket->rpcs); - list_add_tail_rcu(&srpc->active_links, &hsk->active_rpcs); - if ((ntohl(h->seg.offset) == 0) && (srpc->msgin.num_bpages > 0)) { - atomic_or(RPC_PKTS_READY, &srpc->flags); - homa_rpc_handoff(srpc); - } - homa_sock_unlock(hsk); - INC_METRIC(requests_received, 1); - *created = 1; - return srpc; - -error: - homa_bucket_unlock(bucket, id); - kfree(srpc); - return ERR_PTR(err); -} - -/** - * homa_bucket_lock_slow() - This function implements the slow path for - * locking a bucket in one of the hash tables of RPCs. It is invoked when a - * lock isn't immediately available. It waits for the lock, but also records - * statistics about the waiting time. - * @bucket: The hash table bucket to lock. - * @id: ID of the particular RPC being locked (multiple RPCs may - * share a single bucket lock). - */ -void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id) -{ - __u64 start = get_cycles(); - - tt_record2("beginning wait for rpc lock, id %d (bucket %d)", - id, bucket->id); - spin_lock_bh(&bucket->lock); - tt_record2("ending wait for bucket lock, id %d (bucket %d)", - id, bucket->id); - if (homa_is_client(id)) { - INC_METRIC(client_lock_misses, 1); - INC_METRIC(client_lock_miss_cycles, get_cycles() - start); - } else { - INC_METRIC(server_lock_misses, 1); - INC_METRIC(server_lock_miss_cycles, get_cycles() - start); - } -} - -/** - * homa_rpc_acked() - This function is invoked when an ack is received - * for an RPC; if the RPC still exists, is freed. - * @hsk: Socket on which the ack was received. May or may not correspond - * to the RPC, but can sometimes be used to avoid a socket lookup. - * @saddr: Source address from which the act was received (the client - * note for the RPC) - * @ack: Information about an RPC from @saddr that may now be deleted safely. - */ -void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, - struct homa_ack *ack) -{ - struct homa_rpc *rpc; - struct homa_sock *hsk2 = hsk; - __u64 id = homa_local_id(ack->client_id); - __u16 client_port = ntohs(ack->client_port); - __u16 server_port = ntohs(ack->server_port); - - UNIT_LOG("; ", "ack %llu", id); - if (hsk2->port != server_port) { - /* Without RCU, sockets other than hsk can be deleted - * out from under us. - */ - rcu_read_lock(); - hsk2 = homa_sock_find(&hsk->homa->port_map, server_port); - if (!hsk2) - goto done; - } - rpc = homa_find_server_rpc(hsk2, saddr, client_port, id); - if (rpc) { - tt_record1("homa_rpc_acked freeing id %d", rpc->id); - homa_rpc_free(rpc); - homa_rpc_unlock(rpc); - } - -done: - if (hsk->port != server_port) - rcu_read_unlock(); -} - -/** - * homa_rpc_free() - Destructor for homa_rpc; will arrange for all resources - * associated with the RPC to be released (eventually). - * @rpc: Structure to clean up, or NULL. Must be locked. Its socket must - * not be locked. - */ -void homa_rpc_free(struct homa_rpc *rpc) -{ - /* The goal for this function is to make the RPC inaccessible, - * so that no other code will ever access it again. However, don't - * actually release resources; leave that to homa_rpc_reap, which - * runs later. There are two reasons for this. First, releasing - * resources may be expensive, so we don't want to keep the caller - * waiting; homa_rpc_reap will run in situations where there is time - * to spare. Second, there may be other code that currently has - * pointers to this RPC but temporarily released the lock (e.g. to - * copy data to/from user space). It isn't safe to clean up until - * that code has finished its work and released any pointers to the - * RPC (homa_rpc_reap will ensure that this has happened). So, this - * function should only make changes needed to make the RPC - * inaccessible. - */ - if (!rpc || (rpc->state == RPC_DEAD)) - return; - UNIT_LOG("; ", "homa_rpc_free invoked"); - tt_record1("homa_rpc_free invoked for id %d", rpc->id); - rpc->state = RPC_DEAD; - - /* The following line must occur before the socket is locked or - * RPC is added to dead_rpcs. This is necessary because homa_grant_free - * releases the RPC lock and reacquires it (see comment in - * homa_grant_free for more info). - */ - homa_grant_free_rpc(rpc); - - /* Unlink from all lists, so no-one will ever find this RPC again. */ - homa_sock_lock(rpc->hsk, "homa_rpc_free"); - __hlist_del(&rpc->hash_links); - list_del_rcu(&rpc->active_links); - list_add_tail_rcu(&rpc->dead_links, &rpc->hsk->dead_rpcs); - __list_del_entry(&rpc->ready_links); - __list_del_entry(&rpc->buf_links); - if (rpc->interest != NULL) { - rpc->interest->reg_rpc = NULL; - wake_up_process(rpc->interest->thread); - rpc->interest = NULL; - } -// tt_record3("Freeing rpc id %d, socket %d, dead_skbs %d", rpc->id, -// rpc->hsk->client_port, -// rpc->hsk->dead_skbs); - - if (rpc->msgin.length >= 0) { - rpc->hsk->dead_skbs += skb_queue_len(&rpc->msgin.packets); - while (1) { - struct homa_gap *gap = list_first_entry_or_null( - &rpc->msgin.gaps, struct homa_gap, links); - if (gap == NULL) - break; - list_del(&gap->links); - kfree(gap); - } - } - rpc->hsk->dead_skbs += rpc->msgout.num_skbs; - if (rpc->hsk->dead_skbs > rpc->hsk->homa->max_dead_buffs) - /* This update isn't thread-safe; it's just a - * statistic so it's OK if updates occasionally get - * missed. - */ - rpc->hsk->homa->max_dead_buffs = rpc->hsk->dead_skbs; - - homa_sock_unlock(rpc->hsk); - homa_remove_from_throttled(rpc); -} - -/** - * homa_rpc_reap() - Invoked to release resources associated with dead - * RPCs for a given socket. For a large RPC, it can take a long time to - * free all of its packet buffers, so we try to perform this work - * off the critical path where it won't delay applications. Each call to - * this function does a small chunk of work. See the file reap.txt for - * more information. - * @hsk: Homa socket that may contain dead RPCs. Must not be locked by the - * caller; this function will lock and release. - * @count: Number of buffers to free during this call. - * - * Return: A return value of 0 means that we ran out of work to do; calling - * again will do no work (there could be unreaped RPCs, but if so, - * reaping has been disabled for them). A value greater than - * zero means there is still more reaping work to be done. - */ -int homa_rpc_reap(struct homa_sock *hsk, int count) -{ -#ifdef __UNIT_TEST__ -#define BATCH_MAX 3 -#else -#define BATCH_MAX 20 -#endif - struct sk_buff *skbs[BATCH_MAX]; - struct homa_rpc *rpcs[BATCH_MAX]; - int num_skbs, num_rpcs; - struct homa_rpc *rpc; - int i, batch_size; - int rx_frees = 0; - int result; - - INC_METRIC(reaper_calls, 1); - INC_METRIC(reaper_dead_skbs, hsk->dead_skbs); - - /* Each iteration through the following loop will reap - * BATCH_MAX skbs. - */ - while (count > 0) { - batch_size = count; - if (batch_size > BATCH_MAX) - batch_size = BATCH_MAX; - count -= batch_size; - num_skbs = num_rpcs = 0; - - homa_sock_lock(hsk, "homa_rpc_reap"); - if (atomic_read(&hsk->protect_count)) { - INC_METRIC(disabled_reaps, 1); - tt_record2("homa_rpc_reap returning: protect_count %d, dead_skbs %d", - atomic_read(&hsk->protect_count), - hsk->dead_skbs); - homa_sock_unlock(hsk); - return 0; - } - - /* Collect buffers and freeable RPCs. */ - list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) { - if ((atomic_read(&rpc->flags) & RPC_CANT_REAP) - || (atomic_read(&rpc->grants_in_progress) - != 0) - || (atomic_read(&rpc->msgout.active_xmits) - != 0)) { - INC_METRIC(disabled_rpc_reaps, 1); - continue; - } - rpc->magic = 0; - - /* For Tx sk_buffs, collect them here but defer - * freeing until after releasing the socket lock. - */ - if (rpc->msgout.length >= 0) { - while (rpc->msgout.packets) { - skbs[num_skbs] = rpc->msgout.packets; - rpc->msgout.packets = homa_get_skb_info( - rpc->msgout.packets) - ->next_skb; - num_skbs++; - rpc->msgout.num_skbs--; - if (num_skbs >= batch_size) - goto release; - } - } - - /* In the normal case rx sk_buffs will already have been - * freed before we got here. Thus it's OK to free - * immediately in rare situations where there are - * buffers left. - */ - if (rpc->msgin.length >= 0) { - while (1) { - struct sk_buff *skb; - - skb = skb_dequeue(&rpc->msgin.packets); - if (!skb) - break; - kfree_skb(skb); - rx_frees++; - } - } - - /* If we get here, it means all packets have been - * removed from the RPC. - */ - rpcs[num_rpcs] = rpc; - num_rpcs++; - list_del_rcu(&rpc->dead_links); - if (num_rpcs >= batch_size) - goto release; - } - - /* Free all of the collected resources; release the socket - * lock while doing this. - */ -release: - hsk->dead_skbs -= num_skbs + rx_frees; - result = !list_empty(&hsk->dead_rpcs) - && ((num_skbs + num_rpcs) != 0); - homa_sock_unlock(hsk); - homa_skb_free_many_tx(hsk->homa, skbs, num_skbs); - for (i = 0; i < num_rpcs; i++) { - rpc = rpcs[i]; - UNIT_LOG("; ", "reaped %llu", rpc->id); - /* Lock and unlock the RPC before freeing it. This - * is needed to deal with races where the code - * that invoked homa_rpc_free hasn't unlocked the - * RPC yet. - */ - homa_rpc_lock(rpc, "homa_rpc_reap"); - homa_rpc_unlock(rpc); - - if (unlikely(rpc->msgin.num_bpages)) - homa_pool_release_buffers( - &rpc->hsk->buffer_pool, - rpc->msgin.num_bpages, - rpc->msgin.bpage_offsets); - if (rpc->msgin.length >= 0) { - while (1) { - struct homa_gap *gap = list_first_entry_or_null( - &rpc->msgin.gaps, - struct homa_gap, links); - if (gap == NULL) - break; - list_del(&gap->links); - kfree(gap); - } - } - tt_record1("homa_rpc_reap finished reaping id %d", - rpc->id); - rpc->state = 0; - kfree(rpc); - } - tt_record4("reaped %d skbs, %d rpcs; %d skbs remain for port %d", - num_skbs + rx_frees, num_rpcs, hsk->dead_skbs, - hsk->port); - if (!result) - break; - } - homa_pool_check_waiting(&hsk->buffer_pool); - return result; -} - -/** - * homa_find_client_rpc() - Locate client-side information about the RPC that - * a packet belongs to, if there is any. Thread-safe without socket lock. - * @hsk: Socket via which packet was received. - * @id: Unique identifier for the RPC. - * - * Return: A pointer to the homa_rpc for this id, or NULL if none. - * The RPC will be locked; the caller must eventually unlock it - * by invoking homa_rpc_unlock. - */ -struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) -{ - struct homa_rpc *crpc; - struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); - - homa_bucket_lock(bucket, id, __func__); - hlist_for_each_entry_rcu(crpc, &bucket->rpcs, hash_links) { - if (crpc->id == id) - return crpc; - } - homa_bucket_unlock(bucket, id); - return NULL; -} - -/** - * homa_find_server_rpc() - Locate server-side information about the RPC that - * a packet belongs to, if there is any. Thread-safe without socket lock. - * @hsk: Socket via which packet was received. - * @saddr: Address from which the packet was sent. - * @sport: Port at @saddr from which the packet was sent. - * @id: Unique identifier for the RPC (must have server bit set). - * - * Return: A pointer to the homa_rpc matching the arguments, or NULL - * if none. The RPC will be locked; the caller must eventually - * unlock it by invoking homa_rpc_unlock. - */ -struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u16 sport, __u64 id) -{ - struct homa_rpc *srpc; - struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); - - homa_bucket_lock(bucket, id, __func__); - hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { - if ((srpc->id == id) && (srpc->dport == sport) && - ipv6_addr_equal(&srpc->peer->addr, saddr)) - return srpc; - } - homa_bucket_unlock(bucket, id); - return NULL; -} - -/** - * homa_rpc_log() - Log info about a particular RPC; this is functionality - * pulled out of homa_rpc_log_active because its indentation got too deep. - * @rpc: RPC for which key info should be written to the system log. - */ -void homa_rpc_log(struct homa_rpc *rpc) -{ - char *type = homa_is_client(rpc->id) ? "Client" : "Server"; - char *peer = homa_print_ipv6_addr(&rpc->peer->addr); - - if (rpc->state == RPC_INCOMING) - pr_notice("%s RPC INCOMING, id %llu, peer %s:%d, %d/%d bytes received, incoming %d\n", - type, rpc->id, peer, rpc->dport, - rpc->msgin.length - - rpc->msgin.bytes_remaining, - rpc->msgin.length, rpc->msgin.granted); - else if (rpc->state == RPC_OUTGOING) { - pr_notice("%s RPC OUTGOING, id %llu, peer %s:%d, out length %d, left %d, granted %d, in left %d, resend_ticks %u, silent_ticks %d\n", - type, rpc->id, peer, rpc->dport, - rpc->msgout.length, - rpc->msgout.length - rpc->msgout.next_xmit_offset, - rpc->msgout.granted, - rpc->msgin.bytes_remaining, - rpc->resend_timer_ticks, - rpc->silent_ticks); - } else { - pr_notice("%s RPC %s, id %llu, peer %s:%d, incoming length %d, outgoing length %d\n", - type, homa_symbol_for_state(rpc), - rpc->id, peer, rpc->dport, - rpc->msgin.length, rpc->msgout.length); - } -} - -/** - * homa_rpc_log_active() - Print information to the system log about all - * active RPCs. Intended primarily for debugging. - * @homa: Overall data about the Homa protocol implementation. - * @id: An RPC id: if nonzero, then only RPCs with this id will be - * logged. - */ -void homa_rpc_log_active(struct homa *homa, uint64_t id) -{ - struct homa_socktab_scan scan; - struct homa_sock *hsk; - struct homa_rpc *rpc; - int count = 0; - - pr_notice("Logging active Homa RPCs:\n"); - rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { - if (list_empty(&hsk->active_rpcs) || hsk->shutdown) - continue; - - if (!homa_protect_rpcs(hsk)) - continue; - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - count++; - if ((id != 0) && (id != rpc->id)) - continue; - homa_rpc_log(rpc); - } - homa_unprotect_rpcs(hsk); - } - rcu_read_unlock(); - pr_notice("Finished logging active Homa RPCs: %d active RPCs\n", count); -} - -/** - * homa_rpc_log_tt() - Log info about a particular RPC using timetraces. - * @rpc: RPC for which key info should be written to the system log. - */ -void homa_rpc_log_tt(struct homa_rpc *rpc) -{ - if (rpc->state == RPC_INCOMING) { - int received = rpc->msgin.length - - rpc->msgin.bytes_remaining; - tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes received", - rpc->id, tt_addr(rpc->peer->addr), - received, rpc->msgin.length); - if (1) - tt_record4("RPC id %d has incoming %d, granted %d, prio %d", rpc->id, - rpc->msgin.granted - received, - rpc->msgin.granted, rpc->msgin.priority); - tt_record4("RPC id %d: length %d, remaining %d, rank %d", - rpc->id, rpc->msgin.length, - rpc->msgin.bytes_remaining, - atomic_read(&rpc->msgin.rank)); - if (rpc->msgin.num_bpages == 0) - tt_record1("RPC id %d is blocked waiting for buffers", - rpc->id); - else - tt_record2("RPC id %d has %d bpages allocated", - rpc->id, rpc->msgin.num_bpages); - } else if (rpc->state == RPC_OUTGOING) { - tt_record4("Outgoing RPC id %d, peer 0x%x, %d/%d bytes sent", - rpc->id, tt_addr(rpc->peer->addr), - rpc->msgout.next_xmit_offset, - rpc->msgout.length); - if (rpc->msgout.granted > rpc->msgout.next_xmit_offset) - tt_record3("RPC id %d has %d unsent grants (granted %d)", - rpc->id, rpc->msgout.granted - - rpc->msgout.next_xmit_offset, - rpc->msgout.granted); - } else { - tt_record2("RPC id %d is in state %d", rpc->id, rpc->state); - } -} - -/** - * homa_rpc_log_active_tt() - Log information about all active RPCs using - * timetraces. - * @homa: Overall data about the Homa protocol implementation. - * @freeze_count: If nonzero, FREEZE requests will be sent for this many - * incoming RPCs with outstanding grants - */ -void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) -{ - struct homa_socktab_scan scan; - struct homa_sock *hsk; - struct homa_rpc *rpc; - int count = 0; - - homa_grant_log_tt(homa); - tt_record("Logging active Homa RPCs:"); - rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { - if (list_empty(&hsk->active_rpcs) || hsk->shutdown) - continue; - - if (!homa_protect_rpcs(hsk)) - continue; - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - struct freeze_header freeze; - - count++; - homa_rpc_log_tt(rpc); - if (freeze_count == 0) - continue; - if (rpc->state != RPC_INCOMING) - continue; - if (rpc->msgin.granted <= (rpc->msgin.length - - rpc->msgin.bytes_remaining)) - continue; - freeze_count--; - pr_notice("Emitting FREEZE in %s\n", __func__); - homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); - } - homa_unprotect_rpcs(hsk); - } - rcu_read_unlock(); - tt_record1("Finished logging (%d active Homa RPCs)", count); -} - -/** - * homa_validate_incoming() - Scan all of the active RPCs to compute what - * homa_total_incoming should be, and see if it actually matches. - * @homa: Overall data about the Homa protocol implementation. - * @verbose: Print incoming info for each individual RPC. - * @link_errors: Set to 1 if one or more grantable RPCs don't seem to - * be linked into the grantable lists. - * Return: The difference between the actual value of homa->total_incoming - * and the expected value computed from the individual RPCs (positive - * means homa->total_incoming is higher than expected). - */ -int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) -{ - struct homa_socktab_scan scan; - struct homa_sock *hsk; - struct homa_rpc *rpc; - int total_incoming = 0; - int actual; - - tt_record1("homa_validate_incoming starting, total_incoming %d", - atomic_read(&homa->total_incoming)); - *link_errors = 0; - rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { - if (list_empty(&hsk->active_rpcs) || hsk->shutdown) - continue; - - if (!homa_protect_rpcs(hsk)) - continue; - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - int incoming; - - if (rpc->state != RPC_INCOMING) - continue; - incoming = rpc->msgin.granted - - (rpc->msgin.length - - rpc->msgin.bytes_remaining); - if (incoming < 0) - incoming = 0; - if (rpc->msgin.rec_incoming == 0) - continue; - total_incoming += rpc->msgin.rec_incoming; - if (verbose) - tt_record3("homa_validate_incoming: RPC id %d, ncoming %d, rec_incoming %d", - rpc->id, incoming, - rpc->msgin.rec_incoming); - if (rpc->msgin.granted >= rpc->msgin.length) - continue; - if (list_empty(&rpc->grantable_links)) { - tt_record1("homa_validate_incoming: RPC id %d not linked in grantable list", - rpc->id); - *link_errors = 1; - } - if (list_empty(&rpc->grantable_links)) { - tt_record1("homa_validate_incoming: RPC id %d peer not linked in grantable list", - rpc->id); - *link_errors = 1; - } - } - homa_unprotect_rpcs(hsk); - } - rcu_read_unlock(); - actual = atomic_read(&homa->total_incoming); - tt_record3("homa_validate_incoming diff %d (expected %d, got %d)", - actual - total_incoming, total_incoming, actual); - return actual - total_incoming; -} - /** * homa_print_ipv4_addr() - Convert an IPV4 address to the standard string * representation. @@ -1404,34 +640,6 @@ int homa_snprintf(char *buffer, int size, int used, const char *format, ...) return used + new_chars; } -/** - * homa_symbol_for_state() - Returns a printable string describing an - * RPC state. - * @rpc: RPC whose state should be returned in printable form. - * - * Return: A static string holding the current state of @rpc. - */ -char *homa_symbol_for_state(struct homa_rpc *rpc) -{ - static char buffer[20]; - - switch (rpc->state) { - case RPC_OUTGOING: - return "OUTGOING"; - case RPC_INCOMING: - return "INCOMING"; - case RPC_IN_SERVICE: - return "IN_SERVICE"; - case RPC_DEAD: - return "DEAD"; - } - - /* See safety comment in homa_symbol_for_type. */ - snprintf(buffer, sizeof(buffer)-1, "unknown(%u)", rpc->state); - buffer[sizeof(buffer)-1] = 0; - return buffer; -} - /** * homa_symbol_for_type() - Returns a printable string describing a packet type. * @type: A value from those defined by &homa_packet_type. diff --git a/test/Makefile b/test/Makefile index bf03aa9b..c5273d70 100644 --- a/test/Makefile +++ b/test/Makefile @@ -45,8 +45,9 @@ TEST_SRCS := unit_homa_grant.c \ unit_homa_peertab.c \ unit_homa_pool.c \ unit_homa_plumbing.c \ + unit_homa_rpc.c \ unit_homa_skb.c \ - unit_homa_socktab.c \ + unit_homa_sock.c \ unit_homa_timer.c \ unit_homa_utils.c \ unit_timetrace.c @@ -60,8 +61,9 @@ HOMA_SRCS := homa_grant.c \ homa_peertab.c \ homa_pool.c \ homa_plumbing.c \ + homa_rpc.c \ homa_skb.c \ - homa_socktab.c \ + homa_sock.c \ homa_timer.c \ homa_utils.c \ timetrace.c diff --git a/test/mock.c b/test/mock.c index 88509416..013ecae6 100644 --- a/test/mock.c +++ b/test/mock.c @@ -746,6 +746,11 @@ void *mock_kmalloc(size_t size, gfp_t flags) return block; } +void *__kmalloc_noprof(size_t size, gfp_t flags) +{ + return mock_kmalloc(size, flags); +} + struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 8443cd80..635ac5a8 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -194,7 +194,7 @@ TEST_F(homa_incoming, homa_message_in_init__pool_doesnt_exist) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_pool_destroy(&self->hsk.buffer_pool); + homa_pool_destroy(self->hsk.buffer_pool); EXPECT_EQ(ENOMEM, -homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 0)); EXPECT_EQ(0, crpc->msgin.num_bpages); } @@ -203,7 +203,7 @@ TEST_F(homa_incoming, homa_message_in_init__no_buffers_available) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - atomic_set(&self->hsk.buffer_pool.free_bpages, 0); + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); EXPECT_EQ(0, homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 10000)); EXPECT_EQ(0, crpc->msgin.num_bpages); EXPECT_EQ(0, crpc->msgin.granted); @@ -1135,7 +1135,7 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffer_pool) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 1600); ASSERT_NE(NULL, crpc); - homa_pool_destroy(&self->hsk.buffer_pool); + homa_pool_destroy(self->hsk.buffer_pool); unit_log_clear(); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); @@ -1161,7 +1161,7 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffers) EXPECT_NE(NULL, crpc); unit_log_clear(); - atomic_set(&self->hsk.buffer_pool.free_bpages, 0); + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); EXPECT_EQ(1400, homa_metrics_per_cpu()->dropped_data_no_bufs); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index ad47ca1c..3bc4e227 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -262,12 +262,12 @@ TEST_F(homa_plumbing, homa_set_sock_opt__success) & ~(PAGE_SIZE - 1)); args.length = 64*HOMA_BPAGE_SIZE; self->optval.user = &args; - homa_pool_destroy(&self->hsk.buffer_pool); + homa_pool_destroy(self->hsk.buffer_pool); EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_SET_BUF, self->optval, sizeof(struct homa_set_buf_args))); - EXPECT_EQ(args.start, self->hsk.buffer_pool.region); - EXPECT_EQ(64, self->hsk.buffer_pool.num_bpages); + EXPECT_EQ(args.start, self->hsk.buffer_pool->region); + EXPECT_EQ(64, self->hsk.buffer_pool->num_bpages); EXPECT_EQ(1, homa_metrics_per_cpu()->so_set_buf_calls); } @@ -474,18 +474,18 @@ TEST_F(homa_plumbing, homa_recvmsg__bogus_flags) } TEST_F(homa_plumbing, homa_recvmsg__release_buffers) { - EXPECT_EQ(0, -homa_pool_get_pages(&self->hsk.buffer_pool, 2, + EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, self->recvmsg_args.bpage_offsets, 0)); - EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool.descriptors[0].refs)); - EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool.descriptors[1].refs)); + EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool->descriptors[0].refs)); + EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool->descriptors[1].refs)); self->recvmsg_args.num_bpages = 2; self->recvmsg_args.bpage_offsets[0] = 0; self->recvmsg_args.bpage_offsets[1] = HOMA_BPAGE_SIZE; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); - EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool.descriptors[0].refs)); - EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool.descriptors[1].refs)); + EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[0].refs)); + EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[1].refs)); } TEST_F(homa_plumbing, homa_recvmsg__error_in_homa_wait_for_message) { @@ -514,7 +514,7 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) mock_sock_init(&self->hsk, &self->homa, 0); __u32 pages[2]; - EXPECT_EQ(0, -homa_pool_get_pages(&self->hsk.buffer_pool, 2, pages, 0)); + EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, pages, 0)); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); @@ -641,10 +641,10 @@ TEST_F(homa_plumbing, homa_recvmsg__error_copying_out_args) } TEST_F(homa_plumbing, homa_recvmsg__copy_back_args_even_after_error) { - EXPECT_EQ(0, -homa_pool_get_pages(&self->hsk.buffer_pool, 2, + EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, self->recvmsg_args.bpage_offsets, 0)); - EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool.descriptors[0].refs)); - EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool.descriptors[1].refs)); + EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool->descriptors[0].refs)); + EXPECT_EQ(1, atomic_read(&self->hsk.buffer_pool->descriptors[1].refs)); self->recvmsg_args.num_bpages = 2; self->recvmsg_args.bpage_offsets[0] = 0; self->recvmsg_args.bpage_offsets[1] = HOMA_BPAGE_SIZE; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 96496713..af8ed0c0 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -24,7 +24,7 @@ FIXTURE_SETUP(homa_pool) mock_sock_init(&self->hsk, &self->homa, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); self->server_ip = unit_get_in_addr("1.2.3.4"); - cur_pool = &self->hsk.buffer_pool; + cur_pool = self->hsk.buffer_pool; } FIXTURE_TEARDOWN(homa_pool) { @@ -67,7 +67,7 @@ static void change_owner_hook(char *id) TEST_F(homa_pool, homa_pool_set_bpages_needed) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; atomic_set(&pool->free_bpages, 0); unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE+1); @@ -80,33 +80,33 @@ TEST_F(homa_pool, homa_pool_set_bpages_needed) TEST_F(homa_pool, homa_pool_init__basics) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; EXPECT_EQ(100, pool->num_bpages); EXPECT_EQ(-1, pool->descriptors[98].owner); } TEST_F(homa_pool, homa_pool_init__region_not_page_aligned) { - homa_pool_destroy(&self->hsk.buffer_pool); + homa_pool_destroy(self->hsk.buffer_pool); EXPECT_EQ(EINVAL, -homa_pool_init(&self->hsk, ((char *) 0x1000000) + 10, 100*HOMA_BPAGE_SIZE)); } TEST_F(homa_pool, homa_pool_init__region_too_small) { - homa_pool_destroy(&self->hsk.buffer_pool); + homa_pool_destroy(self->hsk.buffer_pool); EXPECT_EQ(EINVAL, -homa_pool_init(&self->hsk, (void *) 0x1000000, HOMA_BPAGE_SIZE)); } TEST_F(homa_pool, homa_pool_init__cant_allocate_descriptors) { mock_kmalloc_errors = 1; - homa_pool_destroy(&self->hsk.buffer_pool); + homa_pool_destroy(self->hsk.buffer_pool); EXPECT_EQ(ENOMEM, -homa_pool_init(&self->hsk, (void *) 0x100000, 100*HOMA_BPAGE_SIZE)); } TEST_F(homa_pool, homa_pool_init__cant_allocate_core_info) { - homa_pool_destroy(&self->hsk.buffer_pool); + homa_pool_destroy(self->hsk.buffer_pool); mock_kmalloc_errors = 2; EXPECT_EQ(ENOMEM, -homa_pool_init(&self->hsk, (void *) 0x100000, 100*HOMA_BPAGE_SIZE)); @@ -114,13 +114,13 @@ TEST_F(homa_pool, homa_pool_init__cant_allocate_core_info) TEST_F(homa_pool, homa_pool_destroy__idempotent) { - homa_pool_destroy(&self->hsk.buffer_pool); - homa_pool_destroy(&self->hsk.buffer_pool); + homa_pool_destroy(self->hsk.buffer_pool); + homa_pool_destroy(self->hsk.buffer_pool); } TEST_F(homa_pool, homa_pool_get_pages__basics) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(0, pages[0]); @@ -132,7 +132,7 @@ TEST_F(homa_pool, homa_pool_get_pages__basics) } TEST_F(homa_pool, homa_pool_get_pages__not_enough_space) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; atomic_set(&pool->free_bpages, 1); EXPECT_EQ(-1, homa_pool_get_pages(pool, 2, pages, 0)); @@ -141,7 +141,7 @@ TEST_F(homa_pool, homa_pool_get_pages__not_enough_space) } TEST_F(homa_pool, homa_pool_get_pages__set_limit) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; atomic_set(&pool->free_bpages, 62); pool->cores[raw_smp_processor_id()].next_candidate = 49; @@ -151,7 +151,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit) } TEST_F(homa_pool, homa_pool_get_pages__set_limit_with_MIN_EXTRA) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; atomic_set(&pool->free_bpages, 92); pool->cores[raw_smp_processor_id()].next_candidate = 13; @@ -161,7 +161,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit_with_MIN_EXTRA) } TEST_F(homa_pool, homa_pool_get_pages__skip_unusable_bpages) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; mock_cycles = 1000; atomic_set(&pool->descriptors[0].refs, 2); @@ -178,7 +178,7 @@ TEST_F(homa_pool, homa_pool_get_pages__skip_unusable_bpages) } TEST_F(homa_pool, homa_pool_get_pages__cant_lock_pages) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; mock_cycles = 1000; mock_trylock_errors = 3; @@ -188,7 +188,7 @@ TEST_F(homa_pool, homa_pool_get_pages__cant_lock_pages) } TEST_F(homa_pool, homa_pool_get_pages__state_changes_while_locking) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; mock_cycles = 1000; unit_hook_register(steal_bpages_hook); @@ -198,7 +198,7 @@ TEST_F(homa_pool, homa_pool_get_pages__state_changes_while_locking) } TEST_F(homa_pool, homa_pool_get_pages__steal_expired_page) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; pool->descriptors[0].owner = 5; mock_cycles = 5000; @@ -212,7 +212,7 @@ TEST_F(homa_pool, homa_pool_get_pages__steal_expired_page) } TEST_F(homa_pool, homa_pool_get_pages__set_owner) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; self->homa.bpage_lease_cycles = 1000; mock_cycles = 5000; @@ -225,7 +225,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_owner) TEST_F(homa_pool, homa_pool_allocate__basics) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 150000); @@ -241,7 +241,7 @@ TEST_F(homa_pool, homa_pool_allocate__basics) } TEST_F(homa_pool, homa_pool_no_buffer_pool) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 150000); @@ -251,7 +251,7 @@ TEST_F(homa_pool, homa_pool_no_buffer_pool) } TEST_F(homa_pool, homa_pool_allocate__cant_allocate_full_bpages) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; atomic_set(&pool->free_bpages, 1); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, @@ -264,7 +264,7 @@ TEST_F(homa_pool, homa_pool_allocate__cant_allocate_full_bpages) } TEST_F(homa_pool, homa_pool_allocate__no_partial_page) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; atomic_set(&pool->free_bpages, 2); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, @@ -278,7 +278,7 @@ TEST_F(homa_pool, homa_pool_allocate__no_partial_page) } TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; pool->cores[raw_smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 40); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -305,7 +305,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) } TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; pool->cores[raw_smp_processor_id()].page_hint = 2; pool->cores[raw_smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; atomic_set(&pool->descriptors[2].refs, 1); @@ -324,7 +324,7 @@ TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) } TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; pool->cores[raw_smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 50); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -347,7 +347,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) } TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; pool->cores[raw_smp_processor_id()].next_candidate = 2; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, @@ -368,7 +368,7 @@ TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) } TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; atomic_set(&pool->free_bpages, 5); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, @@ -384,7 +384,7 @@ TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) TEST_F(homa_pool, homa_pool_allocate__out_of_space) { /* Queue up several RPCs to make sure they are properly sorted. */ - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; atomic_set(&pool->free_bpages, 0); unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); @@ -411,7 +411,7 @@ TEST_F(homa_pool, homa_pool_allocate__out_of_space) TEST_F(homa_pool, homa_pool_get_buffer) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; int available; void *buffer; @@ -429,7 +429,7 @@ TEST_F(homa_pool, homa_pool_get_buffer) TEST_F(homa_pool, homa_pool_release_buffers__basics) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; char *saved_region; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, @@ -464,7 +464,7 @@ TEST_F(homa_pool, homa_pool_release_buffers__basics) TEST_F(homa_pool, homa_pool_check_waiting__basics) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; /* Queue up 2 RPCs that together need a total of 5 bpages. */ atomic_set(&pool->free_bpages, 0); @@ -498,7 +498,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__basics) } TEST_F(homa_pool, homa_pool_check_waiting__bpages_needed_but_no_queued_rpcs) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; pool->bpages_needed = 1; homa_pool_check_waiting(pool); EXPECT_EQ(100, atomic_read(&pool->free_bpages)); @@ -506,7 +506,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__bpages_needed_but_no_queued_rpcs) } TEST_F(homa_pool, homa_pool_check_waiting__rpc_initially_locked) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; atomic_set(&pool->free_bpages, 0); struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -527,7 +527,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__rpc_initially_locked) } TEST_F(homa_pool, homa_pool_check_waiting__reset_bpages_needed) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; atomic_set(&pool->free_bpages, 0); struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, @@ -552,7 +552,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__reset_bpages_needed) } TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; /* Queue up an RPC that needs 2 bpages. */ atomic_set(&pool->free_bpages, 0); @@ -572,7 +572,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) } TEST_F(homa_pool, homa_pool_check_waiting__reallocation_fails) { - struct homa_pool *pool = &self->hsk.buffer_pool; + struct homa_pool *pool = self->hsk.buffer_pool; /* Queue up an RPC that needs 4 bpages. */ atomic_set(&pool->free_bpages, 0); diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c new file mode 100644 index 00000000..48594aac --- /dev/null +++ b/test/unit_homa_rpc.c @@ -0,0 +1,666 @@ +/* Copyright (c) 2019-2023 Homa Developers + * SPDX-License-Identifier: BSD-1-Clause + */ + +#include "homa_impl.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +#define n(x) htons(x) +#define N(x) htonl(x) + +FIXTURE(homa_rpc) { + struct in6_addr client_ip[1]; + int client_port; + struct in6_addr server_ip[1]; + int server_port; + __u64 client_id; + __u64 server_id; + struct homa homa; + struct homa_sock hsk; + union sockaddr_in_union server_addr; + struct data_header data; + struct homa_rpc *crpc; + struct iovec iovec; + struct iov_iter iter; +}; +FIXTURE_SETUP(homa_rpc) +{ + self->client_ip[0] = unit_get_in_addr("196.168.0.1"); + self->client_port = 40000; + self->server_ip[0] = unit_get_in_addr("1.2.3.4"); + self->server_port = 99; + self->client_id = 1234; + self->server_id = 1235; + self->server_addr.in6.sin6_family = AF_INET; + self->server_addr.in6.sin6_addr = *self->server_ip; + self->server_addr.in6.sin6_port = htons(self->server_port); + homa_init(&self->homa); + mock_sock_init(&self->hsk, &self->homa, 0); + self->data = (struct data_header){.common = { + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = DATA, + .sender_id = self->client_id}, + .message_length = htonl(10000), + .incoming = htonl(10000), .cutoff_version = 0, + .ack = {0, 0, 0}, + .retransmit = 0, + .seg = {.offset = 0}}; + self->iovec.iov_base = (void *) 2000; + self->iovec.iov_len = 10000; + iov_iter_init(&self->iter, WRITE, &self->iovec, 1, self->iovec.iov_len); + unit_log_clear(); +} +FIXTURE_TEARDOWN(homa_rpc) +{ + homa_destroy(&self->homa); + unit_teardown(); +} + +/** + * dead_rpcs() - Logs the ids for all of the RPCS in hsk->dead_rpcs. + * @hsk: Homa socket to check for dead RPCs. + * + * Return: the contents of the unit test log. + */ +static const char *dead_rpcs(struct homa_sock *hsk) +{ + struct homa_rpc *rpc; + list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) + UNIT_LOG(" ", "%llu", rpc->id); + return unit_log_get(); +} + +TEST_F(homa_rpc, homa_rpc_new_client__normal) +{ + struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + &self->server_addr); + ASSERT_FALSE(IS_ERR(crpc)); + homa_rpc_free(crpc); + homa_rpc_unlock(crpc); +} +TEST_F(homa_rpc, homa_rpc_new_client__malloc_error) +{ + mock_kmalloc_errors = 1; + struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + &self->server_addr); + EXPECT_TRUE(IS_ERR(crpc)); + EXPECT_EQ(ENOMEM, -PTR_ERR(crpc)); +} +TEST_F(homa_rpc, homa_rpc_new_client__route_error) +{ + mock_route_errors = 1; + struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + &self->server_addr); + EXPECT_TRUE(IS_ERR(crpc)); + EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(crpc)); +} +TEST_F(homa_rpc, homa_rpc_new_client__socket_shutdown) +{ + self->hsk.shutdown = 1; + struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + &self->server_addr); + EXPECT_TRUE(IS_ERR(crpc)); + EXPECT_EQ(ESHUTDOWN, -PTR_ERR(crpc)); + self->hsk.shutdown = 0; +} + +TEST_F(homa_rpc, homa_rpc_new_server__normal) +{ + int created; + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + self->data.message_length = N(1600); + homa_data_pkt(mock_skb_new(self->client_ip, &self->data.common, + 1400, 0), srpc); + EXPECT_EQ(RPC_INCOMING, srpc->state); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(1, created); + homa_rpc_free(srpc); +} +TEST_F(homa_rpc, homa_rpc_new_server__already_exists) +{ + int created; + struct homa_rpc *srpc1 = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_FALSE(IS_ERR(srpc1)); + homa_rpc_unlock(srpc1); + self->data.common.sender_id = cpu_to_be64( + be64_to_cpu(self->data.common.sender_id) + + 2*HOMA_SERVER_RPC_BUCKETS); + struct homa_rpc *srpc2 = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_FALSE(IS_ERR(srpc2)); + EXPECT_EQ(1, created); + homa_rpc_unlock(srpc2); + EXPECT_NE(srpc2, srpc1); + self->data.common.sender_id = cpu_to_be64( + be64_to_cpu(self->data.common.sender_id) + - 2*HOMA_SERVER_RPC_BUCKETS); + struct homa_rpc *srpc3 = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_FALSE(IS_ERR(srpc3)); + EXPECT_EQ(0, created); + homa_rpc_unlock(srpc3); + EXPECT_EQ(srpc3, srpc1); +} +TEST_F(homa_rpc, homa_rpc_new_server__malloc_error) +{ + int created; + mock_kmalloc_errors = 1; + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + EXPECT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); +} +TEST_F(homa_rpc, homa_rpc_new_server__addr_error) +{ + int created; + mock_route_errors = 1; + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + EXPECT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(srpc)); +} +TEST_F(homa_rpc, homa_rpc_new_server__socket_shutdown) +{ + int created; + self->hsk.shutdown = 1; + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + EXPECT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(ESHUTDOWN, -PTR_ERR(srpc)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + self->hsk.shutdown = 0; +} +TEST_F(homa_rpc, homa_rpc_new_server__allocate_buffers) +{ + int created; + self->data.message_length = N(3*HOMA_BPAGE_SIZE); + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + EXPECT_EQ(3, srpc->msgin.num_bpages); + homa_rpc_free(srpc); +} +TEST_F(homa_rpc, homa_rpc_new_server__no_buffer_pool) +{ + int created; + self->data.message_length = N(1400); + homa_pool_destroy(self->hsk.buffer_pool); + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); +} +TEST_F(homa_rpc, homa_rpc_new_server__handoff_rpc) +{ + int created; + self->data.message_length = N(1400); + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + EXPECT_EQ(RPC_INCOMING, srpc->state); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_requests)); + homa_rpc_free(srpc); +} +TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_no_buffers) +{ + int created; + self->data.message_length = N(1400); + atomic_set(&self->hsk.buffer_pool->free_bpages,0 ); + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); + homa_rpc_free(srpc); +} +TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_rpc) +{ + int created; + self->data.message_length = N(2800); + self->data.seg.offset = N(1400); + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + EXPECT_EQ(RPC_INCOMING, srpc->state); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); + homa_rpc_free(srpc); +} + +TEST_F(homa_rpc, homa_bucket_lock_slow) +{ + int created; + mock_cycles = ~0; + struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + &self->server_addr); + ASSERT_FALSE(IS_ERR(crpc)); + homa_rpc_free(crpc); + homa_rpc_unlock(crpc); + struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, + self->client_ip, &self->data, &created); + ASSERT_FALSE(IS_ERR(srpc)); + homa_rpc_unlock(srpc); + + EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_miss_cycles); + homa_bucket_lock_slow(crpc->bucket, crpc->id); + homa_rpc_unlock(crpc); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_lock_misses); + EXPECT_NE(0, homa_metrics_per_cpu()->client_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_misses); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_miss_cycles); + homa_bucket_lock_slow(srpc->bucket, srpc->id); + homa_rpc_unlock(srpc); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_lock_misses); + EXPECT_NE(0, homa_metrics_per_cpu()->server_lock_miss_cycles); +} + +TEST_F(homa_rpc, homa_rpc_acked__basics) +{ + struct homa_sock hsk; + mock_sock_init(&hsk, &self->homa, self->server_port); + struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 3000); + ASSERT_NE(NULL, srpc); + struct homa_ack ack = {.client_port = htons(self->client_port), + .server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->client_id)}; + homa_rpc_acked(&hsk, self->client_ip, &ack); + EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); + homa_sock_destroy(&hsk); +} +TEST_F(homa_rpc, homa_rpc_acked__lookup_socket) +{ + struct homa_sock hsk; + mock_sock_init(&hsk, &self->homa, self->server_port); + struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 3000); + ASSERT_NE(NULL, srpc); + struct homa_ack ack = {.client_port = htons(self->client_port), + .server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->client_id)}; + homa_rpc_acked(&self->hsk, self->client_ip, &ack); + EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); + homa_sock_destroy(&hsk); +} +TEST_F(homa_rpc, homa_rpc_acked__no_such_socket) +{ + struct homa_sock hsk; + mock_sock_init(&hsk, &self->homa, self->server_port); + struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 3000); + ASSERT_NE(NULL, srpc); + struct homa_ack ack = {.client_port = htons(self->client_port), + .server_port = htons(self->server_port+1), + .client_id = cpu_to_be64(self->client_id)}; + homa_rpc_acked(&hsk, self->client_ip, &ack); + EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); + EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); + homa_sock_destroy(&hsk); +} +TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) +{ + struct homa_sock hsk; + mock_sock_init(&hsk, &self->homa, self->server_port); + struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 3000); + ASSERT_NE(NULL, srpc); + struct homa_ack ack = {.client_port = htons(self->client_port), + .server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->client_id+10)}; + homa_rpc_acked(&hsk, self->client_ip, &ack); + EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); + EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); + homa_sock_destroy(&hsk); +} + +TEST_F(homa_rpc, homa_rpc_free__basics) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 20000); + EXPECT_EQ(1, self->homa.num_grantable_rpcs); + ASSERT_NE(NULL, crpc); + unit_log_clear(); + mock_log_rcu_sched = 1; + homa_rpc_free(crpc); + EXPECT_EQ(0, self->homa.num_grantable_rpcs); + EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, crpc->id)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(1, unit_list_length(&self->hsk.dead_rpcs)); +} +TEST_F(homa_rpc, homa_rpc_free__already_dead) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 100); + ASSERT_NE(NULL, crpc); + unit_log_clear(); + homa_rpc_free(crpc); + EXPECT_STREQ("homa_rpc_free invoked", + unit_log_get()); + unit_log_clear(); + homa_rpc_free(crpc); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_rpc, homa_rpc_free__state_ready) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 100); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + homa_rpc_free(crpc); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_responses)); +} +TEST_F(homa_rpc, homa_rpc_free__wakeup_interest) +{ + struct homa_interest interest = {}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 100); + ASSERT_NE(NULL, crpc); + atomic_long_set(&interest.ready_rpc, 0); + interest.reg_rpc = crpc; + crpc->interest = &interest; + unit_log_clear(); + homa_rpc_free(crpc); + EXPECT_EQ(NULL, interest.reg_rpc); + EXPECT_STREQ("homa_rpc_free invoked; " + "wake_up_process pid -1", unit_log_get()); +} +TEST_F(homa_rpc, homa_rpc_free__free_gaps) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + self->data.seg.offset = htonl(1400); + homa_add_packet(crpc, mock_skb_new(self->client_ip, + &self->data.common, 1400, 1400)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_new(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 0, end 1400; start 2800, end 4200", + unit_print_gaps(crpc)); + + homa_rpc_free(crpc); + /* (Test infrastructure will complain if gaps aren't freed) */ +} +TEST_F(homa_rpc, homa_rpc_free__dead_buffs) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 10000, 1000); + ASSERT_NE(NULL, crpc1); + homa_rpc_free(crpc1); + EXPECT_EQ(9, self->homa.max_dead_buffs); + EXPECT_EQ(9, self->hsk.dead_skbs); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 5000, 1000); + ASSERT_NE(NULL, crpc2); + homa_rpc_free(crpc2); + EXPECT_EQ(14, self->homa.max_dead_buffs); + EXPECT_EQ(14, self->hsk.dead_skbs); +} +TEST_F(homa_rpc, homa_rpc_free__remove_from_throttled_list) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 10000, 1000); + homa_add_to_throttled(crpc); + EXPECT_EQ(1, unit_list_length(&self->homa.throttled_rpcs)); + unit_log_clear(); + homa_rpc_free(crpc); + EXPECT_EQ(0, unit_list_length(&self->homa.throttled_rpcs)); +} + +TEST_F(homa_rpc, homa_rpc_reap__basics) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 5000, 100); + struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+4, 2000, 100); + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + ASSERT_NE(NULL, crpc3); + homa_rpc_free(crpc1); + homa_rpc_free(crpc2); + homa_rpc_free(crpc3); + unit_log_clear(); + EXPECT_STREQ("1234 1236 1238", dead_rpcs(&self->hsk)); + EXPECT_EQ(11, self->hsk.dead_skbs); + unit_log_clear(); + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 7)); + EXPECT_STREQ("reaped 1234", unit_log_get()); + unit_log_clear(); + EXPECT_STREQ("1236 1238", dead_rpcs(&self->hsk)); + EXPECT_EQ(2, self->hsk.dead_skbs); +} +TEST_F(homa_rpc, homa_rpc_reap__protected) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + ASSERT_NE(NULL, crpc1); + homa_rpc_free(crpc1); + unit_log_clear(); + homa_protect_rpcs(&self->hsk); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 10)); + homa_unprotect_rpcs(&self->hsk); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_flags) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 2000); + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_free(crpc1); + homa_rpc_free(crpc2); + unit_log_clear(); + atomic_or(RPC_COPYING_TO_USER, &crpc1->flags); + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 3)); + EXPECT_STREQ("reaped 1236", unit_log_get()); + unit_log_clear(); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); + EXPECT_STREQ("", unit_log_get()); + atomic_andnot(RPC_COPYING_TO_USER, &crpc1->flags); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); + EXPECT_STREQ("reaped 1234", unit_log_get()); +} +TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_active_xmits) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 2000); + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_free(crpc1); + homa_rpc_free(crpc2); + unit_log_clear(); + atomic_inc(&crpc1->msgout.active_xmits); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 100)); + EXPECT_STREQ("reaped 1236", unit_log_get()); + unit_log_clear(); + atomic_dec(&crpc1->msgout.active_xmits); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 100)); + EXPECT_STREQ("reaped 1234", unit_log_get()); +} +TEST_F(homa_rpc, homa_rpc_reap__grant_in_progress) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 2000); + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_free(crpc1); + homa_rpc_free(crpc2); + unit_log_clear(); + atomic_inc(&crpc1->grants_in_progress); + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 3)); + EXPECT_STREQ("reaped 1236", unit_log_get()); + unit_log_clear(); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); + EXPECT_STREQ("", unit_log_get()); + atomic_dec(&crpc1->grants_in_progress); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); + EXPECT_STREQ("reaped 1234", unit_log_get()); +} +TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id, 10000, 100); + ASSERT_NE(NULL, crpc); + homa_rpc_free(crpc); + EXPECT_EQ(9, self->hsk.dead_skbs); + unit_log_clear(); + homa_rpc_reap(&self->hsk, 5); + EXPECT_STREQ("1234", dead_rpcs(&self->hsk)); + EXPECT_EQ(4, self->hsk.dead_skbs); +} +TEST_F(homa_rpc, homa_rpc_reap__release_buffers) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + 4000, 98, 1000, 150000); + ASSERT_NE(NULL, crpc); + + EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); + homa_rpc_free(crpc); + EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); + self->hsk.buffer_pool->check_waiting_invoked = 0; + homa_rpc_reap(&self->hsk, 5); + EXPECT_EQ(0, atomic_read(&pool->descriptors[1].refs)); + EXPECT_EQ(1, self->hsk.buffer_pool->check_waiting_invoked); +} +TEST_F(homa_rpc, homa_rpc_reap__free_gaps) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + 4000, 98, 1000, 150000); + ASSERT_NE(NULL, crpc); + homa_gap_new(&crpc->msgin.gaps, 1000, 2000); + mock_cycles = 1000; + homa_gap_new(&crpc->msgin.gaps, 5000, 6000); + + EXPECT_STREQ("start 1000, end 2000; start 5000, end 6000, time 1000", + unit_print_gaps(crpc)); + homa_rpc_free(crpc); + homa_rpc_reap(&self->hsk, 5); + // Test framework will complain if memory not freed. +} +TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) +{ + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 10)); +} + +TEST_F(homa_rpc, homa_find_client_rpc) +{ + atomic64_set(&self->homa.next_outgoing_id, 3); + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 10000, 1000); + atomic64_set(&self->homa.next_outgoing_id, 3 + 3*HOMA_CLIENT_RPC_BUCKETS); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 10000, 1000); + atomic64_set(&self->homa.next_outgoing_id, + 3 + 10*HOMA_CLIENT_RPC_BUCKETS); + struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+4, 10000, 1000); + atomic64_set(&self->homa.next_outgoing_id, 40); + struct homa_rpc *crpc4 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+6, 10000, 1000); + + EXPECT_EQ(crpc1, homa_find_client_rpc(&self->hsk, crpc1->id)); + homa_rpc_unlock(crpc1); + EXPECT_EQ(crpc2, homa_find_client_rpc(&self->hsk, crpc2->id)); + homa_rpc_unlock(crpc2); + EXPECT_EQ(crpc3, homa_find_client_rpc(&self->hsk, crpc3->id)); + homa_rpc_unlock(crpc3); + EXPECT_EQ(crpc4, homa_find_client_rpc(&self->hsk, crpc4->id)); + homa_rpc_unlock(crpc4); + EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, 15)); + homa_rpc_free(crpc1); + homa_rpc_free(crpc2); + homa_rpc_free(crpc3); + homa_rpc_free(crpc4); +} + +TEST_F(homa_rpc, homa_find_server_rpc) +{ + struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + ASSERT_NE(NULL, srpc1); + struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id + 30*HOMA_SERVER_RPC_BUCKETS, + 10000, 100); + ASSERT_NE(NULL, srpc2); + struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port+1, + self->server_id + 10*HOMA_SERVER_RPC_BUCKETS, + 10000, 100); + ASSERT_NE(NULL, srpc3); + struct homa_rpc *srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port+1, + self->server_id + 4, 10000, 100); + ASSERT_NE(NULL, srpc4); + EXPECT_EQ(srpc1, homa_find_server_rpc(&self->hsk, self->client_ip, + self->client_port, srpc1->id)); + homa_rpc_unlock(srpc1); + EXPECT_EQ(srpc2, homa_find_server_rpc(&self->hsk, self->client_ip, + self->client_port, srpc2->id)); + homa_rpc_unlock(srpc2); + EXPECT_EQ(srpc3, homa_find_server_rpc(&self->hsk, self->client_ip, + self->client_port+1, srpc3->id)); + homa_rpc_unlock(srpc3); + EXPECT_EQ(srpc4, homa_find_server_rpc(&self->hsk, self->client_ip, + self->client_port+1, srpc4->id)); + homa_rpc_unlock(srpc4); + EXPECT_EQ(NULL, homa_find_server_rpc(&self->hsk, self->client_ip, + self->client_port, 3)); +} diff --git a/test/unit_homa_socktab.c b/test/unit_homa_sock.c similarity index 91% rename from test/unit_homa_socktab.c rename to test/unit_homa_sock.c index ec55140a..5f3be0b5 100644 --- a/test/unit_homa_socktab.c +++ b/test/unit_homa_sock.c @@ -12,7 +12,7 @@ #define n(x) htons(x) #define N(x) htonl(x) -FIXTURE(homa_socktab) { +FIXTURE(homa_sock) { struct homa homa; struct homa_sock hsk; struct in6_addr client_ip[1]; @@ -21,7 +21,7 @@ FIXTURE(homa_socktab) { int server_port; __u64 client_id; }; -FIXTURE_SETUP(homa_socktab) +FIXTURE_SETUP(homa_sock) { homa_init(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); @@ -31,20 +31,20 @@ FIXTURE_SETUP(homa_socktab) self->server_port = 99; self->client_id = 1234; } -FIXTURE_TEARDOWN(homa_socktab) +FIXTURE_TEARDOWN(homa_sock) { homa_destroy(&self->homa); unit_teardown(); } -TEST_F(homa_socktab, homa_port_hash) +TEST_F(homa_sock, homa_port_hash) { EXPECT_EQ(1023, homa_port_hash(0xffff)); EXPECT_EQ(18, homa_port_hash(0x6012)); EXPECT_EQ(99, homa_port_hash(99)); } -TEST_F(homa_socktab, homa_socktab_start_scan) +TEST_F(homa_sock, homa_socktab_start_scan) { struct homa_socktab_scan scan; homa_destroy(&self->homa); @@ -55,7 +55,7 @@ TEST_F(homa_socktab, homa_socktab_start_scan) EXPECT_EQ(100, scan.current_bucket); } -TEST_F(homa_socktab, homa_socktab_next__basics) +TEST_F(homa_sock, homa_socktab_next__basics) { struct homa_sock hsk1, hsk2, hsk3, hsk4, *hsk; struct homa_socktab_scan scan; @@ -81,7 +81,7 @@ TEST_F(homa_socktab, homa_socktab_next__basics) homa_sock_destroy(&hsk3); homa_sock_destroy(&hsk4); } -TEST_F(homa_socktab, homa_socktab_next__deleted_socket) +TEST_F(homa_sock, homa_socktab_next__deleted_socket) { struct homa_sock hsk1, hsk2, hsk3, *hsk; struct homa_socktab_scan scan; @@ -105,7 +105,7 @@ TEST_F(homa_socktab, homa_socktab_next__deleted_socket) homa_sock_destroy(&hsk3); } -TEST_F(homa_socktab, homa_sock_init__skip_port_in_use) +TEST_F(homa_sock, homa_sock_init__skip_port_in_use) { struct homa_sock hsk2, hsk3; self->homa.next_client_port = 0xffff; @@ -116,7 +116,7 @@ TEST_F(homa_socktab, homa_sock_init__skip_port_in_use) homa_sock_destroy(&hsk2); homa_sock_destroy(&hsk3); } -TEST_F(homa_socktab, homa_sock_init__ip_header_length) +TEST_F(homa_sock, homa_sock_init__ip_header_length) { struct homa_sock hsk_v4, hsk_v6; mock_ipv6 = false; @@ -128,7 +128,7 @@ TEST_F(homa_socktab, homa_sock_init__ip_header_length) homa_sock_destroy(&hsk_v4); homa_sock_destroy(&hsk_v6); } -TEST_F(homa_socktab, homa_sock_init__hijack_tcp) +TEST_F(homa_sock, homa_sock_init__hijack_tcp) { struct homa_sock hijack, no_hijack; self->homa.hijack_tcp = 0; @@ -141,7 +141,7 @@ TEST_F(homa_socktab, homa_sock_init__hijack_tcp) homa_sock_destroy(&no_hijack); } -TEST_F(homa_socktab, homa_sock_shutdown__basics) +TEST_F(homa_sock, homa_sock_shutdown__basics) { int client2, client3; struct homa_sock hsk2, hsk3; @@ -167,7 +167,7 @@ TEST_F(homa_socktab, homa_sock_shutdown__basics) EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, 100)); EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, client3)); } -TEST_F(homa_socktab, homa_sock_shutdown__already_shutdown) +TEST_F(homa_sock, homa_sock_shutdown__already_shutdown) { unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, @@ -181,7 +181,7 @@ TEST_F(homa_socktab, homa_sock_shutdown__already_shutdown) EXPECT_EQ(2 ,unit_list_length(&self->hsk.active_rpcs)); self->hsk.shutdown = 0; } -TEST_F(homa_socktab, homa_sock_shutdown__delete_rpcs) +TEST_F(homa_sock, homa_sock_shutdown__delete_rpcs) { unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, @@ -193,7 +193,7 @@ TEST_F(homa_socktab, homa_sock_shutdown__delete_rpcs) EXPECT_TRUE(self->hsk.shutdown); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_socktab, homa_sock_shutdown__wakeup_interests) +TEST_F(homa_sock, homa_sock_shutdown__wakeup_interests) { struct homa_interest interest1, interest2, interest3; struct task_struct task1, task2, task3; @@ -214,7 +214,7 @@ TEST_F(homa_socktab, homa_sock_shutdown__wakeup_interests) unit_log_get()); } -TEST_F(homa_socktab, homa_sock_bind) +TEST_F(homa_sock, homa_sock_bind) { struct homa_sock hsk2; mock_sock_init(&hsk2, &self->homa, 0); @@ -240,14 +240,14 @@ TEST_F(homa_socktab, homa_sock_bind) EXPECT_EQ(&self->hsk, homa_sock_find(&self->homa.port_map, 120)); homa_sock_destroy(&hsk2); } -TEST_F(homa_socktab, homa_sock_bind__socket_shutdown) +TEST_F(homa_sock, homa_sock_bind__socket_shutdown) { homa_sock_shutdown(&self->hsk); EXPECT_EQ(ESHUTDOWN, -homa_sock_bind(&self->homa.port_map, &self->hsk, 100)); } -TEST_F(homa_socktab, homa_sock_find__basics) +TEST_F(homa_sock, homa_sock_find__basics) { struct homa_sock hsk2; mock_sock_init(&hsk2, &self->homa, 0); @@ -261,7 +261,7 @@ TEST_F(homa_socktab, homa_sock_find__basics) homa_sock_destroy(&hsk2); } -TEST_F(homa_socktab, homa_sock_find__long_hash_chain) +TEST_F(homa_sock, homa_sock_find__long_hash_chain) { struct homa_sock hsk2, hsk3, hsk4; EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &self->hsk, 13)); @@ -289,7 +289,7 @@ TEST_F(homa_socktab, homa_sock_find__long_hash_chain) homa_sock_destroy(&hsk4); } -TEST_F(homa_socktab, homa_sock_lock_slow) +TEST_F(homa_sock, homa_sock_lock_slow) { mock_cycles = ~0; diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index dfd9b9ef..66f9bd95 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -13,46 +13,11 @@ #define N(x) htonl(x) FIXTURE(homa_utils) { - struct in6_addr client_ip[1]; - int client_port; - struct in6_addr server_ip[1]; - int server_port; - __u64 client_id; - __u64 server_id; struct homa homa; - struct homa_sock hsk; - union sockaddr_in_union server_addr; - struct data_header data; - struct homa_rpc *crpc; - struct iovec iovec; - struct iov_iter iter; }; FIXTURE_SETUP(homa_utils) { - self->client_ip[0] = unit_get_in_addr("196.168.0.1"); - self->client_port = 40000; - self->server_ip[0] = unit_get_in_addr("1.2.3.4"); - self->server_port = 99; - self->client_id = 1234; - self->server_id = 1235; - self->server_addr.in6.sin6_family = AF_INET; - self->server_addr.in6.sin6_addr = *self->server_ip; - self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); - self->data = (struct data_header){.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .type = DATA, - .sender_id = self->client_id}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0, 0}, - .retransmit = 0, - .seg = {.offset = 0}}; - self->iovec.iov_base = (void *) 2000; - self->iovec.iov_len = 10000; - iov_iter_init(&self->iter, WRITE, &self->iovec, 1, self->iovec.iov_len); unit_log_clear(); } FIXTURE_TEARDOWN(homa_utils) @@ -87,619 +52,6 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, homa->unsched_cutoffs[7] = c7; } -/** - * dead_rpcs() - Logs the ids for all of the RPCS in hsk->dead_rpcs. - * @hsk: Homa socket to check for dead RPCs. - * - * Return: the contents of the unit test log. - */ -static const char *dead_rpcs(struct homa_sock *hsk) -{ - struct homa_rpc *rpc; - list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) - UNIT_LOG(" ", "%llu", rpc->id); - return unit_log_get(); -} - -TEST_F(homa_utils, homa_rpc_new_client__normal) -{ - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - ASSERT_FALSE(IS_ERR(crpc)); - homa_rpc_free(crpc); - homa_rpc_unlock(crpc); -} -TEST_F(homa_utils, homa_rpc_new_client__malloc_error) -{ - mock_kmalloc_errors = 1; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - EXPECT_TRUE(IS_ERR(crpc)); - EXPECT_EQ(ENOMEM, -PTR_ERR(crpc)); -} -TEST_F(homa_utils, homa_rpc_new_client__route_error) -{ - mock_route_errors = 1; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - EXPECT_TRUE(IS_ERR(crpc)); - EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(crpc)); -} -TEST_F(homa_utils, homa_rpc_new_client__socket_shutdown) -{ - self->hsk.shutdown = 1; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - EXPECT_TRUE(IS_ERR(crpc)); - EXPECT_EQ(ESHUTDOWN, -PTR_ERR(crpc)); - self->hsk.shutdown = 0; -} - -TEST_F(homa_utils, homa_rpc_new_server__normal) -{ - int created; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); - self->data.message_length = N(1600); - homa_data_pkt(mock_skb_new(self->client_ip, &self->data.common, - 1400, 0), srpc); - EXPECT_EQ(RPC_INCOMING, srpc->state); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, created); - homa_rpc_free(srpc); -} -TEST_F(homa_utils, homa_rpc_new_server__already_exists) -{ - int created; - struct homa_rpc *srpc1 = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_FALSE(IS_ERR(srpc1)); - homa_rpc_unlock(srpc1); - self->data.common.sender_id = cpu_to_be64( - be64_to_cpu(self->data.common.sender_id) - + 2*HOMA_SERVER_RPC_BUCKETS); - struct homa_rpc *srpc2 = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_FALSE(IS_ERR(srpc2)); - EXPECT_EQ(1, created); - homa_rpc_unlock(srpc2); - EXPECT_NE(srpc2, srpc1); - self->data.common.sender_id = cpu_to_be64( - be64_to_cpu(self->data.common.sender_id) - - 2*HOMA_SERVER_RPC_BUCKETS); - struct homa_rpc *srpc3 = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_FALSE(IS_ERR(srpc3)); - EXPECT_EQ(0, created); - homa_rpc_unlock(srpc3); - EXPECT_EQ(srpc3, srpc1); -} -TEST_F(homa_utils, homa_rpc_new_server__malloc_error) -{ - int created; - mock_kmalloc_errors = 1; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - EXPECT_TRUE(IS_ERR(srpc)); - EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); -} -TEST_F(homa_utils, homa_rpc_new_server__addr_error) -{ - int created; - mock_route_errors = 1; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - EXPECT_TRUE(IS_ERR(srpc)); - EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(srpc)); -} -TEST_F(homa_utils, homa_rpc_new_server__socket_shutdown) -{ - int created; - self->hsk.shutdown = 1; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - EXPECT_TRUE(IS_ERR(srpc)); - EXPECT_EQ(ESHUTDOWN, -PTR_ERR(srpc)); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - self->hsk.shutdown = 0; -} -TEST_F(homa_utils, homa_rpc_new_server__allocate_buffers) -{ - int created; - self->data.message_length = N(3*HOMA_BPAGE_SIZE); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); - EXPECT_EQ(3, srpc->msgin.num_bpages); - homa_rpc_free(srpc); -} -TEST_F(homa_utils, homa_rpc_new_server__no_buffer_pool) -{ - int created; - self->data.message_length = N(1400); - homa_pool_destroy(&self->hsk.buffer_pool); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_TRUE(IS_ERR(srpc)); - EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); -} -TEST_F(homa_utils, homa_rpc_new_server__handoff_rpc) -{ - int created; - self->data.message_length = N(1400); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); - EXPECT_EQ(RPC_INCOMING, srpc->state); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_requests)); - homa_rpc_free(srpc); -} -TEST_F(homa_utils, homa_rpc_new_server__dont_handoff_no_buffers) -{ - int created; - self->data.message_length = N(1400); - atomic_set(&self->hsk.buffer_pool.free_bpages,0 ); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); - EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); - homa_rpc_free(srpc); -} -TEST_F(homa_utils, homa_rpc_new_server__dont_handoff_rpc) -{ - int created; - self->data.message_length = N(2800); - self->data.seg.offset = N(1400); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); - EXPECT_EQ(RPC_INCOMING, srpc->state); - EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); - homa_rpc_free(srpc); -} - -TEST_F(homa_utils, homa_bucket_lock_slow) -{ - int created; - mock_cycles = ~0; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - ASSERT_FALSE(IS_ERR(crpc)); - homa_rpc_free(crpc); - homa_rpc_unlock(crpc); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); - ASSERT_FALSE(IS_ERR(srpc)); - homa_rpc_unlock(srpc); - - EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_miss_cycles); - homa_bucket_lock_slow(crpc->bucket, crpc->id); - homa_rpc_unlock(crpc); - EXPECT_EQ(1, homa_metrics_per_cpu()->client_lock_misses); - EXPECT_NE(0, homa_metrics_per_cpu()->client_lock_miss_cycles); - EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_miss_cycles); - homa_bucket_lock_slow(srpc->bucket, srpc->id); - homa_rpc_unlock(srpc); - EXPECT_EQ(1, homa_metrics_per_cpu()->server_lock_misses); - EXPECT_NE(0, homa_metrics_per_cpu()->server_lock_miss_cycles); -} - -TEST_F(homa_utils, homa_rpc_acked__basics) -{ - struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); - ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id)}; - homa_rpc_acked(&hsk, self->client_ip, &ack); - EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); - EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); -} -TEST_F(homa_utils, homa_rpc_acked__lookup_socket) -{ - struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); - ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id)}; - homa_rpc_acked(&self->hsk, self->client_ip, &ack); - EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); - EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); -} -TEST_F(homa_utils, homa_rpc_acked__no_such_socket) -{ - struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); - ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port+1), - .client_id = cpu_to_be64(self->client_id)}; - homa_rpc_acked(&hsk, self->client_ip, &ack); - EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); - EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); -} -TEST_F(homa_utils, homa_rpc_acked__no_such_rpc) -{ - struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); - ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id+10)}; - homa_rpc_acked(&hsk, self->client_ip, &ack); - EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); - EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); -} - -TEST_F(homa_utils, homa_rpc_free__basics) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 20000); - EXPECT_EQ(1, self->homa.num_grantable_rpcs); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - mock_log_rcu_sched = 1; - homa_rpc_free(crpc); - EXPECT_EQ(0, self->homa.num_grantable_rpcs); - EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, crpc->id)); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, unit_list_length(&self->hsk.dead_rpcs)); -} -TEST_F(homa_utils, homa_rpc_free__already_dead) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 100); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - homa_rpc_free(crpc); - EXPECT_STREQ("homa_rpc_free invoked", - unit_log_get()); - unit_log_clear(); - homa_rpc_free(crpc); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_free__state_ready) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 100); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); - homa_rpc_free(crpc); - EXPECT_EQ(0, unit_list_length(&self->hsk.ready_responses)); -} -TEST_F(homa_utils, homa_rpc_free__wakeup_interest) -{ - struct homa_interest interest = {}; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 100); - ASSERT_NE(NULL, crpc); - atomic_long_set(&interest.ready_rpc, 0); - interest.reg_rpc = crpc; - crpc->interest = &interest; - unit_log_clear(); - homa_rpc_free(crpc); - EXPECT_EQ(NULL, interest.reg_rpc); - EXPECT_STREQ("homa_rpc_free invoked; " - "wake_up_process pid -1", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_free__free_gaps) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 99, 1000, 1000); - homa_message_in_init(crpc, 10000, 0); - unit_log_clear(); - self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 1400)); - - self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, - &self->data.common, 1400, 4200)); - EXPECT_STREQ("start 0, end 1400; start 2800, end 4200", - unit_print_gaps(crpc)); - - homa_rpc_free(crpc); - /* (Test infrastructure will complain if gaps aren't freed) */ -} -TEST_F(homa_utils, homa_rpc_free__dead_buffs) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 10000, 1000); - ASSERT_NE(NULL, crpc1); - homa_rpc_free(crpc1); - EXPECT_EQ(9, self->homa.max_dead_buffs); - EXPECT_EQ(9, self->hsk.dead_skbs); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 5000, 1000); - ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc2); - EXPECT_EQ(14, self->homa.max_dead_buffs); - EXPECT_EQ(14, self->hsk.dead_skbs); -} -TEST_F(homa_utils, homa_rpc_free__remove_from_throttled_list) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 10000, 1000); - homa_add_to_throttled(crpc); - EXPECT_EQ(1, unit_list_length(&self->homa.throttled_rpcs)); - unit_log_clear(); - homa_rpc_free(crpc); - EXPECT_EQ(0, unit_list_length(&self->homa.throttled_rpcs)); -} - -TEST_F(homa_utils, homa_rpc_free_rcu) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 20000); - homa_rpc_free(crpc); - EXPECT_EQ(RPC_DEAD, crpc->state); -} - -TEST_F(homa_utils, homa_rpc_reap__basics) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 5000, 100); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+4, 2000, 100); - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - ASSERT_NE(NULL, crpc3); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - homa_rpc_free(crpc3); - unit_log_clear(); - EXPECT_STREQ("1234 1236 1238", dead_rpcs(&self->hsk)); - EXPECT_EQ(11, self->hsk.dead_skbs); - unit_log_clear(); - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 7)); - EXPECT_STREQ("reaped 1234", unit_log_get()); - unit_log_clear(); - EXPECT_STREQ("1236 1238", dead_rpcs(&self->hsk)); - EXPECT_EQ(2, self->hsk.dead_skbs); -} -TEST_F(homa_utils, homa_rpc_reap__protected) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 2000); - ASSERT_NE(NULL, crpc1); - homa_rpc_free(crpc1); - unit_log_clear(); - homa_protect_rpcs(&self->hsk); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 10)); - homa_unprotect_rpcs(&self->hsk); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_reap__skip_rpc_because_of_flags) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 1000, 2000); - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - unit_log_clear(); - atomic_or(RPC_COPYING_TO_USER, &crpc1->flags); - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("reaped 1236", unit_log_get()); - unit_log_clear(); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("", unit_log_get()); - atomic_andnot(RPC_COPYING_TO_USER, &crpc1->flags); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("reaped 1234", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_reap__skip_rpc_because_of_active_xmits) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 1000, 2000); - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - unit_log_clear(); - atomic_inc(&crpc1->msgout.active_xmits); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 100)); - EXPECT_STREQ("reaped 1236", unit_log_get()); - unit_log_clear(); - atomic_dec(&crpc1->msgout.active_xmits); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 100)); - EXPECT_STREQ("reaped 1234", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_reap__grant_in_progress) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 1000, 2000); - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - unit_log_clear(); - atomic_inc(&crpc1->grants_in_progress); - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("reaped 1236", unit_log_get()); - unit_log_clear(); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("", unit_log_get()); - atomic_dec(&crpc1->grants_in_progress); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); - EXPECT_STREQ("reaped 1234", unit_log_get()); -} -TEST_F(homa_utils, homa_rpc_reap__hit_limit_in_msgout_packets) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 10000, 100); - ASSERT_NE(NULL, crpc); - homa_rpc_free(crpc); - EXPECT_EQ(9, self->hsk.dead_skbs); - unit_log_clear(); - homa_rpc_reap(&self->hsk, 5); - EXPECT_STREQ("1234", dead_rpcs(&self->hsk)); - EXPECT_EQ(4, self->hsk.dead_skbs); -} -TEST_F(homa_utils, homa_rpc_reap__release_buffers) -{ - struct homa_pool *pool = &self->hsk.buffer_pool; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - 4000, 98, 1000, 150000); - ASSERT_NE(NULL, crpc); - - EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); - homa_rpc_free(crpc); - EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); - self->hsk.buffer_pool.check_waiting_invoked = 0; - homa_rpc_reap(&self->hsk, 5); - EXPECT_EQ(0, atomic_read(&pool->descriptors[1].refs)); - EXPECT_EQ(1, self->hsk.buffer_pool.check_waiting_invoked); -} -TEST_F(homa_utils, homa_rpc_reap__free_gaps) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - 4000, 98, 1000, 150000); - ASSERT_NE(NULL, crpc); - homa_gap_new(&crpc->msgin.gaps, 1000, 2000); - mock_cycles = 1000; - homa_gap_new(&crpc->msgin.gaps, 5000, 6000); - - EXPECT_STREQ("start 1000, end 2000; start 5000, end 6000, time 1000", - unit_print_gaps(crpc)); - homa_rpc_free(crpc); - homa_rpc_reap(&self->hsk, 5); - // Test framework will complain if memory not freed. -} -TEST_F(homa_utils, homa_rpc_reap__nothing_to_reap) -{ - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 10)); -} - -TEST_F(homa_utils, homa_find_client_rpc) -{ - atomic64_set(&self->homa.next_outgoing_id, 3); - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 10000, 1000); - atomic64_set(&self->homa.next_outgoing_id, 3 + 3*HOMA_CLIENT_RPC_BUCKETS); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 10000, 1000); - atomic64_set(&self->homa.next_outgoing_id, - 3 + 10*HOMA_CLIENT_RPC_BUCKETS); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+4, 10000, 1000); - atomic64_set(&self->homa.next_outgoing_id, 40); - struct homa_rpc *crpc4 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+6, 10000, 1000); - - EXPECT_EQ(crpc1, homa_find_client_rpc(&self->hsk, crpc1->id)); - homa_rpc_unlock(crpc1); - EXPECT_EQ(crpc2, homa_find_client_rpc(&self->hsk, crpc2->id)); - homa_rpc_unlock(crpc2); - EXPECT_EQ(crpc3, homa_find_client_rpc(&self->hsk, crpc3->id)); - homa_rpc_unlock(crpc3); - EXPECT_EQ(crpc4, homa_find_client_rpc(&self->hsk, crpc4->id)); - homa_rpc_unlock(crpc4); - EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, 15)); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - homa_rpc_free(crpc3); - homa_rpc_free(crpc4); -} - -TEST_F(homa_utils, homa_find_server_rpc) -{ - struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 10000, 100); - ASSERT_NE(NULL, srpc1); - struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port, - self->server_id + 30*HOMA_SERVER_RPC_BUCKETS, - 10000, 100); - ASSERT_NE(NULL, srpc2); - struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port+1, - self->server_id + 10*HOMA_SERVER_RPC_BUCKETS, - 10000, 100); - ASSERT_NE(NULL, srpc3); - struct homa_rpc *srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, - self->client_ip, self->server_ip, self->client_port+1, - self->server_id + 4, 10000, 100); - ASSERT_NE(NULL, srpc4); - EXPECT_EQ(srpc1, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, srpc1->id)); - homa_rpc_unlock(srpc1); - EXPECT_EQ(srpc2, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, srpc2->id)); - homa_rpc_unlock(srpc2); - EXPECT_EQ(srpc3, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port+1, srpc3->id)); - homa_rpc_unlock(srpc3); - EXPECT_EQ(srpc4, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port+1, srpc4->id)); - homa_rpc_unlock(srpc4); - EXPECT_EQ(NULL, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, 3)); -} - TEST_F(homa_utils, homa_print_ipv4_addr) { char *p1, *p2; From 205c48db4d3f354dc2288b92bb2c6e018abb9d74 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 30 Sep 2024 16:43:28 -0700 Subject: [PATCH 029/625] Extract homa_peer.h from homa_impl.h Also rename homa_peertab* -> homa_peer* --- Makefile | 2 +- homa_grant.c | 1 + homa_impl.h | 241 +---------------- homa_incoming.c | 5 +- homa_outgoing.c | 3 +- homa_peertab.c => homa_peer.c | 5 +- homa_peer.h | 252 ++++++++++++++++++ homa_plumbing.c | 1 + homa_rpc.c | 5 +- homa_sock.c | 1 + homa_timer.c | 1 + homa_utils.c | 9 +- test/Makefile | 4 +- test/unit_homa_incoming.c | 7 +- test/unit_homa_outgoing.c | 3 +- .../{unit_homa_peertab.c => unit_homa_peer.c} | 73 ++--- test/unit_homa_plumbing.c | 1 + test/unit_homa_rpc.c | 1 + test/unit_homa_timer.c | 1 + test/utils.c | 1 + 20 files changed, 325 insertions(+), 292 deletions(-) rename homa_peertab.c => homa_peer.c (99%) create mode 100644 homa_peer.h rename test/{unit_homa_peertab.c => unit_homa_peer.c} (86%) diff --git a/Makefile b/Makefile index a76e3a55..467a0ef7 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ homa-y = homa_grant.o \ homa_metrics.o \ homa_offload.o \ homa_outgoing.o \ - homa_peertab.o \ + homa_peer.o \ homa_pool.o \ homa_plumbing.o \ homa_rpc.o \ diff --git a/homa_grant.c b/homa_grant.c index 224b526a..a0104089 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #include "homa_wire.h" /** diff --git a/homa_impl.h b/homa_impl.h index 8bd2a21f..2ca0a07e 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -147,7 +147,6 @@ struct homa_peer; /* Declarations used in this file, so they can't be made at the end. */ extern int homa_grantable_lock_slow(struct homa *homa, int recalc); -extern void homa_peer_lock_slow(struct homa_peer *peer); extern void homa_throttle_lock_slow(struct homa *homa); extern struct homa_core *homa_cores[]; @@ -386,189 +385,6 @@ struct homa_pool { int check_waiting_invoked; }; -/** - * struct homa_dead_dst - Used to retain dst_entries that are no longer - * needed, until it is safe to delete them (I'm not confident that the RCU - * mechanism will be safe for these: the reference count could get incremented - * after it's on the RCU list?). - */ -struct homa_dead_dst { - /** @dst: Entry that is no longer used by a struct homa_peer. */ - struct dst_entry *dst; - - /** - * @gc_time: Time (in units of get_cycles) when it is safe - * to free @dst. - */ - __u64 gc_time; - - /** @dst_links: Used to link together entries in peertab->dead_dsts. */ - struct list_head dst_links; -}; - -/** - * define HOMA_PEERTAB_BUCKETS - Number of bits in the bucket index for a - * homa_peertab. Should be large enough to hold an entry for every server - * in a datacenter without long hash chains. - */ -#define HOMA_PEERTAB_BUCKET_BITS 16 - -/** define HOME_PEERTAB_BUCKETS - Number of buckets in a homa_peertab. */ -#define HOMA_PEERTAB_BUCKETS (1 << HOMA_PEERTAB_BUCKET_BITS) - -/** - * struct homa_peertab - A hash table that maps from IPv6 addresses - * to homa_peer objects. IPv4 entries are encapsulated as IPv6 addresses. - * Entries are gradually added to this table, but they are never removed - * except when the entire table is deleted. We can't safely delete because - * results returned by homa_peer_find may be retained indefinitely. - * - * This table is managed exclusively by homa_peertab.c, using RCU to - * permit efficient lookups. - */ -struct homa_peertab { - /** - * @write_lock: Synchronizes addition of new entries; not needed - * for lookups (RCU is used instead). - */ - spinlock_t write_lock; - - /** - * @dead_dsts: List of dst_entries that are waiting to be deleted. - * Hold @write_lock when manipulating. - */ - struct list_head dead_dsts; - - /** - * @buckets: Pointer to heads of chains of homa_peers for each bucket. - * Malloc-ed, and must eventually be freed. NULL means this structure - * has not been initialized. - */ - struct hlist_head *buckets; -}; - -/** - * struct homa_peer - One of these objects exists for each machine that we - * have communicated with (either as client or server). - */ -struct homa_peer { - /** - * @addr: IPv6 address for the machine (IPv4 addresses are stored - * as IPv4-mapped IPv6 addresses). - */ - struct in6_addr addr; - - /** @flow: Addressing info needed to send packets. */ - struct flowi flow; - - /** - * @dst: Used to route packets to this peer; we own a reference - * to this, which we must eventually release. - */ - struct dst_entry *dst; - - /** - * @unsched_cutoffs: priorities to use for unscheduled packets - * sent to this host, as specified in the most recent CUTOFFS - * packet from that host. See documentation for @homa.unsched_cutoffs - * for the meanings of these values. - */ - int unsched_cutoffs[HOMA_MAX_PRIORITIES]; - - /** - * @cutoff_version: value of cutoff_version in the most recent - * CUTOFFS packet received from this peer. 0 means we haven't - * yet received a CUTOFFS packet from the host. Note that this is - * stored in network byte order. - */ - __be16 cutoff_version; - - /** - * last_update_jiffies: time in jiffies when we sent the most - * recent CUTOFFS packet to this peer. - */ - unsigned long last_update_jiffies; - - /** - * grantable_rpcs: Contains all homa_rpcs (both requests and - * responses) involving this peer whose msgins require (or required - * them in the past) and have not been fully received. The list is - * sorted in priority order (head has fewest bytes_remaining). - * Locked with homa->grantable_lock. - */ - struct list_head grantable_rpcs; - - /** - * @grantable_links: Used to link this peer into homa->grantable_peers. - * If this RPC is not linked into homa->grantable_peers, this is an - * empty list pointing to itself. - */ - struct list_head grantable_links; - - /** - * @peertab_links: Links this object into a bucket of its - * homa_peertab. - */ - struct hlist_node peertab_links; - - /** - * @outstanding_resends: the number of resend requests we have - * sent to this server (spaced @homa.resend_interval apart) since - * we received a packet from this peer. - */ - int outstanding_resends; - - /** - * @most_recent_resend: @homa->timer_ticks when the most recent - * resend was sent to this peer. - */ - int most_recent_resend; - - /** - * @least_recent_rpc: of all the RPCs for this peer scanned at - * @current_ticks, this is the RPC whose @resend_timer_ticks - * is farthest in the past. - */ - struct homa_rpc *least_recent_rpc; - - /** - * @least_recent_ticks: the @resend_timer_ticks value for - * @least_recent_rpc. - */ - __u32 least_recent_ticks; - - /** - * @current_ticks: the value of @homa->timer_ticks the last time - * that @least_recent_rpc and @least_recent_ticks were computed. - * Used to detect the start of a new homa_timer pass. - */ - __u32 current_ticks; - - /** - * @resend_rpc: the value of @least_recent_rpc computed in the - * previous homa_timer pass. This RPC will be issued a RESEND - * in the current pass, if it still needs one. - */ - struct homa_rpc *resend_rpc; - - /** - * @num_acks: the number of (initial) entries in @acks that - * currently hold valid information. - */ - int num_acks; - - /** - * @acks: info about client RPCs whose results have been completely - * received. - */ - struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; - - /** - * @ack_lock: used to synchronize access to @num_acks and @acks. - */ - spinlock_t ack_lock; -}; - /** * enum homa_freeze_type - The @type argument to homa_freeze must be * one of these values. @@ -813,8 +629,9 @@ struct homa { /** * @peertab: Info about all the other hosts we have communicated with. + * Dynamically allocated; must be kfreed. */ - struct homa_peertab peers; + struct homa_peertab *peers; /** * @page_pool_mutex: Synchronizes access to any/all of the page_pools @@ -1481,26 +1298,6 @@ static inline struct homa_sock *homa_sk(const struct sock *sk) return (struct homa_sock *)sk; } -/** - * homa_peer_lock() - Acquire the lock for a peer's @unacked_lock. If the lock - * isn't immediately available, record stats on the waiting time. - * @peer: Peer to lock. - */ -static inline void homa_peer_lock(struct homa_peer *peer) -{ - if (!spin_trylock_bh(&peer->ack_lock)) - homa_peer_lock_slow(peer); -} - -/** - * homa_peer_unlock() - Release the lock for a peer's @unacked_lock. - * @peer: Peer to lock. - */ -static inline void homa_peer_unlock(struct homa_peer *peer) -{ - spin_unlock_bh(&peer->ack_lock); -} - /** * homa_grantable_lock() - Acquire the grantable lock. If the lock * isn't immediately available, record stats on the waiting time. @@ -1692,8 +1489,6 @@ extern int homa_disconnect(struct sock *sk, int flags); extern void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); extern int homa_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -extern void homa_dst_refresh(struct homa_peertab *peertab, - struct homa_peer *peer, struct homa_sock *hsk); extern int homa_err_handler_v4(struct sk_buff *skb, u32 info); extern int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, @@ -1760,23 +1555,6 @@ extern void homa_outgoing_sysctl_changed(struct homa *homa); extern int homa_pacer_main(void *transportInfo); extern void homa_pacer_stop(struct homa *homa); extern void homa_pacer_xmit(struct homa *homa); -extern void homa_peertab_destroy(struct homa_peertab *peertab); -extern struct homa_peer ** - homa_peertab_get_peers(struct homa_peertab *peertab, - int *num_peers); -extern int homa_peertab_init(struct homa_peertab *peertab); -extern void homa_peer_add_ack(struct homa_rpc *rpc); -extern struct homa_peer - *homa_peer_find(struct homa_peertab *peertab, - const struct in6_addr *addr, struct inet_sock *inet); -extern int homa_peer_get_acks(struct homa_peer *peer, int count, - struct homa_ack *dst); -extern struct dst_entry - *homa_peer_get_dst(struct homa_peer *peer, - struct inet_sock *inet); -extern void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, - int c2, int c3, int c4, int c5, int c6, int c7); -extern void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now); extern __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); extern int homa_pool_allocate(struct homa_rpc *rpc); @@ -1898,20 +1676,5 @@ static inline void homa_check_pacer(struct homa *homa, int softirq) INC_METRIC(pacer_needed_help, 1); } -/** - * homa_get_dst() - Returns destination information associated with a peer, - * updating it if the cached information is stale. - * @peer: Peer whose destination information is desired. - * @hsk: Homa socket; needed by lower-level code to recreate the dst. - * Return Up-to-date destination for peer. - */ -static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, - struct homa_sock *hsk) -{ - if (unlikely(peer->dst->obsolete > 0)) - homa_dst_refresh(&hsk->homa->peers, peer, hsk); - return peer->dst; -} - extern struct completion homa_pacer_kthread_done; #endif /* _HOMA_IMPL_H */ diff --git a/homa_incoming.c b/homa_incoming.c index 6a44c9f1..692ef639 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" /** * homa_message_in_init() - Constructor for homa_message_in. @@ -759,7 +760,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); int i; struct cutoffs_header *h = (struct cutoffs_header *) skb->data; - struct homa_peer *peer = homa_peer_find(&hsk->homa->peers, + struct homa_peer *peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); if (!IS_ERR(peer)) { @@ -802,7 +803,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, "Freezing because NEED_ACK received before message complete, id %d, peer 0x%x"); goto done; } else { - peer = homa_peer_find(&hsk->homa->peers, &saddr, &hsk->inet); + peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); if (IS_ERR(peer)) goto done; } diff --git a/homa_outgoing.c b/homa_outgoing.c index bbc4c19e..31affa15 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #include "homa_wire.h" /** @@ -463,7 +464,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) unknown.common.flags = HOMA_TCP_FLAGS; unknown.common.urgent = htons(HOMA_TCP_URGENT); unknown.common.sender_id = cpu_to_be64(homa_local_id(h->sender_id)); - peer = homa_peer_find(&hsk->homa->peers, &saddr, &hsk->inet); + peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); if (!IS_ERR(peer)) __homa_xmit_control(&unknown, sizeof(unknown), peer, hsk); } diff --git a/homa_peertab.c b/homa_peer.c similarity index 99% rename from homa_peertab.c rename to homa_peer.c index b616441d..8138f412 100644 --- a/homa_peertab.c +++ b/homa_peer.c @@ -1,10 +1,11 @@ // SPDX-License-Identifier: BSD-2-Clause -/* This file manages homa_peertab objects and is responsible for creating - * and deleting homa_peer objects. +/* This file provides functions related to homa_peer and homa_peertab + * objects. */ #include "homa_impl.h" +#include "homa_peer.h" /** * homa_peertab_init() - Constructor for homa_peertabs. diff --git a/homa_peer.h b/homa_peer.h new file mode 100644 index 00000000..688a6487 --- /dev/null +++ b/homa_peer.h @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file contains definitions related to managing peers (homa_peer + * and homa_peertab). + */ + +#ifndef _HOMA_PEER_H +#define _HOMA_PEER_H + +#include "homa_wire.h" +#include "homa_sock.h" + +/** + * struct homa_dead_dst - Used to retain dst_entries that are no longer + * needed, until it is safe to delete them (I'm not confident that the RCU + * mechanism will be safe for these: the reference count could get incremented + * after it's on the RCU list?). + */ +struct homa_dead_dst { + /** @dst: Entry that is no longer used by a struct homa_peer. */ + struct dst_entry *dst; + + /** + * @gc_time: Time (in units of get_cycles) when it is safe + * to free @dst. + */ + __u64 gc_time; + + /** @dst_links: Used to link together entries in peertab->dead_dsts. */ + struct list_head dst_links; +}; + +/** + * define HOMA_PEERTAB_BUCKETS - Number of bits in the bucket index for a + * homa_peertab. Should be large enough to hold an entry for every server + * in a datacenter without long hash chains. + */ +#define HOMA_PEERTAB_BUCKET_BITS 16 + +/** define HOME_PEERTAB_BUCKETS - Number of buckets in a homa_peertab. */ +#define HOMA_PEERTAB_BUCKETS (1 << HOMA_PEERTAB_BUCKET_BITS) + +/** + * struct homa_peertab - A hash table that maps from IPv6 addresses + * to homa_peer objects. IPv4 entries are encapsulated as IPv6 addresses. + * Entries are gradually added to this table, but they are never removed + * except when the entire table is deleted. We can't safely delete because + * results returned by homa_peer_find may be retained indefinitely. + * + * This table is managed exclusively by homa_peertab.c, using RCU to + * permit efficient lookups. + */ +struct homa_peertab { + /** + * @write_lock: Synchronizes addition of new entries; not needed + * for lookups (RCU is used instead). + */ + spinlock_t write_lock; + + /** + * @dead_dsts: List of dst_entries that are waiting to be deleted. + * Hold @write_lock when manipulating. + */ + struct list_head dead_dsts; + + /** + * @buckets: Pointer to heads of chains of homa_peers for each bucket. + * Malloc-ed, and must eventually be freed. NULL means this structure + * has not been initialized. + */ + struct hlist_head *buckets; +}; + +/** + * struct homa_peer - One of these objects exists for each machine that we + * have communicated with (either as client or server). + */ +struct homa_peer { + /** + * @addr: IPv6 address for the machine (IPv4 addresses are stored + * as IPv4-mapped IPv6 addresses). + */ + struct in6_addr addr; + + /** @flow: Addressing info needed to send packets. */ + struct flowi flow; + + /** + * @dst: Used to route packets to this peer; we own a reference + * to this, which we must eventually release. + */ + struct dst_entry *dst; + + /** + * @unsched_cutoffs: priorities to use for unscheduled packets + * sent to this host, as specified in the most recent CUTOFFS + * packet from that host. See documentation for @homa.unsched_cutoffs + * for the meanings of these values. + */ + int unsched_cutoffs[HOMA_MAX_PRIORITIES]; + + /** + * @cutoff_version: value of cutoff_version in the most recent + * CUTOFFS packet received from this peer. 0 means we haven't + * yet received a CUTOFFS packet from the host. Note that this is + * stored in network byte order. + */ + __be16 cutoff_version; + + /** + * last_update_jiffies: time in jiffies when we sent the most + * recent CUTOFFS packet to this peer. + */ + unsigned long last_update_jiffies; + + /** + * grantable_rpcs: Contains all homa_rpcs (both requests and + * responses) involving this peer whose msgins require (or required + * them in the past) and have not been fully received. The list is + * sorted in priority order (head has fewest bytes_remaining). + * Locked with homa->grantable_lock. + */ + struct list_head grantable_rpcs; + + /** + * @grantable_links: Used to link this peer into homa->grantable_peers. + * If this RPC is not linked into homa->grantable_peers, this is an + * empty list pointing to itself. + */ + struct list_head grantable_links; + + /** + * @peertab_links: Links this object into a bucket of its + * homa_peertab. + */ + struct hlist_node peertab_links; + + /** + * @outstanding_resends: the number of resend requests we have + * sent to this server (spaced @homa.resend_interval apart) since + * we received a packet from this peer. + */ + int outstanding_resends; + + /** + * @most_recent_resend: @homa->timer_ticks when the most recent + * resend was sent to this peer. + */ + int most_recent_resend; + + /** + * @least_recent_rpc: of all the RPCs for this peer scanned at + * @current_ticks, this is the RPC whose @resend_timer_ticks + * is farthest in the past. + */ + struct homa_rpc *least_recent_rpc; + + /** + * @least_recent_ticks: the @resend_timer_ticks value for + * @least_recent_rpc. + */ + __u32 least_recent_ticks; + + /** + * @current_ticks: the value of @homa->timer_ticks the last time + * that @least_recent_rpc and @least_recent_ticks were computed. + * Used to detect the start of a new homa_timer pass. + */ + __u32 current_ticks; + + /** + * @resend_rpc: the value of @least_recent_rpc computed in the + * previous homa_timer pass. This RPC will be issued a RESEND + * in the current pass, if it still needs one. + */ + struct homa_rpc *resend_rpc; + + /** + * @num_acks: the number of (initial) entries in @acks that + * currently hold valid information. + */ + int num_acks; + + /** + * @acks: info about client RPCs whose results have been completely + * received. + */ + struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; + + /** + * @ack_lock: used to synchronize access to @num_acks and @acks. + */ + spinlock_t ack_lock; +}; + +extern void homa_dst_refresh(struct homa_peertab *peertab, + struct homa_peer *peer, struct homa_sock *hsk); +extern void homa_peertab_destroy(struct homa_peertab *peertab); +extern struct homa_peer ** + homa_peertab_get_peers(struct homa_peertab *peertab, + int *num_peers); +extern int homa_peertab_init(struct homa_peertab *peertab); +extern void homa_peer_add_ack(struct homa_rpc *rpc); +extern struct homa_peer + *homa_peer_find(struct homa_peertab *peertab, + const struct in6_addr *addr, struct inet_sock *inet); +extern int homa_peer_get_acks(struct homa_peer *peer, int count, + struct homa_ack *dst); +extern struct dst_entry + *homa_peer_get_dst(struct homa_peer *peer, + struct inet_sock *inet); +extern void homa_peer_lock_slow(struct homa_peer *peer); +extern void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, + int c2, int c3, int c4, int c5, int c6, int c7); +extern void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now); + +/** + * homa_peer_lock() - Acquire the lock for a peer's @unacked_lock. If the lock + * isn't immediately available, record stats on the waiting time. + * @peer: Peer to lock. + */ +static inline void homa_peer_lock(struct homa_peer *peer) +{ + if (!spin_trylock_bh(&peer->ack_lock)) + homa_peer_lock_slow(peer); +} + +/** + * homa_peer_unlock() - Release the lock for a peer's @unacked_lock. + * @peer: Peer to lock. + */ +static inline void homa_peer_unlock(struct homa_peer *peer) +{ + spin_unlock_bh(&peer->ack_lock); +} + +/** + * homa_get_dst() - Returns destination information associated with a peer, + * updating it if the cached information is stale. + * @peer: Peer whose destination information is desired. + * @hsk: Homa socket; needed by lower-level code to recreate the dst. + * Return Up-to-date destination for peer. + */ +static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, + struct homa_sock *hsk) +{ + if (unlikely(peer->dst->obsolete > 0)) + homa_dst_refresh(hsk->homa->peers, peer, hsk); + return peer->dst; +} + +#endif /* _HOMA_PEER_H */ diff --git a/homa_plumbing.c b/homa_plumbing.c index 52337aa1..83496343 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #ifndef __UNIT_TEST__ MODULE_LICENSE("Dual MIT/GPL"); diff --git a/homa_rpc.c b/homa_rpc.c index 0dc2dee6..af1068f8 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -3,6 +3,7 @@ /* This file contains functions for managing homa_rpc structs. */ #include "homa_impl.h" +#include "homa_peer.h" /** * homa_rpc_new_client() - Allocate and construct a client RPC (one that is used @@ -35,7 +36,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, crpc->state = RPC_OUTGOING; atomic_set(&crpc->flags, 0); atomic_set(&crpc->grants_in_progress, 0); - crpc->peer = homa_peer_find(&hsk->homa->peers, &dest_addr_as_ipv6, + crpc->peer = homa_peer_find(hsk->homa->peers, &dest_addr_as_ipv6, &hsk->inet); if (IS_ERR(crpc->peer)) { tt_record("error in homa_peer_find"); @@ -137,7 +138,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, srpc->state = RPC_INCOMING; atomic_set(&srpc->flags, 0); atomic_set(&srpc->grants_in_progress, 0); - srpc->peer = homa_peer_find(&hsk->homa->peers, source, &hsk->inet); + srpc->peer = homa_peer_find(hsk->homa->peers, source, &hsk->inet); if (IS_ERR(srpc->peer)) { err = PTR_ERR(srpc->peer); goto error; diff --git a/homa_sock.c b/homa_sock.c index fe898263..575d72d2 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -3,6 +3,7 @@ /* This file manages homa_sock and homa_socktab objects. */ #include "homa_impl.h" +#include "homa_peer.h" /** * homa_socktab_init() - Constructor for homa_socktabs. diff --git a/homa_timer.c b/homa_timer.c index aefd6588..b026e052 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" /** * homa_check_rpc() - Invoked for each RPC during each timer pass; does diff --git a/homa_utils.c b/homa_utils.c index 2aa7f666..d657ee43 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" /* Core-specific information. NR_CPUS is an overestimate of the actual * number, but allows us to allocate the array statically. @@ -123,7 +124,8 @@ int homa_init(struct homa *homa) atomic_set(&homa->total_incoming, 0); homa->next_client_port = HOMA_MIN_DEFAULT_PORT; homa_socktab_init(&homa->port_map); - err = homa_peertab_init(&homa->peers); + homa->peers = kmalloc(sizeof *homa->peers, GFP_KERNEL); + err = homa_peertab_init(homa->peers); if (err) { pr_err("Couldn't initialize peer table (errno %d)\n", -err); return err; @@ -218,7 +220,8 @@ void homa_destroy(struct homa *homa) /* The order of the following 2 statements matters! */ homa_socktab_destroy(&homa->port_map); - homa_peertab_destroy(&homa->peers); + homa_peertab_destroy(homa->peers); + kfree(homa->peers); homa_skb_cleanup(homa); for (i = 0; i < MAX_NUMNODES; i++) { @@ -583,7 +586,7 @@ void homa_freeze_peers(struct homa *homa) return; } - peers = homa_peertab_get_peers(&homa->peers, &num_peers); + peers = homa_peertab_get_peers(homa->peers, &num_peers); if (peers == NULL) { tt_record("homa_freeze_peers couldn't find peers to freeze"); return; diff --git a/test/Makefile b/test/Makefile index c5273d70..e2191b52 100644 --- a/test/Makefile +++ b/test/Makefile @@ -42,7 +42,7 @@ TEST_SRCS := unit_homa_grant.c \ unit_homa_offload.c \ unit_homa_metrics.c \ unit_homa_outgoing.c \ - unit_homa_peertab.c \ + unit_homa_peer.c \ unit_homa_pool.c \ unit_homa_plumbing.c \ unit_homa_rpc.c \ @@ -58,7 +58,7 @@ HOMA_SRCS := homa_grant.c \ homa_metrics.c \ homa_offload.c \ homa_outgoing.c \ - homa_peertab.c \ + homa_peer.c \ homa_pool.c \ homa_plumbing.c \ homa_rpc.c \ diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 635ac5a8..0e2eec98 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -943,7 +944,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) .cutoff_version = 400}; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); - peer = homa_peer_find(&self->homa.peers, self->server_ip, + peer = homa_peer_find(self->homa.peers, self->server_ip, &self->hsk.inet); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(400, peer->cutoff_version); @@ -1594,7 +1595,7 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) mock_kmalloc_errors = 1; homa_cutoffs_pkt(skb, &self->hsk); EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); - peer = homa_peer_find(&self->homa.peers, self->server_ip, + peer = homa_peer_find(self->homa.peers, self->server_ip, &self->hsk.inet); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(0, peer->cutoff_version); @@ -1660,7 +1661,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) { - struct homa_peer *peer = homa_peer_find(&self->homa.peers, + struct homa_peer *peer = homa_peer_find(self->homa.peers, self->server_ip, &self->hsk.inet); peer->acks[0].client_port = htons(self->client_port); peer->acks[0].server_port = htons(self->server_port); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 8ff9143f..e7327040 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -58,7 +59,7 @@ FIXTURE_SETUP(homa_outgoing) self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->peer = homa_peer_find(&self->homa.peers, + self->peer = homa_peer_find(self->homa.peers, &self->server_addr.in6.sin6_addr, &self->hsk.inet); unit_log_clear(); } diff --git a/test/unit_homa_peertab.c b/test/unit_homa_peer.c similarity index 86% rename from test/unit_homa_peertab.c rename to test/unit_homa_peer.c index 92c352ea..5af0d545 100644 --- a/test/unit_homa_peertab.c +++ b/test/unit_homa_peer.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -13,7 +14,7 @@ struct in6_addr ip1111[1]; struct in6_addr ip2222[1]; struct in6_addr ip3333[1]; -FIXTURE(homa_peertab) { +FIXTURE(homa_peer) { struct homa homa; struct homa_sock hsk; struct homa_peertab peertab; @@ -21,7 +22,7 @@ FIXTURE(homa_peertab) { struct in6_addr server_ip[1]; int server_port; }; -FIXTURE_SETUP(homa_peertab) +FIXTURE_SETUP(homa_peer) { homa_init(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); @@ -33,7 +34,7 @@ FIXTURE_SETUP(homa_peertab) ip3333[0] = unit_get_in_addr("3::3:3:3"); self->server_port = 99; } -FIXTURE_TEARDOWN(homa_peertab) +FIXTURE_TEARDOWN(homa_peer) { homa_peertab_destroy(&self->peertab); homa_destroy(&self->homa); @@ -56,7 +57,7 @@ static void peer_spinlock_hook(char *id) mock_cycles += 1000; } -TEST_F(homa_peertab, homa_peer_find__basics) +TEST_F(homa_peer, homa_peer_find__basics) { struct homa_peer *peer, *peer2; @@ -75,7 +76,7 @@ TEST_F(homa_peertab, homa_peer_find__basics) EXPECT_EQ(2, homa_metrics_per_cpu()->peer_new_entries); } -static struct _test_data_homa_peertab *test_data; +static struct _test_data_homa_peer *test_data; static struct homa_peer *conflicting_peer = NULL; static int peer_lock_hook_invocations = 0; static void peer_lock_hook(char *id) { @@ -90,7 +91,7 @@ static void peer_lock_hook(char *id) { &test_data->hsk.inet); } -TEST_F(homa_peertab, homa_peertab_init__vmalloc_failed) +TEST_F(homa_peer, homa_peertab_init__vmalloc_failed) { struct homa_peertab table; mock_vmalloc_errors = 1; @@ -100,7 +101,7 @@ TEST_F(homa_peertab, homa_peertab_init__vmalloc_failed) homa_peertab_destroy(&table); } -TEST_F(homa_peertab, homa_peertab_gc_dsts) +TEST_F(homa_peer, homa_peertab_gc_dsts) { struct homa_peer *peer; peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); @@ -118,7 +119,7 @@ TEST_F(homa_peertab, homa_peertab_gc_dsts) EXPECT_EQ(0, dead_count(&self->peertab)); } -TEST_F(homa_peertab, homa_peertab_get_peers__not_init) +TEST_F(homa_peer, homa_peertab_get_peers__not_init) { struct homa_peertab peertab; int num_peers = 45; @@ -126,13 +127,13 @@ TEST_F(homa_peertab, homa_peertab_get_peers__not_init) EXPECT_EQ(NULL, homa_peertab_get_peers(&peertab, &num_peers)); EXPECT_EQ(0, num_peers); } -TEST_F(homa_peertab, homa_peertab_get_peers__table_empty) +TEST_F(homa_peer, homa_peertab_get_peers__table_empty) { int num_peers = 45; EXPECT_EQ(NULL, homa_peertab_get_peers(&self->peertab, &num_peers)); EXPECT_EQ(0, num_peers); } -TEST_F(homa_peertab, homa_peertab_get_peers__kmalloc_fails) +TEST_F(homa_peer, homa_peertab_get_peers__kmalloc_fails) { int num_peers = 45; mock_kmalloc_errors = 1; @@ -140,7 +141,7 @@ TEST_F(homa_peertab, homa_peertab_get_peers__kmalloc_fails) EXPECT_EQ(NULL, homa_peertab_get_peers(&self->peertab, &num_peers)); EXPECT_EQ(0, num_peers); } -TEST_F(homa_peertab, homa_peertab_get_peers__one_peer) +TEST_F(homa_peer, homa_peertab_get_peers__one_peer) { struct homa_peer **peers; struct homa_peer *peer; @@ -152,7 +153,7 @@ TEST_F(homa_peertab, homa_peertab_get_peers__one_peer) EXPECT_EQ(peer, peers[0]); kfree(peers); } -TEST_F(homa_peertab, homa_peertab_get_peers__multiple_peers) +TEST_F(homa_peer, homa_peertab_get_peers__multiple_peers) { struct homa_peer **peers; struct homa_peer *peer1, *peer2, *peer3; @@ -172,7 +173,7 @@ TEST_F(homa_peertab, homa_peertab_get_peers__multiple_peers) kfree(peers); } -TEST_F(homa_peertab, homa_peer_find__conflicting_creates) +TEST_F(homa_peer, homa_peer_find__conflicting_creates) { struct homa_peer *peer; @@ -183,7 +184,7 @@ TEST_F(homa_peertab, homa_peer_find__conflicting_creates) EXPECT_NE(NULL, conflicting_peer); EXPECT_EQ(conflicting_peer, peer); } -TEST_F(homa_peertab, homa_peer_find__kmalloc_error) +TEST_F(homa_peer, homa_peer_find__kmalloc_error) { struct homa_peer *peer; @@ -193,7 +194,7 @@ TEST_F(homa_peertab, homa_peer_find__kmalloc_error) EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); } -TEST_F(homa_peertab, homa_peer_find__route_error) +TEST_F(homa_peer, homa_peer_find__route_error) { struct homa_peer *peer; @@ -204,7 +205,7 @@ TEST_F(homa_peertab, homa_peer_find__route_error) EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); } -TEST_F(homa_peertab, homa_dst_refresh__basics) +TEST_F(homa_peer, homa_dst_refresh__basics) { struct homa_peer *peer; struct dst_entry *old_dst; @@ -213,11 +214,11 @@ TEST_F(homa_peertab, homa_dst_refresh__basics) EXPECT_EQ_IP(*ip1111, peer->addr); old_dst = homa_get_dst(peer, &self->hsk); - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); + homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_NE(old_dst, peer->dst); - EXPECT_EQ(1, dead_count(&self->homa.peers)); + EXPECT_EQ(1, dead_count(self->homa.peers)); } -TEST_F(homa_peertab, homa_dst_refresh__routing_error) +TEST_F(homa_peer, homa_dst_refresh__routing_error) { struct homa_peer *peer; struct dst_entry *old_dst; @@ -227,12 +228,12 @@ TEST_F(homa_peertab, homa_dst_refresh__routing_error) old_dst = homa_get_dst(peer, &self->hsk); mock_route_errors = 1; - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); + homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); - EXPECT_EQ(0, dead_count(&self->homa.peers)); + EXPECT_EQ(0, dead_count(self->homa.peers)); } -TEST_F(homa_peertab, homa_dst_refresh__malloc_error) +TEST_F(homa_peer, homa_dst_refresh__malloc_error) { struct homa_peer *peer; struct dst_entry *old_dst; @@ -242,11 +243,11 @@ TEST_F(homa_peertab, homa_dst_refresh__malloc_error) old_dst = homa_get_dst(peer, &self->hsk); mock_kmalloc_errors = 1; - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); + homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_NE(old_dst, peer->dst); - EXPECT_EQ(0, dead_count(&self->homa.peers)); + EXPECT_EQ(0, dead_count(self->homa.peers)); } -TEST_F(homa_peertab, homa_dst_refresh__free_old_dsts) +TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) { struct homa_peer *peer; peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); @@ -254,15 +255,15 @@ TEST_F(homa_peertab, homa_dst_refresh__free_old_dsts) EXPECT_EQ_IP(*ip1111, peer->addr); mock_cycles = 0; - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); - EXPECT_EQ(2, dead_count(&self->homa.peers)); + homa_dst_refresh(self->homa.peers, peer, &self->hsk); + homa_dst_refresh(self->homa.peers, peer, &self->hsk); + EXPECT_EQ(2, dead_count(self->homa.peers)); mock_cycles = 500000000; - homa_dst_refresh(&self->homa.peers, peer, &self->hsk); - EXPECT_EQ(1, dead_count(&self->homa.peers)); + homa_dst_refresh(self->homa.peers, peer, &self->hsk); + EXPECT_EQ(1, dead_count(self->homa.peers)); } -TEST_F(homa_peertab, homa_unsched_priority) +TEST_F(homa_peer, homa_unsched_priority) { struct homa_peer peer; homa_peer_set_cutoffs(&peer, INT_MAX, 0, 0, INT_MAX, 200, 100, 0, 0); @@ -272,7 +273,7 @@ TEST_F(homa_peertab, homa_unsched_priority) EXPECT_EQ(3, homa_unsched_priority(&self->homa, &peer, 201)); } -TEST_F(homa_peertab, homa_peer_get_dst_ipv4) +TEST_F(homa_peer, homa_peer_get_dst_ipv4) { struct dst_entry *dst; @@ -291,7 +292,7 @@ TEST_F(homa_peertab, homa_peer_get_dst_ipv4) EXPECT_STREQ("196.168.0.1", homa_print_ipv4_addr(peer->flow.u.ip4.daddr)); } -TEST_F(homa_peertab, homa_peer_get_dst_ipv6) +TEST_F(homa_peer, homa_peer_get_dst_ipv6) { char buffer[30]; __u32 addr; @@ -316,7 +317,7 @@ TEST_F(homa_peertab, homa_peer_get_dst_ipv6) homa_print_ipv6_addr(&peer->flow.u.ip6.daddr)); } -TEST_F(homa_peertab, homa_peer_lock_slow) +TEST_F(homa_peer, homa_peer_lock_slow) { mock_cycles = 10000; struct homa_peer *peer = homa_peer_find(&self->peertab, ip3333, @@ -336,7 +337,7 @@ TEST_F(homa_peertab, homa_peer_lock_slow) homa_peer_unlock(peer); } -TEST_F(homa_peertab, homa_peer_add_ack) +TEST_F(homa_peer, homa_peer_add_ack) { struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, @@ -389,7 +390,7 @@ TEST_F(homa_peertab, homa_peer_add_ack) unit_log_get()); } -TEST_F(homa_peertab, homa_peer_get_acks) +TEST_F(homa_peer, homa_peer_get_acks) { struct homa_peer *peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 3bc4e227..9cbda739 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 48594aac..59bf413d 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index c8167984..216ff473 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/utils.c b/test/utils.c index 8b4e3e6c..a684d4cb 100644 --- a/test/utils.c +++ b/test/utils.c @@ -7,6 +7,7 @@ */ #include "homa_impl.h" +#include "homa_peer.h" #include "ccutils.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" From 79261a026a768d0ad537cab0f75b4a0bc0d07afe Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 30 Sep 2024 17:27:17 -0700 Subject: [PATCH 030/625] Extract homa_pool.h from homa_impl.h --- homa_impl.h | 147 ------------------------------------ homa_incoming.c | 1 + homa_plumbing.c | 1 + homa_pool.c | 3 + homa_pool.h | 154 ++++++++++++++++++++++++++++++++++++++ homa_rpc.c | 1 + homa_sock.c | 1 + test/mock.c | 3 +- test/unit_homa_incoming.c | 1 + test/unit_homa_plumbing.c | 1 + test/unit_homa_pool.c | 2 +- test/unit_homa_rpc.c | 1 + 12 files changed, 167 insertions(+), 149 deletions(-) create mode 100644 homa_pool.h diff --git a/homa_impl.h b/homa_impl.h index 2ca0a07e..07061343 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -134,10 +134,7 @@ extern void *mock_vmalloc(size_t size); /* Forward declarations. */ struct homa_sock; -struct homa_rpc; -struct homa_rpc_bucket; struct homa; -struct homa_peer; #include "homa.h" #include "timetrace.h" @@ -252,139 +249,6 @@ static inline void homa_interest_init(struct homa_interest *interest) interest->response_links.next = LIST_POISON1; } -/** - * struct homa_bpage - Contains information about a single page in - * a buffer pool. - */ -struct homa_bpage { - union { - /** - * @cache_line: Ensures that each homa_bpage object - * is exactly one cache line long. - */ - struct homa_cache_line cache_line; - struct { - /** @lock: to synchronize shared access. */ - spinlock_t lock; - - /** - * @refs: Counts number of distinct uses of this - * bpage (1 tick for each message that is using - * this page, plus an additional tick if the @owner - * field is set). - */ - atomic_t refs; - - /** - * @owner: kernel core that currently owns this page - * (< 0 if none). - */ - int owner; - - /** - * @expiration: time (in get_cycles units) after - * which it's OK to steal this page from its current - * owner (if @refs is 1). - */ - __u64 expiration; - }; - }; -}; -_Static_assert(sizeof(struct homa_bpage) == sizeof(struct homa_cache_line), - "homa_bpage overflowed a cache line"); - -/** - * struct homa_pool_core - Holds core-specific data for a homa_pool (a bpage - * out of which that core is allocating small chunks). - */ -struct homa_pool_core { - union { - /** - * @cache_line: Ensures that each object is exactly one - * cache line long. - */ - struct homa_cache_line cache_line; - struct { - /** - * @page_hint: Index of bpage in pool->descriptors, - * which may be owned by this core. If so, we'll use it - * for allocating partial pages. - */ - int page_hint; - - /** - * @allocated: if the page given by @page_hint is - * owned by this core, this variable gives the number of - * (initial) bytes that have already been allocated - * from the page. - */ - int allocated; - - /** - * @next_candidate: when searching for free bpages, - * check this index next. - */ - int next_candidate; - }; - }; -}; -_Static_assert(sizeof(struct homa_pool_core) == sizeof(struct homa_cache_line), - "homa_pool_core overflowed a cache line"); - -/** - * struct homa_pool - Describes a pool of buffer space for incoming - * messages for a particular socket; managed by homa_pool.c. The pool is - * divided up into "bpages", which are a multiple of the hardware page size. - * A bpage may be owned by a particular core so that it can more efficiently - * allocate space for small messages. - */ -struct homa_pool { - /** - * @hsk: the socket that this pool belongs to. - */ - struct homa_sock *hsk; - - /** - * @region: beginning of the pool's region (in the app's virtual - * memory). Divided into bpages. 0 means the pool hasn't yet been - * initialized. - */ - char *region; - - /** @num_bpages: total number of bpages in the pool. */ - int num_bpages; - - /** @descriptors: kmalloced area containing one entry for each bpage. */ - struct homa_bpage *descriptors; - - /** - * @free_bpages: the number of pages still available for allocation - * by homa_pool_get pages. This equals the number of pages with zero - * reference counts, minus the number of pages that have been claimed - * by homa_get_pool_pages but not yet allocated. - */ - atomic_t free_bpages; - - /** - * The number of free bpages required to satisfy the needs of the - * first RPC on @hsk->waiting_for_bufs, or INT_MAX if that queue - * is empty. - */ - int bpages_needed; - - /** @cores: core-specific info; dynamically allocated. */ - struct homa_pool_core *cores; - - /** @num_cores: number of elements in @cores. */ - int num_cores; - - /** - * @check_waiting_invoked: incremented during unit tests when - * homa_pool_check_waiting is invoked. - */ - int check_waiting_invoked; -}; - /** * enum homa_freeze_type - The @type argument to homa_freeze must be * one of these values. @@ -1557,17 +1421,6 @@ extern void homa_pacer_stop(struct homa *homa); extern void homa_pacer_xmit(struct homa *homa); extern __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); -extern int homa_pool_allocate(struct homa_rpc *rpc); -extern void homa_pool_check_waiting(struct homa_pool *pool); -extern void homa_pool_destroy(struct homa_pool *pool); -extern void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, - int *available); -extern int homa_pool_get_pages(struct homa_pool *pool, int num_pages, - __u32 *pages, int leave_locked); -extern int homa_pool_init(struct homa_sock *hsk, void *buf_region, - __u64 region_size); -extern void homa_pool_release_buffers(struct homa_pool *pool, - int num_buffers, __u32 *buffers); extern char *homa_print_ipv4_addr(__be32 addr); extern char *homa_print_ipv6_addr(const struct in6_addr *addr); extern char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); diff --git a/homa_incoming.c b/homa_incoming.c index 692ef639..2ba42f0f 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_pool.h" /** * homa_message_in_init() - Constructor for homa_message_in. diff --git a/homa_plumbing.c b/homa_plumbing.c index 83496343..9ead714e 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_pool.h" #ifndef __UNIT_TEST__ MODULE_LICENSE("Dual MIT/GPL"); diff --git a/homa_pool.c b/homa_pool.c index 33946639..f79e9289 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -1,6 +1,9 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_pool.h" + +/* This file contains functions that manage user-space buffer pools. */ /* Pools must always have at least this many bpages (no particular * reasoning behind this value). diff --git a/homa_pool.h b/homa_pool.h new file mode 100644 index 00000000..b5ffdfe6 --- /dev/null +++ b/homa_pool.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file contains definitions used to manage user-space buffer pools. + */ + +#ifndef _HOMA_POOL_H +#define _HOMA_POOL_H + +/** + * struct homa_bpage - Contains information about a single page in + * a buffer pool. + */ +struct homa_bpage { + union { + /** + * @cache_line: Ensures that each homa_bpage object + * is exactly one cache line long. + */ + struct homa_cache_line cache_line; + struct { + /** @lock: to synchronize shared access. */ + spinlock_t lock; + + /** + * @refs: Counts number of distinct uses of this + * bpage (1 tick for each message that is using + * this page, plus an additional tick if the @owner + * field is set). + */ + atomic_t refs; + + /** + * @owner: kernel core that currently owns this page + * (< 0 if none). + */ + int owner; + + /** + * @expiration: time (in get_cycles units) after + * which it's OK to steal this page from its current + * owner (if @refs is 1). + */ + __u64 expiration; + }; + }; +}; +_Static_assert(sizeof(struct homa_bpage) == sizeof(struct homa_cache_line), + "homa_bpage overflowed a cache line"); + +/** + * struct homa_pool_core - Holds core-specific data for a homa_pool (a bpage + * out of which that core is allocating small chunks). + */ +struct homa_pool_core { + union { + /** + * @cache_line: Ensures that each object is exactly one + * cache line long. + */ + struct homa_cache_line cache_line; + struct { + /** + * @page_hint: Index of bpage in pool->descriptors, + * which may be owned by this core. If so, we'll use it + * for allocating partial pages. + */ + int page_hint; + + /** + * @allocated: if the page given by @page_hint is + * owned by this core, this variable gives the number of + * (initial) bytes that have already been allocated + * from the page. + */ + int allocated; + + /** + * @next_candidate: when searching for free bpages, + * check this index next. + */ + int next_candidate; + }; + }; +}; +_Static_assert(sizeof(struct homa_pool_core) == L1_CACHE_BYTES, + "homa_pool_core overflowed a cache line"); + +/** + * struct homa_pool - Describes a pool of buffer space for incoming + * messages for a particular socket; managed by homa_pool.c. The pool is + * divided up into "bpages", which are a multiple of the hardware page size. + * A bpage may be owned by a particular core so that it can more efficiently + * allocate space for small messages. + */ +struct homa_pool { + /** + * @hsk: the socket that this pool belongs to. + */ + struct homa_sock *hsk; + + /** + * @region: beginning of the pool's region (in the app's virtual + * memory). Divided into bpages. 0 means the pool hasn't yet been + * initialized. + */ + char *region; + + /** @num_bpages: total number of bpages in the pool. */ + int num_bpages; + + /** @descriptors: kmalloced area containing one entry for each bpage. */ + struct homa_bpage *descriptors; + + /** + * @free_bpages: the number of pages still available for allocation + * by homa_pool_get pages. This equals the number of pages with zero + * reference counts, minus the number of pages that have been claimed + * by homa_get_pool_pages but not yet allocated. + */ + atomic_t free_bpages; + + /** + * The number of free bpages required to satisfy the needs of the + * first RPC on @hsk->waiting_for_bufs, or INT_MAX if that queue + * is empty. + */ + int bpages_needed; + + /** @cores: core-specific info; dynamically allocated. */ + struct homa_pool_core *cores; + + /** @num_cores: number of elements in @cores. */ + int num_cores; + + /** + * @check_waiting_invoked: incremented during unit tests when + * homa_pool_check_waiting is invoked. + */ + int check_waiting_invoked; +}; + +extern int homa_pool_allocate(struct homa_rpc *rpc); +extern void homa_pool_check_waiting(struct homa_pool *pool); +extern void homa_pool_destroy(struct homa_pool *pool); +extern void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, + int *available); +extern int homa_pool_get_pages(struct homa_pool *pool, int num_pages, + __u32 *pages, int leave_locked); +extern int homa_pool_init(struct homa_sock *hsk, void *buf_region, + __u64 region_size); +extern void homa_pool_release_buffers(struct homa_pool *pool, + int num_buffers, __u32 *buffers); + +#endif /* _HOMA_POOL_H */ diff --git a/homa_rpc.c b/homa_rpc.c index af1068f8..aeaaea15 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -4,6 +4,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_pool.h" /** * homa_rpc_new_client() - Allocate and construct a client RPC (one that is used diff --git a/homa_sock.c b/homa_sock.c index 575d72d2..90a29547 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -4,6 +4,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_pool.h" /** * homa_socktab_init() - Constructor for homa_socktabs. diff --git a/test/mock.c b/test/mock.c index 013ecae6..23e2f9c2 100644 --- a/test/mock.c +++ b/test/mock.c @@ -3,11 +3,12 @@ */ /* This file provides simplified substitutes for many Linux variables and - * functions, in order to allow Homa unit tests to be run outside a Linux + * functions in order to allow Homa unit tests to be run outside a Linux * kernel. */ #include "homa_impl.h" +#include "homa_pool.h" #include "ccutils.h" #include "mock.h" #include "utils.h" diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 0e2eec98..ccdfd313 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -4,6 +4,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 9cbda739..11f89b12 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -4,6 +4,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index af8ed0c0..e4dd08e7 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -3,7 +3,7 @@ */ #include "homa_impl.h" -#include "homa_impl.h" +#include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 59bf413d..93020fdd 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -4,6 +4,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" From 2373e1a9442b0c1188a87af5f6a3c84d71ae24d5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 30 Sep 2024 22:20:40 -0700 Subject: [PATCH 031/625] Miscellaneous refactoring to pull more stuff out of homa_impl.h --- homa_grant.c | 1 + homa_impl.h | 58 +++---------------------------- homa_incoming.c | 4 +-- homa_outgoing.c | 1 + homa_peer.c | 1 + homa_peer.h | 2 ++ homa_plumbing.c | 2 +- homa_pool.h | 2 ++ homa_rpc.c | 8 ++--- homa_rpc.h | 10 ++++++ homa_sock.c | 6 ++-- homa_sock.h | 21 +++++++++++ homa_timer.c | 3 +- homa_utils.c | 9 +++-- homa_wire.h | 14 ++++++++ notes.txt | 50 ++++++++++++++++++--------- test/mock.c | 2 +- test/unit_homa_grant.c | 1 + test/unit_homa_offload.c | 1 + test/unit_homa_outgoing.c | 3 +- test/unit_homa_peer.c | 1 + test/unit_homa_plumbing.c | 2 +- test/unit_homa_sock.c | 73 ++++++++++++++++++++------------------- test/unit_homa_timer.c | 1 + test/utils.c | 1 + test/utils.h | 2 ++ 26 files changed, 155 insertions(+), 124 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index a0104089..7ae086b7 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_rpc.h" #include "homa_wire.h" /** diff --git a/homa_impl.h b/homa_impl.h index 07061343..c343aa69 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -7,9 +7,6 @@ #ifndef _HOMA_IMPL_H #define _HOMA_IMPL_H -#pragma GCC diagnostic ignored "-Wpointer-sign" -#pragma GCC diagnostic ignored "-Wunused-variable" - #include #ifdef __UNIT_TEST__ #undef WARN @@ -45,8 +42,6 @@ #include #include #include -#pragma GCC diagnostic warning "-Wpointer-sign" -#pragma GCC diagnostic warning "-Wunused-variable" #ifdef __UNIT_TEST__ #undef alloc_pages @@ -133,13 +128,12 @@ extern void *mock_vmalloc(size_t size); #endif /* Forward declarations. */ +struct homa_peer; struct homa_sock; struct homa; #include "homa.h" #include "timetrace.h" -#include "homa_rpc.h" -#include "homa_wire.h" #include "homa_metrics.h" /* Declarations used in this file, so they can't be made at the end. */ @@ -487,9 +481,10 @@ struct homa { __u16 next_client_port __aligned(CACHE_LINE_SIZE); /** - * @port_map: Information about all open sockets. + * @port_map: Information about all open sockets. Dynamically + * allocated; must be kfreed. */ - struct homa_socktab port_map __aligned(CACHE_LINE_SIZE); + struct homa_socktab *port_map __aligned(CACHE_LINE_SIZE); /** * @peertab: Info about all the other hosts we have communicated with. @@ -1089,30 +1084,6 @@ static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb) - sizeof(struct homa_skb_info)); } -/** - * homa_is_client(): returns true if we are the client for a particular RPC, - * false if we are the server. - * @id: Id of the RPC in question. - */ -static inline bool homa_is_client(__u64 id) -{ - return (id & 1) == 0; -} - -/** - * homa_local_id(): given an RPC identifier from an input packet (which - * is network-encoded), return the decoded id we should use for that - * RPC on this machine. - * @sender_id: RPC id from an incoming packet, such as h->common.sender_id - */ -static inline __u64 homa_local_id(__be64 sender_id) -{ - /* If the client bit was set on the sender side, it needs to be - * removed here, and conversely. - */ - return be64_to_cpu(sender_id) ^ 1; -} - /** * homa_next_skb() - Compute address of Homa's private link field in @skb. * @skb: Socket buffer containing private link field. @@ -1128,22 +1099,6 @@ static inline struct sk_buff **homa_next_skb(struct sk_buff *skb) return (struct sk_buff **) (skb_end_pointer(skb) - sizeof(char *)); } -/** - * port_hash() - Hash function for port numbers. - * @port: Port number being looked up. - * - * Return: The index of the bucket in which this port will be found (if - * it exists. - */ -static inline int homa_port_hash(__u16 port) -{ - /* We can use a really simple hash function here because client - * port numbers are allocated sequentially and server port numbers - * are unpredictable. - */ - return port & (HOMA_SOCKTAB_BUCKETS - 1); -} - /** * homa_set_doff() - Fills in the doff TCP header field for a Homa packet. * @h: Packet header whose doff field is to be set. @@ -1157,11 +1112,6 @@ static inline void homa_set_doff(struct data_header *h, int size) h->common.doff = size << 2; } -static inline struct homa_sock *homa_sk(const struct sock *sk) -{ - return (struct homa_sock *)sk; -} - /** * homa_grantable_lock() - Acquire the grantable lock. If the lock * isn't immediately available, record stats on the waiting time. diff --git a/homa_incoming.c b/homa_incoming.c index 2ba42f0f..67ada889 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -356,7 +356,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) int num_acks = 0; /* Find the appropriate socket.*/ - hsk = homa_sock_find(&homa->port_map, dport); + hsk = homa_sock_find(homa->port_map, dport); if (!hsk) { if (skb_is_ipv6(skb)) icmp6_send(skb, ICMPV6_DEST_UNREACH, @@ -985,7 +985,7 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, struct homa_rpc *rpc, *tmp; rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); + for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk != NULL; hsk = homa_socktab_next(&scan)) { /* Skip the (expensive) lock acquisition if there's no * work to do. diff --git a/homa_outgoing.c b/homa_outgoing.c index 31affa15..61a3a112 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_rpc.h" #include "homa_wire.h" /** diff --git a/homa_peer.c b/homa_peer.c index 8138f412..77362599 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_rpc.h" /** * homa_peertab_init() - Constructor for homa_peertabs. diff --git a/homa_peer.h b/homa_peer.h index 688a6487..88955cb9 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -10,6 +10,8 @@ #include "homa_wire.h" #include "homa_sock.h" +struct homa_rpc; + /** * struct homa_dead_dst - Used to retain dst_entries that are no longer * needed, until it is safe to delete them (I'm not confident that the RCU diff --git a/homa_plumbing.c b/homa_plumbing.c index 9ead714e..e5b1673c 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -681,7 +681,7 @@ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; port = ntohs(addr_in->in6.sin6_port); } - return homa_sock_bind(&homa->port_map, hsk, port); + return homa_sock_bind(homa->port_map, hsk, port); } /** diff --git a/homa_pool.h b/homa_pool.h index b5ffdfe6..1b04b4fd 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -6,6 +6,8 @@ #ifndef _HOMA_POOL_H #define _HOMA_POOL_H +#include "homa_rpc.h" + /** * struct homa_bpage - Contains information about a single page in * a buffer pool. diff --git a/homa_rpc.c b/homa_rpc.c index aeaaea15..493a0133 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -218,7 +218,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, * out from under us. */ rcu_read_lock(); - hsk2 = homa_sock_find(&hsk->homa->port_map, server_port); + hsk2 = homa_sock_find(hsk->homa->port_map, server_port); if (!hsk2) goto done; } @@ -569,7 +569,7 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) pr_notice("Logging active Homa RPCs:\n"); rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); + for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk != NULL; hsk = homa_socktab_next(&scan)) { if (list_empty(&hsk->active_rpcs) || hsk->shutdown) continue; @@ -646,7 +646,7 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) homa_grant_log_tt(homa); tt_record("Logging active Homa RPCs:"); rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); + for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk != NULL; hsk = homa_socktab_next(&scan)) { if (list_empty(&hsk->active_rpcs) || hsk->shutdown) continue; @@ -698,7 +698,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) atomic_read(&homa->total_incoming)); *link_errors = 0; rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); + for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk != NULL; hsk = homa_socktab_next(&scan)) { if (list_empty(&hsk->active_rpcs) || hsk->shutdown) continue; diff --git a/homa_rpc.h b/homa_rpc.h index 629a08f4..ce8ac920 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -510,4 +510,14 @@ static inline void homa_rpc_validate(struct homa_rpc *rpc) BUG(); } +/** + * homa_is_client(): returns true if we are the client for a particular RPC, + * false if we are the server. + * @id: Id of the RPC in question. + */ +static inline bool homa_is_client(__u64 id) +{ + return (id & 1) == 0; +} + #endif /* _HOMA_RPC_H */ diff --git a/homa_sock.c b/homa_sock.c index 90a29547..7f3dfeae 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -104,7 +104,7 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) */ void homa_sock_init(struct homa_sock *hsk, struct homa *homa) { - struct homa_socktab *socktab = &homa->port_map; + struct homa_socktab *socktab = homa->port_map; int i; spin_lock_bh(&socktab->write_lock); @@ -189,9 +189,9 @@ void homa_sock_shutdown(struct homa_sock *hsk) * See sync.txt for additional information about locking. */ hsk->shutdown = true; - spin_lock_bh(&hsk->homa->port_map.write_lock); + spin_lock_bh(&hsk->homa->port_map->write_lock); hlist_del_rcu(&hsk->socktab_links.hash_links); - spin_unlock_bh(&hsk->homa->port_map.write_lock); + spin_unlock_bh(&hsk->homa->port_map->write_lock); homa_sock_unlock(hsk); list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { diff --git a/homa_sock.h b/homa_sock.h index bbf6047f..5704ec53 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -295,6 +295,22 @@ static inline void homa_sock_unlock(struct homa_sock *hsk) spin_unlock_bh(&hsk->lock); } +/** + * port_hash() - Hash function for port numbers. + * @port: Port number being looked up. + * + * Return: The index of the bucket in which this port will be found (if + * it exists. + */ +static inline int homa_port_hash(__u16 port) +{ + /* We can use a really simple hash function here because client + * port numbers are allocated sequentially and server port numbers + * are unpredictable. + */ + return port & (HOMA_SOCKTAB_BUCKETS - 1); +} + /** * homa_client_rpc_bucket() - Find the bucket containing a given * client RPC. @@ -375,4 +391,9 @@ static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id) spin_unlock_bh(&bucket->lock); } +static inline struct homa_sock *homa_sk(const struct sock *sk) +{ + return (struct homa_sock *)sk; +} + #endif /* _HOMA_SOCK_H */ \ No newline at end of file diff --git a/homa_timer.c b/homa_timer.c index b026e052..457053e4 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_rpc.h" /** * homa_check_rpc() - Invoked for each RPC during each timer pass; does @@ -198,7 +199,7 @@ void homa_timer(struct homa *homa) * below prevents sockets from being deleted during the scan. */ rcu_read_lock(); - for (hsk = homa_socktab_start_scan(&homa->port_map, &scan); + for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk != NULL; hsk = homa_socktab_next(&scan)) { while (hsk->dead_skbs >= homa->dead_buffs_limit) { /* If we get here, it means that homa_wait_for_message diff --git a/homa_utils.c b/homa_utils.c index d657ee43..7d233a28 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_rpc.h" /* Core-specific information. NR_CPUS is an overestimate of the actual * number, but allows us to allocate the array statically. @@ -123,7 +124,8 @@ int homa_init(struct homa *homa) homa->throttle_min_bytes = 200; atomic_set(&homa->total_incoming, 0); homa->next_client_port = HOMA_MIN_DEFAULT_PORT; - homa_socktab_init(&homa->port_map); + homa->port_map = kmalloc(sizeof *homa->port_map, GFP_KERNEL); + homa_socktab_init(homa->port_map); homa->peers = kmalloc(sizeof *homa->peers, GFP_KERNEL); err = homa_peertab_init(homa->peers); if (err) { @@ -219,7 +221,8 @@ void homa_destroy(struct homa *homa) } /* The order of the following 2 statements matters! */ - homa_socktab_destroy(&homa->port_map); + homa_socktab_destroy(homa->port_map); + kfree(homa->port_map); homa_peertab_destroy(homa->peers); kfree(homa->peers); homa_skb_cleanup(homa); @@ -580,7 +583,7 @@ void homa_freeze_peers(struct homa *homa) struct homa_socktab_scan scan; /* Find a socket to use (any will do). */ - hsk = homa_socktab_start_scan(&homa->port_map, &scan); + hsk = homa_socktab_start_scan(homa->port_map, &scan); if (hsk == NULL) { tt_record("homa_freeze_peers couldn't find a socket"); return; diff --git a/homa_wire.h b/homa_wire.h index da9c41b4..3cc38417 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -476,4 +476,18 @@ struct ack_header { _Static_assert(sizeof(struct ack_header) <= HOMA_MAX_HEADER, "ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +/** + * homa_local_id(): given an RPC identifier from an input packet (which + * is network-encoded), return the decoded id we should use for that + * RPC on this machine. + * @sender_id: RPC id from an incoming packet, such as h->common.sender_id + */ +static inline __u64 homa_local_id(__be64 sender_id) +{ + /* If the client bit was set on the sender side, it needs to be + * removed here, and conversely. + */ + return be64_to_cpu(sender_id) ^ 1; +} + #endif /* _HOMA_WIRE_H */ diff --git a/notes.txt b/notes.txt index 10c3d069..4109b406 100755 --- a/notes.txt +++ b/notes.txt @@ -1,25 +1,39 @@ Notes for Homa implementation in Linux: --------------------------------------- -* Ideas for making TCP and Homa play well together: +* Notes on splitting homa_impl.h: + * Remove homa_wire.h include from homa_impl.h? + * Where should homa_rpc_bucket be declared (currently in homa_impl.h)? + * Move homa_message_in_init to homa_rpc.c? + +* Thoughts on making TCP and Homa play better together: * Goals: - * Balance queue lengths for the protocols? + * Keep the NIC tx queue from growing long. + * Share bandwidth fairly between TCP and Homa * If one protocol is using a lot less bandwidth, give it preference for transmission? - * Keep track of recent bandwidth consumed by each protocol; when there - is overload, restrict each protocol to its fraction of recent bandwidth. - * "Consumed" has to be measured in terms of bytes offered, not bytes actually - transmitted (otherwise a protocol could get "stuck" at a low transmittion - rate?). - * Maybe use a mechanism like fair-share scheduling? Keep track of - recent usage by each protocol, give priority to the protocol that - used least bandwidth recently? - * Could this be implemented with a mechanism like a token bucket? - * Use a token bucket for each protocol with 50% of available bandwidth - (or maybe less?). Split any extra available bandwidth among the - protocols. Maybe adjust rates for the token buckets based on recent - traffic? - * Also consider the amount of data that has been "stuck" in the NIC? + * Balance queue lengths for the protocols? + * Approach #1: analogous to "fair share" CPU scheduling + * Keep separate tx queues for Homa and TCP + * Try to equalize the queue lengths while pacing packets at + network speed? + * Keep track of lengths for each of many queues + * Each queue paces itself based on relative lengths + * Do all pacing centrally, call back to individual queues for output? + * Approach #2: + * Keep track of recent bandwidth consumed by each protocol; when there + is overload, restrict each protocol to its fraction of recent bandwidth. + * "Consumed" has to be measured in terms of bytes offered, not bytes + actually transmitted (otherwise a protocol could get "stuck" at a low + transmittion rate?). + * Approach #3: token bucket + * Use a token bucket for each protocol with 50% of available bandwidth + (or maybe less?). Split any extra available bandwidth among the + protocols. Maybe adjust rates for the token buckets based on recent + traffic? + * Also consider the amount of data that is "stuck" in the NIC? + +* Notes on Linux qdiscs: * Remedies to consider for the performance problems at 100 Gbps, where one tx channel gets very backed up: @@ -280,7 +294,6 @@ Notes for Homa implementation in Linux: * If there is an error in ip_queue_xmit, does it free the packet? * The answer appears to be "yes", and Homa contains code to check this and log if not. - * How to compute the *real* number of CPUS (<< NR_CPUS?) * Is there a better way to compute packet hashes than Homa's approach in gro_complete? @@ -395,3 +408,6 @@ Notes for Homa implementation in Linux: ip_input.c: ip_rcv_finish ip_input.c: dst_input homa_plumbing.c: homa_softirq + + + gcc -g -Wp,-MMD,/users/ouster/homaModule/.homa_offload.o.d -nostdinc -I./arch/x86/include -I./arch/x86/include/generated -I./include -I./arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I./include/uapi -I./include/generated/uapi -include ./include/linux/compiler-version.h -include ./include/linux/kconfig.h -include ./include/linux/compiler_types.h -D__KERNEL__ -fmacro-prefix-map=./= -std=gnu11 -fshort-wchar -funsigned-char -fno-common -fno-PIE -fno-strict-aliasing -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -fcf-protection=none -m64 -falign-jumps=1 -falign-loops=1 -mno-80387 -mno-fp-ret-in-387 -mpreferred-stack-boundary=3 -mskip-rax-setup -mtune=generic -mno-red-zone -mcmodel=kernel -Wno-sign-compare -fno-asynchronous-unwind-tables -mindirect-branch=thunk-extern -mindirect-branch-register -mindirect-branch-cs-prefix -mfunction-return=thunk-extern -fno-jump-tables -fpatchable-function-entry=16,16 -fno-delete-null-pointer-checks -O2 -fno-allow-store-data-races -fstack-protector-strong -fno-omit-frame-pointer -fno-optimize-sibling-calls -fno-stack-clash-protection -pg -mrecord-mcount -mfentry -DCC_USING_FENTRY -falign-functions=16 -fno-strict-overflow -fno-stack-check -fconserve-stack -Wall -Wundef -Werror=implicit-function-declaration -Werror=implicit-int -Werror=return-type -Werror=strict-prototypes -Wno-format-security -Wno-trigraphs -Wno-frame-address -Wno-address-of-packed-member -Wmissing-declarations -Wmissing-prototypes -Wframe-larger-than=2048 -Wno-main -Wvla -Wno-pointer-sign -Wcast-function-type -Wno-stringop-overflow -Wno-array-bounds -Wno-alloc-size-larger-than -Wimplicit-fallthrough=5 -Werror=date-time -Werror=incompatible-pointer-types -Werror=designated-init -Wenum-conversion -Wextra -Wunused -Wno-unused-but-set-variable -Wno-unused-const-variable -Wno-packed-not-aligned -Wno-format-overflow -Wno-format-truncation -Wno-stringop-truncation -Wno-override-init -Wno-missing-field-initializers -Wno-type-limits -Wno-shift-negative-value -Wno-maybe-uninitialized -Wno-sign-compare -Wno-unused-parameter -g -gdwarf-4 -g -DMODULE -DKBUILD_BASENAME='"homa_offload"' -DKBUILD_MODNAME='"homa"' -D__KBUILD_MODNAME=kmod_homa -E -o /users/ouster/homaModule/homa_offload.e /users/ouster/homaModule/homa_offload.c ; ./tools/objtool/objtool --hacks=jump_label --hacks=noinstr --hacks=skylake --retpoline --rethunk --stackval --static-call --uaccess --prefix=16 --module /users/ouster/homaModule/homa_offload.o diff --git a/test/mock.c b/test/mock.c index 23e2f9c2..ce375e8a 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1438,7 +1438,7 @@ void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) if (port != 0) homa->next_client_port = saved_port; if (port < HOMA_MIN_DEFAULT_PORT) - homa_sock_bind(&homa->port_map, hsk, port); + homa_sock_bind(homa->port_map, hsk, port); hsk->inet.pinet6 = &hsk_pinfo; mock_mtu = UNIT_TEST_DATA_PER_PACKET + hsk->ip_header_length + sizeof(struct data_header); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 6e8d51ab..59b910d8 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 3b90788e..3531fb34 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index e7327040..6122853f 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -4,6 +4,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -464,7 +465,7 @@ TEST_F(homa_outgoing, homa_xmit_control__server_request) struct homa_rpc *srpc; struct grant_header h; - homa_sock_bind(&self->homa.port_map, &self->hsk, self->server_port); + homa_sock_bind(self->homa.port_map, &self->hsk, self->server_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 10000); diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 5af0d545..7bd5cfdd 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -4,6 +4,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 11f89b12..abaf781d 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -67,7 +67,7 @@ FIXTURE_SETUP(homa_plumbing) self->server_addr.in4.sin_addr.s_addr = ipv6_to_ipv4(self->server_addr.in6.sin6_addr); } - homa_sock_bind(&self->homa.port_map, &self->hsk, self->server_port); + homa_sock_bind(self->homa.port_map, &self->hsk, self->server_port); self->data = (struct data_header){.common = { .sport = htons(self->client_port), .dport = htons(self->server_port), diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 5f3be0b5..e18a37fa 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_sock.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -50,7 +51,7 @@ TEST_F(homa_sock, homa_socktab_start_scan) homa_destroy(&self->homa); homa_init(&self->homa); mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); - EXPECT_EQ(&self->hsk, homa_socktab_start_scan(&self->homa.port_map, + EXPECT_EQ(&self->hsk, homa_socktab_start_scan(self->homa.port_map, &scan)); EXPECT_EQ(100, scan.current_bucket); } @@ -66,7 +67,7 @@ TEST_F(homa_sock, homa_socktab_next__basics) mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); mock_sock_init(&hsk3, &self->homa, first_port+2*HOMA_SOCKTAB_BUCKETS); mock_sock_init(&hsk4, &self->homa, first_port+5); - hsk = homa_socktab_start_scan(&self->homa.port_map, &scan); + hsk = homa_socktab_start_scan(self->homa.port_map, &scan); EXPECT_EQ(first_port+2*HOMA_SOCKTAB_BUCKETS, hsk->port); hsk = homa_socktab_next(&scan); EXPECT_EQ(first_port+HOMA_SOCKTAB_BUCKETS, hsk->port); @@ -91,7 +92,7 @@ TEST_F(homa_sock, homa_socktab_next__deleted_socket) mock_sock_init(&hsk1, &self->homa, first_port); mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); mock_sock_init(&hsk3, &self->homa, first_port+2*HOMA_SOCKTAB_BUCKETS); - hsk = homa_socktab_start_scan(&self->homa.port_map, &scan); + hsk = homa_socktab_start_scan(self->homa.port_map, &scan); EXPECT_EQ(first_port+2*HOMA_SOCKTAB_BUCKETS, hsk->port); homa_sock_destroy(&hsk2); hsk = homa_socktab_next(&scan); @@ -146,26 +147,26 @@ TEST_F(homa_sock, homa_sock_shutdown__basics) int client2, client3; struct homa_sock hsk2, hsk3; mock_sock_init(&hsk2, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk2, 100)); + EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); client2 = hsk2.port; mock_sock_init(&hsk3, &self->homa, 0); client3 = hsk3.port; - EXPECT_EQ(&hsk2, homa_sock_find(&self->homa.port_map, client2)); - EXPECT_EQ(&hsk2, homa_sock_find(&self->homa.port_map, 100)); - EXPECT_EQ(&hsk3, homa_sock_find(&self->homa.port_map, client3)); + EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, client2)); + EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, 100)); + EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, client3)); homa_sock_shutdown(&hsk2); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, client2)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, 100)); - EXPECT_EQ(&hsk3, homa_sock_find(&self->homa.port_map, client3)); + EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client2)); + EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, 100)); + EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, client3)); homa_sock_shutdown(&hsk3); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, client2)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, 100)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, client3)); + EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client2)); + EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, 100)); + EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client3)); } TEST_F(homa_sock, homa_sock_shutdown__already_shutdown) { @@ -218,32 +219,32 @@ TEST_F(homa_sock, homa_sock_bind) { struct homa_sock hsk2; mock_sock_init(&hsk2, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk2, 100)); + EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); - EXPECT_EQ(0, -homa_sock_bind(&self->homa.port_map, &self->hsk, 0)); + EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &self->hsk, 0)); EXPECT_EQ(HOMA_MIN_DEFAULT_PORT, self->hsk.port); - EXPECT_EQ(EINVAL, -homa_sock_bind(&self->homa.port_map, &self->hsk, + EXPECT_EQ(EINVAL, -homa_sock_bind(self->homa.port_map, &self->hsk, HOMA_MIN_DEFAULT_PORT + 100)); - EXPECT_EQ(EADDRINUSE, -homa_sock_bind(&self->homa.port_map, &self->hsk, + EXPECT_EQ(EADDRINUSE, -homa_sock_bind(self->homa.port_map, &self->hsk, 100)); - EXPECT_EQ(0, -homa_sock_bind(&self->homa.port_map, &hsk2, + EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &hsk2, 100)); - EXPECT_EQ(0, -homa_sock_bind(&self->homa.port_map, &self->hsk, + EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &self->hsk, 110)); - EXPECT_EQ(&self->hsk, homa_sock_find(&self->homa.port_map, 110)); - EXPECT_EQ(0, -homa_sock_bind(&self->homa.port_map, &self->hsk, + EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, 110)); + EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &self->hsk, 120)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, 110)); - EXPECT_EQ(&self->hsk, homa_sock_find(&self->homa.port_map, 120)); + EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, 110)); + EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, 120)); homa_sock_destroy(&hsk2); } TEST_F(homa_sock, homa_sock_bind__socket_shutdown) { homa_sock_shutdown(&self->hsk); - EXPECT_EQ(ESHUTDOWN, -homa_sock_bind(&self->homa.port_map, &self->hsk, + EXPECT_EQ(ESHUTDOWN, -homa_sock_bind(self->homa.port_map, &self->hsk, 100)); } @@ -251,12 +252,12 @@ TEST_F(homa_sock, homa_sock_find__basics) { struct homa_sock hsk2; mock_sock_init(&hsk2, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk2, 100)); - EXPECT_EQ(&self->hsk, homa_sock_find(&self->homa.port_map, + EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); + EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, self->hsk.port)); - EXPECT_EQ(&hsk2, homa_sock_find(&self->homa.port_map, + EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, hsk2.port)); - EXPECT_EQ(NULL, homa_sock_find(&self->homa.port_map, + EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, hsk2.port + 1)); homa_sock_destroy(&hsk2); } @@ -264,24 +265,24 @@ TEST_F(homa_sock, homa_sock_find__basics) TEST_F(homa_sock, homa_sock_find__long_hash_chain) { struct homa_sock hsk2, hsk3, hsk4; - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &self->hsk, 13)); + EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &self->hsk, 13)); mock_sock_init(&hsk2, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk2, + EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 2*HOMA_SOCKTAB_BUCKETS + 13)); mock_sock_init(&hsk3, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk3, + EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk3, 3*HOMA_SOCKTAB_BUCKETS + 13)); mock_sock_init(&hsk4, &self->homa, 0); - EXPECT_EQ(0, homa_sock_bind(&self->homa.port_map, &hsk4, + EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk4, 5*HOMA_SOCKTAB_BUCKETS + 13)); - EXPECT_EQ(&self->hsk, homa_sock_find(&self->homa.port_map, + EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, 13)); - EXPECT_EQ(&hsk2, homa_sock_find(&self->homa.port_map, + EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, 2*HOMA_SOCKTAB_BUCKETS + 13)); - EXPECT_EQ(&hsk3, homa_sock_find(&self->homa.port_map, + EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, 3*HOMA_SOCKTAB_BUCKETS + 13)); - EXPECT_EQ(&hsk4, homa_sock_find(&self->homa.port_map, + EXPECT_EQ(&hsk4, homa_sock_find(self->homa.port_map, 5*HOMA_SOCKTAB_BUCKETS + 13)); homa_sock_destroy(&hsk2); diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 216ff473..090754aa 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -4,6 +4,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/utils.c b/test/utils.c index a684d4cb..3a17958d 100644 --- a/test/utils.c +++ b/test/utils.c @@ -8,6 +8,7 @@ #include "homa_impl.h" #include "homa_peer.h" +#include "homa_rpc.h" #include "ccutils.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" diff --git a/test/utils.h b/test/utils.h index 988c7785..5825bbc7 100644 --- a/test/utils.h +++ b/test/utils.h @@ -4,6 +4,8 @@ /* Utility functions for unit tests, implemented in C. */ +struct homa_message_out; +struct homa_rpc; struct unit_hash; /** From 42452fda917f2d412c437def81376a8618d27fce Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Oct 2024 16:51:37 -0700 Subject: [PATCH 032/625] Extract homa_skb.h from homa_impl.h Also, create separate homa_skb_core struct for core-specific data related to sk_buff management. --- homa_impl.h | 127 +++----------------- homa_outgoing.c | 1 + homa_rpc.c | 1 + homa_skb.c | 183 ++++++++++++++++++----------- homa_skb.h | 124 ++++++++++++++++++++ homa_timer.c | 1 + homa_utils.c | 47 +------- test/mock.c | 1 + test/unit_homa_outgoing.c | 1 + test/unit_homa_skb.c | 240 +++++++++++++++++++++----------------- 10 files changed, 389 insertions(+), 337 deletions(-) create mode 100644 homa_skb.h diff --git a/homa_impl.h b/homa_impl.h index c343aa69..951a2756 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -141,8 +141,6 @@ extern int homa_grantable_lock_slow(struct homa *homa, int recalc); extern void homa_throttle_lock_slow(struct homa *homa); extern struct homa_core *homa_cores[]; -extern struct homa_numa *homa_numas[]; -extern int homa_num_numas; #define sizeof32(type) ((int) (sizeof(type))) @@ -156,18 +154,6 @@ extern int homa_num_numas; */ #define HOMA_MAX_GRANTS 10 -/** - * define HOMA_PAGE_ORDER: power-of-two exponent determining how - * many pages to allocate in a high-order page for skb pages (e.g., - * 2 means allocate in units of 4 pages). - */ -#define HOMA_SKB_PAGE_ORDER 4 - -/** - * define HOMA_PAGE_SIZE: number of bytes corresponding to HOMA_PAGE_ORDER. - */ -#define HOMA_SKB_PAGE_SIZE (PAGE_SIZE << HOMA_SKB_PAGE_ORDER) - /** * struct homa_cache_line - An object whose size equals that of a cache line. */ @@ -256,31 +242,6 @@ enum homa_freeze_type { NEED_ACK_MISSING_DATA = 6, }; -/** - * struct homa_page_pool - A cache of free pages available for use in tx skbs. - * Each page is of size HOMA_SKB_PAGE_SIZE, and a pool is dedicated for - * use by a single NUMA node. Access to these objects is synchronized with - * @homa->page_pool_mutex. - */ -struct homa_page_pool { - /** @avail: Number of free pages currently in the pool. */ - int avail; - - /** - * @low_mark: Low water mark: smallest value of avail since the - * last time homa_skb_release_pages reset it. - */ - int low_mark; - -#define HOMA_PAGE_POOL_SIZE 1000 - - /** - * @pages: Pointers to pages that are currently free; the ref count - * is 1 in each of these pages. - */ - struct page *pages[HOMA_PAGE_POOL_SIZE]; -}; - /** * struct homa - Overall information about the Homa protocol implementation. * @@ -498,6 +459,15 @@ struct homa { */ spinlock_t page_pool_mutex __aligned(CACHE_LINE_SIZE); + /** + * @page_pools: One page pool for each NUMA node on the machine. + * If there are no cores for node, then this value is NULL. + */ + struct homa_page_pool *page_pools[MAX_NUMNODES]; + + /** @max_numa: Highest NUMA node id in use by any core. */ + int max_numa; + /** * @skb_page_frees_per_sec: Rate at which to return pages from sk_buff * page pools back to Linux. This is the total rate across all pools. @@ -908,24 +878,13 @@ struct homa { int temp[4]; }; - -/** - * struct homa_numa - Homa allocates one of these structures for each - * NUMA node, for information that needs to be kept separately for each - * NUMA node. - */ -struct homa_numa { - /** Used to speed up allocation of tx skbs for cores in this node. */ - struct homa_page_pool page_pool; -}; - /** * struct homa_core - Homa allocates one of these structures for each * core, to hold information that needs to be kept on a per-core basis. */ struct homa_core { - /** Information about the NUMA node to which this node belongs. */ - struct homa_numa *numa; + /** NUMA-specific page pool from which to allocate skb pages. */ + struct homa_page_pool *pool; /** * @last_active: the last time (in get_cycle() units) that @@ -998,49 +957,12 @@ struct homa_core { * is NULL. */ __u64 syscall_end_time; - - /** - * @skb_page: a page of data available being used for skb frags. - * This pointer is included in the page's reference count. - */ - struct page *skb_page; - - /** - * @page_inuse: offset of first byte in @skb_page that hasn't already - * been allocated. - */ - int page_inuse; - - /** @page_size: total number of bytes available in @skb_page. */ - int page_size; - - /** - * define HOMA_MAX_STASHED: maximum number of stashed pages that - * can be consumed by a message of a given size (assumes page_inuse - * is 0). This is a rough guess, since it doesn't consider all of - * the data_segments that will be needed for the packets. - */ -#define HOMA_MAX_STASHED(size) (((size - 1) / HOMA_SKB_PAGE_SIZE) + 1) - - /** - * @num_stashed_pages: number of pages currently available in - * stashed_pages. - */ - int num_stashed_pages; - - /** - * @stashed_pages: use to prefetch from the cache all of the pages a - * message will need with a single operation, to avoid having to - * synchronize separately for each page. Note: these pages are all - * HOMA_SKB_PAGE_SIZE in length. - */ - struct page *stashed_pages[HOMA_MAX_STASHED(HOMA_MAX_MESSAGE_LENGTH)]; }; /** * struct homa_skb_info - Additional information needed by Homa for each - * DATA packet. Space is allocated for this at the very end of the linear - * part of the skb. + * outbound DATA packet. Space is allocated for this at the very end of the + * linear part of the skb. */ struct homa_skb_info { /** @@ -1397,29 +1319,6 @@ extern int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); extern int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t __user optval, unsigned int optlen); extern int homa_shutdown(struct socket *sock, int how); -extern int homa_skb_append_from_iter(struct homa *homa, - struct sk_buff *skb, struct iov_iter *iter, int length); -extern int homa_skb_append_from_skb(struct homa *homa, - struct sk_buff *dst_skb, struct sk_buff *src_skb, - int offset, int length); -extern int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, - void *buf, int length); -extern void homa_skb_cache_pages(struct homa *homa, struct page **pages, - int count); -extern void homa_skb_cleanup(struct homa *homa); -extern void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, - int *length); -extern void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb); -extern void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, - int count); -extern void homa_skb_get(struct sk_buff *skb, void *dest, int offset, - int length); -extern struct sk_buff - *homa_skb_new_tx(int length); -extern bool homa_skb_page_alloc(struct homa *homa, struct homa_core *core); -extern void homa_skb_page_pool_init(struct homa_page_pool *pool); -extern void homa_skb_release_pages(struct homa *homa); -extern void homa_skb_stash_pages(struct homa *homa, int length); extern int homa_snprintf(char *buffer, int size, int used, const char *format, ...) __printf(4, 5); extern int homa_softirq(struct sk_buff *skb); diff --git a/homa_outgoing.c b/homa_outgoing.c index 61a3a112..686c27a1 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -7,6 +7,7 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_rpc.h" +#include "homa_skb.h" #include "homa_wire.h" /** diff --git a/homa_rpc.c b/homa_rpc.c index 493a0133..dea4d769 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -5,6 +5,7 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_pool.h" +#include "homa_skb.h" /** * homa_rpc_new_client() - Allocate and construct a client RPC (one that is used diff --git a/homa_skb.c b/homa_skb.c index d3a40f1e..2bc9dc1c 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -1,8 +1,14 @@ // SPDX-License-Identifier: BSD-2-Clause -/* This file contains functions for allocating and freeing sk_buffs. */ +/* This file contains functions for allocating and freeing sk_buffs. In + * particular, this file implements efficient management of the memory used + * by sk_buffs. + */ #include "homa_impl.h" +#include "homa_skb.h" + +DEFINE_PER_CPU(struct homa_skb_core, homa_skb_core); #ifdef __UNIT_TEST__ extern int mock_max_skb_frags; @@ -17,15 +23,42 @@ static inline void frag_page_set(skb_frag_t *frag, struct page *page) } /** - * homa_skb_page_pool_init() - Invoked when a struct homa is created to - * initialize a page pool. - * @pool: Pool to initialize. + * homa_skb_init() - Invoked when a struct homa is created to initialize + * information related to sk_buff management. */ -void homa_skb_page_pool_init(struct homa_page_pool *pool) +void homa_skb_init(struct homa *homa) { - pool->avail = 0; - pool->low_mark = 0; - memset(pool->pages, 0, sizeof(pool->pages)); + int i; + + spin_lock_init(&homa->page_pool_mutex); + memset(homa->page_pools, 0, sizeof(homa->page_pools)); + homa->skb_page_frees_per_sec = 1000; + homa->skb_pages_to_free = NULL; + homa->pages_to_free_slots = 0; + homa->skb_page_free_time = 0; + homa->skb_page_pool_min_kb = (3*HOMA_MAX_MESSAGE_LENGTH)/1000; + + /* Initialize NUMA-specfific page pools. */ + homa->max_numa = -1; + for (i = 0; i < nr_cpu_ids; i++) { + struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, i); + int numa = cpu_to_node(i); + BUG_ON(numa >= MAX_NUMNODES); + + if (numa > homa->max_numa) + homa->max_numa = numa; + if (homa->page_pools[numa] == NULL) { + struct homa_page_pool *pool; + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + pool->avail = 0; + pool->low_mark = 0; + memset(pool->pages, 0, sizeof(pool->pages)); + homa->page_pools[numa] = pool; + } + skb_core->pool = homa->page_pools[numa]; + } + pr_notice("homa_skb_init found max NUMA node %d\n", homa->max_numa); } /** @@ -38,27 +71,30 @@ void homa_skb_cleanup(struct homa *homa) int i, j; for (i = 0; i < nr_cpu_ids; i++) { - struct homa_core *core = homa_cores[i]; + struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, i); - if (core->skb_page != NULL) { - put_page(core->skb_page); - core->skb_page = NULL; - core->page_size = 0; - core->page_inuse = 0; + if (skb_core->skb_page != NULL) { + put_page(skb_core->skb_page); + skb_core->skb_page = NULL; + skb_core->page_size = 0; + skb_core->page_inuse = 0; } - for (j = 0; j < core->num_stashed_pages; j++) - put_page(core->stashed_pages[j]); - core->num_stashed_pages = 0; + for (j = 0; j < skb_core->num_stashed_pages; j++) + put_page(skb_core->stashed_pages[j]); + skb_core->pool = NULL; + skb_core->num_stashed_pages = 0; } for (i = 0; i < MAX_NUMNODES; i++) { - struct homa_numa *numa = homa_numas[i]; + struct homa_page_pool *pool = homa->page_pools[i]; - if (!numa) + if (pool == NULL) continue; - for (j = numa->page_pool.avail - 1; j >= 0; j--) - put_page(numa->page_pool.pages[j]); - numa->page_pool.avail = 0; + for (j = pool->avail - 1; j >= 0; j--) + put_page(pool->pages[j]); + pool->avail = 0; + kfree(pool); + homa->page_pools[i] = NULL; } if (homa->skb_pages_to_free != NULL) { @@ -112,20 +148,21 @@ struct sk_buff *homa_skb_new_tx(int length) */ void homa_skb_stash_pages(struct homa *homa, int length) { - struct homa_core *core = homa_cores[raw_smp_processor_id()]; - struct homa_page_pool *pool = &core->numa->page_pool; + struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, + raw_smp_processor_id()); + struct homa_page_pool *pool = skb_core->pool; int pages_needed = HOMA_MAX_STASHED(length); - if ((pages_needed < 2) || (core->num_stashed_pages >= pages_needed)) + if ((pages_needed < 2) || (skb_core->num_stashed_pages >= pages_needed)) return; spin_lock_bh(&homa->page_pool_mutex); - while (pool->avail && (core->num_stashed_pages < pages_needed)) { + while (pool->avail && (skb_core->num_stashed_pages < pages_needed)) { pool->avail--; if (pool->avail < pool->low_mark) pool->low_mark = pool->avail; - core->stashed_pages[core->num_stashed_pages] = + skb_core->stashed_pages[skb_core->num_stashed_pages] = pool->pages[pool->avail]; - core->num_stashed_pages++; + skb_core->num_stashed_pages++; } spin_unlock_bh(&homa->page_pool_mutex); } @@ -144,43 +181,44 @@ void homa_skb_stash_pages(struct homa *homa, int length) void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) { struct skb_shared_info *shinfo = skb_shinfo(skb); - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, + raw_smp_processor_id()); skb_frag_t *frag = &shinfo->frags[shinfo->nr_frags - 1]; char *result; int actual_size = *length; /* Can we just extend the skb's last fragment? */ - if ((shinfo->nr_frags > 0) && (skb_frag_page(frag) == core->skb_page) - && (core->page_inuse < core->page_size) + if ((shinfo->nr_frags > 0) && (skb_frag_page(frag) == skb_core->skb_page) + && (skb_core->page_inuse < skb_core->page_size) && ((frag->offset + skb_frag_size(frag)) - == core->page_inuse)) { - if ((core->page_size - core->page_inuse) < actual_size) - actual_size = core->page_size - core->page_inuse; + == skb_core->page_inuse)) { + if ((skb_core->page_size - skb_core->page_inuse) < actual_size) + actual_size = skb_core->page_size - skb_core->page_inuse; *length = actual_size; skb_frag_size_add(frag, actual_size); - result = page_address(skb_frag_page(frag)) + core->page_inuse; - core->page_inuse += actual_size; + result = page_address(skb_frag_page(frag)) + skb_core->page_inuse; + skb_core->page_inuse += actual_size; skb_len_add(skb, actual_size); return result; } /* Need to add a new fragment to the skb. */ - core->page_inuse = ALIGN(core->page_inuse, SMP_CACHE_BYTES); - if (core->page_inuse >= core->page_size) { - if (!homa_skb_page_alloc(homa, core)) + skb_core->page_inuse = ALIGN(skb_core->page_inuse, SMP_CACHE_BYTES); + if (skb_core->page_inuse >= skb_core->page_size) { + if (!homa_skb_page_alloc(homa, skb_core)) return NULL; } - if ((core->page_size - core->page_inuse) < actual_size) - actual_size = core->page_size - core->page_inuse; + if ((skb_core->page_size - skb_core->page_inuse) < actual_size) + actual_size = skb_core->page_size - skb_core->page_inuse; frag = &shinfo->frags[shinfo->nr_frags]; shinfo->nr_frags++; - frag_page_set(frag, core->skb_page); - get_page(core->skb_page); - frag->offset = core->page_inuse; + frag_page_set(frag, skb_core->skb_page); + get_page(skb_core->skb_page); + frag->offset = skb_core->page_inuse; *length = actual_size; skb_frag_size_set(frag, actual_size); - result = page_address(skb_frag_page(frag)) + core->page_inuse; - core->page_inuse += actual_size; + result = page_address(skb_frag_page(frag)) + skb_core->page_inuse; + skb_core->page_inuse += actual_size; skb_len_add(skb, actual_size); return result; } @@ -189,45 +227,46 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) * homa_skb_page_alloc() - Allocate a new page for skb allocation for a * given core. Any existing page is released. * @homa: Overall data about the Homa protocol implementation. - * @core: Allocate page in this core. + * @skb_core: Core-specific info; the page will be allocated in this core. * Return: True if successful, false if memory not available. */ -bool homa_skb_page_alloc(struct homa *homa, struct homa_core *core) +bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) { struct homa_page_pool *pool; __u64 start; - if (core->skb_page) { - if (page_ref_count(core->skb_page) == 1) { + if (skb_core->skb_page) { + if (page_ref_count(skb_core->skb_page) == 1) { /* The existing page is no longer in use, so we can * reuse it. */ - core->page_inuse = 0; + skb_core->page_inuse = 0; goto success; } - put_page(core->skb_page); + put_page(skb_core->skb_page); } /* Step 1: does this core have a stashed page? */ - core->page_size = HOMA_SKB_PAGE_SIZE; - core->page_inuse = 0; - if (core->num_stashed_pages > 0) { - core->num_stashed_pages--; - core->skb_page = core->stashed_pages[core->num_stashed_pages]; + skb_core->page_size = HOMA_SKB_PAGE_SIZE; + skb_core->page_inuse = 0; + if (skb_core->num_stashed_pages > 0) { + skb_core->num_stashed_pages--; + skb_core->skb_page =skb_core->stashed_pages[ + skb_core->num_stashed_pages]; goto success; } /* Step 2: can we retreive a page from the pool for this NUMA node? */ - pool = &core->numa->page_pool; + pool = skb_core->pool; if (pool->avail) { - struct homa_page_pool *pool = &core->numa->page_pool; - spin_lock_bh(&homa->page_pool_mutex); + + /* Must recheck: could have changed before locked. */ if (pool->avail) { pool->avail--; if (pool->avail < pool->low_mark) pool->low_mark = pool->avail; - core->skb_page = pool->pages[pool->avail]; + skb_core->skb_page = pool->pages[pool->avail]; spin_unlock_bh(&homa->page_pool_mutex); goto success; } @@ -237,21 +276,21 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_core *core) /* Step 3: can we allocate a new big page? */ INC_METRIC(skb_page_allocs, 1); start = get_cycles(); - core->skb_page = alloc_pages((GFP_KERNEL & ~__GFP_RECLAIM) | __GFP_COMP + skb_core->skb_page = alloc_pages((GFP_KERNEL & ~__GFP_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, HOMA_SKB_PAGE_ORDER); - if (likely(core->skb_page)) { + if (likely(skb_core->skb_page)) { INC_METRIC(skb_page_alloc_cycles, get_cycles() - start); goto success; } /* Step 4: can we allocate a normal page? */ - core->skb_page = alloc_page(GFP_KERNEL); + skb_core->skb_page = alloc_page(GFP_KERNEL); INC_METRIC(skb_page_alloc_cycles, get_cycles() - start); - if (likely(core->skb_page)) { - core->page_size = PAGE_SIZE; + if (likely(skb_core->skb_page)) { + skb_core->page_size = PAGE_SIZE; goto success; } - core->page_size = core->page_inuse = 0; + skb_core->page_size = skb_core->page_inuse = 0; return false; success: @@ -465,8 +504,8 @@ void homa_skb_cache_pages(struct homa *homa, struct page **pages, int count) spin_lock_bh(&homa->page_pool_mutex); for (i = 0; i < count; i++) { struct page *page = pages[i]; - struct homa_page_pool *pool = - &homa_numas[page_to_nid(page)]->page_pool; + struct homa_page_pool *pool = homa->page_pools[ + page_to_nid(page)]; if (pool->avail < LIMIT) { pool->pages[pool->avail] = page; pool->avail++; @@ -554,9 +593,11 @@ void homa_skb_release_pages(struct homa *homa) /* Find the pool with the largest low-water mark. */ max_low_mark = -1; spin_lock_bh(&homa->page_pool_mutex); - for (i = 0; i < homa_num_numas; i++) { - struct homa_page_pool *pool = &homa_numas[i]->page_pool; + for (i = 0; i <= homa->max_numa; i++) { + struct homa_page_pool *pool = homa->page_pools[i]; + if (pool == NULL) + continue; if (pool->low_mark > max_low_mark) { max_low_mark = pool->low_mark; max_pool = pool; diff --git a/homa_skb.h b/homa_skb.h new file mode 100644 index 00000000..c613c717 --- /dev/null +++ b/homa_skb.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file contains definitions related to efficient management of + * memory associated with transmit sk_buffs. + */ + +#ifndef _HOMA_SKB_H +#define _HOMA_SKB_H + +#include + +/** + * define HOMA_PAGE_ORDER: power-of-two exponent determining how + * many pages to allocate in a high-order page for skb pages (e.g., + * 2 means allocate in units of 4 pages). + */ +#define HOMA_SKB_PAGE_ORDER 4 + +/** + * define HOMA_PAGE_SIZE: number of bytes corresponding to HOMA_PAGE_ORDER. + */ +#define HOMA_SKB_PAGE_SIZE (PAGE_SIZE << HOMA_SKB_PAGE_ORDER) + +/** + * struct homa_page_pool - A cache of free pages available for use in tx skbs. + * Each page is of size HOMA_SKB_PAGE_SIZE, and a pool is dedicated for + * use by a single NUMA node. Access to these objects is synchronized with + * @homa->page_pool_mutex. + */ +struct homa_page_pool { + /** @avail: Number of free pages currently in the pool. */ + int avail; + + /** + * @low_mark: Low water mark: smallest value of avail since the + * last time homa_skb_release_pages reset it. + */ + int low_mark; + +#define HOMA_PAGE_POOL_SIZE 1000 + + /** + * @pages: Pointers to pages that are currently free; the ref count + * is 1 in each of these pages. + */ + struct page *pages[HOMA_PAGE_POOL_SIZE]; +}; + +/** + * struct homa_skb_core - Stores core-specific information related to + * sk_buff allocation. All values are assumed to be zero initially. + */ +struct homa_skb_core { + /** + * @pool: NUMA-specific page pool from which to allocate skb pages + * for this core. + */ + struct homa_page_pool *pool; + + /** + * @skb_page: a page of data available being used for skb frags. + * This pointer is included in the page's reference count. + */ + struct page *skb_page; + + /** + * @page_inuse: offset of first byte in @skb_page that hasn't already + * been allocated. + */ + int page_inuse; + + /** @page_size: total number of bytes available in @skb_page. */ + int page_size; + + /** + * define HOMA_MAX_STASHED: maximum number of stashed pages that + * can be consumed by a message of a given size (assumes page_inuse + * is 0). This is a rough guess, since it doesn't consider all of + * the data_segments that will be needed for the packets. + */ +#define HOMA_MAX_STASHED(size) (((size - 1) / HOMA_SKB_PAGE_SIZE) + 1) + + /** + * @num_stashed_pages: number of pages currently available in + * stashed_pages. + */ + int num_stashed_pages; + + /** + * @stashed_pages: use to prefetch from the cache all of the pages a + * message will need with a single operation, to avoid having to + * synchronize separately for each page. Note: these pages are all + * HOMA_SKB_PAGE_SIZE in length. + */ + struct page *stashed_pages[HOMA_MAX_STASHED(HOMA_MAX_MESSAGE_LENGTH)]; +}; +DECLARE_PER_CPU(struct homa_skb_core, homa_skb_core); + +extern int homa_skb_append_from_iter(struct homa *homa, + struct sk_buff *skb, struct iov_iter *iter, int length); +extern int homa_skb_append_from_skb(struct homa *homa, + struct sk_buff *dst_skb, struct sk_buff *src_skb, + int offset, int length); +extern int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, + void *buf, int length); +extern void homa_skb_cache_pages(struct homa *homa, struct page **pages, + int count); +extern void homa_skb_cleanup(struct homa *homa); +extern void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, + int *length); +extern void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb); +extern void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, + int count); +extern void homa_skb_get(struct sk_buff *skb, void *dest, int offset, + int length); +extern void homa_skb_init(struct homa *homa); +extern struct sk_buff + *homa_skb_new_tx(int length); +extern bool homa_skb_page_alloc(struct homa *homa, + struct homa_skb_core *core); +extern void homa_skb_release_pages(struct homa *homa); +extern void homa_skb_stash_pages(struct homa *homa, int length); + +#endif /* _HOMA_SKB_H */ \ No newline at end of file diff --git a/homa_timer.c b/homa_timer.c index 457053e4..168863e0 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -7,6 +7,7 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_rpc.h" +#include "homa_skb.h" /** * homa_check_rpc() - Invoked for each RPC during each timer pass; does diff --git a/homa_utils.c b/homa_utils.c index 7d233a28..953dfe4d 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -7,18 +7,13 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_rpc.h" +#include "homa_skb.h" /* Core-specific information. NR_CPUS is an overestimate of the actual * number, but allows us to allocate the array statically. */ struct homa_core *homa_cores[NR_CPUS]; -/* Information specific to individual NUMA nodes. */ -struct homa_numa *homa_numas[MAX_NUMNODES]; - -/* Total number of NUMA nodes actually defined in homa_numas. */ -int homa_num_numas; - /* Points to block of memory holding all homa_cores; used to free it. */ char *core_memory; @@ -36,29 +31,11 @@ int homa_init(struct homa *homa) { size_t aligned_size; char *first; - int i, err, num_numas; + int i, err; _Static_assert(HOMA_MAX_PRIORITIES >= 8, "homa_init assumes at least 8 priority levels"); - /* Initialize data specific to NUMA nodes. */ - memset(homa_numas, 0, sizeof(homa_numas)); - num_numas = 0; - for (i = 0; i < nr_cpu_ids; i++) { - struct homa_numa *numa; - int n = cpu_to_node(i); - - if (homa_numas[n]) - continue; - numa = kmalloc(sizeof(struct homa_numa), GFP_KERNEL); - homa_numas[n] = numa; - homa_skb_page_pool_init(&numa->page_pool); - if (n >= homa_num_numas) - homa_num_numas = n+1; - num_numas++; - } - pr_notice("Homa initialized %d homa_numas, highest number %d\n", num_numas, homa_num_numas-1); - /* Initialize core-specific info (if no-one else has already done it), * making sure that each core has private cache lines. */ @@ -76,7 +53,6 @@ int homa_init(struct homa *homa) core = (struct homa_core *) (first + i*aligned_size); homa_cores[i] = core; - core->numa = homa_numas[cpu_to_node(i)]; core->last_active = 0; core->last_gro = 0; atomic_set(&core->softirq_backlog, 0); @@ -87,10 +63,6 @@ int homa_init(struct homa *homa) core->last_app_active = 0; core->held_skb = NULL; core->held_bucket = 0; - core->skb_page = NULL; - core->page_inuse = 0; - core->page_size = 0; - core->num_stashed_pages = 0; } } @@ -132,12 +104,7 @@ int homa_init(struct homa *homa) pr_err("Couldn't initialize peer table (errno %d)\n", -err); return err; } - spin_lock_init(&homa->page_pool_mutex); - homa->skb_page_frees_per_sec = 1000; - homa->skb_pages_to_free = NULL; - homa->pages_to_free_slots = 0; - homa->skb_page_free_time = 0; - homa->skb_page_pool_min_kb = (3*HOMA_MAX_MESSAGE_LENGTH)/1000; + homa_skb_init(homa); /* Wild guesses to initialize configuration values... */ homa->unsched_bytes = 10000; @@ -227,14 +194,6 @@ void homa_destroy(struct homa *homa) kfree(homa->peers); homa_skb_cleanup(homa); - for (i = 0; i < MAX_NUMNODES; i++) { - struct homa_numa *numa = homa_numas[i]; - - if (numa != NULL) { - kfree(numa); - homa_numas[i] = NULL; - } - } if (core_memory) { vfree(core_memory); core_memory = NULL; diff --git a/test/mock.c b/test/mock.c index ce375e8a..ba009f77 100644 --- a/test/mock.c +++ b/test/mock.c @@ -9,6 +9,7 @@ #include "homa_impl.h" #include "homa_pool.h" +#include "homa_skb.h" #include "ccutils.h" #include "mock.h" #include "utils.h" diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 6122853f..1867828f 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -5,6 +5,7 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_rpc.h" +#include "homa_skb.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 19959130..4cfc242a 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -3,20 +3,25 @@ */ #include "homa_impl.h" -#include "homa_impl.h" +#include "homa_skb.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" #include "mock.h" #include "utils.h" +static inline struct homa_skb_core *get_skb_core(int core) +{ + return &per_cpu(homa_skb_core, core); +} + /* Create an skb with 100 bytes of data in the header and frags of * 200, 300, and 400 bytes. */ static struct sk_buff *test_skb(struct homa *homa) { struct sk_buff *skb = homa_skb_new_tx(100); - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); int32_t data[1000]; char *src; @@ -30,13 +35,13 @@ static struct sk_buff *test_skb(struct homa *homa) /* Make sure that the first skb fragment will have a nonzero offset * within its page. */ - homa_skb_page_alloc(homa, core); - core->page_inuse = 100; + homa_skb_page_alloc(homa, skb_core); + skb_core->page_inuse = 100; homa_skb_append_to_frag(homa, skb, src + 100, 200); - core->page_inuse = core->page_size; + skb_core->page_inuse = skb_core->page_size; homa_skb_append_to_frag(homa, skb, src + 300, 300); - core->page_inuse = core->page_size; + skb_core->page_inuse = skb_core->page_size; homa_skb_append_to_frag(homa, skb, src + 600, 400); /* Add some data before the transport header, just to make sure @@ -49,7 +54,7 @@ static struct sk_buff *test_skb(struct homa *homa) /* Add a given number of pages to the page pool for a given core. */ static void add_to_pool(struct homa *homa, int num_pages, int core) { - struct homa_page_pool *pool = &homa_cores[core]->numa->page_pool; + struct homa_page_pool *pool = get_skb_core(core)->pool; int i; for (i = 0; i < num_pages; i++) { pool->pages[pool->avail] = alloc_pages(GFP_KERNEL, @@ -89,54 +94,73 @@ FIXTURE_TEARDOWN(homa_skb) unit_teardown(); } +TEST_F(homa_skb, homa_skb_init) +{ + homa_skb_cleanup(&self->homa); + EXPECT_EQ(NULL, self->homa.page_pools[0]); + mock_numa_mask = 0x83; + homa_skb_init(&self->homa); + EXPECT_NE(NULL, self->homa.page_pools[0]); + EXPECT_NE(NULL, self->homa.page_pools[1]); + EXPECT_EQ(NULL, self->homa.page_pools[2]); + EXPECT_EQ(self->homa.page_pools[1], get_skb_core(0)->pool); + EXPECT_EQ(self->homa.page_pools[1], get_skb_core(1)->pool); + EXPECT_EQ(self->homa.page_pools[0], get_skb_core(2)->pool); + EXPECT_EQ(self->homa.page_pools[0], get_skb_core(6)->pool); + EXPECT_EQ(self->homa.page_pools[1], get_skb_core(7)->pool); + EXPECT_EQ(1, self->homa.max_numa); +} + TEST_F(homa_skb, homa_skb_cleanup) { - struct homa_core *core = homa_cores[2]; - core->skb_page = alloc_pages(GFP_KERNEL, 2); + struct homa_skb_core *skb_core = get_skb_core(2); + skb_core->skb_page = alloc_pages(GFP_KERNEL, 2); add_to_pool(&self->homa, 5, 2); add_to_pool(&self->homa, 4, 3); mock_set_core(3); homa_skb_stash_pages(&self->homa, 2 * HOMA_SKB_PAGE_SIZE - 100); - EXPECT_EQ(5, homa_cores[2]->numa->page_pool.avail); - EXPECT_EQ(2, homa_cores[3]->numa->page_pool.avail); - EXPECT_EQ(2, homa_cores[3]->num_stashed_pages); + EXPECT_EQ(5, get_skb_core(2)->pool->avail); + EXPECT_EQ(2, get_skb_core(3)->pool->avail); + EXPECT_EQ(2, get_skb_core(3)->num_stashed_pages); homa_skb_cleanup(&self->homa); - EXPECT_EQ(NULL, core->skb_page); - EXPECT_EQ(0, homa_cores[2]->numa->page_pool.avail); - EXPECT_EQ(0, homa_cores[3]->numa->page_pool.avail); - EXPECT_EQ(0, homa_cores[3]->num_stashed_pages); + EXPECT_EQ(NULL, skb_core->pool); + EXPECT_EQ(NULL, skb_core->skb_page); + EXPECT_EQ(0, get_skb_core(3)->num_stashed_pages); + + skb_core = get_skb_core(nr_cpu_ids-1); + EXPECT_EQ(NULL, skb_core->pool); } TEST_F(homa_skb, homa_skb_stash_pages) { int id = raw_smp_processor_id(); - struct homa_core *core = homa_cores[id]; + struct homa_skb_core *skb_core = get_skb_core(id); add_to_pool(&self->homa, 5, id); - EXPECT_EQ(5, core->numa->page_pool.avail); - EXPECT_EQ(0, core->num_stashed_pages); + EXPECT_EQ(5, skb_core->pool->avail); + EXPECT_EQ(0, skb_core->num_stashed_pages); /* First attempt: message too small. */ homa_skb_stash_pages(&self->homa, 10000); - EXPECT_EQ(0, core->num_stashed_pages); + EXPECT_EQ(0, skb_core->num_stashed_pages); /* Second attempt: stash pages. */ homa_skb_stash_pages(&self->homa, 3*HOMA_SKB_PAGE_SIZE - 100); - EXPECT_EQ(3, core->num_stashed_pages); - EXPECT_EQ(2, core->numa->page_pool.avail); + EXPECT_EQ(3, skb_core->num_stashed_pages); + EXPECT_EQ(2, skb_core->pool->avail); - /* Third attempt: existing stash adequage. */ + /* Third attempt: existing stash adequate. */ homa_skb_stash_pages(&self->homa, 3 * HOMA_SKB_PAGE_SIZE - 100); - EXPECT_EQ(3, core->num_stashed_pages); + EXPECT_EQ(3, skb_core->num_stashed_pages); /* Fourth attempt: not enough pages in pool. */ homa_skb_stash_pages(&self->homa, 8 * HOMA_SKB_PAGE_SIZE - 100); - EXPECT_EQ(5, core->num_stashed_pages); + EXPECT_EQ(5, skb_core->num_stashed_pages); } TEST_F(homa_skb, homa_skb_extend_frags__basics) { - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); int length = 100; char *p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(100, length); @@ -152,28 +176,28 @@ TEST_F(homa_skb, homa_skb_extend_frags__basics) EXPECT_EQ(300, length); EXPECT_EQ(p2 + 200, p3); - EXPECT_EQ(600, core->page_inuse); + EXPECT_EQ(600, skb_core->page_inuse); EXPECT_EQ(600, self->skb->len); } TEST_F(homa_skb, homa_skb_extend_frags__merge_but_reduce_length) { - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); int length = 1000; char *p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(1000, length); EXPECT_NE(NULL, p1); - core->page_size = 2048; + skb_core->page_size = 2048; length = 2000; char *p2 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(1048, length); EXPECT_EQ(p1 + 1000, p2); - EXPECT_EQ(2048, core->page_inuse); + EXPECT_EQ(2048, skb_core->page_inuse); } TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) { - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); ASSERT_NE(NULL, skb2); @@ -183,7 +207,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) EXPECT_NE(NULL, p1); EXPECT_EQ(1000, self->skb->len); - core->page_size = 2048; + skb_core->page_size = 2048; length = 1000; char *p2 = homa_skb_extend_frags(&self->homa, skb2, &length); EXPECT_EQ(1000, length); @@ -198,12 +222,12 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) EXPECT_EQ(0, skb_shinfo(self->skb)->frags[1].offset); EXPECT_EQ(2000, self->skb->len); - EXPECT_EQ(1000, core->page_inuse); + EXPECT_EQ(1000, skb_core->page_inuse); kfree_skb(skb2); } TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) { - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); ASSERT_NE(NULL, skb2); @@ -212,7 +236,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) EXPECT_EQ(1000, length); EXPECT_NE(NULL, p1); - core->page_size = 2048; + skb_core->page_size = 2048; length = 500; char *p2 = homa_skb_extend_frags(&self->homa, skb2, &length); EXPECT_EQ(500, length); @@ -225,119 +249,119 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); EXPECT_EQ(1536, skb_shinfo(self->skb)->frags[1].offset); - EXPECT_EQ(2048, core->page_inuse); + EXPECT_EQ(2048, skb_core->page_inuse); kfree_skb(skb2); } TEST_F(homa_skb, homa_skb_page_alloc__free_previous_page) { - struct homa_core *core = homa_cores[2]; + struct homa_skb_core *skb_core = get_skb_core(2); struct page *old_page; - EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); - EXPECT_NE(NULL, core->skb_page); - old_page = core->skb_page; + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + old_page = skb_core->skb_page; get_page(old_page); EXPECT_EQ(2, mock_page_refs(old_page)); - EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); - EXPECT_NE(NULL, core->skb_page); - EXPECT_NE(old_page, core->skb_page); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_NE(old_page, skb_core->skb_page); EXPECT_EQ(1, mock_page_refs(old_page)); put_page(old_page); } TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) { - struct homa_core *core = cur_core; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct sk_buff *skb = homa_skb_new_tx(100); struct page *page; int length = 100; homa_skb_extend_frags(&self->homa, skb, &length); - EXPECT_EQ(100, core->page_inuse); - page = core->skb_page; + EXPECT_EQ(100, skb_core->page_inuse); + page = skb_core->skb_page; homa_skb_free_tx(&self->homa, skb); - EXPECT_EQ(1, page_ref_count(core->skb_page)); - EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); - EXPECT_EQ(page, core->skb_page); - EXPECT_EQ(0, core->page_inuse); + EXPECT_EQ(1, page_ref_count(skb_core->skb_page)); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_EQ(page, skb_core->skb_page); + EXPECT_EQ(0, skb_core->page_inuse); } TEST_F(homa_skb, homa_skb_page_alloc__from_stash) { - struct homa_core *core = cur_core; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); add_to_pool(&self->homa, 5, raw_smp_processor_id()); homa_skb_stash_pages(&self->homa, 3*HOMA_SKB_PAGE_SIZE - 100); - EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); - EXPECT_NE(NULL, core->skb_page); - EXPECT_EQ(HOMA_SKB_PAGE_SIZE, core->page_size); - EXPECT_EQ(0, core->page_inuse); - EXPECT_EQ(2, core->num_stashed_pages); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_EQ(HOMA_SKB_PAGE_SIZE, skb_core->page_size); + EXPECT_EQ(0, skb_core->page_inuse); + EXPECT_EQ(2, skb_core->num_stashed_pages); } TEST_F(homa_skb, homa_skb_page_alloc__from_pool) { - struct homa_core *core = cur_core; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); add_to_pool(&self->homa, 5, raw_smp_processor_id()); - EXPECT_EQ(5, core->numa->page_pool.avail); - EXPECT_EQ(0, core->num_stashed_pages); - EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); - EXPECT_NE(NULL, core->skb_page); - EXPECT_EQ(4, core->numa->page_pool.avail); + EXPECT_EQ(5, skb_core->pool->avail); + EXPECT_EQ(0, skb_core->num_stashed_pages); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_EQ(4, skb_core->pool->avail); } TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) { - struct homa_core *core = cur_core; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); add_to_pool(&self->homa, 1, raw_smp_processor_id()); - EXPECT_EQ(1, core->numa->page_pool.avail); - EXPECT_EQ(0, core->num_stashed_pages); - hook_pool = &core->numa->page_pool; + EXPECT_EQ(1, skb_core->pool->avail); + EXPECT_EQ(0, skb_core->num_stashed_pages); + hook_pool = skb_core->pool; unit_hook_register(spinlock_hook); mock_alloc_page_errors = 3; - EXPECT_FALSE(homa_skb_page_alloc(&self->homa, core)); - EXPECT_EQ(NULL, core->skb_page); - EXPECT_EQ(0, core->numa->page_pool.avail); + EXPECT_FALSE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_EQ(NULL, skb_core->skb_page); + EXPECT_EQ(0, skb_core->pool->avail); } TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) { - struct homa_core *core = cur_core; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); mock_cycles = ~0; - EXPECT_EQ(0, core->numa->page_pool.avail); - EXPECT_EQ(0, core->num_stashed_pages); - EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); - EXPECT_NE(NULL, core->skb_page); - EXPECT_EQ(HOMA_SKB_PAGE_SIZE, core->page_size); + EXPECT_EQ(0, skb_core->pool->avail); + EXPECT_EQ(0, skb_core->num_stashed_pages); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_EQ(HOMA_SKB_PAGE_SIZE, skb_core->page_size); EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); EXPECT_NE(0, homa_metrics_per_cpu()->skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) { - struct homa_core *core = homa_cores[2]; + struct homa_skb_core *skb_core = get_skb_core(2); mock_cycles = ~0; mock_alloc_page_errors = 1; - EXPECT_TRUE(homa_skb_page_alloc(&self->homa, core)); - EXPECT_NE(NULL, core->skb_page); - EXPECT_NE(NULL, core->skb_page); - EXPECT_EQ(PAGE_SIZE, core->page_size); - EXPECT_EQ(0, core->page_inuse); + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_NE(NULL, skb_core->skb_page); + EXPECT_EQ(PAGE_SIZE, skb_core->page_size); + EXPECT_EQ(0, skb_core->page_inuse); EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); EXPECT_NE(0, homa_metrics_per_cpu()->skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__no_pages_available) { - struct homa_core *core = homa_cores[2]; + struct homa_skb_core *skb_core = get_skb_core(2); mock_alloc_page_errors = 3; - EXPECT_FALSE(homa_skb_page_alloc(&self->homa, core)); - EXPECT_EQ(NULL, core->skb_page); + EXPECT_FALSE(homa_skb_page_alloc(&self->homa, skb_core)); + EXPECT_EQ(NULL, skb_core->skb_page); } TEST_F(homa_skb, homa_skb_append_to_frag__basics) { - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct skb_shared_info *shinfo = skb_shinfo(self->skb); /* First append fits in a single block. */ EXPECT_EQ(0, homa_skb_append_to_frag(&self->homa, self->skb, "abcd", 4)); /* Second append spills into a new frag. */ - core->page_size = 10; + skb_core->page_size = 10; EXPECT_EQ(0, homa_skb_append_to_frag(&self->homa, self->skb, "0123456789ABCDEFGHIJ", 21)); @@ -362,7 +386,7 @@ TEST_F(homa_skb, homa_skb_append_to_frag__no_memory) TEST_F(homa_skb, homa_skb_append_from_iter__basics) { - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct skb_shared_info *shinfo = skb_shinfo(self->skb); struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); @@ -374,7 +398,7 @@ TEST_F(homa_skb, homa_skb_append_from_iter__basics) unit_log_get()); /* Second append spills into a new frag. */ - core->page_size = 4096; + skb_core->page_size = 4096; unit_log_clear(); EXPECT_EQ(0, homa_skb_append_from_iter(&self->homa, self->skb, iter, 3000)); @@ -414,10 +438,10 @@ TEST_F(homa_skb, homa_skb_append_from_skb__error_copying_header) { struct sk_buff *src_skb = test_skb(&self->homa); struct sk_buff *dst_skb = homa_skb_new_tx(100); - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); mock_alloc_page_errors = -1; - core->page_inuse = core->page_size; + skb_core->page_inuse = skb_core->page_size; EXPECT_EQ(ENOMEM, -homa_skb_append_from_skb(&self->homa, dst_skb, src_skb, 20, 60)); @@ -501,7 +525,7 @@ TEST_F(homa_skb, homa_skb_free_many_tx__basics) homa_skb_extend_frags(&self->homa, skbs[1], &length); homa_skb_free_many_tx(&self->homa, skbs, 2); - EXPECT_EQ(3, homa_numas[0]->page_pool.avail); + EXPECT_EQ(3, self->homa.page_pools[0]->avail); } TEST_F(homa_skb, homa_skb_free_many_tx__skb_ref_count_not_one) { @@ -539,8 +563,8 @@ TEST_F(homa_skb, homa_skb_free_many_tx__check_page_order) mock_compound_order_mask = 3; homa_skb_free_many_tx(&self->homa, &skb, 1); - EXPECT_EQ(1, homa_numas[0]->page_pool.avail); - EXPECT_EQ(page, homa_numas[0]->page_pool.pages[0]); + EXPECT_EQ(1, self->homa.page_pools[0]->avail); + EXPECT_EQ(page, self->homa.page_pools[0]->pages[0]); } TEST_F(homa_skb, homa_skb_cache_pages__different_numa_nodes) @@ -551,10 +575,10 @@ TEST_F(homa_skb, homa_skb_cache_pages__different_numa_nodes) pages[i] = alloc_pages(GFP_KERNEL, HOMA_SKB_PAGE_ORDER); mock_page_nid_mask = 7; homa_skb_cache_pages(&self->homa, pages, 4); - EXPECT_EQ(1, homa_numas[0]->page_pool.avail); - EXPECT_EQ(3, homa_numas[1]->page_pool.avail); - EXPECT_EQ(pages[3], homa_numas[0]->page_pool.pages[0]); - EXPECT_EQ(pages[1], homa_numas[1]->page_pool.pages[1]); + EXPECT_EQ(1, self->homa.page_pools[0]->avail); + EXPECT_EQ(3, self->homa.page_pools[1]->avail); + EXPECT_EQ(pages[3], self->homa.page_pools[0]->pages[0]); + EXPECT_EQ(pages[1], self->homa.page_pools[1]->pages[1]); } TEST_F(homa_skb, homa_skb_cache_pages__pool_size_exceeded) { @@ -563,7 +587,7 @@ TEST_F(homa_skb, homa_skb_cache_pages__pool_size_exceeded) for (i = 0; i < 6; i++) pages[i] = alloc_pages(GFP_KERNEL, HOMA_SKB_PAGE_ORDER); homa_skb_cache_pages(&self->homa, pages, 4); - EXPECT_EQ(4, homa_numas[0]->page_pool.avail); + EXPECT_EQ(4, self->homa.page_pools[0]->avail); put_page(pages[4]); put_page(pages[5]); } @@ -618,13 +642,13 @@ TEST_F(homa_skb, homa_skb_release_pages__basics) self->homa.skb_page_frees_per_sec = 10; self->homa.skb_page_pool_min_kb = 0; add_to_pool(&self->homa, 10, 0); - homa_cores[0]->numa->page_pool.low_mark = 7; + get_skb_core(0)->pool->low_mark = 7; add_to_pool(&self->homa, 3, 1); - homa_cores[1]->numa->page_pool.low_mark = 2; + get_skb_core(1)->pool->low_mark = 2; homa_skb_release_pages(&self->homa); - EXPECT_EQ(5, homa_cores[0]->numa->page_pool.avail); - EXPECT_EQ(3, homa_cores[1]->numa->page_pool.avail); + EXPECT_EQ(5, get_skb_core(0)->pool->avail); + EXPECT_EQ(3, get_skb_core(1)->pool->avail); EXPECT_EQ(501000000UL, self->homa.skb_page_free_time); } TEST_F(homa_skb, homa_skb_release_pages__not_time_to_free) @@ -635,9 +659,9 @@ TEST_F(homa_skb, homa_skb_release_pages__not_time_to_free) self->homa.skb_page_frees_per_sec = 10; self->homa.skb_page_pool_min_kb = 0; add_to_pool(&self->homa, 10, 0); - homa_cores[0]->numa->page_pool.low_mark = 7; + get_skb_core(0)->pool->low_mark = 7; homa_skb_release_pages(&self->homa); - EXPECT_EQ(10, homa_cores[0]->numa->page_pool.avail); + EXPECT_EQ(10, get_skb_core(0)->pool->avail); } TEST_F(homa_skb, homa_skb_release_pages__allocate_skb_pages_to_free) { @@ -664,10 +688,10 @@ TEST_F(homa_skb, homa_skb_release_pages__limited_by_min_kb) self->homa.skb_page_frees_per_sec = 20; self->homa.skb_page_pool_min_kb = (5 * HOMA_SKB_PAGE_SIZE) / 1000; add_to_pool(&self->homa, 10, 0); - homa_cores[0]->numa->page_pool.low_mark = 9; + get_skb_core(0)->pool->low_mark = 9; homa_skb_release_pages(&self->homa); - EXPECT_EQ(6, homa_cores[0]->numa->page_pool.avail); + EXPECT_EQ(6, get_skb_core(0)->pool->avail); } TEST_F(homa_skb, homa_skb_release_pages__empty_pool) { @@ -677,8 +701,8 @@ TEST_F(homa_skb, homa_skb_release_pages__empty_pool) self->homa.skb_page_frees_per_sec = 1000; self->homa.skb_page_pool_min_kb = 0; add_to_pool(&self->homa, 5, 0); - homa_cores[0]->numa->page_pool.low_mark = 5; + get_skb_core(0)->pool->low_mark = 5; homa_skb_release_pages(&self->homa); - EXPECT_EQ(0, homa_cores[0]->numa->page_pool.avail); + EXPECT_EQ(0, get_skb_core(0)->pool->avail); } \ No newline at end of file From 327c3f394eb097f7b0deee4b2e3949f0b8b0e508 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Oct 2024 17:00:36 -0700 Subject: [PATCH 033/625] Extract homa_grant.h from homa_impl.h --- homa_grant.c | 1 + homa_grant.h | 59 ++++++++++++++++++++++++++++++++++++++++++ homa_impl.h | 50 ----------------------------------- homa_incoming.c | 1 + homa_pool.c | 1 + homa_rpc.c | 1 + test/unit_homa_grant.c | 1 + 7 files changed, 64 insertions(+), 50 deletions(-) create mode 100644 homa_grant.h diff --git a/homa_grant.c b/homa_grant.c index 7ae086b7..e152f49c 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_grant.h" #include "homa_peer.h" #include "homa_rpc.h" #include "homa_wire.h" diff --git a/homa_grant.h b/homa_grant.h new file mode 100644 index 00000000..411f40c6 --- /dev/null +++ b/homa_grant.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file contains definitions that related to generating grants. */ + +#ifndef _HOMA_GRANT_H +#define _HOMA_GRANT_H + +extern int homa_grantable_lock_slow(struct homa *homa, int recalc); +extern void homa_grant_add_rpc(struct homa_rpc *rpc); +extern void homa_grant_check_rpc(struct homa_rpc *rpc); +extern void homa_grant_find_oldest(struct homa *homa); +extern void homa_grant_free_rpc(struct homa_rpc *rpc); +extern void homa_grant_log_tt(struct homa *homa); +extern int homa_grant_outranks(struct homa_rpc *rpc1, + struct homa_rpc *rpc2); +extern int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, + int max_rpcs); +extern void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +extern void homa_grant_recalc(struct homa *homa, int locked); +extern void homa_grant_remove_rpc(struct homa_rpc *rpc); +extern int homa_grant_send(struct homa_rpc *rpc, struct homa *homa); +extern int homa_grant_update_incoming(struct homa_rpc *rpc, + struct homa *homa); + +/** + * homa_grantable_lock() - Acquire the grantable lock. If the lock + * isn't immediately available, record stats on the waiting time. + * @homa: Overall data about the Homa protocol implementation. + * @recalc: Nonzero means the caller is homa_grant_recalc; if another thread + * is already recalculating, can return without waiting for the lock. + * Return: Nonzero means this thread now owns the grantable lock. Zero + * means the lock was not acquired and there is no need for this + * thread to do the work of homa_grant_recalc because some other + * thread started a fresh calculation after this method was invoked. + */ +static inline int homa_grantable_lock(struct homa *homa, int recalc) +{ + int result; + + if (spin_trylock_bh(&homa->grantable_lock)) + result = 1; + else + result = homa_grantable_lock_slow(homa, recalc); + homa->grantable_lock_time = get_cycles(); + return result; +} + +/** + * homa_grantable_unlock() - Release the grantable lock. + * @homa: Overall data about the Homa protocol implementation. + */ +static inline void homa_grantable_unlock(struct homa *homa) +{ + INC_METRIC(grantable_lock_cycles, get_cycles() + - homa->grantable_lock_time); + spin_unlock_bh(&homa->grantable_lock); +} + +#endif /* _HOMA_GRANT_H */ \ No newline at end of file diff --git a/homa_impl.h b/homa_impl.h index 951a2756..43717e73 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -137,7 +137,6 @@ struct homa; #include "homa_metrics.h" /* Declarations used in this file, so they can't be made at the end. */ -extern int homa_grantable_lock_slow(struct homa *homa, int recalc); extern void homa_throttle_lock_slow(struct homa *homa); extern struct homa_core *homa_cores[]; @@ -1034,40 +1033,6 @@ static inline void homa_set_doff(struct data_header *h, int size) h->common.doff = size << 2; } -/** - * homa_grantable_lock() - Acquire the grantable lock. If the lock - * isn't immediately available, record stats on the waiting time. - * @homa: Overall data about the Homa protocol implementation. - * @recalc: Nonzero means the caller is homa_grant_recalc; if another thread - * is already recalculating, can return without waiting for the lock. - * Return: Nonzero means this thread now owns the grantable lock. Zero - * means the lock was not acquired and there is no need for this - * thread to do the work of homa_grant_recalc because some other - * thread started a fresh calculation after this method was invoked. - */ -static inline int homa_grantable_lock(struct homa *homa, int recalc) -{ - int result; - - if (spin_trylock_bh(&homa->grantable_lock)) - result = 1; - else - result = homa_grantable_lock_slow(homa, recalc); - homa->grantable_lock_time = get_cycles(); - return result; -} - -/** - * homa_grantable_unlock() - Release the grantable lock. - * @homa: Overall data about the Homa protocol implementation. - */ -static inline void homa_grantable_unlock(struct homa *homa) -{ - INC_METRIC(grantable_lock_cycles, get_cycles() - - homa->grantable_lock_time); - spin_unlock_bh(&homa->grantable_lock); -} - /** * homa_throttle_lock() - Acquire the throttle lock. If the lock * isn't immediately available, record stats on the waiting time. @@ -1240,21 +1205,6 @@ extern void homa_gap_retry(struct homa_rpc *rpc); extern int homa_get_port(struct sock *sk, unsigned short snum); extern int homa_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); -extern void homa_grant_add_rpc(struct homa_rpc *rpc); -extern void homa_grant_check_rpc(struct homa_rpc *rpc); -extern void homa_grant_find_oldest(struct homa *homa); -extern void homa_grant_free_rpc(struct homa_rpc *rpc); -extern void homa_grant_log_tt(struct homa *homa); -extern int homa_grant_outranks(struct homa_rpc *rpc1, - struct homa_rpc *rpc2); -extern int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, - int max_rpcs); -extern void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -extern void homa_grant_recalc(struct homa *homa, int locked); -extern void homa_grant_remove_rpc(struct homa_rpc *rpc); -extern int homa_grant_send(struct homa_rpc *rpc, struct homa *homa); -extern int homa_grant_update_incoming(struct homa_rpc *rpc, - struct homa *homa); extern int homa_gro_complete(struct sk_buff *skb, int thoff); extern void homa_gro_gen2(struct sk_buff *skb); extern void homa_gro_gen3(struct sk_buff *skb); diff --git a/homa_incoming.c b/homa_incoming.c index 67ada889..09b65fc4 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_grant.h" #include "homa_peer.h" #include "homa_pool.h" diff --git a/homa_pool.c b/homa_pool.c index f79e9289..e2700326 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_grant.h" #include "homa_pool.h" /* This file contains functions that manage user-space buffer pools. */ diff --git a/homa_rpc.c b/homa_rpc.c index dea4d769..c6eb5ebc 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -5,6 +5,7 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_pool.h" +#include "homa_grant.h" #include "homa_skb.h" /** diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 59b910d8..f3cfee14 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_grant.h" #include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" From 2ff453e3e59cf0f38fbf44458afb318d5e152055 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 3 Oct 2024 11:28:11 -0700 Subject: [PATCH 034/625] Extract homa_offload.h from homa_impl.h Also created new per-core struct homa_offload_core. Delete the homa_core struct: it's no longer needed. --- homa_impl.h | 101 ---------------------- homa_incoming.c | 8 +- homa_offload.c | 82 +++++++++++------- homa_offload.h | 94 +++++++++++++++++++++ homa_plumbing.c | 22 ++--- homa_utils.c | 49 ----------- test/unit_homa_incoming.c | 17 ++-- test/unit_homa_offload.c | 172 ++++++++++++++++++++------------------ test/utils.h | 2 - 9 files changed, 264 insertions(+), 283 deletions(-) create mode 100644 homa_offload.h diff --git a/homa_impl.h b/homa_impl.h index 43717e73..199b0193 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -139,8 +139,6 @@ struct homa; /* Declarations used in this file, so they can't be made at the end. */ extern void homa_throttle_lock_slow(struct homa *homa); -extern struct homa_core *homa_cores[]; - #define sizeof32(type) ((int) (sizeof(type))) /** define CACHE_LINE_SIZE - The number of bytes in a cache line. */ @@ -876,88 +874,6 @@ struct homa { */ int temp[4]; }; - -/** - * struct homa_core - Homa allocates one of these structures for each - * core, to hold information that needs to be kept on a per-core basis. - */ -struct homa_core { - /** NUMA-specific page pool from which to allocate skb pages. */ - struct homa_page_pool *pool; - - /** - * @last_active: the last time (in get_cycle() units) that - * there was system activity, such NAPI or SoftIRQ, on this - * core. Used for load balancing. - */ - __u64 last_active; - - /** - * @last_gro: the last time (in get_cycle() units) that - * homa_gro_receive returned on this core. Used to determine - * whether GRO is keeping a core busy. - */ - __u64 last_gro; - - /** - * @softirq_backlog: the number of batches of packets that have - * been queued for SoftIRQ processing on this core but haven't - * yet been processed. - */ - atomic_t softirq_backlog; - - /** - * @softirq_offset: used when rotating SoftIRQ assignment among - * the next cores; contains an offset to add to the current core - * to produce the core for SoftIRQ. - */ - int softirq_offset; - - /** - * @gen3_softirq_cores: when the Gen3 load balancer is in use, - * GRO will arrange for SoftIRQ processing to occur on one of - * these cores; -1 values are ignored (see balance.txt for more - * on lewd balancing). This information is filled in via sysctl. - */ -#define NUM_GEN3_SOFTIRQ_CORES 3 - int gen3_softirq_cores[NUM_GEN3_SOFTIRQ_CORES]; - - /** - * @last_app_active: the most recent time (get_cycles() units) - * when an application was actively using Homa on this core (e.g., - * by sending or receiving messages). Used for load balancing - * (see balance.txt). - */ - __u64 last_app_active; - - /** - * held_skb: last packet buffer known to be available for - * merging other packets into on this core (note: may not still - * be available), or NULL if none. - */ - struct sk_buff *held_skb; - - /** - * @held_bucket: the index, within napi->gro_hash, of the list - * containing @held_skb; undefined if @held_skb is NULL. Used to - * verify that @held_skb is still available. - */ - int held_bucket; - - /** - * @thread: the most recent thread to invoke a Homa system call - * on this core, or NULL if none. - */ - struct task_struct *thread; - - /** - * @syscall_end_time: the time, in get_cycle() units, when the last - * Homa system call completed on this core. Meaningless if thread - * is NULL. - */ - __u64 syscall_end_time; -}; - /** * struct homa_skb_info - Additional information needed by Homa for each * outbound DATA packet. Space is allocated for this at the very end of the @@ -1205,17 +1121,6 @@ extern void homa_gap_retry(struct homa_rpc *rpc); extern int homa_get_port(struct sock *sk, unsigned short snum); extern int homa_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); -extern int homa_gro_complete(struct sk_buff *skb, int thoff); -extern void homa_gro_gen2(struct sk_buff *skb); -extern void homa_gro_gen3(struct sk_buff *skb); -extern void homa_gro_hook_tcp(void); -extern void homa_gro_unhook_tcp(void); -extern struct sk_buff - *homa_gro_receive(struct list_head *gro_list, - struct sk_buff *skb); -extern struct sk_buff - *homa_gso_segment(struct sk_buff *skb, - netdev_features_t features); extern int homa_hash(struct sock *sk); extern enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); @@ -1235,8 +1140,6 @@ extern struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data); -extern int homa_offload_end(void); -extern int homa_offload_init(void); extern void homa_outgoing_sysctl_changed(struct homa *homa); extern int homa_pacer_main(void *transportInfo); extern void homa_pacer_stop(struct homa *homa); @@ -1264,7 +1167,6 @@ extern void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); extern void homa_rpc_free(struct homa_rpc *rpc); extern void homa_rpc_handoff(struct homa_rpc *rpc); -extern void homa_send_ipis(void); extern int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); extern int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t __user optval, unsigned int optlen); @@ -1276,9 +1178,6 @@ extern void homa_spin(int ns); extern char *homa_symbol_for_type(uint8_t type); extern int homa_sysctl_softirq_cores(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -extern struct sk_buff - *homa_tcp_gro_receive(struct list_head *held_list, - struct sk_buff *skb); extern void homa_timer(struct homa *homa); extern int homa_timer_main(void *transportInfo); extern void homa_unhash(struct sock *sk); diff --git a/homa_incoming.c b/homa_incoming.c index 09b65fc4..18e511d0 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_grant.h" +#include "homa_offload.h" #include "homa_peer.h" #include "homa_pool.h" @@ -1267,7 +1268,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, INC_METRIC(poll_cycles, now - poll_start); /* Now it's time to sleep. */ - homa_cores[interest.core]->last_app_active = now; + per_cpu(homa_offload_core, interest.core).last_app_active = now; set_current_state(TASK_INTERRUPTIBLE); rpc = (struct homa_rpc *) atomic_long_read(&interest.ready_rpc); if (!rpc && !hsk->shutdown) { @@ -1381,7 +1382,8 @@ struct homa_interest *homa_choose_interest(struct homa *homa, list_for_each(pos, head) { interest = (struct homa_interest *) (((char *) pos) - offset); - if (homa_cores[interest->core]->last_active < busy_time) { + if (per_cpu(homa_offload_core, interest->core).last_active + < busy_time) { if (backup != NULL) INC_METRIC(handoffs_alt_thread, 1); return interest; @@ -1463,7 +1465,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) /* Update the last_app_active time for the thread's core, so Homa * will try to avoid doing any work there. */ - homa_cores[interest->core]->last_app_active = get_cycles(); + per_cpu(homa_offload_core, interest->core).last_app_active = get_cycles(); /* Clear the interest. This serves two purposes. First, it saves * the waking thread from acquiring the socket lock again, which diff --git a/homa_offload.c b/homa_offload.c index eb3353c8..db813f7a 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -5,6 +5,9 @@ */ #include "homa_impl.h" +#include "homa_offload.h" + +DEFINE_PER_CPU(struct homa_offload_core, homa_offload_core); #define CORES_TO_CHECK 4 @@ -38,6 +41,25 @@ static struct net_offload hook_tcp6_net_offload; */ int homa_offload_init(void) { + int i; + + for (i = 0; i < nr_cpu_ids; i++) { + struct homa_offload_core *offload_core; + int j; + + offload_core = &per_cpu(homa_offload_core, i); + offload_core->last_active = 0; + offload_core->last_gro = 0; + atomic_set(&offload_core->softirq_backlog, 0); + offload_core->softirq_offset = 0; + offload_core->gen3_softirq_cores[0] = i^1; + for (j = 1; j < NUM_GEN3_SOFTIRQ_CORES; j++) + offload_core->gen3_softirq_cores[j] = -1; + offload_core->last_app_active = 0; + offload_core->held_skb = NULL; + offload_core->held_bucket = 0; + } + int res1 = inet_add_offload(&homa_offload, IPPROTO_HOMA); int res2 = inet6_add_offload(&homa_offload, IPPROTO_HOMA); @@ -257,9 +279,10 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, */ struct sk_buff *held_skb; struct sk_buff *result = NULL; - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_offload_core *offload_core = &per_cpu(homa_offload_core, + raw_smp_processor_id()); __u64 now = get_cycles(); - int busy = (now - core->last_gro) < homa->gro_busy_cycles; + int busy = (now - offload_core->last_gro) < homa->gro_busy_cycles; __u32 hash; __u64 saved_softirq_metric, softirq_cycles; __u64 *softirq_cycles_metric; @@ -268,7 +291,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, int priority; __u32 saddr; - core->last_active = now; + offload_core->last_active = now; if (skb_is_ipv6(skb)) { priority = ipv6_hdr(skb)->priority; saddr = ntohl(ipv6_hdr(skb)->saddr.in6_u.u6_addr32[3]); @@ -324,7 +347,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * core added a Homa packet (if there is such a list). */ hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - if (core->held_skb) { + if (offload_core->held_skb) { /* Reverse-engineer the location of the napi_struct, so we * can verify that held_skb is still valid. */ @@ -333,18 +356,19 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, struct napi_struct *napi = container_of(gro_list, struct napi_struct, gro_hash[hash]); - /* Must verify that core->held_skb points to a packet on + /* Must verify that offload_core->held_skb points to a packet on * the list, and that the packet is a Homa packet. * homa_gro_complete isn't always invoked before removing - * packets from the list, so core->held_skb could be a + * packets from the list, so offload_core->held_skb could be a * dangling pointer (or the skb could have been reused for * some other protocol). */ list_for_each_entry(held_skb, - &napi->gro_hash[core->held_bucket].list, list) { + &napi->gro_hash[offload_core->held_bucket].list, + list) { int protocol; - if (held_skb != core->held_skb) + if (held_skb != offload_core->held_skb) continue; if (skb_is_ipv6(held_skb)) protocol = ipv6_hdr(held_skb)->nexthdr; @@ -382,9 +406,9 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, homa_gro_complete(held_skb, 0); netif_receive_skb(held_skb); homa_send_ipis(); - napi->gro_hash[core->held_bucket].count--; - if (napi->gro_hash[core->held_bucket].count == 0) - __clear_bit(core->held_bucket, + napi->gro_hash[offload_core->held_bucket].count--; + if (napi->gro_hash[offload_core->held_bucket].count == 0) + __clear_bit(offload_core->held_bucket, &napi->gro_bitmask); result = ERR_PTR(-EINPROGRESS); } @@ -400,14 +424,14 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * means we aren't heavily loaded; if batching does occur, * homa_gro_complete will pick a different core). */ - core->held_skb = skb; - core->held_bucket = hash; + offload_core->held_skb = skb; + offload_core->held_bucket = hash; if (likely(homa->gro_policy & HOMA_GRO_SAME_CORE)) homa_set_softirq_cpu(skb, raw_smp_processor_id()); done: homa_check_pacer(homa, 1); - core->last_gro = get_cycles(); + offload_core->last_gro = get_cycles(); return result; bypass: @@ -420,7 +444,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, softirq_cycles = *softirq_cycles_metric - saved_softirq_metric; *softirq_cycles_metric = saved_softirq_metric; INC_METRIC(bypass_softirq_cycles, softirq_cycles); - core->last_gro = get_cycles(); + offload_core->last_gro = get_cycles(); /* This return value indicates that we have freed skb. */ return ERR_PTR(-EINPROGRESS); @@ -448,16 +472,16 @@ void homa_gro_gen2(struct sk_buff *skb) int this_core = raw_smp_processor_id(); int candidate = this_core; __u64 now = get_cycles(); - struct homa_core *core; + struct homa_offload_core *offload_core; for (i = CORES_TO_CHECK; i > 0; i--) { candidate++; if (unlikely(candidate >= nr_cpu_ids)) candidate = 0; - core = homa_cores[candidate]; - if (atomic_read(&core->softirq_backlog) > 0) + offload_core = &per_cpu(homa_offload_core, candidate); + if (atomic_read(&offload_core->softirq_backlog) > 0) continue; - if ((core->last_gro + homa->busy_cycles) > now) + if ((offload_core->last_gro + homa->busy_cycles) > now) continue; tt_record3("homa_gro_gen2 chose core %d for id %d offset %d", candidate, homa_local_id(h->common.sender_id), @@ -468,12 +492,12 @@ void homa_gro_gen2(struct sk_buff *skb) /* All of the candidates appear to be busy; just * rotate among them. */ - int offset = homa_cores[this_core]->softirq_offset; + int offset = per_cpu(homa_offload_core, this_core).softirq_offset; offset += 1; if (offset > CORES_TO_CHECK) offset = 1; - homa_cores[this_core]->softirq_offset = offset; + per_cpu(homa_offload_core, this_core).softirq_offset = offset; candidate = this_core + offset; while (candidate >= nr_cpu_ids) candidate -= nr_cpu_ids; @@ -481,7 +505,7 @@ void homa_gro_gen2(struct sk_buff *skb) candidate, homa_local_id(h->common.sender_id), ntohl(h->seg.offset)); } - atomic_inc(&homa_cores[candidate]->softirq_backlog); + atomic_inc(&per_cpu(homa_offload_core, candidate).softirq_backlog); homa_set_softirq_cpu(skb, candidate); } @@ -501,7 +525,8 @@ void homa_gro_gen3(struct sk_buff *skb) struct data_header *h = (struct data_header *) skb_transport_header(skb); int i, core; __u64 now, busy_time; - int *candidates = homa_cores[raw_smp_processor_id()]->gen3_softirq_cores; + int *candidates = per_cpu(homa_offload_core, raw_smp_processor_id()) + .gen3_softirq_cores; now = get_cycles(); busy_time = now - homa->busy_cycles; @@ -512,17 +537,18 @@ void homa_gro_gen3(struct sk_buff *skb) if (candidate < 0) break; - if (homa_cores[candidate]->last_app_active < busy_time) { + if (per_cpu(homa_offload_core, candidate).last_app_active + < busy_time) { core = candidate; break; } } homa_set_softirq_cpu(skb, core); - homa_cores[core]->last_active = now; + per_cpu(homa_offload_core, core).last_active = now; tt_record4("homa_gro_gen3 chose core %d for id %d, offset %d, delta %d", core, homa_local_id(h->common.sender_id), ntohl(h->seg.offset), - now - homa_cores[core]->last_app_active); + now - per_cpu(homa_offload_core, core).last_app_active); INC_METRIC(gen3_handoffs, 1); if (core != candidates[0]) INC_METRIC(gen3_alt_handoffs, 1); @@ -546,7 +572,7 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) // ntohl(h->seg.offset), // NAPI_GRO_CB(skb)->count); - homa_cores[raw_smp_processor_id()]->held_skb = NULL; + per_cpu(homa_offload_core, raw_smp_processor_id()).held_skb = NULL; if (homa->gro_policy & HOMA_GRO_GEN3) { homa_gro_gen3(skb); } else if (homa->gro_policy & HOMA_GRO_GEN2) { @@ -568,7 +594,7 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) core++; if (unlikely(core >= nr_cpu_ids)) core = 0; - last_active = homa_cores[core]->last_active; + last_active = per_cpu(homa_offload_core, core).last_active; if (last_active < best_time) { best_time = last_active; best = core; diff --git a/homa_offload.h b/homa_offload.h new file mode 100644 index 00000000..c0f3c9bb --- /dev/null +++ b/homa_offload.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file contains definitions related to homa_offload.c. */ + +#ifndef _HOMA_OFFLOAD_H +#define _HOMA_OFFLOAD_H + +#include + +/** + * struct homa_offload_core - Stores core-specific information used during + * GRO operations. + */ +struct homa_offload_core { + /** + * @last_active: the last time (in get_cycle() units) that + * there was system activity, such NAPI or SoftIRQ, on this + * core. Used for load balancing. + */ + __u64 last_active; + + /** + * @last_gro: the last time (in get_cycle() units) that + * homa_gro_receive returned on this core. Used to determine + * whether GRO is keeping a core busy. + */ + __u64 last_gro; + + /** + * @softirq_backlog: the number of batches of packets that have + * been queued for SoftIRQ processing on this core but haven't + * yet been processed. + */ + atomic_t softirq_backlog; + + /** + * @softirq_offset: used when rotating SoftIRQ assignment among + * the next cores; contains an offset to add to the current core + * to produce the core for SoftIRQ. + */ + int softirq_offset; + + /** + * @gen3_softirq_cores: when the Gen3 load balancer is in use, + * GRO will arrange for SoftIRQ processing to occur on one of + * these cores; -1 values are ignored (see balance.txt for more + * on lewd balancing). This information is filled in via sysctl. + */ +#define NUM_GEN3_SOFTIRQ_CORES 3 + int gen3_softirq_cores[NUM_GEN3_SOFTIRQ_CORES]; + + /** + * @last_app_active: the most recent time (get_cycles() units) + * when an application was actively using Homa on this core (e.g., + * by sending or receiving messages). Used for load balancing + * (see balance.txt). + */ + __u64 last_app_active; + + /** + * held_skb: last packet buffer known to be available for + * merging other packets into on this core (note: may not still + * be available), or NULL if none. + */ + struct sk_buff *held_skb; + + /** + * @held_bucket: the index, within napi->gro_hash, of the list + * containing @held_skb; undefined if @held_skb is NULL. Used to + * verify that @held_skb is still available. + */ + int held_bucket; +}; +DECLARE_PER_CPU(struct homa_offload_core, homa_offload_core); + +extern int homa_gro_complete(struct sk_buff *skb, int thoff); +extern void homa_gro_gen2(struct sk_buff *skb); +extern void homa_gro_gen3(struct sk_buff *skb); +extern void homa_gro_hook_tcp(void); +extern void homa_gro_unhook_tcp(void); +extern struct sk_buff + *homa_gro_receive(struct list_head *gro_list, + struct sk_buff *skb); +extern struct sk_buff + *homa_gso_segment(struct sk_buff *skb, + netdev_features_t features); +extern int homa_offload_end(void); +extern int homa_offload_init(void); +extern void homa_send_ipis(void); +extern struct sk_buff + *homa_tcp_gro_receive(struct list_head *held_list, + struct sk_buff *skb); + +#endif /* _HOMA_OFFLOAD_H */ diff --git a/homa_plumbing.c b/homa_plumbing.c index e5b1673c..bed24531 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_offload.h" #include "homa_peer.h" #include "homa_pool.h" @@ -889,14 +890,13 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) struct homa_rpc *rpc = NULL; union sockaddr_in_union *addr = (union sockaddr_in_union *) msg->msg_name; - homa_cores[raw_smp_processor_id()]->last_app_active = start; + per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; if (unlikely(!msg->msg_control_is_user)) { tt_record("homa_sendmsg error: !msg->msg_control_is_user"); result = -EINVAL; goto error; } - if (unlikely(copy_from_user(&args, msg->msg_control, - sizeof(args)))) { + if (unlikely(copy_from_user(&args, msg->msg_control, sizeof(args)))) { result = -EFAULT; goto error; } @@ -1023,7 +1023,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int result; INC_METRIC(recv_calls, 1); - homa_cores[raw_smp_processor_id()]->last_app_active = start; + per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; if (unlikely(!msg->msg_control)) { /* This test isn't strictly necessary, but it provides a * hook for testing kernel call times. @@ -1245,7 +1245,7 @@ int homa_softirq(struct sk_buff *skb) start = get_cycles(); INC_METRIC(softirq_calls, 1); - homa_cores[raw_smp_processor_id()]->last_active = start; + per_cpu(homa_offload_core, raw_smp_processor_id()).last_active = start; if ((start - last) > 1000000) { int scaled_ms = (int) (10*(start-last)/cpu_khz); @@ -1393,7 +1393,7 @@ int homa_softirq(struct sk_buff *skb) packets = other_pkts; } - atomic_dec(&homa_cores[raw_smp_processor_id()]->softirq_backlog); + atomic_dec(&per_cpu(homa_offload_core, raw_smp_processor_id()).softirq_backlog); INC_METRIC(softirq_cycles, get_cycles() - start); return 0; } @@ -1621,7 +1621,7 @@ int homa_sysctl_softirq_cores(struct ctl_table *table, int write, { int result, i; struct ctl_table table_copy; - struct homa_core *core; + struct homa_offload_core *offload_core; int max_values, *values; max_values = (NUM_GEN3_SOFTIRQ_CORES + 1) * nr_cpu_ids; @@ -1647,9 +1647,9 @@ int homa_sysctl_softirq_cores(struct ctl_table *table, int write, if (values[i] < 0) break; - core = homa_cores[values[i]]; + offload_core = &per_cpu(homa_offload_core, values[i]); for (j = 0; j < NUM_GEN3_SOFTIRQ_CORES; j++) - core->gen3_softirq_cores[j] = values[i+j+1]; + offload_core->gen3_softirq_cores[j] = values[i+j+1]; } } else { /* Read: return values from all of the cores. */ @@ -1663,9 +1663,9 @@ int homa_sysctl_softirq_cores(struct ctl_table *table, int write, *dst = i; dst++; table_copy.maxlen += sizeof(int); - core = homa_cores[i]; + offload_core = &per_cpu(homa_offload_core, i); for (j = 0; j < NUM_GEN3_SOFTIRQ_CORES; j++) { - *dst = core->gen3_softirq_cores[j]; + *dst = offload_core->gen3_softirq_cores[j]; dst++; table_copy.maxlen += sizeof(int); } diff --git a/homa_utils.c b/homa_utils.c index 953dfe4d..377411cc 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -9,14 +9,6 @@ #include "homa_rpc.h" #include "homa_skb.h" -/* Core-specific information. NR_CPUS is an overestimate of the actual - * number, but allows us to allocate the array statically. - */ -struct homa_core *homa_cores[NR_CPUS]; - -/* Points to block of memory holding all homa_cores; used to free it. */ -char *core_memory; - struct completion homa_pacer_kthread_done; /** @@ -29,43 +21,11 @@ struct completion homa_pacer_kthread_done; */ int homa_init(struct homa *homa) { - size_t aligned_size; - char *first; int i, err; _Static_assert(HOMA_MAX_PRIORITIES >= 8, "homa_init assumes at least 8 priority levels"); - /* Initialize core-specific info (if no-one else has already done it), - * making sure that each core has private cache lines. - */ - if (!core_memory) { - aligned_size = (sizeof(struct homa_core) + 0x3f) & ~0x3f; - core_memory = vmalloc(0x3f + (nr_cpu_ids*aligned_size)); - if (!core_memory) { - pr_err("Homa couldn't allocate memory for core-specific data\n"); - return -ENOMEM; - } - first = (char *) (((__u64) core_memory + 0x3f) & ~0x3f); - for (i = 0; i < nr_cpu_ids; i++) { - struct homa_core *core; - int j; - - core = (struct homa_core *) (first + i*aligned_size); - homa_cores[i] = core; - core->last_active = 0; - core->last_gro = 0; - atomic_set(&core->softirq_backlog, 0); - core->softirq_offset = 0; - core->gen3_softirq_cores[0] = i^1; - for (j = 1; j < NUM_GEN3_SOFTIRQ_CORES; j++) - core->gen3_softirq_cores[j] = -1; - core->last_app_active = 0; - core->held_skb = NULL; - core->held_bucket = 0; - } - } - homa->pacer_kthread = NULL; init_completion(&homa_pacer_kthread_done); atomic64_set(&homa->next_outgoing_id, 2); @@ -180,8 +140,6 @@ int homa_init(struct homa *homa) */ void homa_destroy(struct homa *homa) { - int i; - if (homa->pacer_kthread) { homa_pacer_stop(homa); wait_for_completion(&homa_pacer_kthread_done); @@ -193,13 +151,6 @@ void homa_destroy(struct homa *homa) homa_peertab_destroy(homa->peers); kfree(homa->peers); homa_skb_cleanup(homa); - - if (core_memory) { - vfree(core_memory); - core_memory = NULL; - for (i = 0; i < nr_cpu_ids; i++) - homa_cores[i] = NULL; - } kfree(homa->metrics); } diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index ccdfd313..22754cd0 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_offload.h" #include "homa_peer.h" #include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 @@ -2387,9 +2388,9 @@ TEST_F(homa_incoming, homa_choose_interest__find_idle_core) mock_cycles = 5000; self->homa.busy_cycles = 1000; - homa_cores[1]->last_active = 4100; - homa_cores[2]->last_active = 3500; - homa_cores[3]->last_active = 2000; + per_cpu(homa_offload_core, 1).last_active = 4100; + per_cpu(homa_offload_core, 2).last_active = 3500; + per_cpu(homa_offload_core, 3).last_active = 2000; struct homa_interest *result = homa_choose_interest(&self->homa, &self->hsk.request_interests, @@ -2413,9 +2414,9 @@ TEST_F(homa_incoming, homa_choose_interest__all_cores_busy) mock_cycles = 5000; self->homa.busy_cycles = 1000; - homa_cores[1]->last_active = 4100; - homa_cores[2]->last_active = 4001; - homa_cores[3]->last_active = 4800; + per_cpu(homa_offload_core, 1).last_active = 4100; + per_cpu(homa_offload_core, 2).last_active = 4001; + per_cpu(homa_offload_core, 3).last_active = 4800; struct homa_interest *result = homa_choose_interest(&self->homa, &self->hsk.request_interests, @@ -2607,10 +2608,10 @@ TEST_F(homa_incoming, homa_rpc_handoff__update_last_app_active) interest.core = 2; crpc->interest = &interest; mock_cycles = 10000; - homa_cores[2]->last_app_active = 444; + per_cpu(homa_offload_core, 2).last_app_active = 444; homa_rpc_handoff(crpc); EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - EXPECT_EQ(10000, homa_cores[2]->last_app_active); + EXPECT_EQ(10000, per_cpu(homa_offload_core, 2).last_app_active); atomic_andnot(RPC_HANDING_OFF, &crpc->flags); } diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 3531fb34..9691bedf 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -3,6 +3,7 @@ */ #include "homa_impl.h" +#include "homa_offload.h" #include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" @@ -10,6 +11,8 @@ #include "mock.h" #include "utils.h" +#define cur_offload_core (&per_cpu(homa_offload_core, raw_smp_processor_id())) + extern struct homa *homa; static struct sk_buff *tcp_gro_receive(struct list_head *held_list, @@ -82,18 +85,20 @@ FIXTURE_SETUP(homa_offload) inet_offloads[IPPROTO_TCP] = &self->tcp_offloads; self->tcp6_offloads.callbacks.gro_receive = tcp6_gro_receive; inet6_offloads[IPPROTO_TCP] = &self->tcp6_offloads; + homa_offload_init(); unit_log_clear(); /* Configure so core isn't considered too busy for bypasses. */ mock_cycles = 1000; self->homa.gro_busy_cycles = 500; - cur_core->last_gro = 400; + cur_offload_core->last_gro = 400; } FIXTURE_TEARDOWN(homa_offload) { struct sk_buff *skb, *tmp; + homa_offload_end(); list_for_each_entry_safe(skb, tmp, &self->napi.gro_hash[2].list, list) kfree_skb(skb); homa_destroy(&self->homa); @@ -160,10 +165,10 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; - cur_core->held_skb = NULL; - cur_core->held_bucket = 99; + cur_offload_core->held_skb = NULL; + cur_offload_core->held_bucket = 99; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_EQ(skb, cur_core->held_skb); + EXPECT_EQ(skb, cur_offload_core->held_skb); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(IPPROTO_HOMA, ipv6_hdr(skb)->nexthdr); kfree_skb(skb); @@ -182,10 +187,10 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; - cur_core->held_skb = NULL; - cur_core->held_bucket = 99; + cur_offload_core->held_skb = NULL; + cur_offload_core->held_bucket = 99; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_EQ(skb, cur_core->held_skb); + EXPECT_EQ(skb, cur_offload_core->held_skb); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(IPPROTO_HOMA, ip_hdr(skb)->protocol); EXPECT_EQ(2303, ip_hdr(skb)->check); @@ -221,8 +226,8 @@ TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) self->header.seg.offset = -1; skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - cur_core->held_skb = NULL; - cur_core->held_bucket = 99; + cur_offload_core->held_skb = NULL; + cur_offload_core->held_bucket = 99; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); h = (struct data_header *) skb_transport_header(skb); EXPECT_EQ(6000, htonl(h->seg.offset)); @@ -274,7 +279,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) * than one packet. */ self->homa.gro_policy |= HOMA_GRO_SHORT_BYPASS; - cur_core->last_gro = 400; + cur_offload_core->last_gro = 400; skb2 = mock_skb_new(&self->ip, &h.common, 1400, 2000); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(0, -PTR_ERR(result)); @@ -283,14 +288,14 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) /* Third attempt: bypass should happen. */ h.message_length = htonl(1400); h.incoming = htonl(1400); - cur_core->last_gro = 400; + cur_offload_core->last_gro = 400; skb3 = mock_skb_new(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_data_bypasses); /* Third attempt: no bypass because core busy. */ - cur_core->last_gro = 600; + cur_offload_core->last_gro = 600; skb4 = mock_skb_new(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); @@ -332,7 +337,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) /* Second attempt: HOMA_FAST_GRANTS is enabled. */ self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; - cur_core->last_gro = 400; + cur_offload_core->last_gro = 400; struct sk_buff *skb2 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); @@ -340,7 +345,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) EXPECT_SUBSTR("xmit DATA 1400@10000", unit_log_get()); /* Third attempt: core is too busy for fast grants. */ - cur_core->last_gro = 600; + cur_offload_core->last_gro = 600; struct sk_buff *skb3 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); @@ -356,13 +361,13 @@ TEST_F(homa_offload, homa_gro_receive__no_held_skb) skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; - cur_core->held_skb = NULL; - cur_core->held_bucket = 2; + cur_offload_core->held_skb = NULL; + cur_offload_core->held_bucket = 2; EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[2].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, cur_core->held_skb); - EXPECT_EQ(2, cur_core->held_bucket); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_EQ(2, cur_offload_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__empty_merge_list) @@ -373,13 +378,13 @@ TEST_F(homa_offload, homa_gro_receive__empty_merge_list) skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; - cur_core->held_skb = self->skb; - cur_core->held_bucket = 3; + cur_offload_core->held_skb = self->skb; + cur_offload_core->held_bucket = 3; EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[2].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, cur_core->held_skb); - EXPECT_EQ(2, cur_core->held_bucket); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_EQ(2, cur_offload_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__held_skb_not_in_merge_list) @@ -390,13 +395,13 @@ TEST_F(homa_offload, homa_gro_receive__held_skb_not_in_merge_list) skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); skb->hash = 3; NAPI_GRO_CB(skb)->same_flow = 0; - cur_core->held_skb = skb; - cur_core->held_bucket = 2; + cur_offload_core->held_skb = skb; + cur_offload_core->held_bucket = 2; EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, cur_core->held_skb); - EXPECT_EQ(3, cur_core->held_bucket); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_EQ(3, cur_offload_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__held_skb__in_merge_list_but_wrong_proto) @@ -407,25 +412,25 @@ TEST_F(homa_offload, homa_gro_receive__held_skb__in_merge_list_but_wrong_proto) skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); skb->hash = 3; NAPI_GRO_CB(skb)->same_flow = 0; - cur_core->held_skb = self->skb; + cur_offload_core->held_skb = self->skb; if (skb_is_ipv6(self->skb)) ipv6_hdr(self->skb)->nexthdr = IPPROTO_TCP; else ip_hdr(self->skb)->protocol = IPPROTO_TCP; - cur_core->held_bucket = 2; + cur_offload_core->held_bucket = 2; EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); - EXPECT_EQ(skb, cur_core->held_skb); - EXPECT_EQ(3, cur_core->held_bucket); + EXPECT_EQ(skb, cur_offload_core->held_skb); + EXPECT_EQ(3, cur_offload_core->held_bucket); kfree_skb(skb); } TEST_F(homa_offload, homa_gro_receive__merge) { struct sk_buff *skb, *skb2; int same_flow; - cur_core->held_skb = self->skb2; - cur_core->held_bucket = 2; + cur_offload_core->held_skb = self->skb2; + cur_offload_core->held_bucket = 2; self->header.seg.offset = htonl(6000); self->header.common.sender_id = cpu_to_be64(1002); @@ -460,8 +465,8 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) // First packet: fits below the limit. homa->max_gro_skbs = 3; - cur_core->held_skb = self->skb2; - cur_core->held_bucket = 2; + cur_offload_core->held_skb = self->skb2; + cur_offload_core->held_bucket = 2; self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); homa_gro_receive(&self->napi.gro_hash[3].list, skb); @@ -485,7 +490,7 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) // Third packet also hits the limit for skb, causing the bucket // to become empty. homa->max_gro_skbs = 2; - cur_core->held_skb = self->skb; + cur_offload_core->held_skb = self->skb; skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( @@ -504,112 +509,117 @@ TEST_F(homa_offload, homa_gro_gen2) mock_cycles = 1000; homa->busy_cycles = 100; mock_set_core(5); - atomic_set(&homa_cores[6]->softirq_backlog, 1); - homa_cores[6]->last_gro = 0; - atomic_set(&homa_cores[7]->softirq_backlog, 0); - homa_cores[7]->last_gro = 901; - atomic_set(&homa_cores[0]->softirq_backlog, 2); - homa_cores[0]->last_gro = 0; - atomic_set(&homa_cores[1]->softirq_backlog, 0); - homa_cores[1]->last_gro = 899; - atomic_set(&homa_cores[2]->softirq_backlog, 0); - homa_cores[2]->last_gro = 0; + atomic_set(&per_cpu(homa_offload_core, 6).softirq_backlog, 1); + per_cpu(homa_offload_core, 6).last_gro = 0; + atomic_set(&per_cpu(homa_offload_core, 7).softirq_backlog, 0); + per_cpu(homa_offload_core, 7).last_gro = 901; + atomic_set(&per_cpu(homa_offload_core, 0).softirq_backlog, 2); + per_cpu(homa_offload_core, 0).last_gro = 0; + atomic_set(&per_cpu(homa_offload_core, 1).softirq_backlog, 0); + per_cpu(homa_offload_core, 1).last_gro = 899; + atomic_set(&per_cpu(homa_offload_core, 2).softirq_backlog, 0); + per_cpu(homa_offload_core, 2).last_gro = 0; // Avoid busy cores. homa_gro_complete(self->skb, 0); EXPECT_EQ(1, self->skb->hash - 32); - EXPECT_EQ(1, atomic_read(&homa_cores[1]->softirq_backlog)); + EXPECT_EQ(1, atomic_read(&per_cpu(homa_offload_core, 1).softirq_backlog)); // All cores busy; must rotate. homa_gro_complete(self->skb, 0); EXPECT_EQ(6, self->skb->hash - 32); homa_gro_complete(self->skb, 0); EXPECT_EQ(7, self->skb->hash - 32); - EXPECT_EQ(2, homa_cores[5]->softirq_offset); + EXPECT_EQ(2, per_cpu(homa_offload_core, 5).softirq_offset); homa_gro_complete(self->skb, 0); EXPECT_EQ(0, self->skb->hash - 32); homa_gro_complete(self->skb, 0); EXPECT_EQ(1, self->skb->hash - 32); homa_gro_complete(self->skb, 0); EXPECT_EQ(6, self->skb->hash - 32); - EXPECT_EQ(1, homa_cores[5]->softirq_offset); + EXPECT_EQ(1, per_cpu(homa_offload_core, 5).softirq_offset); } TEST_F(homa_offload, homa_gro_gen3__basics) { + struct homa_offload_core *offload_core = cur_offload_core; + struct homa_offload_core *offload3 = &per_cpu(homa_offload_core, 3); + struct homa_offload_core *offload5 = &per_cpu(homa_offload_core, 5); + struct homa_offload_core *offload7 = &per_cpu(homa_offload_core, 7); + homa->gro_policy = HOMA_GRO_GEN3; - struct homa_core *core = cur_core; - core->gen3_softirq_cores[0] = 3; - core->gen3_softirq_cores[1] = 7; - core->gen3_softirq_cores[2] = 5; - homa_cores[3]->last_app_active = 4100; - homa_cores[7]->last_app_active = 3900; - homa_cores[5]->last_app_active = 2000; + offload_core->gen3_softirq_cores[0] = 3; + offload_core->gen3_softirq_cores[1] = 7; + offload_core->gen3_softirq_cores[2] = 5; + offload3->last_app_active = 4100; + offload7->last_app_active = 3900; + offload5->last_app_active = 2000; mock_cycles = 5000; self->homa.busy_cycles = 1000; homa_gro_complete(self->skb, 0); EXPECT_EQ(7, self->skb->hash - 32); - EXPECT_EQ(0, homa_cores[3]->last_active); - EXPECT_EQ(5000, homa_cores[7]->last_active); + EXPECT_EQ(0, offload3->last_active); + EXPECT_EQ(5000, offload7->last_active); } TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) { homa->gro_policy = HOMA_GRO_GEN3; - struct homa_core *core = cur_core; - core->gen3_softirq_cores[0] = 3; - core->gen3_softirq_cores[1] = -1; - core->gen3_softirq_cores[2] = 5; - homa_cores[3]->last_app_active = 4100; - homa_cores[5]->last_app_active = 2000; + struct homa_offload_core *offload_core = cur_offload_core; + offload_core->gen3_softirq_cores[0] = 3; + offload_core->gen3_softirq_cores[1] = -1; + offload_core->gen3_softirq_cores[2] = 5; + per_cpu(homa_offload_core, 3).last_app_active = 4100; + per_cpu(homa_offload_core, 5).last_app_active = 2000; mock_cycles = 5000; self->homa.busy_cycles = 1000; homa_gro_complete(self->skb, 0); EXPECT_EQ(3, self->skb->hash - 32); - EXPECT_EQ(5000, homa_cores[3]->last_active); + EXPECT_EQ(5000, per_cpu(homa_offload_core, 3).last_active); } TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) { homa->gro_policy = HOMA_GRO_GEN3; - struct homa_core *core = cur_core; - core->gen3_softirq_cores[0] = 3; - core->gen3_softirq_cores[1] = 7; - core->gen3_softirq_cores[2] = 5; - homa_cores[3]->last_app_active = 4100; - homa_cores[7]->last_app_active = 4001; - homa_cores[5]->last_app_active = 4500; + struct homa_offload_core *offload_core = cur_offload_core; + offload_core->gen3_softirq_cores[0] = 3; + offload_core->gen3_softirq_cores[1] = 7; + offload_core->gen3_softirq_cores[2] = 5; + per_cpu(homa_offload_core, 3).last_app_active = 4100; + per_cpu(homa_offload_core, 7).last_app_active = 4001; + per_cpu(homa_offload_core, 5).last_app_active = 4500; mock_cycles = 5000; self->homa.busy_cycles = 1000; homa_gro_complete(self->skb, 0); EXPECT_EQ(3, self->skb->hash - 32); - EXPECT_EQ(5000, homa_cores[3]->last_active); + EXPECT_EQ(5000, per_cpu(homa_offload_core, 3).last_active); } TEST_F(homa_offload, homa_gro_complete__clear_held_skb) { - struct homa_core *core = homa_cores[raw_smp_processor_id()]; + struct homa_offload_core *offload_core = &per_cpu(homa_offload_core, + raw_smp_processor_id()); - core->held_skb = self->skb2; + offload_core->held_skb = self->skb2; homa_gro_complete(self->skb, 0); - EXPECT_EQ(NULL, core->held_skb); + EXPECT_EQ(NULL, offload_core->held_skb); } TEST_F(homa_offload, homa_gro_complete__GRO_IDLE) { homa->gro_policy = HOMA_GRO_IDLE; - homa_cores[6]->last_active = 30; - homa_cores[7]->last_active = 25; - homa_cores[0]->last_active = 20; - homa_cores[1]->last_active = 15; - homa_cores[2]->last_active = 10; + per_cpu(homa_offload_core, 6).last_active = 30; + per_cpu(homa_offload_core, 7).last_active = 25; + per_cpu(homa_offload_core, 0).last_active = 20; + per_cpu(homa_offload_core, 1).last_active = 15; + per_cpu(homa_offload_core, 2).last_active = 10; mock_set_core(5); homa_gro_complete(self->skb, 0); EXPECT_EQ(1, self->skb->hash - 32); - homa_cores[6]->last_active = 5; + per_cpu(homa_offload_core, 6).last_active = 5; mock_set_core(5); homa_gro_complete(self->skb, 0); EXPECT_EQ(6, self->skb->hash - 32); diff --git a/test/utils.h b/test/utils.h index 5825bbc7..dd741f6b 100644 --- a/test/utils.h +++ b/test/utils.h @@ -32,8 +32,6 @@ enum unit_rpc_state { UNIT_IN_SERVICE = 24, }; -#define cur_core homa_cores[raw_smp_processor_id()] - extern char *unit_ack_string(struct homa_ack *ack); extern struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, From 4887e2be1a69dee9e075174daf0fe5119d2aaa85 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 6 Oct 2024 10:29:03 -0700 Subject: [PATCH 035/625] Fix bug in commit 58f6572bc3 (was deleting hsk->buffer_pool before last usage) --- homa_sock.c | 9 ++++++--- homa_utils.c | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/homa_sock.c b/homa_sock.c index 7f3dfeae..d6ce989a 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -186,6 +186,8 @@ void homa_sock_shutdown(struct homa_sock *hsk) * synchronize with any operations in progress. * 4. Perform other socket cleanup: at this point we know that * there will be no concurrent activities on individual RPCs. + * 5. Don't delete the buffer pool until after all of the RPCs + * have been reaped. * See sync.txt for additional information about locking. */ hsk->shutdown = true; @@ -207,9 +209,6 @@ void homa_sock_shutdown(struct homa_sock *hsk) wake_up_process(interest->thread); homa_sock_unlock(hsk); - homa_pool_destroy(hsk->buffer_pool); - kfree(hsk->buffer_pool); - i = 0; while (!list_empty(&hsk->dead_rpcs)) { homa_rpc_reap(hsk, 1000); @@ -219,6 +218,10 @@ void homa_sock_shutdown(struct homa_sock *hsk) tt_freeze(); } } + + homa_pool_destroy(hsk->buffer_pool); + kfree(hsk->buffer_pool); + hsk->buffer_pool = NULL; } /** diff --git a/homa_utils.c b/homa_utils.c index 377411cc..4da1ec79 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -148,10 +148,13 @@ void homa_destroy(struct homa *homa) /* The order of the following 2 statements matters! */ homa_socktab_destroy(homa->port_map); kfree(homa->port_map); + homa->port_map = NULL; homa_peertab_destroy(homa->peers); kfree(homa->peers); + homa->peers = NULL; homa_skb_cleanup(homa); kfree(homa->metrics); + homa->metrics = NULL; } /** From 434197752779968fcc02ce3f491d87492999113a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 6 Oct 2024 11:12:38 -0700 Subject: [PATCH 036/625] Remove/clean up unused protocol functions --- homa_impl.h | 4 ---- homa_plumbing.c | 56 ++----------------------------------------------- test/mock.c | 6 ------ 3 files changed, 2 insertions(+), 64 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 199b0193..8e6e0003 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1101,7 +1101,6 @@ extern void homa_data_from_server(struct sk_buff *skb, struct homa_rpc *crpc); extern void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); extern void homa_destroy(struct homa *homa); -extern int homa_diag_destroy(struct sock *sk, int err); extern int homa_disconnect(struct sock *sk, int flags); extern void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); extern int homa_dointvec(struct ctl_table *table, int write, @@ -1156,7 +1155,6 @@ extern int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); extern int homa_register_interests(struct homa_interest *interest, struct homa_sock *hsk, int flags, __u64 id); -extern void homa_rehash(struct sock *sk); extern void homa_remove_from_throttled(struct homa_rpc *rpc); extern void homa_resend_data(struct homa_rpc *rpc, int start, int end, int priority); @@ -1184,8 +1182,6 @@ extern void homa_unhash(struct sock *sk); extern void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); extern int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, int length); -extern int homa_v4_early_demux(struct sk_buff *skb); -extern int homa_v4_early_demux_handler(struct sk_buff *skb); extern int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); extern struct homa_rpc diff --git a/homa_plumbing.c b/homa_plumbing.c index bed24531..27aeeeb6 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -108,16 +108,13 @@ struct proto homa_prot = { .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, .backlog_rcv = homa_backlog_rcv, - .release_cb = ip4_datagram_release_cb, .hash = homa_hash, .unhash = homa_unhash, - .rehash = homa_rehash, .get_port = homa_get_port, .sysctl_mem = sysctl_homa_mem, .sysctl_wmem = &sysctl_homa_wmem_min, .sysctl_rmem = &sysctl_homa_rmem_min, .obj_size = sizeof(struct homa_sock), - .diag_destroy = homa_diag_destroy, .no_autobind = 1, }; @@ -135,10 +132,8 @@ struct proto homav6_prot = { .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, .backlog_rcv = homa_backlog_rcv, - .release_cb = ip6_datagram_release_cb, .hash = homa_hash, .unhash = homa_unhash, - .rehash = homa_rehash, .get_port = homa_get_port, .sysctl_mem = sysctl_homa_mem, .sysctl_wmem = &sysctl_homa_wmem_min, @@ -148,7 +143,6 @@ struct proto homav6_prot = { * struct homa_sock. */ .obj_size = sizeof(struct homa_sock) + sizeof(struct ipv6_pinfo), - .diag_destroy = homa_diag_destroy, .no_autobind = 1, }; @@ -1148,32 +1142,21 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } /** - * homa_hash() - ??. + * homa_hash() - Not needed for Homa. * @sk: Socket for the operation * Return: ?? */ int homa_hash(struct sock *sk) { - pr_warn("unimplemented hash invoked on Homa socket\n"); return 0; } /** - * homa_unhash() - ??. + * homa_unhash() - Not needed for Homa. * @sk: Socket for the operation */ void homa_unhash(struct sock *sk) { - pr_warn("unimplemented unhash invoked on Homa socket\n"); -} - -/** - * homa_rehash() - ??. - * @sk: Socket for the operation - */ -void homa_rehash(struct sock *sk) -{ - pr_warn("unimplemented rehash invoked on Homa socket\n"); } /** @@ -1191,41 +1174,6 @@ int homa_get_port(struct sock *sk, unsigned short snum) return 0; } -/** - * homa_diag_destroy() - ??. - * @sk: Socket for the operation - * @err: ?? - * Return: ?? - */ -int homa_diag_destroy(struct sock *sk, int err) -{ - pr_warn("unimplemented diag_destroy invoked on Homa socket\n"); - return -EINVAL; - -} - -/** - * homa_v4_early_demux() - Invoked by IP for ??. - * @skb: Socket buffer. - * Return: Always 0? - */ -int homa_v4_early_demux(struct sk_buff *skb) -{ - pr_warn("unimplemented early_demux invoked on Homa socket\n"); - return 0; -} - -/** - * homa_v4_early_demux_handler() - invoked by IP for ??. - * @skb: Socket buffer. - * @return: Always 0? - */ -int homa_v4_early_demux_handler(struct sk_buff *skb) -{ - pr_warn("unimplemented early_demux_handler invoked on Homa socket\n"); - return 0; -} - /** * homa_softirq() - This function is invoked at SoftIRQ level to handle * incoming packets. diff --git a/test/mock.c b/test/mock.c index ba009f77..c027f1ea 100644 --- a/test/mock.c +++ b/test/mock.c @@ -545,10 +545,6 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len) return 0; } -void ip6_datagram_release_cb(struct sock *sk) -{ -} - struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { @@ -672,8 +668,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, return 0; } -void ip4_datagram_release_cb(struct sock *sk) {} - int filp_close(struct file *, fl_owner_t id) { return 0; From 407e3402899cc751ca9b643773344e32d1fc1954 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 6 Oct 2024 11:13:02 -0700 Subject: [PATCH 037/625] Update notes.txt --- notes.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/notes.txt b/notes.txt index 4109b406..4bcdc4a2 100755 --- a/notes.txt +++ b/notes.txt @@ -1,11 +1,6 @@ Notes for Homa implementation in Linux: --------------------------------------- -* Notes on splitting homa_impl.h: - * Remove homa_wire.h include from homa_impl.h? - * Where should homa_rpc_bucket be declared (currently in homa_impl.h)? - * Move homa_message_in_init to homa_rpc.c? - * Thoughts on making TCP and Homa play better together: * Goals: * Keep the NIC tx queue from growing long. From baa97c695eedbb628565b3e251ba8fee13dac908 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 7 Oct 2024 11:10:45 -0700 Subject: [PATCH 038/625] Update perf.txt --- perf.txt | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/perf.txt b/perf.txt index 4b10fb0b..55458fe8 100644 --- a/perf.txt +++ b/perf.txt @@ -31,11 +31,17 @@ order. Overall observations: * Short messages: - * Homa: 10x increase for P99, not much changes for P50 - * TCP: 25-60% increases for both P50 and P99 + * Homa: 2x for P50, 10x increase for P99, 2x for P50 + * TCP: 25% increase for P50, 10% increase for P99 + * The TCP degradation is caused by Homa using priorities. If the + experiment is run without priorities for Homa, TCP's short-message + latencies are significantly better than TCP by itself: 571 us for P50, + 3835 us for P99. * Long messages: - * TCP latency improves up to 2x as Homa traffic share increases (perhaps - because Homa throttles itself to link speed?) + * TCP P50 and P99 latency drop by up to 40% as Homa traffic share + increases (perhaps because Homa throttles itself to link speed?) + * Running Homa without priorities improves TCP even more (2x gain for TCP + P50 and P99 under even traffic split, relative to TCP alone) * Homa latency not much affected * Other workloads: * W5 similar to W4 From dbff820b07cb4483bfd491857e405de4acef2a21 Mon Sep 17 00:00:00 2001 From: breakertt Date: Mon, 7 Oct 2024 11:30:44 -0700 Subject: [PATCH 039/625] Cleanup Makefile indentation Resolves #65 --- Makefile | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 467a0ef7..25b50c45 100644 --- a/Makefile +++ b/Makefile @@ -4,19 +4,19 @@ ifneq ($(KERNELRELEASE),) obj-m += homa.o homa-y = homa_grant.o \ - homa_incoming.o \ - homa_metrics.o \ - homa_offload.o \ - homa_outgoing.o \ - homa_peer.o \ - homa_pool.o \ - homa_plumbing.o \ - homa_rpc.o \ - homa_skb.o \ - homa_sock.o \ - homa_timer.o \ - homa_utils.o \ - timetrace.o + homa_incoming.o \ + homa_metrics.o \ + homa_offload.o \ + homa_outgoing.o \ + homa_peer.o \ + homa_pool.o \ + homa_plumbing.o \ + homa_rpc.o \ + homa_skb.o \ + homa_sock.o \ + homa_timer.o \ + homa_utils.o \ + timetrace.o MY_CFLAGS += -g ccflags-y += ${MY_CFLAGS} From 63c5732c943de338c3c417d246a07008fd3fda92 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 8 Oct 2024 12:02:54 -0700 Subject: [PATCH 040/625] Implement Reverse Christmas Tree format for declarations --- homa_api.c | 4 +- homa_grant.c | 21 ++- homa_impl.h | 1 + homa_incoming.c | 54 +++---- homa_offload.c | 34 +++-- homa_outgoing.c | 33 +++-- homa_peer.c | 10 +- homa_peer.h | 2 +- homa_plumbing.c | 20 +-- homa_pool.c | 9 +- homa_receiver.cc | 3 +- homa_receiver.h | 1 + homa_rpc.c | 27 ++-- homa_skb.c | 33 +++-- homa_sock.c | 4 +- homa_timer.c | 20 +-- homa_utils.c | 21 +-- test/ccutils.cc | 10 +- test/main.c | 3 +- test/mock.c | 27 +++- test/unit_homa_grant.c | 56 +++++-- test/unit_homa_incoming.c | 298 +++++++++++++++++++++++++------------- test/unit_homa_metrics.c | 6 +- test/unit_homa_offload.c | 71 +++++---- test/unit_homa_outgoing.c | 192 +++++++++++++++--------- test/unit_homa_peer.c | 30 +++- test/unit_homa_plumbing.c | 93 ++++++++---- test/unit_homa_pool.c | 140 ++++++++++-------- test/unit_homa_rpc.c | 207 ++++++++++++++++---------- test/unit_homa_skb.c | 70 ++++++--- test/unit_homa_sock.c | 13 +- test/unit_homa_timer.c | 18 ++- test/unit_homa_utils.c | 7 +- test/unit_timetrace.c | 11 +- timetrace.c | 29 ++-- 35 files changed, 1008 insertions(+), 570 deletions(-) diff --git a/homa_api.c b/homa_api.c index 4969faca..72b32fe2 100644 --- a/homa_api.c +++ b/homa_api.c @@ -38,8 +38,8 @@ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, const union sockaddr_in_union *dest_addr, uint64_t id) { struct homa_sendmsg_args args; - struct iovec vec; struct msghdr hdr; + struct iovec vec; int result; args.id = id; @@ -117,8 +117,8 @@ int homa_send(int sockfd, const void *message_buf, size_t length, uint64_t completion_cookie) { struct homa_sendmsg_args args; - struct iovec vec; struct msghdr hdr; + struct iovec vec; int result; args.id = 0; diff --git a/homa_grant.c b/homa_grant.c index e152f49c..de92507b 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -68,10 +68,10 @@ inline int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) */ void homa_grant_add_rpc(struct homa_rpc *rpc) { - struct homa_rpc *candidate; + struct homa *homa = rpc->hsk->homa; struct homa_peer *peer = rpc->peer; struct homa_peer *peer_cand; - struct homa *homa = rpc->hsk->homa; + struct homa_rpc *candidate; /* Make sure this message is in the right place in the grantable_rpcs * list for its peer. @@ -158,11 +158,11 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) */ void homa_grant_remove_rpc(struct homa_rpc *rpc) { - struct homa_rpc *head; + struct homa *homa = rpc->hsk->homa; struct homa_peer *peer = rpc->peer; struct homa_rpc *candidate; - struct homa *homa = rpc->hsk->homa; __u64 time = get_cycles(); + struct homa_rpc *head; if (list_empty(&rpc->grantable_links)) return; @@ -382,9 +382,6 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) */ void homa_grant_recalc(struct homa *homa, int locked) { - int i, active, try_again; - __u64 start; - /* The tricky part of this method is that we need to release * homa->grantable_lock before actually sending grants, because * (a) we need to hold the RPC lock while sending grants, and @@ -393,6 +390,8 @@ void homa_grant_recalc(struct homa *homa, int locked) * This array hold a copy of homa->active_rpcs. */ struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; + int i, active, try_again; + __u64 start; tt_record("homa_grant_recalc starting"); INC_METRIC(grant_recalc_calls, 1); @@ -423,8 +422,8 @@ void homa_grant_recalc(struct homa *homa, int locked) homa->max_overcommit); homa->num_active_rpcs = active; for (i = 0; i < active; i++) { - int extra_levels; struct homa_rpc *rpc = homa->active_rpcs[i]; + int extra_levels; active_rpcs[i] = rpc; atomic_inc(&rpc->grants_in_progress); @@ -563,10 +562,10 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, */ void homa_grant_find_oldest(struct homa *homa) { + int max_incoming = homa->grant_window + 2*homa->fifo_grant_increment; struct homa_rpc *rpc, *oldest; struct homa_peer *peer; __u64 oldest_birth; - int max_incoming = homa->grant_window + 2*homa->fifo_grant_increment; oldest = NULL; oldest_birth = ~0; @@ -647,9 +646,9 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) */ int homa_grantable_lock_slow(struct homa *homa, int recalc) { - int result = 0; - __u64 start = get_cycles(); int starting_count = atomic_read(&homa->grant_recalc_count); + __u64 start = get_cycles(); + int result = 0; tt_record("beginning wait for grantable lock"); while (1) { diff --git a/homa_impl.h b/homa_impl.h index 8e6e0003..daefbf11 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -874,6 +874,7 @@ struct homa { */ int temp[4]; }; + /** * struct homa_skb_info - Additional information needed by Homa for each * outbound DATA packet. Space is allocated for this at the very end of the diff --git a/homa_incoming.c b/homa_incoming.c index 18e511d0..41cc9daa 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -80,8 +80,8 @@ struct homa_gap *homa_gap_new(struct list_head *next, int start, int end) */ void homa_gap_retry(struct homa_rpc *rpc) { - struct homa_gap *gap; struct resend_header resend; + struct homa_gap *gap; list_for_each_entry(gap, &rpc->msgin.gaps, links) { resend.offset = htonl(gap->start); @@ -103,10 +103,10 @@ void homa_gap_retry(struct homa_rpc *rpc) void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) { struct data_header *h = (struct data_header *) skb->data; + struct homa_gap *gap, *dummy, *gap2; int start = ntohl(h->seg.offset); int length = homa_data_len(skb); int end = start + length; - struct homa_gap *gap, *dummy, *gap2; if ((start + length) > rpc->msgin.length) { tt_record3("Packet extended past message end; id %d, offset %d, length %d", @@ -210,12 +210,12 @@ int homa_copy_to_user(struct homa_rpc *rpc) #define MAX_SKBS 20 #endif struct sk_buff *skbs[MAX_SKBS]; - int n = 0; /* Number of filled entries in skbs. */ - int error = 0; int start_offset = 0; int end_offset = 0; - int i; + int error = 0; __u64 start; + int n = 0; /* Number of filled entries in skbs. */ + int i; /* Tricky note: we can't hold the RPC lock while we're actually * copying to user space, because (a) it's illegal to hold a spinlock @@ -256,12 +256,12 @@ int homa_copy_to_user(struct homa_rpc *rpc) for (i = 0; i < n; i++) { struct data_header *h = (struct data_header *) skbs[i]->data; - int offset = ntohl(h->seg.offset); int pkt_length = homa_data_len(skbs[i]); + int offset = ntohl(h->seg.offset); + int buf_bytes, chunk_size; + struct iov_iter iter; int copied = 0; char *dst; - struct iov_iter iter; - int buf_bytes, chunk_size; /* Each iteration of this loop copies to one * user buffer. @@ -335,19 +335,16 @@ int homa_copy_to_user(struct homa_rpc *rpc) */ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) { - const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - struct data_header *h = (struct data_header *) skb->data; - __u64 id = homa_local_id(h->common.sender_id); - int dport = ntohs(h->common.dport); - struct homa_sock *hsk; - struct homa_rpc *rpc = NULL; - struct sk_buff *next; - #ifdef __UNIT_TEST__ #define MAX_ACKS 2 #else #define MAX_ACKS 10 #endif + const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + struct data_header *h = (struct data_header *) skb->data; + __u64 id = homa_local_id(h->common.sender_id); + int dport = ntohs(h->common.dport); + /* Used to collect acks from data packets so we can process them * all at the end (can't process them inline because that may * require locking conflicting RPCs). If we run out of space just @@ -355,6 +352,9 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) * explicit mechanism. */ struct homa_ack acks[MAX_ACKS]; + struct homa_rpc *rpc = NULL; + struct homa_sock *hsk; + struct sk_buff *next; int num_acks = 0; /* Find the appropriate socket.*/ @@ -536,8 +536,8 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) */ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) { - struct homa *homa = rpc->hsk->homa; struct data_header *h = (struct data_header *) skb->data; + struct homa *homa = rpc->hsk->homa; tt_record4("incoming data packet, id %d, peer 0x%x, offset %d/%d", homa_local_id(h->common.sender_id), @@ -760,12 +760,12 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) */ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) { + struct cutoffs_header *h = (struct cutoffs_header *) skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + struct homa_peer *peer; int i; - struct cutoffs_header *h = (struct cutoffs_header *) skb->data; - struct homa_peer *peer = homa_peer_find(hsk->homa->peers, - &saddr, &hsk->inet); + peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); if (!IS_ERR(peer)) { peer->unsched_cutoffs[0] = INT_MAX; for (i = 1; i < HOMA_MAX_PRIORITIES; i++) @@ -789,8 +789,8 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct common_header *h = (struct common_header *) skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); __u64 id = homa_local_id(h->sender_id); - struct ack_header ack; struct homa_peer *peer; + struct ack_header ack; tt_record1("Received NEED_ACK for id %d", id); @@ -843,8 +843,8 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc) { - struct ack_header *h = (struct ack_header *) skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + struct ack_header *h = (struct ack_header *) skb->data; int i, count; if (rpc != NULL) { @@ -983,8 +983,8 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, int port, int error) { struct homa_socktab_scan scan; - struct homa_sock *hsk; struct homa_rpc *rpc, *tmp; + struct homa_sock *hsk; rcu_read_lock(); for (hsk = homa_socktab_start_scan(homa->port_map, &scan); @@ -1181,11 +1181,11 @@ int homa_register_interests(struct homa_interest *interest, struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, __u64 id) { + int error, blocked = 0, polled = 0; struct homa_rpc *result = NULL; struct homa_interest interest; struct homa_rpc *rpc = NULL; uint64_t poll_start, now; - int error, blocked = 0, polled = 0; /* Each iteration of this loop finds an RPC, but it might not be * in a state where we can return it (e.g., there might be packets @@ -1375,10 +1375,10 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, struct homa_interest *homa_choose_interest(struct homa *homa, struct list_head *head, int offset) { + __u64 busy_time = get_cycles() - homa->busy_cycles; struct homa_interest *backup = NULL; - struct list_head *pos; struct homa_interest *interest; - __u64 busy_time = get_cycles() - homa->busy_cycles; + struct list_head *pos; list_for_each(pos, head) { interest = (struct homa_interest *) (((char *) pos) - offset); @@ -1405,8 +1405,8 @@ struct homa_interest *homa_choose_interest(struct homa *homa, */ void homa_rpc_handoff(struct homa_rpc *rpc) { - struct homa_interest *interest; struct homa_sock *hsk = rpc->hsk; + struct homa_interest *interest; if ((atomic_read(&rpc->flags) & RPC_HANDING_OFF) || !list_empty(&rpc->ready_links)) diff --git a/homa_offload.c b/homa_offload.c index db813f7a..9111c14d 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -74,8 +74,8 @@ int homa_offload_init(void) */ int homa_offload_end(void) { - int res1 = inet_del_offload(&homa_offload, IPPROTO_HOMA); int res2 = inet6_del_offload(&homa_offload, IPPROTO_HOMA); + int res1 = inet_del_offload(&homa_offload, IPPROTO_HOMA); return res1 ? res1 : res2; } @@ -131,6 +131,7 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, { struct common_header *h = (struct common_header *) skb_transport_header(skb); + // tt_record4("homa_tcp_gro_receive got type 0x%x, flags 0x%x, " // "urgent 0x%x, id %d", h->type, h->flags, // ntohs(h->urgent), homa_local_id(h->sender_id)); @@ -277,20 +278,21 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * gro_list by the caller, so it will be considered for merges * in the future. */ - struct sk_buff *held_skb; - struct sk_buff *result = NULL; - struct homa_offload_core *offload_core = &per_cpu(homa_offload_core, - raw_smp_processor_id()); - __u64 now = get_cycles(); - int busy = (now - offload_core->last_gro) < homa->gro_busy_cycles; - __u32 hash; __u64 saved_softirq_metric, softirq_cycles; + struct homa_offload_core *offload_core; + struct sk_buff *result = NULL; __u64 *softirq_cycles_metric; - struct data_header *h_new = (struct data_header *) - skb_transport_header(skb); + struct data_header *h_new; + struct sk_buff *held_skb; + __u64 now = get_cycles(); int priority; __u32 saddr; + __u32 hash; + int busy; + h_new = (struct data_header *) skb_transport_header(skb); + offload_core = &per_cpu(homa_offload_core, raw_smp_processor_id()); + busy = (now - offload_core->last_gro) < homa->gro_busy_cycles; offload_core->last_active = now; if (skb_is_ipv6(skb)) { priority = ipv6_hdr(skb)->priority; @@ -468,11 +470,11 @@ void homa_gro_gen2(struct sk_buff *skb) * balancing. */ struct data_header *h = (struct data_header *) skb_transport_header(skb); - int i; int this_core = raw_smp_processor_id(); + struct homa_offload_core *offload_core; int candidate = this_core; __u64 now = get_cycles(); - struct homa_offload_core *offload_core; + int i; for (i = CORES_TO_CHECK; i > 0; i--) { candidate++; @@ -523,11 +525,12 @@ void homa_gro_gen3(struct sk_buff *skb) * load balancer. */ struct data_header *h = (struct data_header *) skb_transport_header(skb); - int i, core; __u64 now, busy_time; - int *candidates = per_cpu(homa_offload_core, raw_smp_processor_id()) - .gen3_softirq_cores; + int *candidates; + int i, core; + candidates = per_cpu(homa_offload_core, + raw_smp_processor_id()).gen3_softirq_cores; now = get_cycles(); busy_time = now - homa->busy_cycles; @@ -567,6 +570,7 @@ void homa_gro_gen3(struct sk_buff *skb) int homa_gro_complete(struct sk_buff *skb, int hoffset) { struct data_header *h = (struct data_header *) skb_transport_header(skb); + // tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", // h->common.type, homa_local_id(h->common.sender_id), // ntohl(h->seg.offset), diff --git a/homa_outgoing.c b/homa_outgoing.c index 686c27a1..f49b9c88 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -122,10 +122,10 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data) { - struct data_header *h; - struct sk_buff *skb; struct homa_skb_info *homa_info; int segs, err, gso_size; + struct data_header *h; + struct sk_buff *skb; /* Initialize the overall skb. */ skb = homa_skb_new_tx(sizeof32(struct data_header)); @@ -225,14 +225,15 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) */ int mtu, max_seg_data, max_gso_data; + int overlap_xmit, segs_per_gso; + struct sk_buff **last_link; + struct dst_entry *dst; + /* Bytes of the message that haven't yet been copied into skbs. */ int bytes_left; - int err; - struct sk_buff **last_link; - struct dst_entry *dst; - int overlap_xmit, segs_per_gso; int gso_size; + int err; homa_message_out_init(rpc, iter->count); if (unlikely((rpc->msgout.length > HOMA_MAX_MESSAGE_LENGTH) @@ -371,12 +372,12 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, struct homa_sock *hsk) { + struct netdev_queue *txq; struct common_header *h; - int extra_bytes; - int result, priority; struct dst_entry *dst; + int result, priority; struct sk_buff *skb; - struct netdev_queue *txq; + int extra_bytes; dst = homa_get_dst(peer, hsk); skb = homa_skb_new_tx(HOMA_MAX_HEADER); @@ -449,9 +450,9 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) { struct common_header *h = (struct common_header *) skb->data; + struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); struct unknown_header unknown; struct homa_peer *peer; - struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); if (hsk->homa->verbose) pr_notice("sending UNKNOWN to peer %s:%d for id %llu", @@ -548,9 +549,9 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) */ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) { - int err; struct homa_skb_info *homa_info = homa_get_skb_info(skb); struct dst_entry *dst; + int err; /* Update info that may have changed since the message was initially * created. @@ -607,8 +608,8 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) void homa_resend_data(struct homa_rpc *rpc, int start, int end, int priority) { - struct sk_buff *skb; struct homa_skb_info *homa_info; + struct sk_buff *skb; if (end <= start) return; @@ -640,8 +641,8 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, for ( ; data_left > 0; data_left -= seg_length, offset += seg_length, seg_offset += skb_shinfo(skb)->gso_size) { - struct sk_buff *new_skb; struct homa_skb_info *new_homa_info; + struct sk_buff *new_skb; int err; if (seg_length > data_left) @@ -740,8 +741,8 @@ void homa_outgoing_sysctl_changed(struct homa *homa) */ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) { - __u64 idle, new_idle, clock; int cycles_for_packet, bytes; + __u64 idle, new_idle, clock; bytes = homa_get_skb_info(skb)->wire_bytes; cycles_for_packet = (bytes * homa->cycles_per_kbyte)/1000; @@ -874,8 +875,8 @@ void homa_pacer_xmit(struct homa *homa) homa_throttle_lock(homa); homa->pacer_fifo_count -= homa->pacer_fifo_fraction; if (homa->pacer_fifo_count <= 0) { - __u64 oldest = ~0; struct homa_rpc *cur; + __u64 oldest = ~0; homa->pacer_fifo_count += 1000; rpc = NULL; @@ -1031,8 +1032,8 @@ void homa_remove_from_throttled(struct homa_rpc *rpc) void homa_log_throttled(struct homa *homa) { struct homa_rpc *rpc; - int rpcs = 0; int64_t bytes = 0; + int rpcs = 0; pr_notice("Printing throttled list\n"); homa_throttle_lock(homa); diff --git a/homa_peer.c b/homa_peer.c index 77362599..60c0989a 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -42,9 +42,9 @@ int homa_peertab_init(struct homa_peertab *peertab) */ void homa_peertab_destroy(struct homa_peertab *peertab) { - int i; - struct homa_peer *peer; struct hlist_node *next; + struct homa_peer *peer; + int i; if (!peertab->buckets) return; @@ -72,10 +72,10 @@ void homa_peertab_destroy(struct homa_peertab *peertab) struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, int *num_peers) { - int i, count; - struct homa_peer *peer; - struct hlist_node *next; struct homa_peer **result; + struct hlist_node *next; + struct homa_peer *peer; + int i, count; *num_peers = 0; if (!peertab->buckets) diff --git a/homa_peer.h b/homa_peer.h index 88955cb9..b020a88d 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -199,7 +199,7 @@ extern void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, struct homa_sock *hsk); extern void homa_peertab_destroy(struct homa_peertab *peertab); extern struct homa_peer ** - homa_peertab_get_peers(struct homa_peertab *peertab, + homa_peertab_get_peers(struct homa_peertab *peertab, int *num_peers); extern int homa_peertab_init(struct homa_peertab *peertab); extern void homa_peer_add_ack(struct homa_rpc *rpc); diff --git a/homa_plumbing.c b/homa_plumbing.c index 27aeeeb6..ddaceb14 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1182,14 +1182,14 @@ int homa_get_port(struct sock *sk, unsigned short snum) */ int homa_softirq(struct sk_buff *skb) { - struct common_header *h; struct sk_buff *packets, *other_pkts, *next; struct sk_buff **prev_link, **other_link; + struct common_header *h; + int first_packet = 1; static __u64 last; - __u64 start; int header_offset; - int first_packet = 1; int pull_length; + __u64 start; start = get_cycles(); INC_METRIC(softirq_calls, 1); @@ -1372,14 +1372,14 @@ int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb) */ int homa_err_handler_v4(struct sk_buff *skb, u32 info) { + const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); const struct iphdr *iph = ip_hdr(skb); int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; - const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); if ((type == ICMP_DEST_UNREACH) && (code == ICMP_PORT_UNREACH)) { - struct common_header *h; char *icmp = (char *) icmp_hdr(skb); + struct common_header *h; iph = (struct iphdr *) (icmp + sizeof(struct icmphdr)); h = (struct common_header *) (icmp + sizeof(struct icmphdr) @@ -1420,8 +1420,8 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; if ((type == ICMPV6_DEST_UNREACH) && (code == ICMPV6_PORT_UNREACH)) { - struct common_header *h; char *icmp = (char *) icmp_hdr(skb); + struct common_header *h; iph = (struct ipv6hdr *) (icmp + sizeof(struct icmphdr)); h = (struct common_header *) (icmp + sizeof(struct icmphdr) @@ -1567,10 +1567,10 @@ int homa_dointvec(struct ctl_table *table, int write, int homa_sysctl_softirq_cores(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int result, i; - struct ctl_table table_copy; struct homa_offload_core *offload_core; + struct ctl_table table_copy; int max_values, *values; + int result, i; max_values = (NUM_GEN3_SOFTIRQ_CORES + 1) * nr_cpu_ids; values = kmalloc_array(max_values, sizeof(int), GFP_KERNEL); @@ -1647,9 +1647,9 @@ enum hrtimer_restart homa_hrtimer(struct hrtimer *timer) int homa_timer_main(void *transportInfo) { struct homa *homa = (struct homa *) transportInfo; - u64 nsec; - ktime_t tick_interval; struct hrtimer hrtimer; + ktime_t tick_interval; + u64 nsec; hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer.function = &homa_hrtimer; diff --git a/homa_pool.c b/homa_pool.c index e2700326..3863fcff 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -50,8 +50,8 @@ static inline void set_bpages_needed(struct homa_pool *pool) */ int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) { - int i, result; struct homa_pool *pool = hsk->buffer_pool; + int i, result; if (((__u64) region) & ~PAGE_MASK) return -EINVAL; @@ -135,12 +135,13 @@ void homa_pool_destroy(struct homa_pool *pool) int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, int set_owner) { - int alloced = 0; + int core_num = raw_smp_processor_id(); + struct homa_pool_core *core; __u64 now = get_cycles(); + int alloced = 0; int limit = 0; - int core_num = raw_smp_processor_id(); - struct homa_pool_core *core = &pool->cores[core_num]; + core = &pool->cores[core_num]; if (atomic_sub_return(num_pages, &pool->free_bpages) < 0) { atomic_add(num_pages, &pool->free_bpages); return -1; diff --git a/homa_receiver.cc b/homa_receiver.cc index 470000f0..8aeb8418 100644 --- a/homa_receiver.cc +++ b/homa_receiver.cc @@ -52,13 +52,14 @@ homa::receiver::~receiver() */ void homa::receiver::copy_out(void *dest, size_t offset, size_t count) const { - ssize_t limit = offset + count; char *cdest = static_cast(dest); + ssize_t limit = offset + count; if (limit > msg_length) limit = msg_length; while (static_cast(offset) < limit) { size_t chunk_size = contiguous(offset); + memcpy(cdest, get(offset), chunk_size); offset += chunk_size; cdest += chunk_size; diff --git a/homa_receiver.h b/homa_receiver.h index 1554bc71..54805424 100644 --- a/homa_receiver.h +++ b/homa_receiver.h @@ -83,6 +83,7 @@ class receiver { template inline T* get(size_t offset, T* storage = nullptr) const { int buf_num = offset >> HOMA_BPAGE_SHIFT; + if (static_cast(offset + sizeof(T)) > msg_length) return nullptr; if (contiguous(offset) >= sizeof(T)) diff --git a/homa_rpc.c b/homa_rpc.c index c6eb5ebc..5dcbc021 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -22,10 +22,10 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, const union sockaddr_in_union *dest) { - int err; - struct homa_rpc *crpc; - struct homa_rpc_bucket *bucket; struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); + struct homa_rpc_bucket *bucket; + struct homa_rpc *crpc; + int err; crpc = kmalloc(sizeof(*crpc), GFP_KERNEL); if (unlikely(!crpc)) @@ -109,14 +109,15 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, const struct in6_addr *source, struct data_header *h, int *created) { - int err; - struct homa_rpc *srpc = NULL; __u64 id = homa_local_id(h->common.sender_id); - struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); + struct homa_rpc_bucket *bucket; + struct homa_rpc *srpc = NULL; + int err; /* Lock the bucket, and make sure no-one else has already created * the desired RPC. */ + bucket = homa_server_rpc_bucket(hsk, id); homa_bucket_lock(bucket, id, "homa_rpc_new_server"); hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { if ((srpc->id == id) && @@ -208,11 +209,11 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack) { - struct homa_rpc *rpc; - struct homa_sock *hsk2 = hsk; - __u64 id = homa_local_id(ack->client_id); __u16 client_port = ntohs(ack->client_port); __u16 server_port = ntohs(ack->server_port); + __u64 id = homa_local_id(ack->client_id); + struct homa_sock *hsk2 = hsk; + struct homa_rpc *rpc; UNIT_LOG("; ", "ack %llu", id); if (hsk2->port != server_port) { @@ -333,8 +334,8 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) #else #define BATCH_MAX 20 #endif - struct sk_buff *skbs[BATCH_MAX]; struct homa_rpc *rpcs[BATCH_MAX]; + struct sk_buff *skbs[BATCH_MAX]; int num_skbs, num_rpcs; struct homa_rpc *rpc; int i, batch_size; @@ -482,8 +483,8 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) */ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) { - struct homa_rpc *crpc; struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); + struct homa_rpc *crpc; homa_bucket_lock(bucket, id, __func__); hlist_for_each_entry_rcu(crpc, &bucket->rpcs, hash_links) { @@ -509,8 +510,8 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, const struct in6_addr *saddr, __u16 sport, __u64 id) { - struct homa_rpc *srpc; struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); + struct homa_rpc *srpc; homa_bucket_lock(bucket, id, __func__); hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { @@ -691,9 +692,9 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) { struct homa_socktab_scan scan; + int total_incoming = 0; struct homa_sock *hsk; struct homa_rpc *rpc; - int total_incoming = 0; int actual; tt_record1("homa_validate_incoming starting, total_incoming %d", diff --git a/homa_skb.c b/homa_skb.c index 2bc9dc1c..f4e0776e 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -43,8 +43,8 @@ void homa_skb_init(struct homa *homa) for (i = 0; i < nr_cpu_ids; i++) { struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, i); int numa = cpu_to_node(i); - BUG_ON(numa >= MAX_NUMNODES); + BUG_ON(numa >= MAX_NUMNODES); if (numa > homa->max_numa) homa->max_numa = numa; if (homa->page_pools[numa] == NULL) { @@ -118,8 +118,8 @@ void homa_skb_cleanup(struct homa *homa) */ struct sk_buff *homa_skb_new_tx(int length) { - struct sk_buff *skb; __u64 start = get_cycles(); + struct sk_buff *skb; /* Note: allocate space for an IPv6 header, which is larger than * an IPv4 header. @@ -181,13 +181,14 @@ void homa_skb_stash_pages(struct homa *homa, int length) void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) { struct skb_shared_info *shinfo = skb_shinfo(skb); - struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, - raw_smp_processor_id()); - skb_frag_t *frag = &shinfo->frags[shinfo->nr_frags - 1]; - char *result; + struct homa_skb_core *skb_core; int actual_size = *length; + skb_frag_t *frag; + char *result; /* Can we just extend the skb's last fragment? */ + skb_core = &per_cpu(homa_skb_core, raw_smp_processor_id()); + frag = &shinfo->frags[shinfo->nr_frags - 1]; if ((shinfo->nr_frags > 0) && (skb_frag_page(frag) == skb_core->skb_page) && (skb_core->page_inuse < skb_core->page_size) && ((frag->offset + skb_frag_size(frag)) @@ -310,8 +311,8 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, void *buf, int length) { - int chunk_length; char *src = (char *) buf; + int chunk_length; char *dst; while (length > 0) { @@ -371,9 +372,9 @@ int homa_skb_append_from_iter(struct homa *homa, struct sk_buff *skb, int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, struct sk_buff *src_skb, int offset, int length) { + int src_frag_offset, src_frags_left, chunk_size, err, head_len; struct skb_shared_info *src_shinfo = skb_shinfo(src_skb); struct skb_shared_info *dst_shinfo = skb_shinfo(dst_skb); - int src_frag_offset, src_frags_left, chunk_size, err, head_len; skb_frag_t *src_frag, *dst_frag; /* Copy bytes from the linear part of the source, if any. */ @@ -444,14 +445,15 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) #define MAX_PAGES_AT_ONCE 50 #endif struct page *pages_to_cache[MAX_PAGES_AT_ONCE]; - int num_pages = 0; __u64 start = get_cycles(); + int num_pages = 0; int i, j; for (i = 0; i < count; i++) { + struct skb_shared_info *shinfo; struct sk_buff *skb = skbs[i]; - struct skb_shared_info *shinfo = skb_shinfo(skb); + shinfo = skb_shinfo(skb); if (refcount_read(&skb->users) != 1) { /* This sk_buff is still in use somewhere, so can't * reclaim its pages. @@ -504,8 +506,9 @@ void homa_skb_cache_pages(struct homa *homa, struct page **pages, int count) spin_lock_bh(&homa->page_pool_mutex); for (i = 0; i < count; i++) { struct page *page = pages[i]; - struct homa_page_pool *pool = homa->page_pools[ - page_to_nid(page)]; + struct homa_page_pool *pool; + + pool = homa->page_pools[page_to_nid(page)]; if (pool->avail < LIMIT) { pool->pages[pool->avail] = page; pool->avail++; @@ -526,9 +529,9 @@ void homa_skb_cache_pages(struct homa *homa, struct page **pages, int count) */ void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length) { + int chunk_size, frags_left, frag_offset, head_len; struct skb_shared_info *shinfo = skb_shinfo(skb); char *dst = (char *) dest; - int chunk_size, frags_left, frag_offset, head_len; skb_frag_t *frag; /* Copy bytes from the linear part of the skb, if any. */ @@ -570,10 +573,10 @@ void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length) */ void homa_skb_release_pages(struct homa *homa) { - __u64 now = get_cycles(); - __s64 interval; int i, max_low_mark, min_pages, release, release_max; struct homa_page_pool *max_pool; + __u64 now = get_cycles(); + __s64 interval; if (now < homa->skb_page_free_time) return; diff --git a/homa_sock.c b/homa_sock.c index d6ce989a..9cdf7b3c 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -74,8 +74,8 @@ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, */ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) { - struct homa_sock *hsk; struct homa_socktab_links *links; + struct homa_sock *hsk; while (1) { while (scan->next == NULL) { @@ -249,8 +249,8 @@ void homa_sock_destroy(struct homa_sock *hsk) int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, __u16 port) { - int result = 0; struct homa_sock *owner; + int result = 0; if (port == 0) return result; diff --git a/homa_timer.c b/homa_timer.c index 168863e0..9969f532 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -19,9 +19,9 @@ */ void homa_check_rpc(struct homa_rpc *rpc) { - const char *us, *them; - struct resend_header resend; struct homa *homa = rpc->hsk->homa; + struct resend_header resend; + const char *us, *them; /* See if we need to request an ack for this RPC. */ if (!homa_is_client(rpc->id) && (rpc->state == RPC_OUTGOING) @@ -46,7 +46,7 @@ void homa_check_rpc(struct homa_rpc *rpc) if (rpc->state == RPC_INCOMING) { if ((rpc->msgin.length - rpc->msgin.bytes_remaining) - >= rpc->msgin.granted) { + >= rpc->msgin.granted) { /* We've received everything that we've granted, so we * shouldn't expect to hear anything until we grant more. */ @@ -154,18 +154,18 @@ void homa_check_rpc(struct homa_rpc *rpc) void homa_timer(struct homa *homa) { struct homa_socktab_scan scan; + static __u64 prev_grant_count; + int total_incoming_rpcs = 0; + int sum_incoming_rec = 0; struct homa_sock *hsk; + static int zero_count; struct homa_rpc *rpc; + int sum_incoming = 0; cycles_t start, end; - int rpc_count = 0; + __u64 total_grants; int total_rpcs = 0; - int total_incoming_rpcs = 0; - int sum_incoming = 0; - int sum_incoming_rec = 0; - static __u64 prev_grant_count; - static int zero_count; + int rpc_count = 0; int core; - __u64 total_grants; start = get_cycles(); homa->timer_ticks++; diff --git a/homa_utils.c b/homa_utils.c index 4da1ec79..8fc932b6 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -176,10 +176,11 @@ char *homa_print_ipv4_addr(__be32 addr) #define NUM_BUFS_IPV4 4 #define BUF_SIZE_IPV4 30 static char buffers[NUM_BUFS_IPV4][BUF_SIZE_IPV4]; - static int next_buf; __u32 a2 = ntohl(addr); - char *buffer = buffers[next_buf]; + static int next_buf; + char *buffer; + buffer = buffers[next_buf]; next_buf++; if (next_buf >= NUM_BUFS_IPV4) next_buf = 0; @@ -204,8 +205,9 @@ char *homa_print_ipv6_addr(const struct in6_addr *addr) #define BUF_SIZE 64 static char buffers[NUM_BUFS][BUF_SIZE]; static int next_buf; - char *buffer = buffers[next_buf]; + char *buffer; + buffer = buffers[next_buf]; next_buf++; if (next_buf >= NUM_BUFS) next_buf = 0; @@ -245,10 +247,10 @@ char *homa_print_ipv6_addr(const struct in6_addr *addr) */ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) { - int used = 0; struct common_header *common; - struct in6_addr saddr; char header[HOMA_MAX_HEADER]; + struct in6_addr saddr; + int used = 0; if (skb == NULL) { snprintf(buffer, buf_len, "skb is NULL!"); @@ -267,8 +269,8 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) be64_to_cpu(common->sender_id)); switch (common->type) { case DATA: { - struct data_header *h = (struct data_header *) header; struct homa_skb_info *homa_info = homa_get_skb_info(skb); + struct data_header *h = (struct data_header *) header; int data_left, i, seg_length, pos, offset; if (skb_shinfo(skb)->gso_segs == 0) { @@ -399,8 +401,9 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) { char header[HOMA_MAX_HEADER]; - struct common_header *common = (struct common_header *) header; + struct common_header *common; + common = (struct common_header *) header; homa_skb_get(skb, header, 0, HOMA_MAX_HEADER); switch (common->type) { case DATA: { @@ -489,11 +492,11 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) */ void homa_freeze_peers(struct homa *homa) { + struct homa_socktab_scan scan; + struct freeze_header freeze; struct homa_peer **peers; int num_peers, i, err; - struct freeze_header freeze; struct homa_sock *hsk; - struct homa_socktab_scan scan; /* Find a socket to use (any will do). */ hsk = homa_socktab_start_scan(homa->port_map, &scan); diff --git a/test/ccutils.cc b/test/ccutils.cc index 0cdd201c..55d0f004 100644 --- a/test/ccutils.cc +++ b/test/ccutils.cc @@ -160,6 +160,7 @@ void unit_log_clear(void) void unit_fill_data(unsigned char *data, int length, int first_value) { int i; + for (i = 0; i <= length-4; i += 4) { *reinterpret_cast(data + i) = first_value + i; } @@ -193,6 +194,7 @@ void unit_log_add_separator(char *sep) void unit_log_data(const char *separator, unsigned char *data, int length) { int i, range_start, expected_next; + if (length == 0) { unit_log_printf(separator, "empty block"); return; @@ -204,6 +206,7 @@ void unit_log_data(const char *separator, unsigned char *data, int length) expected_next = range_start; for (i = 0; i <= length-4; i += 4) { int current = *reinterpret_cast(data + i); + if (current != expected_next) { unit_log_printf(separator, "%d-%d", range_start, expected_next-1); @@ -247,6 +250,7 @@ const char *unit_log_get(void) void unit_log_printf(const char *separator, const char* format, ...) { va_list ap; + va_start(ap, format); if (!unit_log.empty() && (separator != NULL)) @@ -257,10 +261,12 @@ void unit_log_printf(const char *separator, const char* format, ...) int buf_size = 1024; while (true) { char buf[buf_size]; - // vsnprintf trashes the va_list, so copy it first va_list aq; + int length; + + // vsnprintf trashes the va_list, so copy it first __va_copy(aq, ap); - int length = vsnprintf(buf, buf_size, format, aq); + length = vsnprintf(buf, buf_size, format, aq); assert(length >= 0); // old glibc versions returned -1 if (length < buf_size) { unit_log.append(buf, length); diff --git a/test/main.c b/test/main.c index afe7eefe..17a5e1d5 100644 --- a/test/main.c +++ b/test/main.c @@ -31,8 +31,9 @@ static char * helpMessage = "double-checking.\n"; int main(int argc, char **argv) { - int i; int verbose = 0; + int i; + mock_ipv6_default = true; for (i = 1; i < argc; i++) { if ((strcmp(argv[i], "-h") == 0) || diff --git a/test/mock.c b/test/mock.c index c027f1ea..bea2d829 100644 --- a/test/mock.c +++ b/test/mock.c @@ -199,6 +199,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node) { int shinfo_size; + if (mock_check_error(&mock_alloc_skb_errors)) return NULL; struct sk_buff *skb = malloc(sizeof(struct sk_buff)); @@ -241,6 +242,7 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) {} size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) { size_t bytes_left = bytes; + if (mock_check_error(&mock_copy_data_errors)) return false; if (bytes > iter->count) { @@ -306,6 +308,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { __u64 int_from = (__u64) from; + if (mock_check_error(&mock_copy_data_errors)) return 1; if (int_from > 200000) @@ -606,8 +609,9 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - char buffer[200]; const char *prefix = " "; + char buffer[200]; + if (mock_check_error(&mock_ip_queue_xmit_errors)) { /* Latest data (as of 1/2019) suggests that ip_queue_xmit * frees packets after errors. @@ -645,6 +649,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, const struct sock *sk) { struct rtable *route; + if (mock_check_error(&mock_route_errors)) return ERR_PTR(-EHOSTUNREACH); route = malloc(sizeof(struct rtable)); @@ -830,6 +835,7 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode, const struct proc_ops *proc_ops) { struct proc_dir_entry *entry = malloc(40); + if (!entry) { FAIL("malloc failed"); return ERR_PTR(-ENOMEM); @@ -956,6 +962,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iovec *iov = (struct iovec *) iter_iov(iter); __u64 int_base = (__u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; + if (chunk_bytes > bytes_left) chunk_bytes = bytes_left; unit_log_printf("; ", @@ -998,6 +1005,7 @@ void *skb_push(struct sk_buff *skb, unsigned int len) void *skb_put(struct sk_buff *skb, unsigned int len) { unsigned char *result = skb_tail_pointer(skb); + skb->tail += len; skb->len += len; return result; @@ -1006,9 +1014,9 @@ void *skb_put(struct sk_buff *skb, unsigned int len) struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features) { + struct sk_buff *skb1, *skb2; struct data_header h; int offset, length; - struct sk_buff *skb1, *skb2; /* Split the existing packet into two packets. */ memcpy(&h, skb_transport_header(head_skb), sizeof(h)); @@ -1131,6 +1139,7 @@ int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, struct page *mock_alloc_pages(gfp_t gfp, unsigned order) { struct page *page; + if (mock_check_error(&mock_alloc_page_errors)) return NULL; page = (struct page *)malloc(PAGE_SIZE << order); @@ -1172,6 +1181,7 @@ void mock_clear_xmit_prios() unsigned int mock_compound_order(struct page *page) { unsigned int result; + if (mock_compound_order_mask & 1) result = 0; else @@ -1228,6 +1238,7 @@ unsigned int mock_get_mtu(const struct dst_entry *dst) void mock_get_page(struct page *page) { int64_t ref_count = (int64_t) unit_hash_get(pages_in_use, page); + if (ref_count == 0) FAIL(" unallocated page passed to mock_get_page"); else @@ -1249,6 +1260,7 @@ int mock_page_refs(struct page *page) int mock_page_to_nid(struct page *page) { int result; + if (mock_page_nid_mask & 1) result = 1; else @@ -1260,6 +1272,7 @@ int mock_page_to_nid(struct page *page) void mock_put_page(struct page *page) { int64_t ref_count = (int64_t) unit_hash_get(pages_in_use, page); + if (ref_count == 0) FAIL(" unallocated page passed to mock_put_page"); else { @@ -1331,6 +1344,7 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct common_header *h, int extra_bytes, int first_value) { int header_size, ip_size, data_size, shinfo_size; + struct sk_buff *skb; unsigned char *p; switch (h->type) { @@ -1365,7 +1379,7 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct common_header *h, header_size = sizeof(struct common_header); break; } - struct sk_buff *skb = malloc(sizeof(struct sk_buff)); + skb = malloc(sizeof(struct sk_buff)); memset(skb, 0, sizeof(*skb)); if (!skbs_in_use) skbs_in_use = unit_hash_new(); @@ -1421,9 +1435,10 @@ int mock_skb_count(void) */ void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) { + int saved_port = homa->next_client_port; static struct ipv6_pinfo hsk_pinfo; struct sock *sk = &hsk->sock; - int saved_port = homa->next_client_port; + memset(hsk, 0, sizeof(*hsk)); sk->sk_data_ready = mock_data_ready; sk->sk_family = mock_ipv6 ? AF_INET6 : AF_INET; @@ -1460,6 +1475,8 @@ void mock_spin_unlock(spinlock_t *lock) */ void mock_teardown(void) { + int count; + pcpu_hot.cpu_number = 1; cpu_khz = 1000000; mock_alloc_page_errors = 0; @@ -1498,7 +1515,7 @@ void mock_teardown(void) memset(inet_offloads, 0, sizeof(inet_offloads)); memset(inet6_offloads, 0, sizeof(inet6_offloads)); - int count = unit_hash_size(skbs_in_use); + count = unit_hash_size(skbs_in_use); if (count > 0) FAIL(" %u sk_buff(s) still in use after test", count); unit_hash_free(skbs_in_use); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index f3cfee14..54c0662a 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -105,6 +105,7 @@ static struct homa_rpc *test_rpc(FIXTURE_DATA(homa_grant) *self, struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, server_ip, self->server_port, id, 1000, size); + homa_message_in_init(rpc, size, 0); homa_grant_add_rpc(rpc); return rpc; @@ -215,9 +216,11 @@ TEST_F(homa_grant, homa_grant_add_rpc__insert_in_peer_list) } TEST_F(homa_grant, homa_grant_add_rpc__adjust_order_in_peer_list) { + struct homa_rpc *rpc3; + test_rpc(self, 200, self->server_ip, 20000); test_rpc(self, 300, self->server_ip, 30000); - struct homa_rpc *rpc3 = test_rpc(self, 400, self->server_ip, 40000); + rpc3 = test_rpc(self, 400, self->server_ip, 40000); test_rpc(self, 500, self->server_ip, 50000); unit_log_clear(); @@ -267,10 +270,13 @@ TEST_F(homa_grant, homa_grant_add_rpc__insert_peer_in_homa_list) } TEST_F(homa_grant, homa_grant_add_rpc__move_peer_in_homa_list) { + struct homa_rpc *rpc3; + struct homa_rpc *rpc4; + test_rpc(self, 200, self->server_ip, 20000); test_rpc(self, 300, self->server_ip+1, 30000); - struct homa_rpc *rpc3 = test_rpc(self, 400, self->server_ip+2, 40000); - struct homa_rpc *rpc4 = test_rpc(self, 500, self->server_ip+3, 50000); + rpc3 = test_rpc(self, 400, self->server_ip+2, 40000); + rpc4 = test_rpc(self, 500, self->server_ip+3, 50000); unit_log_clear(); unit_log_grantables(&self->homa); @@ -307,6 +313,7 @@ TEST_F(homa_grant, homa_grant_remove_rpc__skip_if_not_linked) struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 2000); + unit_log_grantables(&self->homa); EXPECT_EQ(0, self->homa.num_grantable_rpcs); @@ -317,6 +324,7 @@ TEST_F(homa_grant, homa_grant_remove_rpc__clear_oldest_rpc) { struct homa_rpc *rpc1 = test_rpc(self, 200, self->server_ip, 20000); struct homa_rpc *rpc2 = test_rpc(self, 300, self->server_ip, 10000); + EXPECT_EQ(2, self->homa.num_grantable_rpcs); self->homa.oldest_rpc = rpc2; @@ -330,6 +338,7 @@ TEST_F(homa_grant, homa_grant_remove_rpc__clear_oldest_rpc) TEST_F(homa_grant, homa_grant_remove_rpc__update_metrics) { struct homa_rpc *rpc = test_rpc(self, 200, self->server_ip, 20000); + EXPECT_EQ(1, self->homa.num_grantable_rpcs); self->homa.last_grantable_change = 100; self->homa.num_grantable_rpcs = 3; @@ -342,8 +351,10 @@ TEST_F(homa_grant, homa_grant_remove_rpc__update_metrics) } TEST_F(homa_grant, homa_grant_remove_rpc__not_first_in_peer_list) { + struct homa_rpc *rpc2; + test_rpc(self, 200, self->server_ip, 20000); - struct homa_rpc *rpc2 = test_rpc(self, 300, self->server_ip, 30000); + rpc2 = test_rpc(self, 300, self->server_ip, 30000); test_rpc(self, 400, self->server_ip+1, 25000); unit_log_clear(); @@ -364,6 +375,7 @@ TEST_F(homa_grant, homa_grant_remove_rpc__not_first_in_peer_list) TEST_F(homa_grant, homa_grant_remove_rpc__only_entry_in_peer_list) { struct homa_rpc *rpc1 = test_rpc(self, 200, self->server_ip, 30000); + test_rpc(self, 300, self->server_ip+1, 40000); test_rpc(self, 400, self->server_ip+2, 20000); @@ -385,6 +397,7 @@ TEST_F(homa_grant, homa_grant_remove_rpc__only_entry_in_peer_list) TEST_F(homa_grant, homa_grant_remove_rpc__reposition_peer_in_homa_list) { struct homa_rpc *rpc1 = test_rpc(self, 200, self->server_ip, 20000); + test_rpc(self, 300, self->server_ip, 50000); test_rpc(self, 400, self->server_ip+1, 30000); test_rpc(self, 500, self->server_ip+2, 40000); @@ -410,8 +423,8 @@ TEST_F(homa_grant, homa_grant_remove_rpc__reposition_peer_in_homa_list) TEST_F(homa_grant, homa_grant_send__basics) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - rpc->msgin.priority = 3; + rpc->msgin.priority = 3; unit_log_clear(); int granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(1, granted); @@ -421,6 +434,7 @@ TEST_F(homa_grant, homa_grant_send__basics) TEST_F(homa_grant, homa_grant_send__incoming_negative) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + rpc->msgin.bytes_remaining = 5000; atomic_set(&self->homa.total_incoming, self->homa.max_incoming); @@ -433,8 +447,8 @@ TEST_F(homa_grant, homa_grant_send__incoming_negative) TEST_F(homa_grant, homa_grant_send__end_of_message) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - rpc->msgin.bytes_remaining = 5000; + rpc->msgin.bytes_remaining = 5000; unit_log_clear(); int granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(1, granted); @@ -444,6 +458,7 @@ TEST_F(homa_grant, homa_grant_send__end_of_message) TEST_F(homa_grant, homa_grant_send__not_enough_available_bytes) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + rpc->msgin.granted = 3000; rpc->msgin.rec_incoming = 4000; atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 4000); @@ -457,8 +472,8 @@ TEST_F(homa_grant, homa_grant_send__not_enough_available_bytes) TEST_F(homa_grant, homa_grant_send__nothing_available) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); + atomic_set(&self->homa.total_incoming, self->homa.max_incoming); unit_log_clear(); int granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(0, granted); @@ -468,8 +483,8 @@ TEST_F(homa_grant, homa_grant_send__nothing_available) TEST_F(homa_grant, homa_grant_send__skip_because_of_silent_ticks) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - rpc->silent_ticks = 2; + rpc->silent_ticks = 2; unit_log_clear(); int granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(0, granted); @@ -477,8 +492,8 @@ TEST_F(homa_grant, homa_grant_send__skip_because_of_silent_ticks) TEST_F(homa_grant, homa_grant_send__resend_all) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - rpc->msgin.resend_all = 1; + rpc->msgin.resend_all = 1; unit_log_clear(); int granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(1, granted); @@ -492,6 +507,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 2000); + rpc->msgin.bytes_remaining = 500; rpc->msgin.granted = 2000; rpc->msgin.rec_incoming = 0; @@ -505,8 +521,8 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 2000); - homa_message_in_init(rpc, 2000, 0); + homa_message_in_init(rpc, 2000, 0); homa_rpc_lock(rpc, "test"); homa_grant_check_rpc(rpc); EXPECT_EQ(2000, rpc->msgin.rec_incoming); @@ -526,6 +542,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__message_doesnt_need_grants) struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 2000); + homa_message_in_init(rpc, 2000, 0); rpc->msgin.granted = 2000; rpc->msgin.bytes_remaining = 500; @@ -546,6 +563,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 20000); + homa_message_in_init(rpc, 20000, 0); rpc->msgin.bytes_remaining = 12000; @@ -559,6 +577,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) { struct homa_rpc *rpc1, *rpc2, *rpc3; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_overcommit = 2; @@ -580,6 +599,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) { struct homa_rpc *rpc1, *rpc2, *rpc3; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_overcommit = 2; @@ -602,6 +622,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) { struct homa_rpc *rpc1, *rpc2, *rpc3; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); @@ -624,6 +645,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) { struct homa_rpc *rpc1, *rpc2, *rpc3; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); @@ -648,6 +670,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) TEST_F(homa_grant, homa_grant_check_rpc__send_new_grant) { struct homa_rpc *rpc; + rpc = test_rpc(self, 100, self->server_ip, 40000); homa_grant_recalc(&self->homa, 0); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); @@ -666,6 +689,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__send_new_grant) TEST_F(homa_grant, homa_grant_check_rpc__remove_from_grantable) { struct homa_rpc *rpc; + rpc = test_rpc(self, 100, self->server_ip, 40000); homa_grant_recalc(&self->homa, 0); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); @@ -689,6 +713,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__remove_from_grantable) TEST_F(homa_grant, homa_grant_check_rpc__recalc_because_of_headroom) { struct homa_rpc *rpc1, *rpc2; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_incoming = 15000; @@ -714,6 +739,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__recalc_because_of_headroom) TEST_F(homa_grant, homa_grant_recalc__basics) { struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip+1, 25000); @@ -756,6 +782,7 @@ TEST_F(homa_grant, homa_grant_recalc__already_locked) TEST_F(homa_grant, homa_grant_recalc__skip_recalc) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + unit_hook_register(grantable_spinlock_hook); hook_homa = &self->homa; mock_trylock_errors = 0xff; @@ -770,6 +797,7 @@ TEST_F(homa_grant, homa_grant_recalc__skip_recalc) TEST_F(homa_grant, homa_grant_recalc__clear_existing_active_rpcs) { struct homa_rpc *rpc1; + rpc1 = test_rpc(self, 100, self->server_ip, 40000); test_rpc(self, 102, self->server_ip, 30000); test_rpc(self, 104, self->server_ip, 25000); @@ -788,6 +816,7 @@ TEST_F(homa_grant, homa_grant_recalc__clear_existing_active_rpcs) TEST_F(homa_grant, homa_grant_recalc__use_only_lowest_priorities) { struct homa_rpc *rpc1, *rpc2; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_incoming = 100000; @@ -802,6 +831,7 @@ TEST_F(homa_grant, homa_grant_recalc__use_only_lowest_priorities) TEST_F(homa_grant, homa_grant_recalc__share_lowest_priority_level) { struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 100, self->server_ip, 40000); @@ -823,6 +853,7 @@ TEST_F(homa_grant, homa_grant_recalc__share_lowest_priority_level) TEST_F(homa_grant, homa_grant_recalc__compute_window_size) { struct homa_rpc *rpc1, *rpc2, *rpc3; + rpc1 = test_rpc(self, 100, self->server_ip, 30000); rpc2 = test_rpc(self, 102, self->server_ip, 40000); rpc3 = test_rpc(self, 100, self->server_ip, 50000); @@ -848,6 +879,7 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted) { struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + rpc1 = test_rpc(self, 100, self->server_ip, 10000); rpc2 = test_rpc(self, 102, self->server_ip, 10000); rpc3 = test_rpc(self, 104, self->server_ip, 10000); @@ -864,6 +896,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted) TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) { struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + rpc1 = test_rpc(self, 100, self->server_ip, 10000); rpc2 = test_rpc(self, 102, self->server_ip, 10000); rpc3 = test_rpc(self, 104, self->server_ip, 10000); @@ -993,6 +1026,7 @@ TEST_F(homa_grant, homa_grant_find_oldest__basics) TEST_F(homa_grant, homa_grant_find_oldest__fifo_grant_unused) { struct homa_rpc *srpc1, *srpc2; + mock_cycles = ~0; srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 11, 400000, 100); @@ -1028,6 +1062,7 @@ TEST_F(homa_grant, homa_grant_rpc_free__rpc_not_grantable) TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) { struct homa_rpc *rpc1, *rpc2, *rpc3; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); @@ -1049,6 +1084,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) TEST_F(homa_grant, homa_grant_free_rpc__not_in_active_list) { struct homa_rpc *rpc1, *rpc2, *rpc3; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 22754cd0..4d0be4e4 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -186,6 +186,7 @@ TEST_F(homa_incoming, homa_message_in_init__basics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + EXPECT_EQ(0, homa_message_in_init(crpc, 127, 100)); EXPECT_EQ(100, crpc->msgin.granted); EXPECT_EQ(0, homa_message_in_init(crpc, 128, 500)); @@ -197,6 +198,7 @@ TEST_F(homa_incoming, homa_message_in_init__pool_doesnt_exist) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_pool_destroy(self->hsk.buffer_pool); EXPECT_EQ(ENOMEM, -homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 0)); EXPECT_EQ(0, crpc->msgin.num_bpages); @@ -206,6 +208,7 @@ TEST_F(homa_incoming, homa_message_in_init__no_buffers_available) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); EXPECT_EQ(0, homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 10000)); EXPECT_EQ(0, crpc->msgin.num_bpages); @@ -216,6 +219,7 @@ TEST_F(homa_incoming, homa_message_in_init__update_metrics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + EXPECT_EQ(0, homa_message_in_init(crpc, 140, 0)); EXPECT_EQ(0, homa_message_in_init(crpc, 130, 0)); EXPECT_EQ(0, homa_message_in_init(crpc, 0xfff, 0)); @@ -235,6 +239,7 @@ TEST_F(homa_incoming, homa_gap_retry) struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 100); + homa_gap_new(&srpc->msgin.gaps, 1000, 2000); homa_gap_new(&srpc->msgin.gaps, 4000, 6000); homa_gap_new(&srpc->msgin.gaps, 7000, 8000); @@ -253,6 +258,7 @@ TEST_F(homa_incoming, homa_add_packet__basics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); mock_cycles = 5000; @@ -290,6 +296,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_overlaps_message_end) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(9000); @@ -302,6 +309,7 @@ TEST_F(homa_incoming, homa_add_packet__sequential_packets) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); homa_add_packet(crpc, mock_skb_new(self->client_ip, @@ -323,6 +331,7 @@ TEST_F(homa_incoming, homa_add_packet__new_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); homa_add_packet(crpc, mock_skb_new(self->client_ip, @@ -340,6 +349,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_before_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); @@ -361,6 +371,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_straddles_start_of_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); @@ -382,6 +393,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_extends_past_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); @@ -403,6 +415,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_at_start_of_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); @@ -426,6 +439,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_covers_entire_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); @@ -448,6 +462,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_beyond_end_of_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); @@ -469,6 +484,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_straddles_end_of_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); @@ -490,6 +506,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_at_end_of_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); @@ -512,6 +529,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_in_middle_of_gap) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); mock_cycles = 1000; @@ -539,6 +557,7 @@ TEST_F(homa_incoming, homa_add_packet__scan_multiple_gaps) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(1400); @@ -562,6 +581,7 @@ TEST_F(homa_incoming, homa_add_packet__metrics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); crpc->msgin.recv_end = 4200; self->data.seg.offset = htonl(0); @@ -760,8 +780,8 @@ TEST_F(homa_incoming, homa_copy_to_user__error_in_skb_copy_datagram_iter) TEST_F(homa_incoming, homa_copy_to_user__timetrace_info) { struct homa_rpc *crpc; - int offset; char traces[1000]; + int offset; crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, @@ -797,6 +817,7 @@ TEST_F(homa_incoming, homa_copy_to_user__timetrace_info) TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv4) { struct sk_buff *skb; + self->data.common.dport = htons(100); // Make sure the test uses IPv4. @@ -812,6 +833,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv4) TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv6) { struct sk_buff *skb; + self->data.common.dport = htons(100); // Make sure the test uses IPv6. @@ -827,6 +849,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv6) TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_free_many_packets) { struct sk_buff *skb, *skb2, *skb3; + self->data.common.dport = htons(100); // Make sure the test uses IPv6. @@ -864,6 +887,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_server_rpc) struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 100); + ASSERT_NE(NULL, srpc); EXPECT_EQ(8600, srpc->msgin.bytes_remaining); self->data.seg.offset = htonl(1400); @@ -874,6 +898,9 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_server_rpc) } TEST_F(homa_incoming, homa_dispatch_pkts__non_data_packet_for_existing_server_rpc) { + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_IN_SERVICE, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); struct resend_header resend = {.common = { .sport = htons(self->client_port), .dport = htons(self->server_port), @@ -882,9 +909,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__non_data_packet_for_existing_server_rp .offset = 0, .length = 1000, .priority = 3}; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_IN_SERVICE, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 10000, 100); + ASSERT_NE(NULL, srpc); unit_log_clear(); homa_dispatch_pkts(mock_skb_new(self->client_ip, &resend.common, 0, 0), @@ -896,6 +921,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_client_rpc) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); EXPECT_EQ(10000, crpc->msgout.granted); unit_log_clear(); @@ -917,6 +943,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99991), .type = UNKNOWN}}; + mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); @@ -928,6 +955,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) .dport = htons(self->server_port), .sender_id = cpu_to_be64(99990), .type = GRANT}}; + mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); @@ -935,7 +963,6 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) } TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) { - struct homa_peer *peer; struct cutoffs_header h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99991), @@ -944,6 +971,8 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, .cutoff_version = 400}; + struct homa_peer *peer; + homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); peer = homa_peer_find(self->homa.peers, self->server_ip, @@ -960,6 +989,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) .sender_id = cpu_to_be64(99990), .type = RESEND}, .offset = 0, .length = 2000, .priority = 5}; + homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit UNKNOWN", unit_log_get()); @@ -969,17 +999,17 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(10000, crpc->msgout.granted); - unit_log_clear(); - crpc->silent_ticks = 5; - crpc->peer->outstanding_resends = 2; - struct grant_header h = {.common = {.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = GRANT}, .offset = htonl(12600), .priority = 3, .resend_all = 0}; + + ASSERT_NE(NULL, crpc); + EXPECT_EQ(10000, crpc->msgout.granted); + unit_log_clear(); + crpc->silent_ticks = 5; + crpc->peer->outstanding_resends = 2; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, crpc->silent_ticks); @@ -999,6 +1029,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); EXPECT_EQ(10000, crpc->msgout.granted); unit_log_clear(); @@ -1014,6 +1045,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 3000); + ASSERT_NE(NULL, srpc); self->data.ack = (struct homa_ack) { .client_port = htons(self->client_port), @@ -1029,6 +1061,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) TEST_F(homa_incoming, homa_dispatch_pkts__too_many_acks) { struct sk_buff *skb, *skb2, *skb3; + self->data.ack = (struct homa_ack) { .client_port = htons(self->client_port), .server_port = htons(self->server_port), @@ -1061,11 +1094,13 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) struct homa_rpc *dead = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 20000); + struct homa_rpc *srpc; + homa_rpc_free(dead); EXPECT_EQ(31, self->hsk.dead_skbs); - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 10000, 5000); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 10000, 5000); ASSERT_NE(NULL, srpc); self->homa.dead_buffs_limit = 16; mock_cycles = ~0; @@ -1091,6 +1126,7 @@ TEST_F(homa_incoming, homa_data_pkt__basics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 1600); + ASSERT_NE(NULL, crpc); unit_log_clear(); crpc->msgout.next_xmit_offset = crpc->msgout.length; @@ -1109,8 +1145,8 @@ TEST_F(homa_incoming, homa_data_pkt__wrong_client_rpc_state) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 2000); - ASSERT_NE(NULL, crpc); + ASSERT_NE(NULL, crpc); crpc->state = RPC_DEAD; self->data.message_length = htonl(2000); self->data.seg.offset = htonl(1400); @@ -1125,6 +1161,7 @@ TEST_F(homa_incoming, homa_data_pkt__initialize_msgin) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 1600); + ASSERT_NE(NULL, crpc); self->data.message_length = htonl(1600); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, @@ -1137,6 +1174,7 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffer_pool) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 1600); + ASSERT_NE(NULL, crpc); homa_pool_destroy(self->hsk.buffer_pool); unit_log_clear(); @@ -1149,6 +1187,7 @@ TEST_F(homa_incoming, homa_data_pkt__wrong_server_rpc_state) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 1400, 5000); + ASSERT_NE(NULL, srpc); unit_log_clear(); homa_data_pkt(mock_skb_new(self->client_ip, &self->data.common, @@ -1161,6 +1200,7 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffers) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 5000); + EXPECT_NE(NULL, crpc); unit_log_clear(); @@ -1175,6 +1215,7 @@ TEST_F(homa_incoming, homa_data_pkt__update_delta) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 5000); + EXPECT_NE(NULL, crpc); unit_log_clear(); @@ -1199,6 +1240,7 @@ TEST_F(homa_incoming, homa_data_pkt__handoff) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 3000); + ASSERT_NE(NULL, crpc); unit_log_clear(); crpc->msgout.next_xmit_offset = crpc->msgout.length; @@ -1266,10 +1308,6 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); - ASSERT_NE(NULL, srpc); - homa_xmit_data(srpc, false); - unit_log_clear(); - struct grant_header h = {{.sport = htons(srpc->dport), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->client_id), @@ -1277,6 +1315,11 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) .offset = htonl(11000), .priority = 3, .resend_all = 0}; + + ASSERT_NE(NULL, srpc); + homa_xmit_data(srpc, false); + unit_log_clear(); + homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(11000, srpc->msgout.granted); @@ -1307,12 +1350,6 @@ TEST_F(homa_incoming, homa_grant_pkt__reset) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); - ASSERT_NE(NULL, srpc); - homa_xmit_data(srpc, false); - unit_log_clear(); - EXPECT_EQ(10000, srpc->msgout.granted); - EXPECT_EQ(10000, srpc->msgout.next_xmit_offset); - struct grant_header h = {{.sport = htons(srpc->dport), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->client_id), @@ -1320,6 +1357,13 @@ TEST_F(homa_incoming, homa_grant_pkt__reset) .offset = htonl(3000), .priority = 2, .resend_all = 1}; + + ASSERT_NE(NULL, srpc); + homa_xmit_data(srpc, false); + unit_log_clear(); + EXPECT_EQ(10000, srpc->msgout.granted); + EXPECT_EQ(10000, srpc->msgout.next_xmit_offset); + homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(10000, srpc->msgout.granted); @@ -1338,15 +1382,15 @@ TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - struct grant_header h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = GRANT}, .offset = htonl(25000), .priority = 3}; + + ASSERT_NE(NULL, crpc); + unit_log_clear(); homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(20000, crpc->msgout.granted); @@ -1378,6 +1422,7 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_in_service_server_sends_busy) struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 20000); + ASSERT_NE(NULL, srpc); unit_log_clear(); @@ -1400,6 +1445,7 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_incoming_server_sends_busy) struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 20000); + ASSERT_NE(NULL, srpc); srpc->msgin.granted = 1400; unit_log_clear(); @@ -1424,6 +1470,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 3000); + ASSERT_NE(NULL, crpc); unit_log_clear(); @@ -1443,6 +1490,7 @@ TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 100); + ASSERT_NE(NULL, crpc); unit_log_clear(); @@ -1462,6 +1510,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_send_data) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 100); + ASSERT_NE(NULL, crpc); homa_xmit_data(crpc, false); unit_log_clear(); @@ -1484,6 +1533,7 @@ TEST_F(homa_incoming, homa_resend_pkt__server_send_data) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); + ASSERT_NE(NULL, srpc); homa_xmit_data(srpc, false); unit_log_clear(); @@ -1505,6 +1555,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 2000); + ASSERT_NE(NULL, crpc); homa_xmit_data(crpc, false); unit_log_clear(); @@ -1530,6 +1581,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 2000); + ASSERT_NE(NULL, crpc); crpc->msgout.granted = 1400; homa_xmit_data(crpc, false); @@ -1553,6 +1605,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); + ASSERT_NE(NULL, srpc); unit_log_clear(); @@ -1566,10 +1619,6 @@ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(10000, crpc->msgout.granted); - unit_log_clear(); - struct cutoffs_header h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), @@ -1577,6 +1626,11 @@ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, .cutoff_version = 400}; + + ASSERT_NE(NULL, crpc); + EXPECT_EQ(10000, crpc->msgout.granted); + unit_log_clear(); + homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(400, crpc->peer->cutoff_version); @@ -1585,7 +1639,6 @@ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) } TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) { - struct homa_peer *peer; struct cutoffs_header h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), @@ -1594,6 +1647,8 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, .cutoff_version = 400}; struct sk_buff *skb = mock_skb_new(self->server_ip, &h.common, 0, 0); + struct homa_peer *peer; + mock_kmalloc_errors = 1; homa_cutoffs_pkt(skb, &self->hsk); EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); @@ -1608,14 +1663,15 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 3000); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - mock_xmit_log_verbose = 1; struct need_ack_header h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = NEED_ACK}}; + + ASSERT_NE(NULL, crpc); + unit_log_clear(); + mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks", @@ -1628,14 +1684,15 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 3000); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - mock_xmit_log_verbose = 1; struct need_ack_header h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = NEED_ACK}}; + + ASSERT_NE(NULL, crpc); + unit_log_clear(); + mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); @@ -1647,14 +1704,15 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 3000); - ASSERT_NE(NULL, crpc); - unit_log_clear(); - mock_xmit_log_verbose = 1; struct need_ack_header h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = NEED_ACK}}; + + ASSERT_NE(NULL, crpc); + unit_log_clear(); + mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); @@ -1665,16 +1723,17 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) { struct homa_peer *peer = homa_peer_find(self->homa.peers, self->server_ip, &self->hsk.inet); - peer->acks[0].client_port = htons(self->client_port); - peer->acks[0].server_port = htons(self->server_port); - peer->acks[0].client_id = cpu_to_be64(self->client_id+2); - peer->num_acks = 1; - mock_xmit_log_verbose = 1; struct need_ack_header h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = NEED_ACK}}; + + peer->acks[0].client_port = htons(self->client_port); + peer->acks[0].server_port = htons(self->server_port); + peer->acks[0].client_id = cpu_to_be64(self->client_id+2); + peer->num_acks = 1; + mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, " @@ -1686,16 +1745,17 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 5000); - ASSERT_NE(NULL, srpc); - EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); - unit_log_clear(); - mock_xmit_log_verbose = 1; struct ack_header h = {.common = { .sport = htons(self->client_port), .dport = htons(self->hsk2.port), .sender_id = cpu_to_be64(self->client_id), .type = ACK}, .num_acks = htons(0)}; + + ASSERT_NE(NULL, srpc); + EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); + unit_log_clear(); + mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); @@ -1709,17 +1769,18 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id+2, 100, 5000); - ASSERT_NE(NULL, srpc1); - ASSERT_NE(NULL, srpc2); - EXPECT_EQ(2, unit_list_length(&self->hsk2.active_rpcs)); - unit_log_clear(); - mock_xmit_log_verbose = 1; struct ack_header h = {.common = { .sport = htons(self->client_port + 1), .dport = htons(self->hsk2.port), .sender_id = cpu_to_be64(self->client_id), .type = ACK}, .num_acks = htons(2)}; + + ASSERT_NE(NULL, srpc1); + ASSERT_NE(NULL, srpc2); + EXPECT_EQ(2, unit_list_length(&self->hsk2.active_rpcs)); + unit_log_clear(); + mock_xmit_log_verbose = 1; h.acks[0] = (struct homa_ack) {.client_port = htons(self->client_port), .server_port = htons(self->server_port), .client_id = cpu_to_be64(self->server_id+5)}; @@ -1751,6 +1812,7 @@ TEST_F(homa_incoming, homa_rpc_abort__socket_shutdown) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); unit_log_clear(); self->hsk.shutdown = 1; @@ -1771,6 +1833,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__basics) struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip+1, self->server_port, self->client_id+4, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); @@ -1789,6 +1852,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__multiple_sockets) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 1600); struct homa_rpc *crpc2, *crpc3; + crpc2 = unit_client_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 5000, 1600); @@ -1820,6 +1884,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__select_addr) struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip+2, self->server_port, self->client_id+4, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); @@ -1842,6 +1907,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__select_port) struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id+4, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); @@ -1866,6 +1932,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__any_port) struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id+4, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); @@ -1880,6 +1947,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__ignore_dead_rpcs) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 1600); + ASSERT_NE(NULL, crpc); homa_rpc_free(crpc); EXPECT_EQ(RPC_DEAD, crpc->state); @@ -1892,6 +1960,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__free_server_rpc) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, self->server_id, 20000, 100); + ASSERT_NE(NULL, srpc); unit_log_clear(); homa_abort_rpcs(&self->homa, self->client_ip, 0, 0); @@ -1909,6 +1978,7 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__basics) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 20000, 100); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, srpc); @@ -1937,6 +2007,7 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__rpc_already_dead) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 1600); + ASSERT_NE(NULL, crpc); homa_rpc_free(crpc); EXPECT_EQ(RPC_DEAD, crpc->state); @@ -1952,6 +2023,7 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__free_rpcs) struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port+1, self->client_id+2, 5000, 1600); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); unit_log_clear(); @@ -1964,6 +2036,7 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__free_rpcs) TEST_F(homa_incoming, homa_register_interests__id_not_for_client_rpc) { int result; + result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_RESPONSE, 45); EXPECT_EQ(EINVAL, -result); @@ -1971,16 +2044,18 @@ TEST_F(homa_incoming, homa_register_interests__id_not_for_client_rpc) TEST_F(homa_incoming, homa_register_interests__no_rpc_for_id) { int result; + result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_RESPONSE, 44); EXPECT_EQ(EINVAL, -result); } TEST_F(homa_incoming, homa_register_interests__id_already_has_interest) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_interest interest; + ASSERT_NE(NULL, crpc); crpc->interest = &interest; @@ -1994,9 +2069,11 @@ TEST_F(homa_incoming, homa_register_interests__return_response_by_id) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + int result; + ASSERT_NE(NULL, crpc); - int result = homa_register_interests(&self->interest, &self->hsk, + result = homa_register_interests(&self->interest, &self->hsk, 0, self->client_id); EXPECT_EQ(0, result); EXPECT_EQ(crpc, (struct homa_rpc *) @@ -2006,6 +2083,7 @@ TEST_F(homa_incoming, homa_register_interests__return_response_by_id) TEST_F(homa_incoming, homa_register_interests__socket_shutdown) { int result; + self->hsk.shutdown = 1; result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_RESPONSE, 0); @@ -2017,9 +2095,10 @@ TEST_F(homa_incoming, homa_register_interests__specified_id_has_packets) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); + int result; - int result = homa_register_interests(&self->interest, &self->hsk, + ASSERT_NE(NULL, crpc); + result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST, crpc->id); EXPECT_EQ(0, result); EXPECT_EQ(crpc, (struct homa_rpc *) @@ -2031,10 +2110,12 @@ TEST_F(homa_incoming, homa_register_interests__specified_id_has_error) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + int result; + ASSERT_NE(NULL, crpc); crpc->error = -EFAULT; - int result = homa_register_interests(&self->interest, &self->hsk, + result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_NONBLOCKING, crpc->id); EXPECT_EQ(0, result); EXPECT_EQ(crpc, (struct homa_rpc *) @@ -2046,9 +2127,10 @@ TEST_F(homa_incoming, homa_register_interests__specified_id_not_ready) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); + int result; - int result = homa_register_interests(&self->interest, &self->hsk, + ASSERT_NE(NULL, crpc); + result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST, crpc->id); EXPECT_EQ(0, result); EXPECT_EQ(NULL, (struct homa_rpc *) @@ -2059,9 +2141,10 @@ TEST_F(homa_incoming, homa_register_interests__return_queued_response) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); + int result; - int result = homa_register_interests(&self->interest, &self->hsk, + ASSERT_NE(NULL, crpc); + result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); EXPECT_EQ(0, result); EXPECT_EQ(crpc, (struct homa_rpc *) @@ -2075,9 +2158,10 @@ TEST_F(homa_incoming, homa_register_interests__return_queued_request) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, 1, 20000, 100); - ASSERT_NE(NULL, srpc); + int result; - int result = homa_register_interests(&self->interest, &self->hsk, + ASSERT_NE(NULL, srpc); + result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); EXPECT_EQ(0, result); EXPECT_EQ(srpc, (struct homa_rpc *) @@ -2094,10 +2178,11 @@ TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, self->server_id+2, 20000, 100); + int result; // First time should call sk_data_ready (for 2nd RPC). unit_log_clear(); - int result = homa_register_interests(&self->interest, &self->hsk, + result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); EXPECT_EQ(0, result); EXPECT_EQ(srpc1, (struct homa_rpc *) @@ -2122,9 +2207,10 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_from_register_interests) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); + struct homa_rpc *rpc; - struct homa_rpc *rpc = homa_wait_for_message(&self->hsk, + ASSERT_NE(NULL, crpc); + rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, self->client_id); EXPECT_EQ(crpc, rpc); @@ -2135,10 +2221,11 @@ TEST_F(homa_incoming, homa_wait_for_message__error_from_register_interests) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); self->hsk.shutdown = 1; - struct homa_rpc *rpc = homa_wait_for_message(&self->hsk, + rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, self->client_id); EXPECT_EQ(ESHUTDOWN, -PTR_ERR(rpc)); @@ -2146,12 +2233,12 @@ TEST_F(homa_incoming, homa_wait_for_message__error_from_register_interests) } TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_polling) { - struct homa_rpc *rpc; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc1); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc1); hook_rpc = crpc1; poll_count = 5; self->homa.poll_cycles = 1000000; @@ -2166,10 +2253,11 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_polling) } TEST_F(homa_incoming, homa_wait_for_message__nothing_ready_nonblocking) { - struct homa_rpc *rpc; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 20000, 1600); ASSERT_NE(NULL, crpc1); @@ -2180,10 +2268,11 @@ TEST_F(homa_incoming, homa_wait_for_message__nothing_ready_nonblocking) } TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_sleeping) { - struct homa_rpc *rpc; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc1); /* Also, check to see that reaping occurs before sleeping. */ @@ -2208,10 +2297,11 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_sleeping) } TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_after_giving_up) { - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); hook_rpc = crpc; @@ -2228,10 +2318,11 @@ TEST_F(homa_incoming, homa_wait_for_message__handoff_rpc_then_delete_after_givin { // A key thing this test does it to ensure that RPC_HANDING_OFF // gets cleared even though the RPC has been deleted. - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); // Prevent the RPC from being reaped during the test. @@ -2250,13 +2341,13 @@ TEST_F(homa_incoming, homa_wait_for_message__handoff_rpc_then_delete_after_givin } TEST_F(homa_incoming, homa_wait_for_message__explicit_rpc_deleted_while_sleeping) { - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); unit_log_clear(); - hook_rpc = crpc; unit_hook_register(delete_hook); rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_RESPONSE, @@ -2265,13 +2356,13 @@ TEST_F(homa_incoming, homa_wait_for_message__explicit_rpc_deleted_while_sleeping } TEST_F(homa_incoming, homa_wait_for_message__socket_shutdown_while_sleeping) { - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); unit_log_clear(); - hook_hsk = &self->hsk; unit_hook_register(shutdown_hook); rpc = homa_wait_for_message(&self->hsk, @@ -2280,14 +2371,14 @@ TEST_F(homa_incoming, homa_wait_for_message__socket_shutdown_while_sleeping) } TEST_F(homa_incoming, homa_wait_for_message__copy_to_user) { - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); mock_copy_to_user_dont_copy = -1; unit_log_clear(); - hook_hsk = &self->hsk; rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); @@ -2300,14 +2391,15 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_freed_after_matching) /* Arrange for 2 RPCs to be ready, but delete the first one after * it has matched; this should cause the second one to be matched. */ - struct homa_rpc *rpc; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - ASSERT_NE(NULL, crpc1); struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 20000, 1600); + struct homa_rpc *rpc; + + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); unit_log_clear(); @@ -2321,10 +2413,11 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_freed_after_matching) } TEST_F(homa_incoming, homa_wait_for_message__copy_to_user_fails) { - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); unit_log_clear(); mock_copy_data_errors = 1; @@ -2340,10 +2433,11 @@ TEST_F(homa_incoming, homa_wait_for_message__copy_to_user_fails) } TEST_F(homa_incoming, homa_wait_for_message__message_complete) { - struct homa_rpc *rpc; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 2000); + struct homa_rpc *rpc; + ASSERT_NE(NULL, crpc); mock_copy_to_user_dont_copy = -1; unit_log_clear(); @@ -2371,11 +2465,13 @@ TEST_F(homa_incoming, homa_choose_interest__empty_list) struct homa_interest *result = homa_choose_interest(&self->homa, &self->hsk.request_interests, offsetof(struct homa_interest, request_links)); + EXPECT_EQ(NULL, result); } TEST_F(homa_incoming, homa_choose_interest__find_idle_core) { struct homa_interest interest1, interest2, interest3; + homa_interest_init(&interest1); interest1.core = 1; list_add_tail(&interest1.request_links, &self->hsk.request_interests); @@ -2402,6 +2498,7 @@ TEST_F(homa_incoming, homa_choose_interest__find_idle_core) TEST_F(homa_incoming, homa_choose_interest__all_cores_busy) { struct homa_interest interest1, interest2, interest3; + homa_interest_init(&interest1); interest1.core = 1; list_add_tail(&interest1.request_links, &self->hsk.request_interests); @@ -2428,10 +2525,11 @@ TEST_F(homa_incoming, homa_choose_interest__all_cores_busy) TEST_F(homa_incoming, homa_rpc_handoff__handoff_already_in_progress) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_interest interest; + ASSERT_NE(NULL, crpc); EXPECT_EQ(NULL, crpc->interest); unit_log_clear(); @@ -2450,10 +2548,11 @@ TEST_F(homa_incoming, homa_rpc_handoff__handoff_already_in_progress) } TEST_F(homa_incoming, homa_rpc_handoff__rpc_already_enqueued) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_interest interest; + ASSERT_NE(NULL, crpc); EXPECT_EQ(NULL, crpc->interest); unit_log_clear(); @@ -2479,10 +2578,11 @@ TEST_F(homa_incoming, homa_rpc_handoff__rpc_already_enqueued) } TEST_F(homa_incoming, homa_rpc_handoff__interest_on_rpc) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_interest interest; + ASSERT_NE(NULL, crpc); EXPECT_EQ(NULL, crpc->interest); unit_log_clear(); @@ -2502,10 +2602,11 @@ TEST_F(homa_incoming, homa_rpc_handoff__interest_on_rpc) } TEST_F(homa_incoming, homa_rpc_handoff__response_interests) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_interest interest; + ASSERT_NE(NULL, crpc); EXPECT_EQ(NULL, crpc->interest); unit_log_clear(); @@ -2525,22 +2626,22 @@ TEST_F(homa_incoming, homa_rpc_handoff__queue_on_ready_responses) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_rpc_handoff(crpc); EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); } TEST_F(homa_incoming, homa_rpc_handoff__request_interests) { - struct homa_interest interest; struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 20000, 100); + struct homa_interest interest; + ASSERT_NE(NULL, srpc); unit_log_clear(); - homa_interest_init(&interest); interest.thread = &mock_task; list_add_tail(&interest.request_links, &self->hsk.request_interests); @@ -2556,6 +2657,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__queue_on_ready_requests) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, 1, 20000, 100); + ASSERT_NE(NULL, srpc); unit_log_clear(); @@ -2565,10 +2667,11 @@ TEST_F(homa_incoming, homa_rpc_handoff__queue_on_ready_requests) } TEST_F(homa_incoming, homa_rpc_handoff__detach_interest) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_interest interest; + ASSERT_NE(NULL, crpc); EXPECT_EQ(NULL, crpc->interest); unit_log_clear(); @@ -2594,10 +2697,11 @@ TEST_F(homa_incoming, homa_rpc_handoff__detach_interest) } TEST_F(homa_incoming, homa_rpc_handoff__update_last_app_active) { - struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_interest interest; + ASSERT_NE(NULL, crpc); EXPECT_EQ(NULL, crpc->interest); unit_log_clear(); diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c index dc5d5044..4bcb73cb 100644 --- a/test/unit_homa_metrics.c +++ b/test/unit_homa_metrics.c @@ -55,8 +55,9 @@ TEST_F(homa_metrics, homa_metrics_open) } TEST_F(homa_metrics, homa_metrics_read__basics) { - char buffer[1000]; loff_t offset = 10; + char buffer[1000]; + self->homa.metrics = kmalloc(100, GFP_KERNEL); self->homa.metrics_capacity = 100; strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); @@ -77,8 +78,9 @@ TEST_F(homa_metrics, homa_metrics_read__basics) } TEST_F(homa_metrics, homa_metrics_read__error_copying_to_user) { - char buffer[1000]; loff_t offset = 10; + char buffer[1000]; + self->homa.metrics = kmalloc(100, GFP_KERNEL); self->homa.metrics_capacity = 100; strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 9691bedf..523ac1e9 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -43,6 +43,7 @@ static struct sk_buff *tcp6_gro_receive(struct list_head *held_list, FIXTURE_SETUP(homa_offload) { int i; + homa_init(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; homa = &self->homa; @@ -133,8 +134,9 @@ TEST_F(homa_offload, homa_gro_hook_tcp) TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) { - struct sk_buff *skb; struct common_header *h; + struct sk_buff *skb; + homa_gro_hook_tcp(); self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); @@ -155,8 +157,9 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) } TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) { - struct sk_buff *skb; struct common_header *h; + struct sk_buff *skb; + homa_gro_hook_tcp(); self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); @@ -176,8 +179,9 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) } TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) { - struct sk_buff *skb; struct common_header *h; + struct sk_buff *skb; + mock_ipv6 = false; homa_gro_hook_tcp(); self->header.seg.offset = htonl(6000); @@ -200,9 +204,10 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) TEST_F(homa_offload, homa_gso_segment_set_ip_ids) { + struct sk_buff *skb; mock_ipv6 = false; - struct sk_buff *skb = mock_skb_new(&self->ip, &self->header.common, - 1400, 2000); + + skb = mock_skb_new(&self->ip, &self->header.common, 1400, 2000); int version = ip_hdr(skb)->version; EXPECT_EQ(4, version); struct sk_buff *segs = homa_gso_segment(skb, 0); @@ -248,22 +253,28 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) { struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); + struct sk_buff *skb, *skb2, *skb3, *skb4; int client_port = 40000; - int server_port = 99; __u64 client_id = 1234; __u64 server_id = 1235; - struct data_header h = {.common = { - .sport = htons(40000), .dport = htons(server_port), - .type = DATA, - .sender_id = cpu_to_be64(client_id)}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0, 0}, - .retransmit = 0, - .seg = {.offset = htonl(2000)}}; - struct sk_buff *skb, *skb2, *skb3, *skb4; - - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + struct homa_rpc *srpc; + int server_port = 99; + struct data_header h; + + h.common.sport = htons(40000); + h.common.dport = htons(server_port); + h.common.type = DATA; + h.common.sender_id = cpu_to_be64(client_id); + h.message_length = htonl(10000); + h.incoming = htonl(10000); + h.cutoff_version = 0; + h.ack.client_id = 0; + h.ack.client_port= 0; + h.ack.server_port = 0; + h.retransmit = 0; + h.seg.offset = htonl(2000); + + srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &client_ip, &server_ip, client_port, server_id, 10000, 200); ASSERT_NE(NULL, srpc); @@ -312,20 +323,23 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) int client_port = 40000; __u64 client_id = 1234; __u64 server_id = 1235; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, + struct homa_rpc *srpc; + struct grant_header h; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &client_ip, &server_ip, client_port, server_id, 100, 20000); ASSERT_NE(NULL, srpc); homa_xmit_data(srpc, false); unit_log_clear(); - struct grant_header h = {{.sport = htons(srpc->dport), - .dport = htons(self->hsk.port), - .sender_id = cpu_to_be64(client_id), - .type = GRANT}, - .offset = htonl(11000), - .priority = 3, - .resend_all = 0}; + h.common.sport = htons(srpc->dport); + h.common.dport = htons(self->hsk.port); + h.common.sender_id = cpu_to_be64(client_id); + h.common.type = GRANT; + h.offset = htonl(11000); + h.priority = 3; + h.resend_all = 0; /* First attempt: HOMA_GRO_FAST_GRANTS not enabled. */ self->homa.gro_policy = 0; @@ -357,6 +371,7 @@ TEST_F(homa_offload, homa_gro_receive__no_held_skb) { struct sk_buff *skb; int same_flow; + self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); skb->hash = 2; @@ -374,6 +389,7 @@ TEST_F(homa_offload, homa_gro_receive__empty_merge_list) { struct sk_buff *skb; int same_flow; + self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); skb->hash = 2; @@ -391,6 +407,7 @@ TEST_F(homa_offload, homa_gro_receive__held_skb_not_in_merge_list) { struct sk_buff *skb; int same_flow; + self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); skb->hash = 3; @@ -408,6 +425,7 @@ TEST_F(homa_offload, homa_gro_receive__held_skb__in_merge_list_but_wrong_proto) { struct sk_buff *skb; int same_flow; + self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); skb->hash = 3; @@ -429,6 +447,7 @@ TEST_F(homa_offload, homa_gro_receive__merge) { struct sk_buff *skb, *skb2; int same_flow; + cur_offload_core->held_skb = self->skb2; cur_offload_core->held_bucket = 2; diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 1867828f..1bf6af8e 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -93,6 +93,7 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + homa_rpc_unlock(crpc); struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); homa_message_out_init(crpc, 10000); @@ -118,32 +119,34 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved) TEST_F(homa_outgoing, homa_fill_data_interleaved__error_copying_data) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); - homa_rpc_unlock(crpc); + &self->server_addr); struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct sk_buff *skb; + + homa_rpc_unlock(crpc); homa_message_out_init(crpc, 10000); unit_log_clear(); mock_copy_data_errors = 1; - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 10000, 5000, - 1500); + skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); EXPECT_EQ(EFAULT, -PTR_ERR(skb)); } TEST_F(homa_outgoing, homa_new_data_packet__one_segment) { + struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + struct sk_buff *skb; + char buffer[1000]; + homa_rpc_unlock(crpc); - struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); homa_message_out_init(crpc, 500); unit_log_clear(); - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 5000, 500, - 2000); + skb = homa_new_data_packet(crpc, iter, 5000, 500, 2000); EXPECT_STREQ("_copy_from_iter 500 bytes at 1000", unit_log_get()); - char buffer[1000]; EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, " "message_length 500, offset 5000, data_length 500, " "incoming 500", @@ -154,29 +157,32 @@ TEST_F(homa_outgoing, homa_new_data_packet__one_segment) } TEST_F(homa_outgoing, homa_new_data_packet__cant_allocate_skb) { + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + struct sk_buff *skb; + homa_rpc_unlock(crpc); - struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); homa_message_out_init(crpc, 500); unit_log_clear(); mock_alloc_skb_errors = 1; - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 0, 500, 2000); + skb = homa_new_data_packet(crpc, iter, 0, 500, 2000); EXPECT_TRUE(IS_ERR(skb)); EXPECT_EQ(ENOMEM, -PTR_ERR(skb)); } TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_interleaved) { + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + struct sk_buff *skb; + homa_rpc_unlock(crpc); - struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); homa_message_out_init(crpc, 10000); unit_log_clear(); - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 10000, 5000, - 1500); + skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); EXPECT_STREQ("_copy_from_iter 1500 bytes at 1000; " "_copy_from_iter 1500 bytes at 2500; " "_copy_from_iter 1500 bytes at 4000; " @@ -197,10 +203,11 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_int } TEST_F(homa_outgoing, homa_new_data_packet__error_in_homa_fill_data_interleaved) { + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); + &self->server_addr); + homa_rpc_unlock(crpc); - struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); homa_message_out_init(crpc, 10000); unit_log_clear(); @@ -212,20 +219,22 @@ TEST_F(homa_outgoing, homa_new_data_packet__error_in_homa_fill_data_interleaved) } TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_tcp_hijacking) { - self->homa.hijack_tcp = 1; + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct homa_rpc *crpc; struct homa_sock hsk; + struct sk_buff *skb; + char buffer[1000]; + + self->homa.hijack_tcp = 1; mock_sock_init(&hsk, &self->homa, self->client_port+1); - struct homa_rpc *crpc = homa_rpc_new_client(&hsk, &self->server_addr); + crpc = homa_rpc_new_client(&hsk, &self->server_addr); homa_rpc_unlock(crpc); - struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); homa_message_out_init(crpc, 10000); unit_log_clear(); - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 10000, 5000, - 1500); + skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); EXPECT_STREQ("_copy_from_iter 5000 bytes at 1000", unit_log_get()); - char buffer[1000]; EXPECT_STREQ("DATA from 0.0.0.0:40001, dport 99, id 2, " "message_length 10000, offset 10000, data_length 1500, " "incoming 10000, extra segs 1500@11500 1500@13000 " @@ -236,30 +245,32 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_tcp_hijacking) } TEST_F(homa_outgoing, homa_new_data_packet__error_copying_data_hijacking_path) { + struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + struct sk_buff *skb; + homa_rpc_unlock(crpc); - struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); homa_message_out_init(crpc, 500); unit_log_clear(); mock_copy_data_errors = 1; - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 5000, 500, - 2000); + skb = homa_new_data_packet(crpc, iter, 5000, 500, 2000); EXPECT_TRUE(IS_ERR(skb)); EXPECT_EQ(EFAULT, -PTR_ERR(skb)); } TEST_F(homa_outgoing, homa_new_data_packet__gso_information) { + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); + &self->server_addr); + struct sk_buff *skb; + homa_rpc_unlock(crpc); - struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); homa_message_out_init(crpc, 10000); unit_log_clear(); - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 10000, 5000, - 1500); + skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); EXPECT_EQ(4, skb_shinfo(skb)->gso_segs); EXPECT_EQ(1500 + sizeof(struct seg_header), @@ -269,16 +280,17 @@ TEST_F(homa_outgoing, homa_new_data_packet__gso_information) } TEST_F(homa_outgoing, homa_new_data_packet__gso_force_software) { + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); + &self->server_addr); + struct sk_buff *skb; + homa_rpc_unlock(crpc); - struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); homa_message_out_init(crpc, 10000); self->homa.gso_force_software = 1; unit_log_clear(); - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 10000, 5000, - 1500); + skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); EXPECT_EQ(13, skb_shinfo(skb)->gso_type); kfree_skb(skb); } @@ -287,6 +299,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__basics) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 3000), 0)); @@ -316,6 +329,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__message_too_long) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); EXPECT_EQ(EINVAL, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, HOMA_MAX_MESSAGE_LENGTH+1), @@ -326,6 +340,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__zero_length_message) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); EXPECT_EQ(EINVAL, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 0), 0)); @@ -335,6 +350,8 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_force_software) { struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, &self->server_addr); + struct homa_rpc *crpc2; + ASSERT_FALSE(crpc1 == NULL); homa_rpc_unlock(crpc1); mock_net_device.gso_max_size = 10000; @@ -347,8 +364,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_force_software) EXPECT_SUBSTR("xmit DATA", unit_log_get()); EXPECT_NOSUBSTR("TSO disabled", unit_log_get()); - struct homa_rpc *crpc2 = homa_rpc_new_client(&self->hsk, - &self->server_addr); + crpc2 = homa_rpc_new_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc2 == NULL); homa_rpc_unlock(crpc2); self->homa.gso_force_software = 1; @@ -362,6 +378,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); unit_log_clear(); mock_net_device.gso_max_size = 10000; @@ -375,6 +392,8 @@ TEST_F(homa_outgoing, homa_message_out_fill__include_acks) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + struct data_header h; + ASSERT_FALSE(crpc == NULL); crpc->peer->acks[0] = (struct homa_ack) { .client_port = htons(100), @@ -384,7 +403,6 @@ TEST_F(homa_outgoing, homa_message_out_fill__include_acks) ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 500), 0)); homa_rpc_unlock(crpc); - struct data_header h; homa_skb_get(crpc->msgout.packets, &h, 0, sizeof(h)); EXPECT_STREQ("client_port 100, server_port 200, client_id 1000", unit_ack_string(&h.ack)); @@ -393,6 +411,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); mock_net_device.gso_max_size = 5000; unit_log_clear(); @@ -420,6 +439,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); unit_hook_register(unlock_hook); hook_rpc = crpc; @@ -433,6 +453,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__add_to_throttled) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 5000), 1)); @@ -451,6 +472,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__too_short_for_pipelining) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc == NULL); ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 1000), 1)); @@ -591,7 +613,6 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) TEST_F(homa_outgoing, homa_xmit_unknown) { - struct sk_buff *skb; struct grant_header h = {{.sport = htons(self->client_port), .dport = htons(self->server_port), .sender_id = cpu_to_be64(99990), @@ -599,6 +620,8 @@ TEST_F(homa_outgoing, homa_xmit_unknown) .offset = htonl(11200), .priority = 3, .resend_all = 0}; + struct sk_buff *skb; + mock_xmit_log_verbose = 1; skb = mock_skb_new(self->client_ip, &h.common, 0, 0); homa_xmit_unknown(skb, &self->hsk); @@ -612,6 +635,7 @@ TEST_F(homa_outgoing, homa_xmit_data__basics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); + crpc->msgout.sched_priority = 2; crpc->msgout.unscheduled = 2000; crpc->msgout.granted = 5000; @@ -635,8 +659,8 @@ TEST_F(homa_outgoing, homa_xmit_data__stop_because_no_more_granted) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); - unit_log_clear(); + unit_log_clear(); crpc->msgout.granted = 1000; homa_xmit_data(crpc, false); EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); @@ -649,6 +673,7 @@ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 200, 1000); + unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 11000); self->homa.max_nic_queue_cycles = 500; @@ -693,6 +718,7 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); + unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 11000); self->homa.max_nic_queue_cycles = 3000; @@ -711,6 +737,7 @@ TEST_F(homa_outgoing, homa_xmit_data__rpc_freed) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); + crpc->msgout.unscheduled = 2000; crpc->msgout.granted = 5000; @@ -728,6 +755,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__update_cutoff_version) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 1000); + crpc->peer->cutoff_version = htons(123); mock_xmit_log_verbose = 1; unit_log_clear(); @@ -737,11 +765,12 @@ TEST_F(homa_outgoing, __homa_xmit_data__update_cutoff_version) } TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) { - int old_refcount; - struct dst_entry *dst; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 1000); + struct dst_entry *dst; + int old_refcount; + unit_log_clear(); dst = crpc->peer->dst; old_refcount = atomic_read(&dst->__rcuref.refcnt); @@ -754,14 +783,16 @@ TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) } TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) { + struct homa_rpc *crpc; + // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, self->client_port); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 1000); + crpc = unit_client_rpc(&self->hsk,UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 1000); unit_log_clear(); mock_ip_queue_xmit_errors = 1; skb_get(crpc->msgout.packets); @@ -770,14 +801,16 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) } TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) { + struct homa_rpc *crpc; + // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, self->client_port); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 100, 1000); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 100, 1000); unit_log_clear(); mock_ip6_xmit_errors = 1; skb_get(crpc->msgout.packets); @@ -787,10 +820,12 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) TEST_F(homa_outgoing, homa_resend_data__basics) { + struct homa_rpc *crpc; + mock_net_device.gso_max_size = 5000; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 16000, 1000); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); unit_log_clear(); mock_clear_xmit_prios(); mock_xmit_log_verbose = 1; @@ -842,16 +877,20 @@ TEST_F(homa_outgoing, homa_resend_data__packet_doesnt_use_gso) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 2000); + unit_log_clear(); homa_resend_data(crpc, 500, 1500, 2); EXPECT_STREQ("xmit DATA retrans 1000@0", unit_log_get()); } TEST_F(homa_outgoing, homa_resend_data__cant_allocate_skb) { + struct homa_rpc *crpc; + mock_net_device.gso_max_size = 5000; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 16000, 1000); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); + unit_log_clear(); mock_clear_xmit_prios(); mock_alloc_skb_errors = 1; @@ -860,10 +899,12 @@ TEST_F(homa_outgoing, homa_resend_data__cant_allocate_skb) } TEST_F(homa_outgoing, homa_resend_data__set_incoming) { + struct homa_rpc *crpc; + mock_net_device.gso_max_size = 5000; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 16000, 1000); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); unit_log_clear(); mock_xmit_log_verbose = 1; EXPECT_EQ(10000, crpc->msgout.granted); @@ -880,10 +921,12 @@ TEST_F(homa_outgoing, homa_resend_data__set_incoming) } TEST_F(homa_outgoing, homa_resend_data__error_copying_data) { + struct homa_rpc *crpc; + mock_net_device.gso_max_size = 5000; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, - self->server_port, self->client_id, 16000, 1000); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); unit_log_clear(); mock_clear_xmit_prios(); mock_max_skb_frags = 0; @@ -893,10 +936,12 @@ TEST_F(homa_outgoing, homa_resend_data__error_copying_data) } TEST_F(homa_outgoing, homa_resend_data__set_homa_info) { + struct homa_rpc *crpc; + mock_net_device.gso_max_size = 5000; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 16000, 1000); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 16000, 1000); unit_log_clear(); mock_xmit_log_homa_info = 1; homa_resend_data(crpc, 8400, 8800, 2); @@ -931,6 +976,7 @@ TEST_F(homa_outgoing, homa_check_nic_queue__basics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 500, 1000); + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 9000); @@ -946,6 +992,7 @@ TEST_F(homa_outgoing, homa_check_nic_queue__queue_full) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 500, 1000); + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 9000); @@ -961,6 +1008,7 @@ TEST_F(homa_outgoing, homa_check_nic_queue__queue_full_but_force) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 500, 1000); + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 9000); @@ -976,6 +1024,7 @@ TEST_F(homa_outgoing, homa_check_nic_queue__pacer_metrics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 500, 1000); + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; homa_add_to_throttled(crpc); unit_log_clear(); @@ -995,6 +1044,7 @@ TEST_F(homa_outgoing, homa_check_nic_queue__queue_empty) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 500, 1000); + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 9000); @@ -1022,6 +1072,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__basics) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id+4, 150000, 1000); + homa_add_to_throttled(crpc1); homa_add_to_throttled(crpc2); homa_add_to_throttled(crpc3); @@ -1039,18 +1090,17 @@ TEST_F(homa_outgoing, homa_pacer_xmit__basics) } TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) { + struct homa_rpc *crpc1, *crpc2, *crpc3; + mock_cycles = 10000; - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 2, 20000, 1000); + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 2, 20000, 1000); mock_cycles = 11000; - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 4, 10000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 4, 10000, 1000); mock_cycles = 12000; - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 6, 30000, 1000); + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 6, 30000, 1000); homa_add_to_throttled(crpc1); homa_add_to_throttled(crpc2); homa_add_to_throttled(crpc3); @@ -1093,6 +1143,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__pacer_busy) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 1000); + homa_add_to_throttled(crpc); self->homa.max_nic_queue_cycles = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; @@ -1119,6 +1170,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__nic_queue_fills) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 1000); + homa_add_to_throttled(crpc); self->homa.max_nic_queue_cycles = 2001; mock_cycles = 10000; @@ -1137,6 +1189,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 1000); + homa_add_to_throttled(crpc); self->homa.max_nic_queue_cycles = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; @@ -1161,6 +1214,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__remove_from_queue) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 4, 10000, 1000); + homa_add_to_throttled(crpc1); homa_add_to_throttled(crpc2); self->homa.max_nic_queue_cycles = 2000; diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 7bd5cfdd..aea66c4a 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -46,6 +46,7 @@ static int dead_count(struct homa_peertab *peertab) { struct list_head *pos; int count = 0; + list_for_each(pos, &peertab->dead_dsts) count++; return count; @@ -95,6 +96,7 @@ static void peer_lock_hook(char *id) { TEST_F(homa_peer, homa_peertab_init__vmalloc_failed) { struct homa_peertab table; + mock_vmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_peertab_init(&table)); @@ -105,6 +107,7 @@ TEST_F(homa_peer, homa_peertab_init__vmalloc_failed) TEST_F(homa_peer, homa_peertab_gc_dsts) { struct homa_peer *peer; + peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); mock_cycles = 0; homa_dst_refresh(&self->peertab, peer, &self->hsk); @@ -124,6 +127,7 @@ TEST_F(homa_peer, homa_peertab_get_peers__not_init) { struct homa_peertab peertab; int num_peers = 45; + memset(&peertab, 0, sizeof(peertab)); EXPECT_EQ(NULL, homa_peertab_get_peers(&peertab, &num_peers)); EXPECT_EQ(0, num_peers); @@ -137,6 +141,7 @@ TEST_F(homa_peer, homa_peertab_get_peers__table_empty) TEST_F(homa_peer, homa_peertab_get_peers__kmalloc_fails) { int num_peers = 45; + mock_kmalloc_errors = 1; homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_EQ(NULL, homa_peertab_get_peers(&self->peertab, &num_peers)); @@ -147,6 +152,7 @@ TEST_F(homa_peer, homa_peertab_get_peers__one_peer) struct homa_peer **peers; struct homa_peer *peer; int num_peers = 45; + peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); peers = homa_peertab_get_peers(&self->peertab, &num_peers); ASSERT_NE(NULL, peers); @@ -156,9 +162,10 @@ TEST_F(homa_peer, homa_peertab_get_peers__one_peer) } TEST_F(homa_peer, homa_peertab_get_peers__multiple_peers) { - struct homa_peer **peers; struct homa_peer *peer1, *peer2, *peer3; + struct homa_peer **peers; int num_peers = 45; + peer1 = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); peer2 = homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); peer3 = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); @@ -208,8 +215,9 @@ TEST_F(homa_peer, homa_peer_find__route_error) TEST_F(homa_peer, homa_dst_refresh__basics) { - struct homa_peer *peer; struct dst_entry *old_dst; + struct homa_peer *peer; + peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); @@ -221,8 +229,9 @@ TEST_F(homa_peer, homa_dst_refresh__basics) } TEST_F(homa_peer, homa_dst_refresh__routing_error) { - struct homa_peer *peer; struct dst_entry *old_dst; + struct homa_peer *peer; + peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); @@ -236,8 +245,9 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) } TEST_F(homa_peer, homa_dst_refresh__malloc_error) { - struct homa_peer *peer; struct dst_entry *old_dst; + struct homa_peer *peer; + peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); @@ -251,6 +261,7 @@ TEST_F(homa_peer, homa_dst_refresh__malloc_error) TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) { struct homa_peer *peer; + peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); @@ -267,6 +278,7 @@ TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) TEST_F(homa_peer, homa_unsched_priority) { struct homa_peer peer; + homa_peer_set_cutoffs(&peer, INT_MAX, 0, 0, INT_MAX, 200, 100, 0, 0); EXPECT_EQ(5, homa_unsched_priority(&self->homa, &peer, 10)); @@ -295,9 +307,9 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv4) } TEST_F(homa_peer, homa_peer_get_dst_ipv6) { + struct dst_entry *dst; char buffer[30]; __u32 addr; - struct dst_entry *dst; // Make sure the test uses IPv6. mock_ipv6 = true; @@ -320,11 +332,11 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) TEST_F(homa_peer, homa_peer_lock_slow) { - mock_cycles = 10000; struct homa_peer *peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - ASSERT_NE(NULL, peer); + ASSERT_NE(NULL, peer); + mock_cycles = 10000; homa_peer_lock(peer); EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_misses); EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); @@ -350,6 +362,7 @@ TEST_F(homa_peer, homa_peer_add_ack) self->client_ip, self->server_ip, self->server_port, 103, 100, 100); struct homa_peer *peer = crpc1->peer; + EXPECT_EQ(0, peer->num_acks); /* Initialize 3 acks in the peer. */ @@ -395,11 +408,12 @@ TEST_F(homa_peer, homa_peer_get_acks) { struct homa_peer *peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); + struct homa_ack acks[2]; + ASSERT_NE(NULL, peer); EXPECT_EQ(0, peer->num_acks); // First call: nothing available. - struct homa_ack acks[2]; EXPECT_EQ(0, homa_peer_get_acks(peer, 2, acks)); // Second call: retrieve 2 out of 3. diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index abaf781d..fa665816 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -111,85 +111,95 @@ FIXTURE_TEARDOWN(homa_plumbing) TEST_F(homa_plumbing, homa_bind__version_mismatch) { + struct sockaddr addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - - struct sockaddr addr = {}; addr.sa_family = AF_INET6; - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr, sizeof(addr)); + result = homa_bind(&sock, &addr, sizeof(addr)); EXPECT_EQ(EAFNOSUPPORT, -result); } TEST_F(homa_plumbing, homa_bind__ipv6_address_too_short) { + union sockaddr_in_union addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - union sockaddr_in_union addr = {}; addr.in6.sin6_family = AF_INET6; - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)-1); + result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)-1); EXPECT_EQ(EINVAL, -result); } TEST_F(homa_plumbing, homa_bind__ipv6_ok) { + union sockaddr_in_union addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - union sockaddr_in_union addr = {}; addr.in6.sin6_family = AF_INET6; addr.in6.sin6_port = htons(123); - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)); + result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)); EXPECT_EQ(0, -result); EXPECT_EQ(123, self->hsk.port); } TEST_F(homa_plumbing, homa_bind__ipv4_address_too_short) { + union sockaddr_in_union addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - union sockaddr_in_union addr = {}; addr.in4.sin_family = AF_INET; - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)-1); + result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)-1); EXPECT_EQ(EINVAL, -result); } TEST_F(homa_plumbing, homa_bind__ipv4_ok) { + union sockaddr_in_union addr = {}; + struct socket sock = {}; + int result; + // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - union sockaddr_in_union addr = {}; addr.in4.sin_family = AF_INET; addr.in4.sin_port = htons(345); - struct socket sock = {}; sock.sk = &self->hsk.inet.sk; - int result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)); + result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)); EXPECT_EQ(0, -result); EXPECT_EQ(345, self->hsk.port); } TEST_F(homa_plumbing, homa_ioc_abort__basics) { - struct homa_abort_args args = {self->client_id, 0}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 200); + struct homa_abort_args args = {self->client_id, 0}; + ASSERT_NE(NULL, crpc); EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); EXPECT_EQ(RPC_DEAD, crpc->state); @@ -198,18 +208,20 @@ TEST_F(homa_plumbing, homa_ioc_abort__basics) TEST_F(homa_plumbing, homa_ioc_abort__cant_read_user_args) { struct homa_abort_args args = {self->client_id, 0}; + mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); } TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) { - struct homa_abort_args args = {0, ECANCELED}; struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 200); struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 200); + struct homa_abort_args args = {0, ECANCELED}; + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); @@ -220,6 +232,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) TEST_F(homa_plumbing, homa_ioc_abort__nonexistent_rpc) { struct homa_abort_args args = {99, 0}; + EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); } @@ -249,6 +262,7 @@ TEST_F(homa_plumbing, homa_set_sock_opt__copy_from_sockptr_fails) TEST_F(homa_plumbing, homa_set_sock_opt__copy_to_user_fails) { struct homa_set_buf_args args = {(void *) 0x100000, 5*HOMA_BPAGE_SIZE}; + self->optval.user = &args; mock_copy_to_user_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, @@ -336,6 +350,7 @@ TEST_F(homa_plumbing, homa_sendmsg__cant_update_user_arguments) TEST_F(homa_plumbing, homa_sendmsg__request_sent_successfully) { struct homa_rpc *crpc; + atomic64_set(&self->homa.next_outgoing_id, 1234); self->sendmsg_args.completion_cookie = 88888; EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, @@ -353,6 +368,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_nonzero_completion_cookie) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; self->sendmsg_args.completion_cookie = 12345; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, @@ -365,6 +381,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_cant_find_rpc) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id + 1; EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); @@ -376,6 +393,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_error_in_rpc) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 100); + self->sendmsg_args.id = srpc->id; srpc->error = -ENOMEM; EXPECT_EQ(ENOMEM, -homa_sendmsg(&self->hsk.inet.sk, @@ -388,6 +406,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_wrong_state) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); @@ -399,6 +418,7 @@ TEST_F(homa_plumbing, homa_sendmsg__homa_message_out_fill_returns_error) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; self->sendmsg_hdr.msg_iter.count = HOMA_MAX_MESSAGE_LENGTH + 1; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, @@ -411,6 +431,7 @@ TEST_F(homa_plumbing, homa_sendmsg__rpc_freed_during_homa_message_out_fill) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 100); + unit_hook_register(unlock_hook); hook_rpc = srpc; self->sendmsg_args.id = self->server_id; @@ -425,6 +446,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_succeeds) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); @@ -502,6 +524,7 @@ TEST_F(homa_plumbing, homa_recvmsg__MSG_DONT_WAIT) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); + EXPECT_NE(NULL, crpc); EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, @@ -510,16 +533,18 @@ TEST_F(homa_plumbing, homa_recvmsg__MSG_DONT_WAIT) } TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) { + struct homa_rpc *crpc; + __u32 pages[2]; + // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - __u32 pages[2]; EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, pages, 0)); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, self->server_ip, self->server_port, - self->client_id, 100, 2000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 100, 2000); EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); crpc->completion_cookie = 44444; @@ -541,15 +566,18 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) } TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv6) { + struct in6_addr server_ip6; + struct homa_rpc *crpc; + // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - struct in6_addr server_ip6 = unit_get_in_addr("1::3:5:7"); + server_ip6 = unit_get_in_addr("1::3:5:7"); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, &server_ip6, self->server_port, - self->client_id, 100, 2000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, + &server_ip6, self->server_port, self->client_id, + 100, 2000); EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); crpc->completion_cookie = 44444; @@ -571,6 +599,7 @@ TEST_F(homa_plumbing, homa_recvmsg__rpc_has_error) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); + EXPECT_NE(NULL, crpc); crpc->completion_cookie = 44444; homa_rpc_abort(crpc, -ETIMEDOUT); @@ -591,6 +620,7 @@ TEST_F(homa_plumbing, homa_recvmsg__add_ack) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); + EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); crpc->completion_cookie = 44444; @@ -604,8 +634,8 @@ TEST_F(homa_plumbing, homa_recvmsg__server_normal_completion) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 200); - EXPECT_NE(NULL, srpc); + EXPECT_NE(NULL, srpc); EXPECT_EQ(100, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->server_id, self->recvmsg_args.id); @@ -618,6 +648,7 @@ TEST_F(homa_plumbing, homa_recvmsg__delete_server_rpc_after_error) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 200); + EXPECT_NE(NULL, srpc); srpc->error = -ENOMEM; @@ -632,6 +663,7 @@ TEST_F(homa_plumbing, homa_recvmsg__error_copying_out_args) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); + EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); mock_copy_to_user_errors = 1; @@ -659,6 +691,7 @@ TEST_F(homa_plumbing, homa_recvmsg__copy_back_args_even_after_error) TEST_F(homa_plumbing, homa_softirq__basics) { struct sk_buff *skb; + skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); homa_softirq(skb); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); @@ -666,6 +699,7 @@ TEST_F(homa_plumbing, homa_softirq__basics) TEST_F(homa_plumbing, homa_softirq__cant_pull_header) { struct sk_buff *skb; + skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); skb->data_len = skb->len - 20; homa_softirq(skb); @@ -674,6 +708,7 @@ TEST_F(homa_plumbing, homa_softirq__cant_pull_header) TEST_F(homa_plumbing, homa_softirq__remove_extra_headers) { struct sk_buff *skb; + skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); __skb_push(skb, 10); homa_softirq(skb); @@ -683,6 +718,7 @@ TEST_F(homa_plumbing, homa_softirq__packet_too_short) { struct sk_buff *skb; struct ack_header h; + h.common.type = ACK; skb = mock_skb_new(self->client_ip, &h.common, 0, 0); skb->len -= 1; @@ -693,6 +729,7 @@ TEST_F(homa_plumbing, homa_softirq__packet_too_short) TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) { struct sk_buff *skb; + self->data.common.type = BOGUS; skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); homa_softirq(skb); @@ -728,13 +765,13 @@ TEST_F(homa_plumbing, homa_softirq__process_short_messages_first) } TEST_F(homa_plumbing, homa_softirq__process_control_first) { - struct sk_buff *skb, *skb2; struct common_header unknown = { .sport = htons(self->client_port), .dport = htons(self->server_port), .type = UNKNOWN, .sender_id = cpu_to_be64(self->client_id) }; + struct sk_buff *skb, *skb2; self->data.common.sender_id = cpu_to_be64(2000); self->data.message_length = htonl(2000); diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index e4dd08e7..6ebdc7d9 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -68,6 +68,7 @@ static void change_owner_hook(char *id) TEST_F(homa_pool, homa_pool_set_bpages_needed) { struct homa_pool *pool = self->hsk.buffer_pool; + atomic_set(&pool->free_bpages, 0); unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE+1); @@ -81,6 +82,7 @@ TEST_F(homa_pool, homa_pool_set_bpages_needed) TEST_F(homa_pool, homa_pool_init__basics) { struct homa_pool *pool = self->hsk.buffer_pool; + EXPECT_EQ(100, pool->num_bpages); EXPECT_EQ(-1, pool->descriptors[98].owner); } @@ -122,6 +124,7 @@ TEST_F(homa_pool, homa_pool_get_pages__basics) { struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; + EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(0, pages[0]); EXPECT_EQ(1, pages[1]); @@ -134,6 +137,7 @@ TEST_F(homa_pool, homa_pool_get_pages__not_enough_space) { struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; + atomic_set(&pool->free_bpages, 1); EXPECT_EQ(-1, homa_pool_get_pages(pool, 2, pages, 0)); atomic_set(&pool->free_bpages, 2); @@ -143,6 +147,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit) { struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; + atomic_set(&pool->free_bpages, 62); pool->cores[raw_smp_processor_id()].next_candidate = 49; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); @@ -153,6 +158,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit_with_MIN_EXTRA) { struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; + atomic_set(&pool->free_bpages, 92); pool->cores[raw_smp_processor_id()].next_candidate = 13; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); @@ -163,6 +169,7 @@ TEST_F(homa_pool, homa_pool_get_pages__skip_unusable_bpages) { struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; + mock_cycles = 1000; atomic_set(&pool->descriptors[0].refs, 2); atomic_set(&pool->descriptors[1].refs, 1); @@ -180,6 +187,7 @@ TEST_F(homa_pool, homa_pool_get_pages__cant_lock_pages) { struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; + mock_cycles = 1000; mock_trylock_errors = 3; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); @@ -190,6 +198,7 @@ TEST_F(homa_pool, homa_pool_get_pages__state_changes_while_locking) { struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; + mock_cycles = 1000; unit_hook_register(steal_bpages_hook); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); @@ -200,6 +209,7 @@ TEST_F(homa_pool, homa_pool_get_pages__steal_expired_page) { struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; + pool->descriptors[0].owner = 5; mock_cycles = 5000; pool->descriptors[0].expiration = mock_cycles - 1; @@ -214,6 +224,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_owner) { struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; + self->homa.bpage_lease_cycles = 1000; mock_cycles = 5000; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 1)); @@ -225,12 +236,12 @@ TEST_F(homa_pool, homa_pool_get_pages__set_owner) TEST_F(homa_pool, homa_pool_allocate__basics) { - struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 150000); - ASSERT_NE(NULL, crpc); + struct homa_pool *pool = self->hsk.buffer_pool; + ASSERT_NE(NULL, crpc); EXPECT_EQ(3, crpc->msgin.num_bpages); EXPECT_EQ(0, crpc->msgin.bpage_offsets[0]); EXPECT_EQ(-1, pool->descriptors[0].owner); @@ -241,10 +252,11 @@ TEST_F(homa_pool, homa_pool_allocate__basics) } TEST_F(homa_pool, homa_pool_no_buffer_pool) { - struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 150000); + struct homa_pool *pool = self->hsk.buffer_pool; + ASSERT_NE(NULL, crpc); homa_pool_destroy(pool); EXPECT_EQ(ENOMEM, -homa_pool_allocate(crpc)); @@ -252,10 +264,11 @@ TEST_F(homa_pool, homa_pool_no_buffer_pool) TEST_F(homa_pool, homa_pool_allocate__cant_allocate_full_bpages) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + atomic_set(&pool->free_bpages, 1); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 150000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 150000); ASSERT_NE(NULL, crpc); EXPECT_EQ(0, crpc->msgin.num_bpages); @@ -265,10 +278,12 @@ TEST_F(homa_pool, homa_pool_allocate__cant_allocate_full_bpages) TEST_F(homa_pool, homa_pool_allocate__no_partial_page) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + atomic_set(&pool->free_bpages, 2); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, + 2*HOMA_BPAGE_SIZE); ASSERT_NE(NULL, crpc); EXPECT_EQ(2, crpc->msgin.num_bpages); @@ -279,11 +294,12 @@ TEST_F(homa_pool, homa_pool_allocate__no_partial_page) TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + pool->cores[raw_smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 40); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); // First allocation just sets up a partially-allocated bpage. @@ -306,13 +322,14 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + pool->cores[raw_smp_processor_id()].page_hint = 2; pool->cores[raw_smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; atomic_set(&pool->descriptors[2].refs, 1); pool->descriptors[2].owner = raw_smp_processor_id(); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); @@ -325,11 +342,12 @@ TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + pool->cores[raw_smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 50); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); @@ -348,14 +366,14 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc1, *crpc2; + pool->cores[raw_smp_processor_id()].next_candidate = 2; - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc1); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 100, 1000, 3000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 100, 1000, 3000); ASSERT_NE(NULL, crpc2); EXPECT_EQ(1, crpc1->msgin.num_bpages); @@ -369,10 +387,12 @@ TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + atomic_set(&pool->free_bpages, 5); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 5*HOMA_BPAGE_SIZE + 100); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, + 5*HOMA_BPAGE_SIZE + 100); ASSERT_NE(NULL, crpc); EXPECT_EQ(0, crpc->msgin.num_bpages); @@ -383,8 +403,10 @@ TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) } TEST_F(homa_pool, homa_pool_allocate__out_of_space) { - /* Queue up several RPCs to make sure they are properly sorted. */ struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *rpc; + + /* Queue up several RPCs to make sure they are properly sorted. */ atomic_set(&pool->free_bpages, 0); unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); @@ -395,8 +417,8 @@ TEST_F(homa_pool, homa_pool_allocate__out_of_space) ASSERT_EQ(0, atomic_read(&pool->free_bpages)); ASSERT_FALSE(list_empty(&self->hsk.waiting_for_bufs)); - struct homa_rpc *rpc = list_first_entry(&self->hsk.waiting_for_bufs, - struct homa_rpc, buf_links); + rpc = list_first_entry(&self->hsk.waiting_for_bufs, struct homa_rpc, + buf_links); EXPECT_EQ(98, rpc->id); ASSERT_FALSE(list_is_last(&rpc->buf_links, &self->hsk.waiting_for_bufs)); rpc = list_next_entry(rpc, buf_links); @@ -412,12 +434,12 @@ TEST_F(homa_pool, homa_pool_allocate__out_of_space) TEST_F(homa_pool, homa_pool_get_buffer) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; int available; void *buffer; - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 150000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 150000); ASSERT_NE(NULL, crpc); buffer = homa_pool_get_buffer(crpc, HOMA_BPAGE_SIZE + 1000, &available); EXPECT_EQ(HOMA_BPAGE_SIZE - 1000, available); @@ -430,15 +452,14 @@ TEST_F(homa_pool, homa_pool_get_buffer) TEST_F(homa_pool, homa_pool_release_buffers__basics) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc1, *crpc2; char *saved_region; - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 150000); + crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 150000); ASSERT_NE(NULL, crpc1); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc2); EXPECT_EQ(1, atomic_read(&pool->descriptors[0].refs)); @@ -465,19 +486,18 @@ TEST_F(homa_pool, homa_pool_release_buffers__basics) TEST_F(homa_pool, homa_pool_check_waiting__basics) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc2, *crpc3; /* Queue up 2 RPCs that together need a total of 5 bpages. */ atomic_set(&pool->free_bpages, 0); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 3*HOMA_BPAGE_SIZE); + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 3*HOMA_BPAGE_SIZE); ASSERT_NE(NULL, crpc2); EXPECT_EQ(0, crpc2->msgin.num_bpages); EXPECT_EQ(3, pool->bpages_needed); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); + crpc3 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); ASSERT_NE(NULL, crpc3); EXPECT_EQ(0, crpc3->msgin.num_bpages); EXPECT_EQ(2, pool->bpages_needed); @@ -499,6 +519,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__basics) TEST_F(homa_pool, homa_pool_check_waiting__bpages_needed_but_no_queued_rpcs) { struct homa_pool *pool = self->hsk.buffer_pool; + pool->bpages_needed = 1; homa_pool_check_waiting(pool); EXPECT_EQ(100, atomic_read(&pool->free_bpages)); @@ -507,11 +528,11 @@ TEST_F(homa_pool, homa_pool_check_waiting__bpages_needed_but_no_queued_rpcs) TEST_F(homa_pool, homa_pool_check_waiting__rpc_initially_locked) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; atomic_set(&pool->free_bpages, 0); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); EXPECT_EQ(0, crpc->msgin.num_bpages); @@ -528,18 +549,17 @@ TEST_F(homa_pool, homa_pool_check_waiting__rpc_initially_locked) TEST_F(homa_pool, homa_pool_check_waiting__reset_bpages_needed) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc1, *crpc2; atomic_set(&pool->free_bpages, 0); - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2000); + crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc1); EXPECT_EQ(0, crpc1->msgin.num_bpages); atomic_set(&pool->free_bpages, 0); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2*HOMA_BPAGE_SIZE - 1); + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE - 1); ASSERT_NE(NULL, crpc2); EXPECT_EQ(0, crpc2->msgin.num_bpages); EXPECT_EQ(1, pool->bpages_needed); @@ -553,12 +573,12 @@ TEST_F(homa_pool, homa_pool_check_waiting__reset_bpages_needed) TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; /* Queue up an RPC that needs 2 bpages. */ atomic_set(&pool->free_bpages, 0); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); ASSERT_NE(NULL, crpc); EXPECT_EQ(0, crpc->msgin.num_bpages); EXPECT_EQ(2, pool->bpages_needed); @@ -573,12 +593,12 @@ TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) TEST_F(homa_pool, homa_pool_check_waiting__reallocation_fails) { struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; /* Queue up an RPC that needs 4 bpages. */ atomic_set(&pool->free_bpages, 0); - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, - 4000, 98, 1000, 4*HOMA_BPAGE_SIZE); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 4*HOMA_BPAGE_SIZE); ASSERT_NE(NULL, crpc); EXPECT_EQ(0, crpc->msgin.num_bpages); pool->bpages_needed = 2; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 93020fdd..dfe662e7 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -72,6 +72,7 @@ FIXTURE_TEARDOWN(homa_rpc) static const char *dead_rpcs(struct homa_sock *hsk) { struct homa_rpc *rpc; + list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) UNIT_LOG(" ", "%llu", rpc->id); return unit_log_get(); @@ -81,31 +82,35 @@ TEST_F(homa_rpc, homa_rpc_new_client__normal) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(IS_ERR(crpc)); homa_rpc_free(crpc); homa_rpc_unlock(crpc); } TEST_F(homa_rpc, homa_rpc_new_client__malloc_error) { + struct homa_rpc *crpc; + mock_kmalloc_errors = 1; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); + crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); EXPECT_TRUE(IS_ERR(crpc)); EXPECT_EQ(ENOMEM, -PTR_ERR(crpc)); } TEST_F(homa_rpc, homa_rpc_new_client__route_error) { + struct homa_rpc *crpc; + mock_route_errors = 1; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); + crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); EXPECT_TRUE(IS_ERR(crpc)); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(crpc)); } TEST_F(homa_rpc, homa_rpc_new_client__socket_shutdown) { + struct homa_rpc *crpc; + self->hsk.shutdown = 1; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); + crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); EXPECT_TRUE(IS_ERR(crpc)); EXPECT_EQ(ESHUTDOWN, -PTR_ERR(crpc)); self->hsk.shutdown = 0; @@ -113,9 +118,11 @@ TEST_F(homa_rpc, homa_rpc_new_client__socket_shutdown) TEST_F(homa_rpc, homa_rpc_new_server__normal) { + struct homa_rpc *srpc; int created; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); self->data.message_length = N(1600); @@ -128,16 +135,18 @@ TEST_F(homa_rpc, homa_rpc_new_server__normal) } TEST_F(homa_rpc, homa_rpc_new_server__already_exists) { + struct homa_rpc *srpc1, *srpc2, *srpc3; int created; - struct homa_rpc *srpc1 = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + + srpc1 = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_FALSE(IS_ERR(srpc1)); homa_rpc_unlock(srpc1); self->data.common.sender_id = cpu_to_be64( be64_to_cpu(self->data.common.sender_id) + 2*HOMA_SERVER_RPC_BUCKETS); - struct homa_rpc *srpc2 = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc2 = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_FALSE(IS_ERR(srpc2)); EXPECT_EQ(1, created); homa_rpc_unlock(srpc2); @@ -145,8 +154,8 @@ TEST_F(homa_rpc, homa_rpc_new_server__already_exists) self->data.common.sender_id = cpu_to_be64( be64_to_cpu(self->data.common.sender_id) - 2*HOMA_SERVER_RPC_BUCKETS); - struct homa_rpc *srpc3 = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc3 = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_FALSE(IS_ERR(srpc3)); EXPECT_EQ(0, created); homa_rpc_unlock(srpc3); @@ -154,28 +163,34 @@ TEST_F(homa_rpc, homa_rpc_new_server__already_exists) } TEST_F(homa_rpc, homa_rpc_new_server__malloc_error) { + struct homa_rpc *srpc; int created; + mock_kmalloc_errors = 1; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); EXPECT_TRUE(IS_ERR(srpc)); EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); } TEST_F(homa_rpc, homa_rpc_new_server__addr_error) { + struct homa_rpc *srpc; int created; + mock_route_errors = 1; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); EXPECT_TRUE(IS_ERR(srpc)); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(srpc)); } TEST_F(homa_rpc, homa_rpc_new_server__socket_shutdown) { + struct homa_rpc *srpc; int created; + self->hsk.shutdown = 1; - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); EXPECT_TRUE(IS_ERR(srpc)); EXPECT_EQ(ESHUTDOWN, -PTR_ERR(srpc)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); @@ -183,10 +198,12 @@ TEST_F(homa_rpc, homa_rpc_new_server__socket_shutdown) } TEST_F(homa_rpc, homa_rpc_new_server__allocate_buffers) { + struct homa_rpc *srpc; int created; + self->data.message_length = N(3*HOMA_BPAGE_SIZE); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); EXPECT_EQ(3, srpc->msgin.num_bpages); @@ -194,20 +211,24 @@ TEST_F(homa_rpc, homa_rpc_new_server__allocate_buffers) } TEST_F(homa_rpc, homa_rpc_new_server__no_buffer_pool) { + struct homa_rpc *srpc; int created; + self->data.message_length = N(1400); homa_pool_destroy(self->hsk.buffer_pool); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_TRUE(IS_ERR(srpc)); EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); } TEST_F(homa_rpc, homa_rpc_new_server__handoff_rpc) { + struct homa_rpc *srpc; int created; + self->data.message_length = N(1400); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); EXPECT_EQ(RPC_INCOMING, srpc->state); @@ -217,11 +238,13 @@ TEST_F(homa_rpc, homa_rpc_new_server__handoff_rpc) } TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_no_buffers) { + struct homa_rpc *srpc; int created; + self->data.message_length = N(1400); atomic_set(&self->hsk.buffer_pool->free_bpages,0 ); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); @@ -229,11 +252,13 @@ TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_no_buffers) } TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_rpc) { + struct homa_rpc *srpc; int created; + self->data.message_length = N(2800); self->data.seg.offset = N(1400); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); EXPECT_EQ(RPC_INCOMING, srpc->state); @@ -244,15 +269,16 @@ TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_rpc) TEST_F(homa_rpc, homa_bucket_lock_slow) { + struct homa_rpc *crpc, *srpc; int created; + mock_cycles = ~0; - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, - &self->server_addr); + crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); ASSERT_FALSE(IS_ERR(crpc)); homa_rpc_free(crpc); homa_rpc_unlock(crpc); - struct homa_rpc *srpc = homa_rpc_new_server(&self->hsk, - self->client_ip, &self->data, &created); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); @@ -272,15 +298,18 @@ TEST_F(homa_rpc, homa_bucket_lock_slow) TEST_F(homa_rpc, homa_rpc_acked__basics) { + struct homa_rpc *srpc; struct homa_sock hsk; + struct homa_ack ack = {}; + mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); + srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id)}; + ack.client_port = htons(self->client_port); + ack.server_port = htons(self->server_port); + ack.client_id = cpu_to_be64(self->client_id); homa_rpc_acked(&hsk, self->client_ip, &ack); EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); @@ -288,15 +317,18 @@ TEST_F(homa_rpc, homa_rpc_acked__basics) } TEST_F(homa_rpc, homa_rpc_acked__lookup_socket) { + struct homa_ack ack = {}; + struct homa_rpc *srpc; struct homa_sock hsk; + mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); + srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id)}; + ack.client_port = htons(self->client_port); + ack.server_port = htons(self->server_port); + ack.client_id = cpu_to_be64(self->client_id); homa_rpc_acked(&self->hsk, self->client_ip, &ack); EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); @@ -304,15 +336,18 @@ TEST_F(homa_rpc, homa_rpc_acked__lookup_socket) } TEST_F(homa_rpc, homa_rpc_acked__no_such_socket) { + struct homa_ack ack = {}; + struct homa_rpc *srpc; struct homa_sock hsk; + mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); + srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port+1), - .client_id = cpu_to_be64(self->client_id)}; + ack.client_port = htons(self->client_port); + ack.server_port = htons(self->server_port+1); + ack.client_id = cpu_to_be64(self->client_id); homa_rpc_acked(&hsk, self->client_ip, &ack); EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); @@ -320,15 +355,18 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_socket) } TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) { + struct homa_ack ack = {}; + struct homa_rpc *srpc; struct homa_sock hsk; + mock_sock_init(&hsk, &self->homa, self->server_port); - struct homa_rpc *srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 3000); + srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); ASSERT_NE(NULL, srpc); - struct homa_ack ack = {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id+10)}; + ack.client_port = htons(self->client_port); + ack.server_port = htons(self->server_port); + ack.client_id = cpu_to_be64(self->client_id+10); homa_rpc_acked(&hsk, self->client_ip, &ack); EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); @@ -340,6 +378,7 @@ TEST_F(homa_rpc, homa_rpc_free__basics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 20000); + EXPECT_EQ(1, self->homa.num_grantable_rpcs); ASSERT_NE(NULL, crpc); unit_log_clear(); @@ -355,6 +394,7 @@ TEST_F(homa_rpc, homa_rpc_free__already_dead) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 100); + ASSERT_NE(NULL, crpc); unit_log_clear(); homa_rpc_free(crpc); @@ -369,6 +409,7 @@ TEST_F(homa_rpc, homa_rpc_free__state_ready) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 100); + ASSERT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); homa_rpc_free(crpc); @@ -376,10 +417,11 @@ TEST_F(homa_rpc, homa_rpc_free__state_ready) } TEST_F(homa_rpc, homa_rpc_free__wakeup_interest) { - struct homa_interest interest = {}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 100); + struct homa_interest interest = {}; + ASSERT_NE(NULL, crpc); atomic_long_set(&interest.ready_rpc, 0); interest.reg_rpc = crpc; @@ -395,6 +437,7 @@ TEST_F(homa_rpc, homa_rpc_free__free_gaps) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(1400); @@ -415,6 +458,7 @@ TEST_F(homa_rpc, homa_rpc_free__dead_buffs) struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 1000); + ASSERT_NE(NULL, crpc1); homa_rpc_free(crpc1); EXPECT_EQ(9, self->homa.max_dead_buffs); @@ -432,6 +476,7 @@ TEST_F(homa_rpc, homa_rpc_free__remove_from_throttled_list) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 1000); + homa_add_to_throttled(crpc); EXPECT_EQ(1, unit_list_length(&self->homa.throttled_rpcs)); unit_log_clear(); @@ -450,6 +495,7 @@ TEST_F(homa_rpc, homa_rpc_reap__basics) struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id+4, 2000, 100); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); @@ -471,6 +517,7 @@ TEST_F(homa_rpc, homa_rpc_reap__protected) struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 2000); + ASSERT_NE(NULL, crpc1); homa_rpc_free(crpc1); unit_log_clear(); @@ -487,6 +534,7 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_flags) struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 1000, 2000); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); homa_rpc_free(crpc1); @@ -510,6 +558,7 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_active_xmits) struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 1000, 2000); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); homa_rpc_free(crpc1); @@ -531,6 +580,7 @@ TEST_F(homa_rpc, homa_rpc_reap__grant_in_progress) struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 1000, 2000); + ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); homa_rpc_free(crpc1); @@ -551,6 +601,7 @@ TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 100); + ASSERT_NE(NULL, crpc); homa_rpc_free(crpc); EXPECT_EQ(9, self->hsk.dead_skbs); @@ -561,12 +612,12 @@ TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) } TEST_F(homa_rpc, homa_rpc_reap__release_buffers) { - struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, 4000, 98, 1000, 150000); - ASSERT_NE(NULL, crpc); + struct homa_pool *pool = self->hsk.buffer_pool; + ASSERT_NE(NULL, crpc); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); homa_rpc_free(crpc); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); @@ -580,6 +631,7 @@ TEST_F(homa_rpc, homa_rpc_reap__free_gaps) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, 4000, 98, 1000, 150000); + ASSERT_NE(NULL, crpc); homa_gap_new(&crpc->msgin.gaps, 1000, 2000); mock_cycles = 1000; @@ -598,23 +650,25 @@ TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) TEST_F(homa_rpc, homa_find_client_rpc) { + struct homa_rpc *crpc1, *crpc2, *crpc3, *crpc4; + atomic64_set(&self->homa.next_outgoing_id, 3); - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 10000, 1000); + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 10000, 1000); atomic64_set(&self->homa.next_outgoing_id, 3 + 3*HOMA_CLIENT_RPC_BUCKETS); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 10000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id+2, + 10000, 1000); atomic64_set(&self->homa.next_outgoing_id, 3 + 10*HOMA_CLIENT_RPC_BUCKETS); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+4, 10000, 1000); + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id+4, + 10000, 1000); atomic64_set(&self->homa.next_outgoing_id, 40); - struct homa_rpc *crpc4 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+6, 10000, 1000); + crpc4 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id+6, + 10000, 1000); EXPECT_EQ(crpc1, homa_find_client_rpc(&self->hsk, crpc1->id)); homa_rpc_unlock(crpc1); @@ -636,20 +690,21 @@ TEST_F(homa_rpc, homa_find_server_rpc) struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 100); - ASSERT_NE(NULL, srpc1); struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id + 30*HOMA_SERVER_RPC_BUCKETS, 10000, 100); - ASSERT_NE(NULL, srpc2); struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port+1, self->server_id + 10*HOMA_SERVER_RPC_BUCKETS, 10000, 100); - ASSERT_NE(NULL, srpc3); struct homa_rpc *srpc4 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port+1, self->server_id + 4, 10000, 100); + + ASSERT_NE(NULL, srpc1); + ASSERT_NE(NULL, srpc2); + ASSERT_NE(NULL, srpc3); ASSERT_NE(NULL, srpc4); EXPECT_EQ(srpc1, homa_find_server_rpc(&self->hsk, self->client_ip, self->client_port, srpc1->id)); diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 4cfc242a..69bf2749 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -20,9 +20,8 @@ static inline struct homa_skb_core *get_skb_core(int core) */ static struct sk_buff *test_skb(struct homa *homa) { - struct sk_buff *skb = homa_skb_new_tx(100); struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); - + struct sk_buff *skb = homa_skb_new_tx(100); int32_t data[1000]; char *src; int i; @@ -56,6 +55,7 @@ static void add_to_pool(struct homa *homa, int num_pages, int core) { struct homa_page_pool *pool = get_skb_core(core)->pool; int i; + for (i = 0; i < num_pages; i++) { pool->pages[pool->avail] = alloc_pages(GFP_KERNEL, HOMA_SKB_PAGE_ORDER); @@ -114,6 +114,7 @@ TEST_F(homa_skb, homa_skb_init) TEST_F(homa_skb, homa_skb_cleanup) { struct homa_skb_core *skb_core = get_skb_core(2); + skb_core->skb_page = alloc_pages(GFP_KERNEL, 2); add_to_pool(&self->homa, 5, 2); add_to_pool(&self->homa, 4, 3); @@ -135,7 +136,9 @@ TEST_F(homa_skb, homa_skb_cleanup) TEST_F(homa_skb, homa_skb_stash_pages) { int id = raw_smp_processor_id(); - struct homa_skb_core *skb_core = get_skb_core(id); + struct homa_skb_core *skb_core; + + skb_core = get_skb_core(id); add_to_pool(&self->homa, 5, id); EXPECT_EQ(5, skb_core->pool->avail); EXPECT_EQ(0, skb_core->num_stashed_pages); @@ -161,18 +164,20 @@ TEST_F(homa_skb, homa_skb_stash_pages) TEST_F(homa_skb, homa_skb_extend_frags__basics) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + char *p1, *p2, *p3; int length = 100; - char *p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); + + p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(100, length); EXPECT_NE(NULL, p1); length = 200; - char *p2 = homa_skb_extend_frags(&self->homa, self->skb, &length); + p2 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(200, length); EXPECT_EQ(p1 + 100, p2); length = 300; - char *p3 = homa_skb_extend_frags(&self->homa, self->skb, &length); + p3 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(300, length); EXPECT_EQ(p2 + 200, p3); @@ -183,13 +188,15 @@ TEST_F(homa_skb, homa_skb_extend_frags__merge_but_reduce_length) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); int length = 1000; - char *p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); + char *p1, *p2; + + p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(1000, length); EXPECT_NE(NULL, p1); skb_core->page_size = 2048; length = 2000; - char *p2 = homa_skb_extend_frags(&self->homa, self->skb, &length); + p2 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(1048, length); EXPECT_EQ(p1 + 1000, p2); @@ -199,23 +206,24 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); - ASSERT_NE(NULL, skb2); + char *p1, *p2, *p3; + ASSERT_NE(NULL, skb2); int length = 1000; - char *p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); + p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(1000, length); EXPECT_NE(NULL, p1); EXPECT_EQ(1000, self->skb->len); skb_core->page_size = 2048; length = 1000; - char *p2 = homa_skb_extend_frags(&self->homa, skb2, &length); + p2 = homa_skb_extend_frags(&self->homa, skb2, &length); EXPECT_EQ(1000, length); EXPECT_EQ(p1 + 1024, p2); EXPECT_EQ(1000, skb2->len); length = 1000; - char *p3 = homa_skb_extend_frags(&self->homa, self->skb, &length); + p3 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_NE(NULL, p3); EXPECT_EQ(1000, length); EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); @@ -229,21 +237,22 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); - ASSERT_NE(NULL, skb2); + char *p1, *p2, *p3; + ASSERT_NE(NULL, skb2); int length = 1000; - char *p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); + p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(1000, length); EXPECT_NE(NULL, p1); skb_core->page_size = 2048; length = 500; - char *p2 = homa_skb_extend_frags(&self->homa, skb2, &length); + p2 = homa_skb_extend_frags(&self->homa, skb2, &length); EXPECT_EQ(500, length); EXPECT_EQ(p1 + 1024, p2); length = 2000; - char *p3 = homa_skb_extend_frags(&self->homa, self->skb, &length); + p3 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(p2 + 512, p3); EXPECT_EQ(512, length); EXPECT_EQ(2, skb_shinfo(self->skb)->nr_frags); @@ -257,6 +266,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__free_previous_page) { struct homa_skb_core *skb_core = get_skb_core(2); struct page *old_page; + EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); EXPECT_NE(NULL, skb_core->skb_page); old_page = skb_core->skb_page; @@ -274,6 +284,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) struct sk_buff *skb = homa_skb_new_tx(100); struct page *page; int length = 100; + homa_skb_extend_frags(&self->homa, skb, &length); EXPECT_EQ(100, skb_core->page_inuse); page = skb_core->skb_page; @@ -287,6 +298,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) TEST_F(homa_skb, homa_skb_page_alloc__from_stash) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + add_to_pool(&self->homa, 5, raw_smp_processor_id()); homa_skb_stash_pages(&self->homa, 3*HOMA_SKB_PAGE_SIZE - 100); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); @@ -298,6 +310,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__from_stash) TEST_F(homa_skb, homa_skb_page_alloc__from_pool) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + add_to_pool(&self->homa, 5, raw_smp_processor_id()); EXPECT_EQ(5, skb_core->pool->avail); EXPECT_EQ(0, skb_core->num_stashed_pages); @@ -308,6 +321,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__from_pool) TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + add_to_pool(&self->homa, 1, raw_smp_processor_id()); EXPECT_EQ(1, skb_core->pool->avail); EXPECT_EQ(0, skb_core->num_stashed_pages); @@ -322,6 +336,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + mock_cycles = ~0; EXPECT_EQ(0, skb_core->pool->avail); EXPECT_EQ(0, skb_core->num_stashed_pages); @@ -334,6 +349,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) { struct homa_skb_core *skb_core = get_skb_core(2); + mock_cycles = ~0; mock_alloc_page_errors = 1; EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); @@ -347,6 +363,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) TEST_F(homa_skb, homa_skb_page_alloc__no_pages_available) { struct homa_skb_core *skb_core = get_skb_core(2); + mock_alloc_page_errors = 3; EXPECT_FALSE(homa_skb_page_alloc(&self->homa, skb_core)); EXPECT_EQ(NULL, skb_core->skb_page); @@ -356,6 +373,7 @@ TEST_F(homa_skb, homa_skb_append_to_frag__basics) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct skb_shared_info *shinfo = skb_shinfo(self->skb); + char *p; /* First append fits in a single block. */ EXPECT_EQ(0, homa_skb_append_to_frag(&self->homa, self->skb, "abcd", 4)); @@ -367,7 +385,7 @@ TEST_F(homa_skb, homa_skb_append_to_frag__basics) EXPECT_EQ(2, shinfo->nr_frags); EXPECT_EQ(10, skb_frag_size(&shinfo->frags[0])); - char *p = ((char *) page_address(skb_frag_page(&shinfo->frags[0]))) + p = ((char *) page_address(skb_frag_page(&shinfo->frags[0]))) + shinfo->frags[0].offset; p[skb_frag_size(&shinfo->frags[0])] = 0; EXPECT_STREQ("abcd012345", p); @@ -387,8 +405,8 @@ TEST_F(homa_skb, homa_skb_append_to_frag__no_memory) TEST_F(homa_skb, homa_skb_append_from_iter__basics) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); - struct skb_shared_info *shinfo = skb_shinfo(self->skb); struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); + struct skb_shared_info *shinfo = skb_shinfo(self->skb); /* First append fits in a single block. */ unit_log_clear(); @@ -413,6 +431,7 @@ TEST_F(homa_skb, homa_skb_append_from_iter__basics) TEST_F(homa_skb, homa_skb_append_from_iter__no_memory) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + mock_alloc_page_errors = 3; EXPECT_EQ(ENOMEM, -homa_skb_append_from_iter(&self->homa, self->skb, iter, 2000)); @@ -436,9 +455,9 @@ TEST_F(homa_skb, homa_skb_append_from_skb__header_only) } TEST_F(homa_skb, homa_skb_append_from_skb__error_copying_header) { + struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct sk_buff *src_skb = test_skb(&self->homa); struct sk_buff *dst_skb = homa_skb_new_tx(100); - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); mock_alloc_page_errors = -1; skb_core->page_inuse = skb_core->page_size; @@ -452,9 +471,10 @@ TEST_F(homa_skb, homa_skb_append_from_skb__header_and_first_frag) { struct sk_buff *src_skb = test_skb(&self->homa); struct sk_buff *dst_skb = homa_skb_new_tx(100); - struct skb_shared_info *dst_shinfo = skb_shinfo(dst_skb); + struct skb_shared_info *dst_shinfo; int32_t data[500]; + dst_shinfo = skb_shinfo(dst_skb); EXPECT_EQ(0, homa_skb_append_from_skb(&self->homa, dst_skb, src_skb, 80, 100)); memset(data, 0, sizeof(data)); @@ -471,9 +491,10 @@ TEST_F(homa_skb, homa_skb_append_from_skb__multiple_frags) { struct sk_buff *src_skb = test_skb(&self->homa); struct sk_buff *dst_skb = homa_skb_new_tx(100); - struct skb_shared_info *dst_shinfo = skb_shinfo(dst_skb); + struct skb_shared_info *dst_shinfo; int32_t data[500]; + dst_shinfo = skb_shinfo(dst_skb); EXPECT_EQ(0, homa_skb_append_from_skb(&self->homa, dst_skb, src_skb, 320, 600)); memset(data, 0, sizeof(data)); @@ -490,9 +511,10 @@ TEST_F(homa_skb, homa_skb_append_from_skb__dst_runs_out_of_frags) { struct sk_buff *src_skb = test_skb(&self->homa); struct sk_buff *dst_skb = homa_skb_new_tx(100); - struct skb_shared_info *dst_shinfo = skb_shinfo(dst_skb); + struct skb_shared_info *dst_shinfo; int i, err; + dst_shinfo = skb_shinfo(dst_skb); mock_max_skb_frags = 4; for (i = 0; i < 10; i++) { err = homa_skb_append_from_skb(&self->homa, dst_skb, src_skb, @@ -530,8 +552,8 @@ TEST_F(homa_skb, homa_skb_free_many_tx__basics) TEST_F(homa_skb, homa_skb_free_many_tx__skb_ref_count_not_one) { struct sk_buff *skb; - int length; struct page *page; + int length; skb = homa_skb_new_tx(100); length = HOMA_SKB_PAGE_SIZE; @@ -571,6 +593,7 @@ TEST_F(homa_skb, homa_skb_cache_pages__different_numa_nodes) { struct page *pages[4]; int i; + for (i = 0; i < 4; i++) pages[i] = alloc_pages(GFP_KERNEL, HOMA_SKB_PAGE_ORDER); mock_page_nid_mask = 7; @@ -584,6 +607,7 @@ TEST_F(homa_skb, homa_skb_cache_pages__pool_size_exceeded) { struct page *pages[6]; int i; + for (i = 0; i < 6; i++) pages[i] = alloc_pages(GFP_KERNEL, HOMA_SKB_PAGE_ORDER); homa_skb_cache_pages(&self->homa, pages, 4); diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index e18a37fa..54cb5b70 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -48,6 +48,7 @@ TEST_F(homa_sock, homa_port_hash) TEST_F(homa_sock, homa_socktab_start_scan) { struct homa_socktab_scan scan; + homa_destroy(&self->homa); homa_init(&self->homa); mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); @@ -60,6 +61,7 @@ TEST_F(homa_sock, homa_socktab_next__basics) { struct homa_sock hsk1, hsk2, hsk3, hsk4, *hsk; struct homa_socktab_scan scan; + int first_port = 34000; homa_destroy(&self->homa); homa_init(&self->homa); @@ -87,6 +89,7 @@ TEST_F(homa_sock, homa_socktab_next__deleted_socket) struct homa_sock hsk1, hsk2, hsk3, *hsk; struct homa_socktab_scan scan; int first_port = 34000; + homa_destroy(&self->homa); homa_init(&self->homa); mock_sock_init(&hsk1, &self->homa, first_port); @@ -109,6 +112,7 @@ TEST_F(homa_sock, homa_socktab_next__deleted_socket) TEST_F(homa_sock, homa_sock_init__skip_port_in_use) { struct homa_sock hsk2, hsk3; + self->homa.next_client_port = 0xffff; mock_sock_init(&hsk2, &self->homa, 0); mock_sock_init(&hsk3, &self->homa, 0); @@ -120,6 +124,7 @@ TEST_F(homa_sock, homa_sock_init__skip_port_in_use) TEST_F(homa_sock, homa_sock_init__ip_header_length) { struct homa_sock hsk_v4, hsk_v6; + mock_ipv6 = false; mock_sock_init(&hsk_v4, &self->homa, 0); mock_ipv6 = true; @@ -132,6 +137,7 @@ TEST_F(homa_sock, homa_sock_init__ip_header_length) TEST_F(homa_sock, homa_sock_init__hijack_tcp) { struct homa_sock hijack, no_hijack; + self->homa.hijack_tcp = 0; mock_sock_init(&no_hijack, &self->homa, 0); self->homa.hijack_tcp = 1; @@ -144,8 +150,9 @@ TEST_F(homa_sock, homa_sock_init__hijack_tcp) TEST_F(homa_sock, homa_sock_shutdown__basics) { - int client2, client3; struct homa_sock hsk2, hsk3; + int client2, client3; + mock_sock_init(&hsk2, &self->homa, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); client2 = hsk2.port; @@ -198,6 +205,7 @@ TEST_F(homa_sock, homa_sock_shutdown__wakeup_interests) { struct homa_interest interest1, interest2, interest3; struct task_struct task1, task2, task3; + interest1.thread = &task1; task1.pid = 100; interest2.thread = &task2; @@ -218,6 +226,7 @@ TEST_F(homa_sock, homa_sock_shutdown__wakeup_interests) TEST_F(homa_sock, homa_sock_bind) { struct homa_sock hsk2; + mock_sock_init(&hsk2, &self->homa, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); @@ -251,6 +260,7 @@ TEST_F(homa_sock, homa_sock_bind__socket_shutdown) TEST_F(homa_sock, homa_sock_find__basics) { struct homa_sock hsk2; + mock_sock_init(&hsk2, &self->homa, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, @@ -265,6 +275,7 @@ TEST_F(homa_sock, homa_sock_find__basics) TEST_F(homa_sock, homa_sock_find__long_hash_chain) { struct homa_sock hsk2, hsk3, hsk4; + EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &self->hsk, 13)); mock_sock_init(&hsk2, &self->homa, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 090754aa..b552ce6d 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -51,6 +51,7 @@ TEST_F(homa_timer, homa_check_rpc__request_ack) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 100); + ASSERT_NE(NULL, srpc); self->homa.request_ack_ticks = 2; @@ -84,6 +85,7 @@ TEST_F(homa_timer, homa_check_rpc__all_granted_bytes_received) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 5000); + ASSERT_NE(NULL, crpc); unit_log_clear(); crpc->msgin.granted = 1400; @@ -97,6 +99,7 @@ TEST_F(homa_timer, homa_check_rpc__no_buffer_space) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 5000); + ASSERT_NE(NULL, crpc); unit_log_clear(); crpc->msgin.num_bpages = 0; @@ -110,6 +113,7 @@ TEST_F(homa_timer, homa_check_rpc__server_has_received_request) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 100); + ASSERT_NE(NULL, srpc); unit_log_clear(); srpc->silent_ticks = 10; @@ -122,6 +126,7 @@ TEST_F(homa_timer, homa_check_rpc__granted_bytes_not_sent) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 200); + ASSERT_NE(NULL, crpc); unit_log_clear(); crpc->silent_ticks = 10; @@ -134,6 +139,7 @@ TEST_F(homa_timer, homa_check_rpc__timeout) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 200, 10000); + ASSERT_NE(NULL, crpc); unit_log_clear(); crpc->silent_ticks = self->homa.timeout_ticks-1; @@ -150,6 +156,7 @@ TEST_F(homa_timer, homa_check_rpc__issue_resend) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 200, 10000); + ASSERT_NE(NULL, crpc); crpc->msgin.granted = 5000; self->homa.resend_ticks = 3; @@ -185,6 +192,7 @@ TEST_F(homa_timer, homa_check_rpc__request_first_bytes_of_message) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 5000, 10000); + ASSERT_NE(NULL, crpc); crpc->msgout.granted = 5000; crpc->msgout.next_xmit_offset = 5000; @@ -207,6 +215,7 @@ TEST_F(homa_timer, homa_check_rpc__call_homa_gap_retry) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 200, 20000); + ASSERT_NE(NULL, crpc); crpc->silent_ticks = 3; crpc->msgin.granted = 10000; @@ -223,13 +232,14 @@ TEST_F(homa_timer, homa_check_rpc__call_homa_gap_retry) TEST_F(homa_timer, homa_timer__basics) { - self->homa.timeout_ticks = 5; - self->homa.resend_ticks = 3; - self->homa.resend_interval = 2; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 200, 5000); + ASSERT_NE(NULL, crpc); + self->homa.timeout_ticks = 5; + self->homa.resend_ticks = 3; + self->homa.resend_interval = 2; unit_log_clear(); crpc->silent_ticks = 1; homa_timer(&self->homa); @@ -260,6 +270,7 @@ TEST_F(homa_timer, homa_timer__reap_dead_rpcs) struct homa_rpc *dead = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 40000, 1000); + ASSERT_NE(NULL, dead); homa_rpc_free(dead); EXPECT_EQ(31, self->hsk.dead_skbs); @@ -279,6 +290,7 @@ TEST_F(homa_timer, homa_timer__rpc_in_service) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 5000, 5000); + ASSERT_NE(NULL, srpc); unit_log_clear(); homa_timer(&self->homa); diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 66f9bd95..8bebb8b9 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -54,12 +54,12 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, TEST_F(homa_utils, homa_print_ipv4_addr) { - char *p1, *p2; - int i; - struct in6_addr test_addr1 = unit_get_in_addr("192.168.0.1"); struct in6_addr test_addr2 = unit_get_in_addr("1.2.3.4"); struct in6_addr test_addr3 = unit_get_in_addr("5.6.7.8"); + char *p1, *p2; + int i; + p1 = homa_print_ipv6_addr(&test_addr1); p2 = homa_print_ipv6_addr(&test_addr2); EXPECT_STREQ("192.168.0.1", p1); @@ -75,6 +75,7 @@ TEST_F(homa_utils, homa_snprintf) { char buffer[50]; int used = 0; + used = homa_snprintf(buffer, sizeof32(buffer), used, "Test message with values: %d and %d", 100, 1000); EXPECT_EQ(38, used); diff --git a/test/unit_timetrace.c b/test/unit_timetrace.c index 650e96ef..4bb24faf 100644 --- a/test/unit_timetrace.c +++ b/test/unit_timetrace.c @@ -45,6 +45,7 @@ TEST_F(timetrace, tt_freeze) TEST_F(timetrace, tt_record__basics) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_record("Message with no args"); mock_cycles++; @@ -69,6 +70,7 @@ TEST_F(timetrace, tt_record__basics) TEST_F(timetrace, tt_record_buf__wraparound) { char buffer[100]; + memset(buffer, 0, sizeof(buffer)); tt_buffer_size = 4; tt_record("Message 1"); @@ -91,8 +93,8 @@ TEST_F(timetrace, tt_record_buf__wraparound) TEST_F(timetrace, tt_find_oldest) { int pos[nr_cpu_ids]; - tt_buffer_size = 4; + tt_buffer_size = 4; tt_record_buf(tt_buffers[0], 1500, "Buf0", 0, 0, 0, 0); tt_record_buf(tt_buffers[0], 1600, "Buf0", 0, 0, 0, 0); tt_record_buf(tt_buffers[0], 1700, "Buf0", 0, 0, 0, 0); @@ -138,6 +140,7 @@ TEST_F(timetrace, tt_proc_open__increment_frozen) TEST_F(timetrace, tt_proc_read__bogus_file) { struct tt_proc_file pf; + pf.file = NULL; int err = -tt_proc_read(&self->file, (char *) 1000, 100, 0); EXPECT_EQ(EINVAL, err); @@ -156,6 +159,7 @@ TEST_F(timetrace, tt_proc_read__uninitialized) TEST_F(timetrace, tt_proc_read__nothing_to_read) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); buffer[0] = 0; tt_proc_open(NULL, &self->file); @@ -166,6 +170,7 @@ TEST_F(timetrace, tt_proc_read__nothing_to_read) TEST_F(timetrace, tt_proc_read__leftovers) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_pf_storage = 100; tt_record_buf(tt_buffers[0], 1000, @@ -190,6 +195,7 @@ TEST_F(timetrace, tt_proc_read__leftovers) TEST_F(timetrace, tt_proc_read__sort_events_by_time) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_record_buf(tt_buffers[0], 1000, "Buf0", 0, 0, 0, 0); tt_record_buf(tt_buffers[0], 1100, "Buf0", 0, 0, 0, 0); @@ -217,6 +223,7 @@ TEST_F(timetrace, tt_proc_read__sort_events_by_time) TEST_F(timetrace, tt_proc_read__event_barely_fits_in_buffer) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_pf_storage = 25; tt_record_buf(tt_buffers[0], 1000, @@ -235,6 +242,7 @@ TEST_F(timetrace, tt_proc_read__event_barely_fits_in_buffer) TEST_F(timetrace, tt_proc_read__single_entry_too_large) { char buffer[1000]; + memset(buffer, 0, sizeof(buffer)); tt_pf_storage = 20; tt_record_buf(tt_buffers[0], 1000, @@ -248,6 +256,7 @@ TEST_F(timetrace, tt_proc_read__single_entry_too_large) TEST_F(timetrace, tt_proc_release__bogus_file) { struct tt_proc_file pf; + pf.file = NULL; int err = -tt_proc_release(NULL, &self->file); EXPECT_EQ(EINVAL, err); diff --git a/timetrace.c b/timetrace.c index fdac6291..15f490ab 100644 --- a/timetrace.c +++ b/timetrace.c @@ -283,8 +283,8 @@ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, void tt_find_oldest(int *pos) { struct tt_buffer *buffer; - int i; __u64 start_time = 0; + int i; for (i = 0; i < nr_cpu_ids; i++) { buffer = tt_buffers[i]; @@ -370,11 +370,12 @@ int tt_proc_open(struct inode *inode, struct file *file) ssize_t tt_proc_read(struct file *file, char __user *user_buf, size_t length, loff_t *offset) { + struct tt_proc_file *pf = file->private_data; + /* # bytes of data that have accumulated in pf->msg_storage but * haven't been copied to user space yet. */ int copied_to_user = 0; - struct tt_proc_file *pf = file->private_data; spin_lock(&tt_lock); if ((pf == NULL) || (pf->file != file)) { @@ -499,8 +500,8 @@ loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence) */ int tt_proc_release(struct inode *inode, struct file *file) { - int i; struct tt_proc_file *pf = file->private_data; + int i; if ((pf == NULL) || (pf->file != file)) { pr_err("tt_metrics_release found damaged private_data: 0x%p\n", @@ -550,6 +551,10 @@ int tt_proc_release(struct inode *inode, struct file *file) */ void tt_print_file(char *path) { + /* Static buffer for accumulating output data. */ + static char buffer[10000]; + struct file *filp = NULL; + /* Index of the next entry to return from each tt_buffer. * This array is too large to allocate on the stack, and we don't * want to allocate space dynamically (this function could be @@ -559,13 +564,9 @@ void tt_print_file(char *path) */ static int pos[NR_CPUS]; static atomic_t active; - struct file *filp = NULL; - int err; - - /* Also use a static buffer for accumulating output data. */ - static char buffer[10000]; int bytes_used = 0; loff_t offset = 0; + int err; if (atomic_xchg(&active, 1)) { pr_err("concurrent call to %s aborting\n", __func__); @@ -591,10 +592,10 @@ void tt_print_file(char *path) /* Each iteration of this loop printk's one event. */ while (true) { + __u64 earliest_time = ~0; struct tt_event *event; - int i; int current_core = -1; - __u64 earliest_time = ~0; + int i; /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { @@ -691,11 +692,11 @@ void tt_printk(void) /* Each iteration of this loop printk's one event. */ while (true) { + __u64 earliest_time = ~0; struct tt_event *event; - int i; int current_core = -1; - __u64 earliest_time = ~0; char msg[200]; + int i; /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { @@ -752,10 +753,10 @@ void tt_get_messages(char *buffer, size_t length) /* Each iteration of this loop prints one event. */ while (true) { + __u64 earliest_time = ~0; struct tt_event *event; - int i, result; int current_core = -1; - __u64 earliest_time = ~0; + int i, result; /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { From 4e0c8d83f387e2bb876cb0b2891729f86536e977 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 8 Oct 2024 13:53:11 -0700 Subject: [PATCH 041/625] Use L1_CACHE_BYTES for cache line size --- homa_impl.h | 24 +++++++----------------- homa_pool.h | 6 +++--- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index daefbf11..1ad745d2 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -141,9 +141,6 @@ extern void homa_throttle_lock_slow(struct homa *homa); #define sizeof32(type) ((int) (sizeof(type))) -/** define CACHE_LINE_SIZE - The number of bytes in a cache line. */ -#define CACHE_LINE_SIZE 64 - /** * define HOMA_MAX_GRANTS - Used to size various data structures for grant * management; the max_overcommit sysctl parameter must never be greater than @@ -151,13 +148,6 @@ extern void homa_throttle_lock_slow(struct homa *homa); */ #define HOMA_MAX_GRANTS 10 -/** - * struct homa_cache_line - An object whose size equals that of a cache line. - */ -struct homa_cache_line { - char bytes[64]; -}; - /** * struct homa_interest - Contains various information used while waiting * for incoming messages (indicates what kinds of messages a particular @@ -261,13 +251,13 @@ struct homa { * it could be a severe underestimate if there is competing traffic * from, say, TCP. Access only with atomic ops. */ - atomic64_t link_idle_time __aligned(CACHE_LINE_SIZE); + atomic64_t link_idle_time __aligned(L1_CACHE_BYTES); /** * @grantable_lock: Used to synchronize access to grant-related * fields below, from @grantable_peers to @last_grantable_change. */ - spinlock_t grantable_lock __aligned(CACHE_LINE_SIZE); + spinlock_t grantable_lock __aligned(L1_CACHE_BYTES); /** * @grantable_lock_time: get_cycles() time when grantable_lock @@ -366,7 +356,7 @@ struct homa { * @pacer_mutex: Ensures that only one instance of homa_pacer_xmit * runs at a time. Only used in "try" mode: never block on this. */ - spinlock_t pacer_mutex __aligned(CACHE_LINE_SIZE); + spinlock_t pacer_mutex __aligned(L1_CACHE_BYTES); /** * @pacer_fifo_fraction: The fraction of time (in thousandths) when @@ -428,7 +418,7 @@ struct homa { * a peer sends more bytes than granted (see synchronization note in * homa_send_grants for why we have to allow this possibility). */ - atomic_t total_incoming __aligned(CACHE_LINE_SIZE); + atomic_t total_incoming __aligned(L1_CACHE_BYTES); /** * @next_client_port: A client port number to consider for the @@ -436,13 +426,13 @@ struct homa { * be in the range allocated for servers; must check before using. * This port may also be in use already; must check. */ - __u16 next_client_port __aligned(CACHE_LINE_SIZE); + __u16 next_client_port __aligned(L1_CACHE_BYTES); /** * @port_map: Information about all open sockets. Dynamically * allocated; must be kfreed. */ - struct homa_socktab *port_map __aligned(CACHE_LINE_SIZE); + struct homa_socktab *port_map __aligned(L1_CACHE_BYTES); /** * @peertab: Info about all the other hosts we have communicated with. @@ -454,7 +444,7 @@ struct homa { * @page_pool_mutex: Synchronizes access to any/all of the page_pools * used for outgoing sk_buff data. */ - spinlock_t page_pool_mutex __aligned(CACHE_LINE_SIZE); + spinlock_t page_pool_mutex __aligned(L1_CACHE_BYTES); /** * @page_pools: One page pool for each NUMA node on the machine. diff --git a/homa_pool.h b/homa_pool.h index 1b04b4fd..bde038f2 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -18,7 +18,7 @@ struct homa_bpage { * @cache_line: Ensures that each homa_bpage object * is exactly one cache line long. */ - struct homa_cache_line cache_line; + char cache_line[L1_CACHE_BYTES]; struct { /** @lock: to synchronize shared access. */ spinlock_t lock; @@ -46,7 +46,7 @@ struct homa_bpage { }; }; }; -_Static_assert(sizeof(struct homa_bpage) == sizeof(struct homa_cache_line), +_Static_assert(sizeof(struct homa_bpage) == L1_CACHE_BYTES, "homa_bpage overflowed a cache line"); /** @@ -59,7 +59,7 @@ struct homa_pool_core { * @cache_line: Ensures that each object is exactly one * cache line long. */ - struct homa_cache_line cache_line; + char cache_line[L1_CACHE_BYTES]; struct { /** * @page_hint: Index of bpage in pool->descriptors, From 8d88c723a86ab2812acf7d44d136bb32cae263f5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 9 Oct 2024 10:45:24 -0700 Subject: [PATCH 042/625] Fix test subirectory to pass checkpatch.pl --- test/main.c | 37 +++--- test/mock.c | 224 +++++++++++++++++++------------------ test/unit_homa_grant.c | 46 ++++---- test/unit_homa_incoming.c | 230 +++++++++++++++++--------------------- test/unit_homa_metrics.c | 5 +- test/unit_homa_offload.c | 52 +++++---- test/unit_homa_outgoing.c | 77 ++++--------- test/unit_homa_peer.c | 22 ++-- test/unit_homa_plumbing.c | 28 +++-- test/unit_homa_pool.c | 14 +-- test/unit_homa_rpc.c | 12 +- test/unit_homa_skb.c | 25 ++--- test/unit_homa_sock.c | 22 ++-- test/unit_homa_timer.c | 4 +- test/unit_homa_utils.c | 4 +- test/unit_timetrace.c | 24 ++-- test/utils.c | 76 +++++++------ test/utils.h | 33 +++--- 18 files changed, 436 insertions(+), 499 deletions(-) diff --git a/test/main.c b/test/main.c index 17a5e1d5..73ae9c30 100644 --- a/test/main.c +++ b/test/main.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* Main program for running Homa unit tests. */ @@ -8,29 +6,23 @@ #include "kselftest_harness.h" #include "mock.h" -static char * helpMessage = - "This program runs unit tests written in the Linux kernel kselftest " - "style.\n" +static char *helpMessage = + "This program runs unit tests written in the Linux kernel kselftest style.\n" " Usage: %s options test_name test_name ...\n" "The following options are supported:\n" " --help or -h Print this message\n" - " --ipv4 Simulate IPv4 for all packets (default: " - "use IPv6)\n" - " --verbose or -v Print the names of all tests as they run " - "(default:\n" + " --ipv4 Simulate IPv4 for all packets (default: use IPv6)\n" + " --verbose or -v Print the names of all tests as they run (default:\n" " print only tests that fail)\n" - "If one or more test_name arguments are provided, then only those " - "tests are\n" + "If one or more test_name arguments are provided, then only those tests are\n" "run; if no test names are provided, then all tests are run.\n" - "\n" - "Note: the tests should provide complete coverage of both IPv4 and " - "IPv6 without\n" - "using the --ipv4 argument (code that depends on IPv4 vs. IPv6 " - "already has\n" - "special test cases for each); --ipv4 is provided for occasional " - "double-checking.\n"; + "\n" + "Note: the tests should provide complete coverage of both IPv4 and IPv6 without\n" + "using the --ipv4 argument (code that depends on IPv4 vs. IPv6 already has\n" + "special test cases for each); --ipv4 is provided for occasional double-checking.\n"; -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ int verbose = 0; int i; @@ -49,9 +41,8 @@ int main(int argc, char **argv) { printf("Unknown option %s; type '%s --help' for help\n", argv[i], argv[0]); return 1; - } else { + } else break; - } } test_harness_run(argc-i, argv+i, verbose); -} \ No newline at end of file +} diff --git a/test/mock.c b/test/mock.c index bea2d829..fe547ece 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file provides simplified substitutes for many Linux variables and * functions in order to allow Homa unit tests to be run outside a Linux @@ -34,24 +32,24 @@ extern void *memcpy(void *dest, const void *src, size_t n); * the next call to the function will fail; bit 1 corresponds to the next * call after that, and so on. */ -int mock_alloc_page_errors = 0; -int mock_alloc_skb_errors = 0; -int mock_copy_data_errors = 0; -int mock_copy_to_iter_errors = 0; -int mock_copy_to_user_errors = 0; -int mock_cpu_idle = 0; -int mock_import_ubuf_errors = 0; -int mock_import_iovec_errors = 0; -int mock_ip6_xmit_errors = 0; -int mock_ip_queue_xmit_errors = 0; -int mock_kmalloc_errors = 0; -int mock_route_errors = 0; -int mock_spin_lock_held = 0; -int mock_trylock_errors = 0; -int mock_vmalloc_errors = 0; +int mock_alloc_page_errors; +int mock_alloc_skb_errors; +int mock_copy_data_errors; +int mock_copy_to_iter_errors; +int mock_copy_to_user_errors; +int mock_cpu_idle; +int mock_import_ubuf_errors; +int mock_import_iovec_errors; +int mock_ip6_xmit_errors; +int mock_ip_queue_xmit_errors; +int mock_kmalloc_errors; +int mock_route_errors; +int mock_spin_lock_held; +int mock_trylock_errors; +int mock_vmalloc_errors; /* The return value from calls to signal_pending(). */ -int mock_signal_pending = 0; +int mock_signal_pending; /* Used as current task during tests. */ struct task_struct mock_task; @@ -59,24 +57,24 @@ struct task_struct mock_task; /* If a test sets this variable to nonzero, ip_queue_xmit will log * outgoing packets using the long format rather than short. */ -int mock_xmit_log_verbose = 0; +int mock_xmit_log_verbose; /* If a test sets this variable to nonzero, ip_queue_xmit will log * the contents of the homa_info from packets. */ -int mock_xmit_log_homa_info = 0; +int mock_xmit_log_homa_info; /* If a test sets this variable to nonzero, call_rcu_sched will log * whenever it is invoked. */ -int mock_log_rcu_sched = 0; +int mock_log_rcu_sched; /* A zero value means that copy_to_user will actually copy bytes to * the destination address; if nonzero, then 0 bits determine which * copies actually occur (bit 0 for the first copy, etc., just like * error masks). */ -int mock_copy_to_user_dont_copy = 0; +int mock_copy_to_user_dont_copy; /* HOMA_BPAGE_SIZE will evaluate to this. */ int mock_bpage_size = 0x10000; @@ -87,48 +85,49 @@ int mock_bpage_shift = 16; /* Keeps track of all the blocks of memory that have been allocated by * kmalloc but not yet freed by kfree. Reset for each test. */ -static struct unit_hash *kmallocs_in_use = NULL; +static struct unit_hash *kmallocs_in_use; /* Keeps track of all the results returned by proc_create that have not * yet been closed by calling proc_remove. Reset for each test. */ -static struct unit_hash *proc_files_in_use = NULL; +static struct unit_hash *proc_files_in_use; /* Keeps track of all the results returned by alloc_pages that have * not yet been released by calling put_page. The value of each entry is * a (char *) giving the reference count for the page. Reset for each test. */ -static struct unit_hash *pages_in_use = NULL; +static struct unit_hash *pages_in_use; /* Keeps track of all the results returned by ip_route_output_flow that - * have not yet been freed. Reset for each test. */ -static struct unit_hash *routes_in_use = NULL; + * have not yet been freed. Reset for each test. + */ +static struct unit_hash *routes_in_use; /* Keeps track of all sk_buffs that are alive in the current test. * Reset for each test. */ -static struct unit_hash *skbs_in_use = NULL; +static struct unit_hash *skbs_in_use; /* Keeps track of all the blocks of memory that have been allocated by * vmalloc but not yet freed by vfree. Reset for each test. */ -static struct unit_hash *vmallocs_in_use = NULL; +static struct unit_hash *vmallocs_in_use; /* The number of locks that have been acquired but not yet released. * Should be 0 at the end of each test. */ -static int mock_active_locks = 0; +static int mock_active_locks; /* The number of times rcu_read_lock has been called minus the number * of times rcu_read_unlock has been called. * Should be 0 at the end of each test. */ -static int mock_active_rcu_locks = 0; +static int mock_active_rcu_locks; /* Used as the return value for calls to get_cycles. A value of ~0 means * return actual clock time. */ -cycles_t mock_cycles = 0; +cycles_t mock_cycles; /* Indicates whether we should be simulation IPv6 or IPv4 in the * current test. Can be overridden by a test. @@ -140,14 +139,14 @@ bool mock_ipv6_default; /* List of priorities for all outbound packets. */ char mock_xmit_prios[1000]; -int mock_xmit_prios_offset = 0; +int mock_xmit_prios_offset; /* Maximum packet size allowed by "network" (see homa_message_out_fill; * chosen so that data packets will have UNIT_TEST_DATA_PER_PACKET bytes * of payload. The variable can be modified if useful in some tests. * Set by mock_sock_init. */ -int mock_mtu = 0; +int mock_mtu; /* Used instead of MAX_SKB_FRAGS when running some unit tests. */ int mock_max_skb_frags = MAX_SKB_FRAGS; @@ -158,12 +157,12 @@ int mock_numa_mask = 5; /* Bits determine the result of successive calls to compound order, starting * at the lowest bit. 0 means return HOMA_SKB_PAGE_ORDER, 1 means return 0. */ -int mock_compound_order_mask = 0; +int mock_compound_order_mask; /* Bits specify the NUMA node number that will be returned by the next * calls to mock_page_to_nid, starting with the low-order bit. */ -int mock_page_nid_mask = 0; +int mock_page_nid_mask; struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; struct netdev_queue mock_net_queue = {.state = 0}; @@ -177,14 +176,14 @@ const struct net_offload *inet6_offloads[MAX_INET_PROTOS]; static struct hrtimer_clock_base clock_base; unsigned int cpu_khz = 1000000; struct task_struct *current_task = &mock_task; -unsigned long ex_handler_refcount = 0; +unsigned long ex_handler_refcount; struct net init_net; unsigned long volatile jiffies = 1100; unsigned int nr_cpu_ids = 8; -unsigned long page_offset_base = 0; -unsigned long phys_base = 0; -unsigned long vmemmap_base = 0; -int __preempt_count = 0; +unsigned long page_offset_base; +unsigned long phys_base; +unsigned long vmemmap_base; +int __preempt_count; struct pcpu_hot pcpu_hot = {.cpu_number = 1}; char sock_flow_table[RPS_SOCK_FLOW_TABLE_SIZE(1024)]; struct net_hotdata net_hotdata = { @@ -193,18 +192,20 @@ struct net_hotdata net_hotdata = { }; extern void add_wait_queue(struct wait_queue_head *wq_head, - struct wait_queue_entry *wq_entry) {} + struct wait_queue_entry *wq_entry) +{} struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node) { + struct sk_buff *skb; int shinfo_size; if (mock_check_error(&mock_alloc_skb_errors)) return NULL; - struct sk_buff *skb = malloc(sizeof(struct sk_buff)); + skb = malloc(sizeof(struct sk_buff)); if (skb == NULL) - FAIL("skb malloc failed in __alloc_skb"); + FAIL("skb malloc failed in %s", __func__); memset(skb, 0, sizeof(*skb)); if (!skbs_in_use) skbs_in_use = unit_hash_new(); @@ -214,7 +215,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, skb->head = malloc(size + shinfo_size); memset(skb->head, 0, size + shinfo_size); if (skb->head == NULL) - FAIL("data malloc failed in __alloc_skb"); + FAIL("data malloc failed in %s", __func__); skb->data = skb->head; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; @@ -230,13 +231,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, return skb; } -void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) -{ - if (mock_log_rcu_sched) - unit_log_printf("; ", "call_rcu_sched"); - func(head); -} - void __check_object_size(const void *ptr, unsigned long n, bool to_user) {} size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) @@ -246,14 +240,15 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) if (mock_check_error(&mock_copy_data_errors)) return false; if (bytes > iter->count) { - unit_log_printf("; ", "copy_from_iter needs %lu bytes, but " - "iov_iter has only %lu", bytes, iter->count); + unit_log_printf("; ", "copy_from_iter needs %lu bytes, but iov_iter has only %lu", bytes, + iter->count); return 0; } while (bytes_left > 0) { struct iovec *iov = (struct iovec *) iter_iov(iter); __u64 int_base = (__u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; + if (chunk_bytes > bytes_left) chunk_bytes = bytes_left; unit_log_printf("; ", "_copy_from_iter %lu bytes at %llu", @@ -322,11 +317,6 @@ void __copy_overflow(int size, unsigned long count) abort(); } -void do_exit(long error_code) -{ - while(1) {} -} - void dst_release(struct dst_entry *dst) { if (!dst) @@ -335,7 +325,7 @@ void dst_release(struct dst_entry *dst) if (atomic_read(&dst->__rcuref.refcnt) > 0) return; if (!routes_in_use || unit_hash_get(routes_in_use, dst) == NULL) { - FAIL("dst_release on unknown route"); + FAIL("%s on unknown route", __func__); return; } unit_hash_erase(routes_in_use, dst); @@ -343,9 +333,10 @@ void dst_release(struct dst_entry *dst) } void finish_wait(struct wait_queue_head *wq_head, - struct wait_queue_entry *wq_entry) {} + struct wait_queue_entry *wq_entry) +{} -#if LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0) +#if KERNEL_VERSION(5, 18, 0) > LINUX_VERSION_CODE void get_random_bytes(void *buf, int nbytes) #else void get_random_bytes(void *buf, size_t nbytes) @@ -378,7 +369,8 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 range_ns, const enum hrtimer_mode mode) {} + u64 range_ns, const enum hrtimer_mode mode) +{} void __icmp_send(struct sk_buff *skb, int type, int code, __be32 info, const struct ip_options *opt) @@ -398,14 +390,14 @@ int idle_cpu(int cpu) return mock_check_error(&mock_cpu_idle); } -ssize_t import_iovec(int type, const struct iovec __user * uvector, - unsigned nr_segs, unsigned fast_segs, +ssize_t import_iovec(int type, const struct iovec __user *uvector, + unsigned int nr_segs, unsigned int fast_segs, struct iovec **iov, struct iov_iter *iter) { ssize_t size; - unsigned i; + unsigned int i; - *iov = (struct iovec *) kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + *iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); if (mock_check_error(&mock_import_iovec_errors)) return -EINVAL; size = 0; @@ -509,7 +501,8 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, return 0; } -void inet_register_protosw(struct inet_protosw *p) {} +void inet_register_protosw(struct inet_protosw *p) +{} int inet_release(struct socket *sock) { @@ -521,10 +514,12 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return 0; } -void inet_unregister_protosw(struct inet_protosw *p) {} +void inet_unregister_protosw(struct inet_protosw *p) +{} void __init_swait_queue_head(struct swait_queue_head *q, const char *name, - struct lock_class_key *key) {} + struct lock_class_key *key) +{} void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, @@ -551,10 +546,10 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len) struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { + struct rtable *route; + if (mock_check_error(&mock_route_errors)) return ERR_PTR(-EHOSTUNREACH); - - struct rtable *route; route = malloc(sizeof(struct rtable)); if (!route) { FAIL("malloc failed"); @@ -580,6 +575,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, { char buffer[200]; const char *prefix = " "; + if (mock_check_error(&mock_ip6_xmit_errors)) { kfree_skb(skb); return -ENETDOWN; @@ -597,11 +593,11 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, unit_log_printf("; ", "xmit %s", buffer); if (mock_xmit_log_homa_info) { struct homa_skb_info *homa_info; + homa_info = homa_get_skb_info(skb); - unit_log_printf("; ", "homa_info: wire_bytes %d, data_bytes %d, " - "seg_length %d, offset %d", homa_info->wire_bytes, - homa_info->data_bytes, homa_info->seg_length, - homa_info->offset); + unit_log_printf("; ", "homa_info: wire_bytes %d, data_bytes %d, seg_length %d, offset %d", + homa_info->wire_bytes, homa_info->data_bytes, + homa_info->seg_length, homa_info->offset); } kfree_skb(skb); return 0; @@ -632,6 +628,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) unit_log_printf("; ", "xmit %s", buffer); if (mock_xmit_log_homa_info) { struct homa_skb_info *homa_info; + homa_info = homa_get_skb_info(skb); unit_log_printf("; ", "homa_info: wire_bytes %d, data_bytes %d", homa_info->wire_bytes, homa_info->data_bytes); @@ -699,7 +696,7 @@ void kfree(const void *block) if (block == NULL) return; if (!kmallocs_in_use || unit_hash_get(kmallocs_in_use, block) == NULL) { - FAIL("kfree on unknown block"); + FAIL("%s on unknown block", __func__); return; } unit_hash_erase(kmallocs_in_use, block); @@ -722,21 +719,23 @@ void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) unit_hash_erase(skbs_in_use, skb); while (shinfo->frag_list) { struct sk_buff *next = shinfo->frag_list->next; + kfree_skb(shinfo->frag_list); shinfo->frag_list = next; } - for (i = 0; i < shinfo->nr_frags; i++) { + for (i = 0; i < shinfo->nr_frags; i++) put_page(skb_frag_page(&shinfo->frags[i])); - } free(skb->head); free(skb); } void *mock_kmalloc(size_t size, gfp_t flags) { + void *block; + if (mock_check_error(&mock_kmalloc_errors)) return NULL; - void *block = malloc(size); + block = malloc(size); if (!block) { FAIL("malloc failed"); return NULL; @@ -856,7 +855,7 @@ void proc_remove(struct proc_dir_entry *de) { if (!proc_files_in_use || unit_hash_get(proc_files_in_use, de) == NULL) { - FAIL("proc_remove on unknown dir_entry"); + FAIL("%s on unknown dir_entry", __func__); return; } unit_hash_erase(proc_files_in_use, de); @@ -925,7 +924,8 @@ void release_sock(struct sock *sk) } void remove_wait_queue(struct wait_queue_head *wq_head, - struct wait_queue_entry *wq_entry) {} + struct wait_queue_entry *wq_entry) +{} void schedule(void) { @@ -933,12 +933,15 @@ void schedule(void) } void security_sk_classify_flow(const struct sock *sk, - struct flowi_common *flic) {} + struct flowi_common *flic) +{} void __show_free_areas(unsigned int filter, nodemask_t *nodemask, - int max_zone_idx) {} + int max_zone_idx) +{} -void sk_common_release(struct sock *sk) {} +void sk_common_release(struct sock *sk) +{} int sk_set_peek_off(struct sock *sk, int val) { @@ -953,9 +956,8 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, if (mock_check_error(&mock_copy_data_errors)) return -EFAULT; if (bytes_left > iter->count) { - unit_log_printf("; ", "skb_copy_datagram_iter needs %lu bytes, " - "but iov_iter has only %lu", - bytes_left, iter->count); + unit_log_printf("; ", "%s needs %lu bytes, but iov_iter has only %lu", + __func__, bytes_left, iter->count); return 0; } while (bytes_left > 0) { @@ -966,7 +968,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, if (chunk_bytes > bytes_left) chunk_bytes = bytes_left; unit_log_printf("; ", - "skb_copy_datagram_iter: %lu bytes to 0x%llx: ", + "%s: %lu bytes to 0x%llx: ", __func__, chunk_bytes, int_base); unit_log_data(NULL, from->data + offset + size - bytes_left, chunk_bytes); @@ -988,7 +990,7 @@ struct sk_buff *skb_dequeue(struct sk_buff_head *list) void *skb_pull(struct sk_buff *skb, unsigned int len) { if ((skb_tail_pointer(skb) - skb->data) < len) - FAIL("sk_buff underflow during pull"); + FAIL("sk_buff underflow during %s", __func__); skb->len -= len; return skb->data += len; } @@ -998,7 +1000,7 @@ void *skb_push(struct sk_buff *skb, unsigned int len) skb->data -= len; skb->len += len; if (unlikely(skb->data < skb->head)) - FAIL("sk_buff underflow during skb_push"); + FAIL("sk_buff underflow during %s", __func__); return skb->data; } @@ -1078,21 +1080,23 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) return 0; } -void synchronize_sched(void) {} - -void __tasklet_hi_schedule(struct tasklet_struct *t) {} +void __tasklet_hi_schedule(struct tasklet_struct *t) +{} void tasklet_init(struct tasklet_struct *t, - void (*func)(unsigned long), unsigned long data) {} + void (*func)(unsigned long), unsigned long data) +{} -void tasklet_kill(struct tasklet_struct *t) {} +void tasklet_kill(struct tasklet_struct *t) +{} -void unregister_net_sysctl_table(struct ctl_table_header *header) {} +void unregister_net_sysctl_table(struct ctl_table_header *header) +{} void vfree(const void *block) { if (!vmallocs_in_use || unit_hash_get(vmallocs_in_use, block) == NULL) { - FAIL("vfree on unknown block"); + FAIL("%s on unknown block", __func__); return; } unit_hash_erase(vmallocs_in_use, block); @@ -1106,7 +1110,7 @@ int vfs_fsync(struct file *file, int datasync) void wait_for_completion(struct completion *x) {} -long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, +long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, long timeout) { return 0; @@ -1126,7 +1130,7 @@ int wake_up_process(struct task_struct *tsk) void __warn_printk(const char *s, ...) {} -int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, +int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, void *key) { return 0; @@ -1136,7 +1140,7 @@ int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, * mock_alloc_pages() - Called instead of alloc_pages when Homa is compiled * for unit testing. */ -struct page *mock_alloc_pages(gfp_t gfp, unsigned order) +struct page *mock_alloc_pages(gfp_t gfp, unsigned int order) { struct page *page; @@ -1169,7 +1173,7 @@ int mock_check_error(int *errorMask) * mock_clear_xmit_prios() - Remove all information from the list of * transmit priorities. */ -void mock_clear_xmit_prios() +void mock_clear_xmit_prios(void) { mock_xmit_prios_offset = 0; mock_xmit_prios[0] = 0; @@ -1219,6 +1223,7 @@ cycles_t mock_get_cycles(void) { if (mock_cycles == ~0) { uint32_t lo, hi; + __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); return (((uint64_t)hi << 32) | lo); } @@ -1240,7 +1245,7 @@ void mock_get_page(struct page *page) int64_t ref_count = (int64_t) unit_hash_get(pages_in_use, page); if (ref_count == 0) - FAIL(" unallocated page passed to mock_get_page"); + FAIL("unallocated page passed to %s", __func__); else unit_hash_set(pages_in_use, page, (void *) (ref_count+1)); } @@ -1274,7 +1279,7 @@ void mock_put_page(struct page *page) int64_t ref_count = (int64_t) unit_hash_get(pages_in_use, page); if (ref_count == 0) - FAIL(" unallocated page passed to mock_put_page"); + FAIL("unallocated page passed to %s", __func__); else { ref_count--; if (ref_count == 0) { @@ -1459,7 +1464,7 @@ void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) /** * mock_spin_unlock() - Called instead of spin_unlock when Homa is compiled * for unit testing. - * @lock: Lock to be be released (ignored). + * @lock: Lock to be released (ignored). */ void mock_spin_unlock(spinlock_t *lock) { @@ -1572,11 +1577,12 @@ void mock_teardown(void) */ void *mock_vmalloc(size_t size) { + void *block; + if (mock_check_error(&mock_vmalloc_errors)) return NULL; - void *block = malloc(size); - if (!block) - { + block = malloc(size); + if (!block) { FAIL("malloc failed"); return NULL; } diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 54c0662a..3448ac6a 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_grant.h" @@ -82,13 +80,13 @@ FIXTURE_SETUP(homa_grant) self->server_addr.in6.sin6_port = htons(self->server_port); self->data = (struct data_header){.common = { .sport = htons(self->client_port), - .dport = htons(self->server_port), + .dport = htons(self->server_port), .type = DATA, .sender_id = cpu_to_be64(self->client_id)}, .message_length = htonl(10000), .incoming = htonl(10000), .cutoff_version = 0, .ack = {0, 0, 0}, - .retransmit = 0, + .retransmit = 0, .seg = {.offset = 0}}; unit_log_clear(); self->incoming_delta = 0; @@ -113,16 +111,16 @@ static struct homa_rpc *test_rpc(FIXTURE_DATA(homa_grant) *self, TEST_F(homa_grant, homa_grant_outranks) { - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk,UNIT_OUTGOING, + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 20000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk,UNIT_OUTGOING, + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 102, 1000, 30000); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk,UNIT_OUTGOING, + struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 104, 1000, 30000); - struct homa_rpc *crpc4 = unit_client_rpc(&self->hsk,UNIT_OUTGOING, + struct homa_rpc *crpc4 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 106, 1000, 30000); @@ -423,10 +421,11 @@ TEST_F(homa_grant, homa_grant_remove_rpc__reposition_peer_in_homa_list) TEST_F(homa_grant, homa_grant_send__basics) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + int granted; rpc->msgin.priority = 3; unit_log_clear(); - int granted = homa_grant_send(rpc, &self->homa); + granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(1, granted); EXPECT_EQ(10000, rpc->msgin.granted); EXPECT_STREQ("xmit GRANT 10000@3", unit_log_get()); @@ -434,12 +433,13 @@ TEST_F(homa_grant, homa_grant_send__basics) TEST_F(homa_grant, homa_grant_send__incoming_negative) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + int granted; rpc->msgin.bytes_remaining = 5000; atomic_set(&self->homa.total_incoming, self->homa.max_incoming); unit_log_clear(); - int granted = homa_grant_send(rpc, &self->homa); + granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(0, granted); EXPECT_EQ(15000, rpc->msgin.granted); EXPECT_STREQ("", unit_log_get()); @@ -447,10 +447,11 @@ TEST_F(homa_grant, homa_grant_send__incoming_negative) TEST_F(homa_grant, homa_grant_send__end_of_message) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + int granted; rpc->msgin.bytes_remaining = 5000; unit_log_clear(); - int granted = homa_grant_send(rpc, &self->homa); + granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(1, granted); EXPECT_EQ(20000, rpc->msgin.granted); EXPECT_STREQ("xmit GRANT 20000@0", unit_log_get()); @@ -458,13 +459,14 @@ TEST_F(homa_grant, homa_grant_send__end_of_message) TEST_F(homa_grant, homa_grant_send__not_enough_available_bytes) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + int granted; rpc->msgin.granted = 3000; rpc->msgin.rec_incoming = 4000; atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 4000); unit_log_clear(); - int granted = homa_grant_send(rpc, &self->homa); + granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(1, granted); EXPECT_EQ(8000, rpc->msgin.granted); EXPECT_STREQ("xmit GRANT 8000@0", unit_log_get()); @@ -472,10 +474,11 @@ TEST_F(homa_grant, homa_grant_send__not_enough_available_bytes) TEST_F(homa_grant, homa_grant_send__nothing_available) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + int granted; atomic_set(&self->homa.total_incoming, self->homa.max_incoming); unit_log_clear(); - int granted = homa_grant_send(rpc, &self->homa); + granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(0, granted); EXPECT_EQ(0, rpc->msgin.granted); EXPECT_STREQ("", unit_log_get()); @@ -483,19 +486,21 @@ TEST_F(homa_grant, homa_grant_send__nothing_available) TEST_F(homa_grant, homa_grant_send__skip_because_of_silent_ticks) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + int granted; rpc->silent_ticks = 2; unit_log_clear(); - int granted = homa_grant_send(rpc, &self->homa); + granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(0, granted); } TEST_F(homa_grant, homa_grant_send__resend_all) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + int granted; rpc->msgin.resend_all = 1; unit_log_clear(); - int granted = homa_grant_send(rpc, &self->homa); + granted = homa_grant_send(rpc, &self->homa); EXPECT_EQ(1, granted); EXPECT_EQ(10000, rpc->msgin.granted); EXPECT_EQ(0, rpc->msgin.resend_all); @@ -521,6 +526,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 2000); + int old_state; homa_message_in_init(rpc, 2000, 0); homa_rpc_lock(rpc, "test"); @@ -528,7 +534,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) EXPECT_EQ(2000, rpc->msgin.rec_incoming); EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); - int old_state = rpc->state; + old_state = rpc->state; rpc->state = RPC_DEAD; rpc->msgin.bytes_remaining = 0; homa_rpc_lock(rpc, "test"); @@ -859,7 +865,7 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) rpc3 = test_rpc(self, 100, self->server_ip, 50000); self->homa.max_incoming = 100000; - /* First try: fixed window size. */ + /* First try: fixed window size. */ homa_grantable_lock(&self->homa, 0); self->homa.window_param = 5000; homa_grant_recalc(&self->homa, 1); @@ -868,7 +874,7 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) EXPECT_EQ(5000, rpc2->msgin.granted); EXPECT_EQ(5000, rpc3->msgin.granted); - /* Second try: dynamic window size. */ + /* Second try: dynamic window size. */ self->homa.window_param = 0; homa_grant_recalc(&self->homa, 0); EXPECT_EQ(25000, self->homa.grant_window); @@ -1132,4 +1138,4 @@ TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); EXPECT_EQ(2, homa_metrics_per_cpu()->grantable_lock_misses); homa_grantable_unlock(&self->homa); -} \ No newline at end of file +} diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 4d0be4e4..f6f5c4a7 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_offload.h" @@ -15,11 +13,11 @@ /* The following variable (and hook function) are used to mark an RPC * ready with an error (but only if thread is sleeping). */ -struct homa_rpc *hook_rpc = NULL; -struct homa_sock *hook_hsk = NULL; -int delete_count = 0; -int lock_delete_count = 0; -int hook_granted = 0; +struct homa_rpc *hook_rpc; +struct homa_sock *hook_hsk; +int delete_count; +int lock_delete_count; +int hook_granted; void handoff_hook(char *id) { if (strcmp(id, "schedule") != 0) @@ -29,8 +27,7 @@ void handoff_hook(char *id) hook_rpc->error = -EFAULT; homa_rpc_handoff(hook_rpc); unit_log_printf("; ", - "%d in ready_requests, %d in ready_responses, " - "%d in request_interests, %d in response_interests", + "%d in ready_requests, %d in ready_responses, %d in request_interests, %d in response_interests", unit_list_length(&hook_rpc->hsk->ready_requests), unit_list_length(&hook_rpc->hsk->ready_responses), unit_list_length(&hook_rpc->hsk->request_interests), @@ -38,7 +35,7 @@ void handoff_hook(char *id) } /* The following hook function marks an RPC ready after several calls. */ -int poll_count = 0; +int poll_count; void poll_hook(char *id) { if (strcmp(id, "schedule") != 0) @@ -63,7 +60,7 @@ void handoff_hook2(char *id) } /* The following hook function first hands off an RPC, then deletes it. */ -int hook3_count = 0; +int hook3_count; void handoff_hook3(char *id) { if (hook3_count || (strcmp(id, "found_rpc") != 0)) @@ -79,9 +76,8 @@ void delete_hook(char *id) { if (strcmp(id, "schedule") != 0) return; - if (delete_count == 0) { + if (delete_count == 0) homa_rpc_free(hook_rpc); - } delete_count--; } @@ -96,7 +92,8 @@ void lock_delete_hook(char *id) } /* The following function is used via unit_hook to free an RPC after it - * has been matched in homa_wait_for_message. */ + * has been matched in homa_wait_for_message. + */ void match_free_hook(char *id) { if (strcmp(id, "found_rpc") == 0) @@ -112,14 +109,13 @@ void shutdown_hook(char *id) } /* The following hook function updates hook_rpc->msgin.granted. */ -int unlock_count = 0; +int unlock_count; void unlock_hook(char *id) { if (strcmp(id, "unlock") != 0) return; - if (unlock_count == 0) { + if (unlock_count == 0) hook_rpc->msgin.granted = hook_granted; - } unlock_count--; } @@ -163,13 +159,13 @@ FIXTURE_SETUP(homa_incoming) self->server_addr.in6.sin6_port = htons(self->server_port); self->data = (struct data_header){.common = { .sport = htons(self->client_port), - .dport = htons(self->server_port), + .dport = htons(self->server_port), .type = DATA, .sender_id = cpu_to_be64(self->client_id)}, .message_length = htonl(10000), .incoming = htonl(10000), .cutoff_version = 0, .ack = {0, 0, 0}, - .retransmit = 0, + .retransmit = 0, .seg = {.offset = 0}}; unit_log_clear(); delete_count = 0; @@ -250,7 +246,7 @@ TEST_F(homa_incoming, homa_gap_retry) EXPECT_STREQ("xmit RESEND 1000-1999@7; " "xmit RESEND 4000-5999@7; " "xmit RESEND 7000-7999@7", - unit_log_get()); + unit_log_get()); } TEST_F(homa_incoming, homa_add_packet__basics) @@ -269,9 +265,8 @@ TEST_F(homa_incoming, homa_add_packet__basics) self->data.seg.offset = htonl(4200); homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 800, 4200)); - EXPECT_STREQ("start 0, end 1400, time 5000; " - "start 2800, end 4200, time 5000", - unit_print_gaps(crpc)); + EXPECT_STREQ("start 0, end 1400, time 5000; start 2800, end 4200, time 5000", + unit_print_gaps(crpc)); unit_log_clear(); self->data.seg.offset = 0; @@ -287,8 +282,8 @@ TEST_F(homa_incoming, homa_add_packet__basics) EXPECT_STREQ("", unit_print_gaps(crpc)); unit_log_clear(); unit_log_skb_list(&crpc->msgin.packets, 0); - EXPECT_STREQ("DATA 1400@1400; DATA 800@4200; DATA 1400@0; " - "DATA 1400@2800", unit_log_get()); + EXPECT_STREQ("DATA 1400@1400; DATA 800@4200; DATA 1400@0; DATA 1400@2800", + unit_log_get()); EXPECT_EQ(4, skb_queue_len(&crpc->msgin.packets)); } TEST_F(homa_incoming, homa_add_packet__packet_overlaps_message_end) @@ -548,9 +543,8 @@ TEST_F(homa_incoming, homa_add_packet__packet_in_middle_of_gap) homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 2000)); EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); - EXPECT_STREQ("start 1400, end 2000, time 1000; " - "start 3400, end 4200, time 1000", - unit_print_gaps(crpc)); + EXPECT_STREQ("start 1400, end 2000, time 1000; start 3400, end 4200, time 1000", + unit_print_gaps(crpc)); } TEST_F(homa_incoming, homa_add_packet__scan_multiple_gaps) { @@ -627,12 +621,9 @@ TEST_F(homa_incoming, homa_copy_to_user__basics) mock_copy_to_user_dont_copy = -1; EXPECT_EQ(0, -homa_copy_to_user(crpc)); EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399; " - "skb_copy_datagram_iter: 648 bytes to 0x1000578: " - "101000-101647; " - "skb_copy_datagram_iter: 752 bytes to 0x1000800: " - "101648-102399; " - "skb_copy_datagram_iter: 1200 bytes to 0x1000af0: " - "201800-202999", + "skb_copy_datagram_iter: 648 bytes to 0x1000578: 101000-101647; " + "skb_copy_datagram_iter: 752 bytes to 0x1000800: 101648-102399; " + "skb_copy_datagram_iter: 1200 bytes to 0x1000af0: 201800-202999", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } @@ -674,22 +665,14 @@ TEST_F(homa_incoming, homa_copy_to_user__multiple_batches) unit_log_clear(); mock_copy_to_user_dont_copy = -1; EXPECT_EQ(0, -homa_copy_to_user(crpc)); - EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: " - "0-1399; " - "skb_copy_datagram_iter: 1400 bytes to 0x1000578: " - "1400-2799; " - "skb_copy_datagram_iter: 1400 bytes to 0x1000af0: " - "2800-4199; " - "skb_copy_datagram_iter: 1400 bytes to 0x1001068: " - "4200-5599; " - "skb_copy_datagram_iter: 1400 bytes to 0x10015e0: " - "5600-6999; " - "skb_copy_datagram_iter: 1400 bytes to 0x1001b58: " - "7000-8399; " - "skb_copy_datagram_iter: 1400 bytes to 0x10020d0: " - "8400-9799; " - "skb_copy_datagram_iter: 1400 bytes to 0x1002648: " - "9800-11199", + EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399; " + "skb_copy_datagram_iter: 1400 bytes to 0x1000578: 1400-2799; " + "skb_copy_datagram_iter: 1400 bytes to 0x1000af0: 2800-4199; " + "skb_copy_datagram_iter: 1400 bytes to 0x1001068: 4200-5599; " + "skb_copy_datagram_iter: 1400 bytes to 0x10015e0: 5600-6999; " + "skb_copy_datagram_iter: 1400 bytes to 0x1001b58: 7000-8399; " + "skb_copy_datagram_iter: 1400 bytes to 0x10020d0: 8400-9799; " + "skb_copy_datagram_iter: 1400 bytes to 0x1002648: 9800-11199", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } @@ -733,18 +716,12 @@ TEST_F(homa_incoming, homa_copy_to_user__many_chunks_for_one_skb) unit_log_clear(); mock_copy_to_user_dont_copy = -1; EXPECT_EQ(0, -homa_copy_to_user(crpc)); - EXPECT_STREQ("skb_copy_datagram_iter: 512 bytes to 0x1000000: " - "101000-101511; " - "skb_copy_datagram_iter: 512 bytes to 0x1000200: " - "101512-102023; " - "skb_copy_datagram_iter: 512 bytes to 0x1000400: " - "102024-102535; " - "skb_copy_datagram_iter: 512 bytes to 0x1000600: " - "102536-103047; " - "skb_copy_datagram_iter: 512 bytes to 0x1000800: " - "103048-103559; " - "skb_copy_datagram_iter: 440 bytes to 0x1000a00: " - "103560-103999", + EXPECT_STREQ("skb_copy_datagram_iter: 512 bytes to 0x1000000: 101000-101511; " + "skb_copy_datagram_iter: 512 bytes to 0x1000200: 101512-102023; " + "skb_copy_datagram_iter: 512 bytes to 0x1000400: 102024-102535; " + "skb_copy_datagram_iter: 512 bytes to 0x1000600: 102536-103047; " + "skb_copy_datagram_iter: 512 bytes to 0x1000800: 103048-103559; " + "skb_copy_datagram_iter: 440 bytes to 0x1000a00: 103560-103999", unit_log_get()); } TEST_F(homa_incoming, homa_copy_to_user__error_in_import_single_range) @@ -927,7 +904,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_client_rpc) unit_log_clear(); struct grant_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = GRANT}, .offset = htonl(12600), @@ -940,7 +917,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_client_rpc) TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) { struct grant_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99991), .type = UNKNOWN}}; @@ -952,7 +929,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) { struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), + .dport = htons(self->server_port), .sender_id = cpu_to_be64(99990), .type = GRANT}}; @@ -964,10 +941,10 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) { struct cutoffs_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99991), .type = CUTOFFS}, - .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), + .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, .cutoff_version = 400}; @@ -985,7 +962,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) { struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99990), .type = RESEND}, .offset = 0, .length = 2000, .priority = 5}; @@ -1000,7 +977,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); struct grant_header h = {.common = {.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = GRANT}, .offset = htonl(12600), .priority = 3, .resend_all = 0}; @@ -1035,7 +1012,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) unit_log_clear(); struct common_header h = {.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = 99}; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h, 0, 0), &self->homa); EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_packet_types); @@ -1309,10 +1286,10 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); struct grant_header h = {{.sport = htons(srpc->dport), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->client_id), .type = GRANT}, - .offset = htonl(11000), + .offset = htonl(11000), .priority = 3, .resend_all = 0}; @@ -1351,10 +1328,10 @@ TEST_F(homa_incoming, homa_grant_pkt__reset) self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); struct grant_header h = {{.sport = htons(srpc->dport), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->client_id), .type = GRANT}, - .offset = htonl(3000), + .offset = htonl(3000), .priority = 2, .resend_all = 1}; @@ -1383,10 +1360,10 @@ TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); struct grant_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = GRANT}, - .offset = htonl(25000), + .offset = htonl(25000), .priority = 3}; ASSERT_NE(NULL, crpc); @@ -1399,10 +1376,10 @@ TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) { struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), + .dport = htons(self->server_port), .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, - .offset = htonl(100), + .offset = htonl(100), .length = htonl(200), .priority = 3}; @@ -1413,10 +1390,10 @@ TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) TEST_F(homa_incoming, homa_resend_pkt__rpc_in_service_server_sends_busy) { struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), + .dport = htons(self->server_port), .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, - .offset = htonl(0), + .offset = htonl(0), .length = htonl(200), .priority = 3}; struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_IN_SERVICE, @@ -1436,10 +1413,10 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_incoming_server_sends_busy) * everything we have granted so far. */ struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), + .dport = htons(self->server_port), .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, - .offset = htonl(1400), + .offset = htonl(1400), .length = htonl(200), .priority = 3}; struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, @@ -1461,10 +1438,10 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) * server must already have received everything. */ struct resend_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, - .offset = htonl(100), + .offset = htonl(100), .length = htonl(200), .priority = 3}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -1481,10 +1458,10 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) { struct resend_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, - .offset = htonl(100), + .offset = htonl(100), .length = htonl(200), .priority = 3}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -1501,10 +1478,10 @@ TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) TEST_F(homa_incoming, homa_resend_pkt__client_send_data) { struct resend_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, - .offset = htonl(100), + .offset = htonl(100), .length = htonl(200), .priority = 3}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -1524,10 +1501,10 @@ TEST_F(homa_incoming, homa_resend_pkt__client_send_data) TEST_F(homa_incoming, homa_resend_pkt__server_send_data) { struct resend_header h = {{.sport = htons(self->client_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, - .offset = htonl(100), + .offset = htonl(100), .length = htonl(2000), .priority = 4}; struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, @@ -1549,7 +1526,7 @@ TEST_F(homa_incoming, homa_resend_pkt__server_send_data) TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) { struct unknown_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = UNKNOWN}}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -1563,19 +1540,15 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); - EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, " - "message_length 2000, offset 0, data_length 1400, " - "incoming 2000, RETRANSMIT; " - "xmit DATA from 0.0.0.0:32768, dport 99, id 1234, " - "message_length 2000, offset 1400, data_length 600, " - "incoming 2000, RETRANSMIT", + EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 2000, RETRANSMIT; " + "xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 1400, data_length 600, incoming 2000, RETRANSMIT", unit_log_get()); EXPECT_EQ(-1, crpc->msgin.length); } TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) { struct unknown_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = UNKNOWN}}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -1590,16 +1563,14 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); - EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, " - "message_length 2000, offset 0, data_length 1400, " - "incoming 1400, RETRANSMIT", + EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 1400, RETRANSMIT", unit_log_get()); EXPECT_EQ(-1, crpc->msgin.length); } TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) { struct unknown_header h = {{.sport = htons(self->client_port), - .dport = htons(self->hsk2.port), + .dport = htons(self->hsk2.port), .sender_id = cpu_to_be64(self->client_id), .type = UNKNOWN}}; struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, @@ -1620,10 +1591,10 @@ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); struct cutoffs_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = CUTOFFS}, - .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), + .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, .cutoff_version = 400}; @@ -1640,10 +1611,10 @@ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) { struct cutoffs_header h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = CUTOFFS}, - .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), + .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, .cutoff_version = 400}; struct sk_buff *skb = mock_skb_new(self->server_ip, &h.common, 0, 0); @@ -1665,7 +1636,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) self->server_port, self->client_id, 100, 3000); struct need_ack_header h = {.common = { .sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = NEED_ACK}}; @@ -1686,7 +1657,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) self->server_port, self->client_id, 100, 3000); struct need_ack_header h = {.common = { .sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = NEED_ACK}}; @@ -1706,7 +1677,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) self->server_port, self->client_id, 100, 3000); struct need_ack_header h = {.common = { .sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = NEED_ACK}}; @@ -1725,7 +1696,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) self->server_ip, &self->hsk.inet); struct need_ack_header h = {.common = { .sport = htons(self->server_port), - .dport = htons(self->hsk.port), + .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = NEED_ACK}}; @@ -1736,8 +1707,8 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); - EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, " - "acks [cp 40000, sp 99, id 1236]", unit_log_get()); + EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks [cp 40000, sp 99, id 1236]", + unit_log_get()); } TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) @@ -1747,7 +1718,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) self->server_id, 100, 5000); struct ack_header h = {.common = { .sport = htons(self->client_port), - .dport = htons(self->hsk2.port), + .dport = htons(self->hsk2.port), .sender_id = cpu_to_be64(self->client_id), .type = ACK}, .num_acks = htons(0)}; @@ -1771,7 +1742,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) self->server_id+2, 100, 5000); struct ack_header h = {.common = { .sport = htons(self->client_port + 1), - .dport = htons(self->hsk2.port), + .dport = htons(self->hsk2.port), .sender_id = cpu_to_be64(self->client_id), .type = ACK}, .num_acks = htons(2)}; @@ -1782,11 +1753,11 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) unit_log_clear(); mock_xmit_log_verbose = 1; h.acks[0] = (struct homa_ack) {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->server_id+5)}; + .server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->server_id+5)}; h.acks[1] = (struct homa_ack) {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->server_id+1)}; + .server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->server_id+1)}; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); @@ -1959,7 +1930,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__free_server_rpc) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); + self->server_id, 20000, 100); ASSERT_NE(NULL, srpc); unit_log_clear(); @@ -1977,7 +1948,7 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__basics) self->server_port+1, self->client_id+2, 5000, 1600); struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); + self->server_id, 20000, 100); ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); @@ -2157,7 +2128,7 @@ TEST_F(homa_incoming, homa_register_interests__return_queued_request) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, - 1, 20000, 100); + 1, 20000, 100); int result; ASSERT_NE(NULL, srpc); @@ -2174,10 +2145,10 @@ TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) { struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); + self->server_id, 20000, 100); struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, - self->server_id+2, 20000, 100); + self->server_id+2, 20000, 100); int result; // First time should call sk_data_ready (for 2nd RPC). @@ -2275,7 +2246,7 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_sleeping) ASSERT_NE(NULL, crpc1); - /* Also, check to see that reaping occurs before sleeping. */ + /* Also, check to see that reaping occurs before sleeping. */ struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 20000, 20000); @@ -2289,9 +2260,8 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_sleeping) rpc = homa_wait_for_message(&self->hsk, 0, self->client_id); EXPECT_EQ(crpc1, rpc); EXPECT_EQ(NULL, crpc1->interest); - EXPECT_STREQ("reaped 1236; wake_up_process pid 0; 0 in ready_requests, " - "0 in ready_responses, 0 in request_interests, " - "0 in response_interests", unit_log_get()); + EXPECT_STREQ("reaped 1236; wake_up_process pid 0; 0 in ready_requests, 0 in ready_responses, 0 in request_interests, 0 in response_interests", + unit_log_get()); EXPECT_EQ(0, self->hsk.dead_skbs); homa_rpc_unlock(rpc); } @@ -2637,7 +2607,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__request_interests) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); + self->server_id, 20000, 100); struct homa_interest interest; ASSERT_NE(NULL, srpc); @@ -2656,7 +2626,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__queue_on_ready_requests) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, - 1, 20000, 100); + 1, 20000, 100); ASSERT_NE(NULL, srpc); unit_log_clear(); diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c index 4bcb73cb..e4a7df0a 100644 --- a/test/unit_homa_metrics.c +++ b/test/unit_homa_metrics.c @@ -38,8 +38,7 @@ TEST_F(homa_metrics, homa_metric_append) homa_metric_append(&self->homa, ", q: %050d", 88); EXPECT_EQ(77, self->homa.metrics_length); - EXPECT_STREQ("x: 10, y: 20, z: 12345, " - "q: 00000000000000000000000000000000000000000000000088", + EXPECT_STREQ("x: 10, y: 20, z: 12345, q: 00000000000000000000000000000000000000000000000088", self->homa.metrics); EXPECT_EQ(120, self->homa.metrics_capacity); } @@ -97,4 +96,4 @@ TEST_F(homa_metrics, homa_metrics_release) EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); EXPECT_EQ(0, self->homa.metrics_active_opens); -} \ No newline at end of file +} diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 523ac1e9..4cf374ee 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_offload.h" @@ -28,7 +26,7 @@ static struct sk_buff *tcp6_gro_receive(struct list_head *held_list, return NULL; } - FIXTURE(homa_offload) +FIXTURE(homa_offload) { struct homa homa; struct homa_sock hsk; @@ -70,9 +68,9 @@ FIXTURE_SETUP(homa_offload) NAPI_GRO_CB(self->skb)->same_flow = 0; NAPI_GRO_CB(self->skb)->last = self->skb; NAPI_GRO_CB(self->skb)->count = 1; - self->header.seg.offset = htonl(4000); - self->header.common.dport = htons(88); - self->header.common.sender_id = cpu_to_be64(1002); + self->header.seg.offset = htonl(4000); + self->header.common.dport = htons(88); + self->header.common.sender_id = cpu_to_be64(1002); self->skb2 = mock_skb_new(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(self->skb2)->same_flow = 0; NAPI_GRO_CB(self->skb2)->last = self->skb2; @@ -97,7 +95,7 @@ FIXTURE_SETUP(homa_offload) } FIXTURE_TEARDOWN(homa_offload) { - struct sk_buff *skb, *tmp; + struct sk_buff *skb, *tmp; homa_offload_end(); list_for_each_entry_safe(skb, tmp, &self->napi.gro_hash[2].list, list) @@ -204,13 +202,14 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) TEST_F(homa_offload, homa_gso_segment_set_ip_ids) { - struct sk_buff *skb; - mock_ipv6 = false; + struct sk_buff *skb, *segs; + int version; + mock_ipv6 = false; skb = mock_skb_new(&self->ip, &self->header.common, 1400, 2000); - int version = ip_hdr(skb)->version; + version = ip_hdr(skb)->version; EXPECT_EQ(4, version); - struct sk_buff *segs = homa_gso_segment(skb, 0); + segs = homa_gso_segment(skb, 0); ASSERT_NE(NULL, segs); ASSERT_NE(NULL, segs->next); EXPECT_EQ(NULL, segs->next->next); @@ -253,7 +252,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) { struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); - struct sk_buff *skb, *skb2, *skb3, *skb4; + struct sk_buff *skb, *skb2, *skb3, *skb4, *result; int client_port = 40000; __u64 client_id = 1234; __u64 server_id = 1235; @@ -269,7 +268,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) h.incoming = htonl(10000); h.cutoff_version = 0; h.ack.client_id = 0; - h.ack.client_port= 0; + h.ack.client_port = 0; h.ack.server_port = 0; h.retransmit = 0; h.seg.offset = htonl(2000); @@ -282,7 +281,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) /* First attempt: HOMA_GRO_SHORT_BYPASS not enabled. */ skb = mock_skb_new(&self->ip, &h.common, 1400, 2000); - struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); + result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(0, homa_metrics_per_cpu()->gro_data_bypasses); @@ -320,6 +319,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) { struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); + struct sk_buff *skb, *skb2, *skb3, *result; int client_port = 40000; __u64 client_id = 1234; __u64 server_id = 1235; @@ -343,8 +343,8 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) /* First attempt: HOMA_GRO_FAST_GRANTS not enabled. */ self->homa.gro_policy = 0; - struct sk_buff *skb = mock_skb_new(&client_ip, &h.common, 0, 0); - struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); + skb = mock_skb_new(&client_ip, &h.common, 0, 0); + result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(0, homa_metrics_per_cpu()->gro_grant_bypasses); EXPECT_STREQ("", unit_log_get()); @@ -352,7 +352,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) /* Second attempt: HOMA_FAST_GRANTS is enabled. */ self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; cur_offload_core->last_gro = 400; - struct sk_buff *skb2 = mock_skb_new(&client_ip, &h.common, 0, 0); + skb2 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); @@ -360,7 +360,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) /* Third attempt: core is too busy for fast grants. */ cur_offload_core->last_gro = 600; - struct sk_buff *skb3 = mock_skb_new(&client_ip, &h.common, 0, 0); + skb3 = mock_skb_new(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); @@ -470,12 +470,8 @@ TEST_F(homa_offload, homa_gro_receive__merge) EXPECT_EQ(3, NAPI_GRO_CB(self->skb2)->count); unit_log_frag_list(self->skb2, 1); - EXPECT_STREQ("DATA from 196.168.0.1:40000, dport 88, id 1002, " - "message_length 10000, offset 6000, " - "data_length 1400, incoming 10000; " - "DATA from 196.168.0.1:40000, dport 88, id 1004, " - "message_length 10000, offset 7000, " - "data_length 1400, incoming 10000", + EXPECT_STREQ("DATA from 196.168.0.1:40000, dport 88, id 1002, message_length 10000, offset 6000, data_length 1400, incoming 10000; " + "DATA from 196.168.0.1:40000, dport 88, id 1004, message_length 10000, offset 7000, data_length 1400, incoming 10000", unit_log_get()); } TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) @@ -583,8 +579,9 @@ TEST_F(homa_offload, homa_gro_gen3__basics) } TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) { - homa->gro_policy = HOMA_GRO_GEN3; struct homa_offload_core *offload_core = cur_offload_core; + + homa->gro_policy = HOMA_GRO_GEN3; offload_core->gen3_softirq_cores[0] = 3; offload_core->gen3_softirq_cores[1] = -1; offload_core->gen3_softirq_cores[2] = 5; @@ -599,8 +596,9 @@ TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) } TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) { - homa->gro_policy = HOMA_GRO_GEN3; struct homa_offload_core *offload_core = cur_offload_core; + + homa->gro_policy = HOMA_GRO_GEN3; offload_core->gen3_softirq_cores[0] = 3; offload_core->gen3_softirq_cores[1] = 7; offload_core->gen3_softirq_cores[2] = 5; diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 1bf6af8e..4abc8403 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_peer.h" @@ -13,7 +11,7 @@ #include "utils.h" /* The following hook function frees hook_rpc. */ -static struct homa_rpc *hook_rpc = NULL; +static struct homa_rpc *hook_rpc; static void unlock_hook(char *id) { if (strcmp(id, "unlock") != 0) @@ -93,9 +91,10 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + char buffer[1000]; homa_rpc_unlock(crpc); - struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); homa_message_out_init(crpc, 10000); unit_log_clear(); @@ -106,12 +105,8 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved) "_copy_from_iter 1500 bytes at 4000; " "_copy_from_iter 500 bytes at 5500", unit_log_get()); - char buffer[1000]; - EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 10000, offset 10000, data_length 1500, " - "incoming 10000, extra segs 1500@11500 1500@13000 " - "500@14500", - homa_print_packet(skb, buffer, sizeof(buffer))); + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); EXPECT_EQ(5000 + sizeof32(struct data_header) + 3*sizeof32(struct seg_header), skb->len); kfree_skb(skb); @@ -147,9 +142,7 @@ TEST_F(homa_outgoing, homa_new_data_packet__one_segment) skb = homa_new_data_packet(crpc, iter, 5000, 500, 2000); EXPECT_STREQ("_copy_from_iter 500 bytes at 1000", unit_log_get()); - EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 500, offset 5000, data_length 500, " - "incoming 500", + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 500, offset 5000, data_length 500, incoming 500", homa_print_packet(skb, buffer, sizeof(buffer))); EXPECT_EQ(0, skb_shinfo(skb)->gso_segs); @@ -177,6 +170,7 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_int struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); struct sk_buff *skb; + char buffer[1000]; homa_rpc_unlock(crpc); homa_message_out_init(crpc, 10000); @@ -187,13 +181,8 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_int "_copy_from_iter 1500 bytes at 2500; " "_copy_from_iter 1500 bytes at 4000; " "_copy_from_iter 500 bytes at 5500", unit_log_get()); - - char buffer[1000]; - EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 10000, offset 10000, data_length 1500, " - "incoming 10000, extra segs 1500@11500 1500@13000 " - "500@14500", - homa_print_packet(skb, buffer, sizeof(buffer))); + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); EXPECT_EQ(4*(sizeof(struct data_header) + crpc->hsk->ip_header_length + HOMA_ETH_OVERHEAD) + 5000, @@ -235,11 +224,8 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_tcp_hijacking) skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); EXPECT_STREQ("_copy_from_iter 5000 bytes at 1000", unit_log_get()); - EXPECT_STREQ("DATA from 0.0.0.0:40001, dport 99, id 2, " - "message_length 10000, offset 10000, data_length 1500, " - "incoming 10000, extra segs 1500@11500 1500@13000 " - "500@14500", - homa_print_packet(skb, buffer, sizeof(buffer))); + EXPECT_STREQ("DATA from 0.0.0.0:40001, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); kfree_skb(skb); homa_sock_destroy(&hsk); } @@ -312,15 +298,9 @@ TEST_F(homa_outgoing, homa_message_out_fill__basics) "_copy_from_iter 200 bytes at 3800", unit_log_get()); unit_log_clear(); unit_log_message_out_packets(&crpc->msgout, 1); - EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 3000, offset 0, data_length 1400, " - "incoming 3000; " - "DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 3000, offset 1400, data_length 1400, " - "incoming 3000; " - "DATA from 0.0.0.0:40000, dport 99, id 2, " - "message_length 3000, offset 2800, data_length 200, " - "incoming 3000", + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 0, data_length 1400, incoming 3000; " + "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 1400, data_length 1400, incoming 3000; " + "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 2800, data_length 200, incoming 3000", unit_log_get()); EXPECT_EQ(3, crpc->msgout.num_skbs); EXPECT_EQ(3000, crpc->msgout.copied_from_user); @@ -501,8 +481,7 @@ TEST_F(homa_outgoing, homa_xmit_control__server_request) h.common.sender_id = cpu_to_be64(self->client_id); mock_xmit_log_verbose = 1; EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), srpc)); - EXPECT_STREQ("xmit GRANT from 0.0.0.0:99, dport 40000, id 1235, " - "offset 12345, grant_prio 4", + EXPECT_STREQ("xmit GRANT from 0.0.0.0:99, dport 40000, id 1235, offset 12345, grant_prio 4", unit_log_get()); EXPECT_STREQ("7", mock_xmit_prios); } @@ -522,8 +501,7 @@ TEST_F(homa_outgoing, homa_xmit_control__client_response) h.resend_all = 0; mock_xmit_log_verbose = 1; EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), crpc)); - EXPECT_STREQ("xmit GRANT from 0.0.0.0:40000, dport 99, id 1234, " - "offset 12345, grant_prio 4", + EXPECT_STREQ("xmit GRANT from 0.0.0.0:40000, dport 99, id 1234, offset 12345, grant_prio 4", unit_log_get()); EXPECT_STREQ("7", mock_xmit_prios); } @@ -614,10 +592,10 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) TEST_F(homa_outgoing, homa_xmit_unknown) { struct grant_header h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), + .dport = htons(self->server_port), .sender_id = cpu_to_be64(99990), .type = GRANT}, - .offset = htonl(11200), + .offset = htonl(11200), .priority = 3, .resend_all = 0}; struct sk_buff *skb; @@ -790,7 +768,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, self->client_port); - crpc = unit_client_rpc(&self->hsk,UNIT_OUTGOING, self->client_ip, + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 1000); unit_log_clear(); @@ -834,15 +812,9 @@ TEST_F(homa_outgoing, homa_resend_data__basics) skb_push(crpc->msgout.packets, 8); homa_resend_data(crpc, 7000, 10000, 2); - EXPECT_STREQ("xmit DATA from 0.0.0.0:40000, dport 99, id 1234, " - "message_length 16000, offset 7000, data_length 1400, " - "incoming 10000, RETRANSMIT; " - "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, " - "message_length 16000, offset 8400, data_length 1400, " - "incoming 10000, RETRANSMIT; " - "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, " - "message_length 16000, offset 9800, data_length 200, " - "incoming 10000, RETRANSMIT", + EXPECT_STREQ("xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 7000, data_length 1400, incoming 10000, RETRANSMIT; " + "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 8400, data_length 1400, incoming 10000, RETRANSMIT; " + "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 9800, data_length 200, incoming 10000, RETRANSMIT", unit_log_get()); EXPECT_STREQ("2 2 2", mock_xmit_prios); @@ -946,8 +918,7 @@ TEST_F(homa_outgoing, homa_resend_data__set_homa_info) mock_xmit_log_homa_info = 1; homa_resend_data(crpc, 8400, 8800, 2); EXPECT_STREQ("xmit DATA retrans 1400@8400; " - "homa_info: wire_bytes 1538, data_bytes 1400, " - "seg_length 1400, offset 8400", + "homa_info: wire_bytes 1538, data_bytes 1400, seg_length 1400, offset 8400", unit_log_get()); } diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index aea66c4a..c076be2d 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_peer.h" @@ -79,16 +77,18 @@ TEST_F(homa_peer, homa_peer_find__basics) } static struct _test_data_homa_peer *test_data; -static struct homa_peer *conflicting_peer = NULL; -static int peer_lock_hook_invocations = 0; -static void peer_lock_hook(char *id) { +static struct homa_peer *conflicting_peer; +static int peer_lock_hook_invocations; +static void peer_lock_hook(char *id) +{ if (strcmp(id, "spin_lock") != 0) return; if (peer_lock_hook_invocations > 0) return; - peer_lock_hook_invocations ++; + peer_lock_hook_invocations++; /* Creates a peer with the same address as the one being created - * by the main test function below. */ + * by the main test function below. + */ conflicting_peer = homa_peer_find(&test_data->peertab, ip3333, &test_data->hsk.inet); } @@ -135,6 +135,7 @@ TEST_F(homa_peer, homa_peertab_get_peers__not_init) TEST_F(homa_peer, homa_peertab_get_peers__table_empty) { int num_peers = 45; + EXPECT_EQ(NULL, homa_peertab_get_peers(&self->peertab, &num_peers)); EXPECT_EQ(0, num_peers); } @@ -397,10 +398,7 @@ TEST_F(homa_peer, homa_peer_add_ack) mock_xmit_log_verbose = 1; homa_peer_add_ack(crpc3); EXPECT_EQ(0, peer->num_acks); - EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 103, acks " - "[cp 1000, sp 99, id 90] [cp 1001, sp 99, id 91] " - "[cp 1002, sp 99, id 92] [cp 32768, sp 99, id 101] " - "[cp 32768, sp 99, id 102]", + EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 103, acks [cp 1000, sp 99, id 90] [cp 1001, sp 99, id 91] [cp 1002, sp 99, id 92] [cp 32768, sp 99, id 101] [cp 32768, sp 99, id 102]", unit_log_get()); } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index fa665816..9f152b22 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_peer.h" @@ -14,7 +12,7 @@ extern struct homa *homa; /* The following hook function frees hook_rpc. */ -static struct homa_rpc *hook_rpc = NULL; +static struct homa_rpc *hook_rpc; static void unlock_hook(char *id) { if (strcmp(id, "unlock") != 0) @@ -70,12 +68,12 @@ FIXTURE_SETUP(homa_plumbing) homa_sock_bind(self->homa.port_map, &self->hsk, self->server_port); self->data = (struct data_header){.common = { .sport = htons(self->client_port), - .dport = htons(self->server_port), + .dport = htons(self->server_port), .type = DATA, .sender_id = cpu_to_be64(self->client_id)}, .message_length = htonl(10000), .incoming = htonl(10000), .retransmit = 0, - .seg={.offset = 0}}; + .seg = {.offset = 0}}; self->recvmsg_args.id = 0; self->recvmsg_hdr.msg_name = &self->addr; self->recvmsg_hdr.msg_namelen = 0; @@ -367,7 +365,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_nonzero_completion_cookie) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); self->sendmsg_args.id = self->server_id; self->sendmsg_args.completion_cookie = 12345; @@ -380,7 +378,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_cant_find_rpc) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); self->sendmsg_args.id = self->server_id + 1; EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, @@ -392,7 +390,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_error_in_rpc) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); self->sendmsg_args.id = srpc->id; srpc->error = -ENOMEM; @@ -405,7 +403,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_wrong_state) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); self->sendmsg_args.id = self->server_id; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, @@ -417,7 +415,7 @@ TEST_F(homa_plumbing, homa_sendmsg__homa_message_out_fill_returns_error) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); self->sendmsg_args.id = self->server_id; self->sendmsg_hdr.msg_iter.count = HOMA_MAX_MESSAGE_LENGTH + 1; @@ -430,7 +428,7 @@ TEST_F(homa_plumbing, homa_sendmsg__rpc_freed_during_homa_message_out_fill) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); unit_hook_register(unlock_hook); hook_rpc = srpc; @@ -445,7 +443,7 @@ TEST_F(homa_plumbing, homa_sendmsg__response_succeeds) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 100); + self->server_id, 2000, 100); self->sendmsg_args.id = self->server_id; EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, @@ -633,7 +631,7 @@ TEST_F(homa_plumbing, homa_recvmsg__server_normal_completion) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 200); + self->server_id, 100, 200); EXPECT_NE(NULL, srpc); EXPECT_EQ(100, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, @@ -647,7 +645,7 @@ TEST_F(homa_plumbing, homa_recvmsg__delete_server_rpc_after_error) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 200); + self->server_id, 100, 200); EXPECT_NE(NULL, srpc); srpc->error = -ENOMEM; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 6ebdc7d9..1ad35fc5 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2022-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_pool.h" @@ -308,7 +306,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) // Try a second allocation; the lock hook steals the partial bpage, // so a new one has to be allocated. crpc->msgin.num_bpages = 0; - mock_trylock_errors = 1; + mock_trylock_errors = 1; unit_hook_register(change_owner_hook); EXPECT_EQ(0, homa_pool_allocate(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); @@ -488,7 +486,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__basics) struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc2, *crpc3; - /* Queue up 2 RPCs that together need a total of 5 bpages. */ + /* Queue up 2 RPCs that together need a total of 5 bpages. */ atomic_set(&pool->free_bpages, 0); crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 3*HOMA_BPAGE_SIZE); @@ -575,7 +573,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; - /* Queue up an RPC that needs 2 bpages. */ + /* Queue up an RPC that needs 2 bpages. */ atomic_set(&pool->free_bpages, 0); crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); @@ -595,7 +593,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__reallocation_fails) struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; - /* Queue up an RPC that needs 4 bpages. */ + /* Queue up an RPC that needs 4 bpages. */ atomic_set(&pool->free_bpages, 0); crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 4*HOMA_BPAGE_SIZE); @@ -609,4 +607,4 @@ TEST_F(homa_pool, homa_pool_check_waiting__reallocation_fails) EXPECT_EQ(0, crpc->msgin.num_bpages); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(4, pool->bpages_needed); -} \ No newline at end of file +} diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index dfe662e7..23aa15aa 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_peer.h" @@ -44,13 +42,13 @@ FIXTURE_SETUP(homa_rpc) mock_sock_init(&self->hsk, &self->homa, 0); self->data = (struct data_header){.common = { .sport = htons(self->client_port), - .dport = htons(self->server_port), + .dport = htons(self->server_port), .type = DATA, .sender_id = self->client_id}, .message_length = htonl(10000), .incoming = htonl(10000), .cutoff_version = 0, .ack = {0, 0, 0}, - .retransmit = 0, + .retransmit = 0, .seg = {.offset = 0}}; self->iovec.iov_base = (void *) 2000; self->iovec.iov_len = 10000; @@ -242,7 +240,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_no_buffers) int created; self->data.message_length = N(1400); - atomic_set(&self->hsk.buffer_pool->free_bpages,0 ); + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc)); @@ -451,7 +449,7 @@ TEST_F(homa_rpc, homa_rpc_free__free_gaps) unit_print_gaps(crpc)); homa_rpc_free(crpc); - /* (Test infrastructure will complain if gaps aren't freed) */ + /* (Test infrastructure will complain if gaps aren't freed) */ } TEST_F(homa_rpc, homa_rpc_free__dead_buffs) { diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 69bf2749..4806dcbe 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2022-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_skb.h" @@ -63,7 +61,7 @@ static void add_to_pool(struct homa *homa, int num_pages, int core) } } -static struct homa_page_pool *hook_pool = NULL; +static struct homa_page_pool *hook_pool; /* Used to remove a page from hook_pool when a lock is acquired. */ static void spinlock_hook(char *id) @@ -99,7 +97,7 @@ TEST_F(homa_skb, homa_skb_init) homa_skb_cleanup(&self->homa); EXPECT_EQ(NULL, self->homa.page_pools[0]); mock_numa_mask = 0x83; - homa_skb_init(&self->homa); + homa_skb_init(&self->homa); EXPECT_NE(NULL, self->homa.page_pools[0]); EXPECT_NE(NULL, self->homa.page_pools[1]); EXPECT_EQ(NULL, self->homa.page_pools[2]); @@ -207,9 +205,10 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); char *p1, *p2, *p3; + int length; ASSERT_NE(NULL, skb2); - int length = 1000; + length = 1000; p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(1000, length); EXPECT_NE(NULL, p1); @@ -238,9 +237,10 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); char *p1, *p2, *p3; + int length; ASSERT_NE(NULL, skb2); - int length = 1000; + length = 1000; p1 = homa_skb_extend_frags(&self->homa, self->skb, &length); EXPECT_EQ(1000, length); EXPECT_NE(NULL, p1); @@ -413,7 +413,7 @@ TEST_F(homa_skb, homa_skb_append_from_iter__basics) EXPECT_EQ(0, homa_skb_append_from_iter(&self->homa, self->skb, iter, 2000)); EXPECT_STREQ("_copy_from_iter 2000 bytes at 1000", - unit_log_get()); + unit_log_get()); /* Second append spills into a new frag. */ skb_core->page_size = 4096; @@ -422,7 +422,7 @@ TEST_F(homa_skb, homa_skb_append_from_iter__basics) 3000)); EXPECT_STREQ("_copy_from_iter 2096 bytes at 3000; " "_copy_from_iter 904 bytes at 5096", - unit_log_get()); + unit_log_get()); EXPECT_EQ(2, shinfo->nr_frags); EXPECT_EQ(4096, skb_frag_size(&shinfo->frags[0])); @@ -575,8 +575,7 @@ TEST_F(homa_skb, homa_skb_free_many_tx__check_page_order) int i, length; skb = homa_skb_new_tx(100); - for (i = 0; i < 4; i++) - { + for (i = 0; i < 4; i++) { length = 2 * HOMA_SKB_PAGE_SIZE; homa_skb_extend_frags(&self->homa, skb, &length); } @@ -621,7 +620,7 @@ TEST_F(homa_skb, homa_skb_get) struct sk_buff *skb = test_skb(&self->homa); int32_t data[500]; - /* Data is entirely in the head. */ + /* Data is entirely in the head. */ memset(data, 0, sizeof(data)); homa_skb_get(skb, data, 20, 40); EXPECT_EQ(1000020, data[0]); @@ -729,4 +728,4 @@ TEST_F(homa_skb, homa_skb_release_pages__empty_pool) homa_skb_release_pages(&self->homa); EXPECT_EQ(0, get_skb_core(0)->pool->avail); -} \ No newline at end of file +} diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 54cb5b70..e2b7caec 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_sock.h" @@ -61,8 +59,8 @@ TEST_F(homa_sock, homa_socktab_next__basics) { struct homa_sock hsk1, hsk2, hsk3, hsk4, *hsk; struct homa_socktab_scan scan; - int first_port = 34000; + homa_destroy(&self->homa); homa_init(&self->homa); mock_sock_init(&hsk1, &self->homa, first_port); @@ -138,13 +136,13 @@ TEST_F(homa_sock, homa_sock_init__hijack_tcp) { struct homa_sock hijack, no_hijack; - self->homa.hijack_tcp = 0; + self->homa.hijack_tcp = 0; mock_sock_init(&no_hijack, &self->homa, 0); - self->homa.hijack_tcp = 1; - mock_sock_init(&hijack, &self->homa, 0); - EXPECT_EQ(0, no_hijack.sock.sk_protocol); - EXPECT_EQ(IPPROTO_TCP, hijack.sock.sk_protocol); - homa_sock_destroy(&hijack); + self->homa.hijack_tcp = 1; + mock_sock_init(&hijack, &self->homa, 0); + EXPECT_EQ(0, no_hijack.sock.sk_protocol); + EXPECT_EQ(IPPROTO_TCP, hijack.sock.sk_protocol); + homa_sock_destroy(&hijack); homa_sock_destroy(&no_hijack); } @@ -186,7 +184,7 @@ TEST_F(homa_sock, homa_sock_shutdown__already_shutdown) self->hsk.shutdown = 1; homa_sock_shutdown(&self->hsk); EXPECT_TRUE(self->hsk.shutdown); - EXPECT_EQ(2 ,unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(2, unit_list_length(&self->hsk.active_rpcs)); self->hsk.shutdown = 0; } TEST_F(homa_sock, homa_sock_shutdown__delete_rpcs) @@ -315,4 +313,4 @@ TEST_F(homa_sock, homa_sock_lock_slow) EXPECT_EQ(1, homa_metrics_per_cpu()->socket_lock_misses); EXPECT_NE(0, homa_metrics_per_cpu()->socket_lock_miss_cycles); homa_sock_unlock(&self->hsk); -} \ No newline at end of file +} diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index b552ce6d..474a0336 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #include "homa_peer.h" diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 8bebb8b9..ec04cb11 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #define KSELFTEST_NOT_MAIN 1 diff --git a/test/unit_timetrace.c b/test/unit_timetrace.c index 4bb24faf..b13ddf63 100644 --- a/test/unit_timetrace.c +++ b/test/unit_timetrace.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" #define KSELFTEST_NOT_MAIN 1 @@ -121,14 +119,18 @@ TEST_F(timetrace, tt_find_oldest) TEST_F(timetrace, tt_proc_open__not_initialized) { + int err; + tt_destroy(); - int err = -tt_proc_open(NULL, &self->file); + err = -tt_proc_open(NULL, &self->file); EXPECT_EQ(EINVAL, err); } TEST_F(timetrace, tt_proc_open__no_memory) { + int err; + mock_kmalloc_errors = 1; - int err = -tt_proc_open(NULL, &self->file); + err = -tt_proc_open(NULL, &self->file); EXPECT_EQ(ENOMEM, err); } TEST_F(timetrace, tt_proc_open__increment_frozen) @@ -140,9 +142,10 @@ TEST_F(timetrace, tt_proc_open__increment_frozen) TEST_F(timetrace, tt_proc_read__bogus_file) { struct tt_proc_file pf; + int err; pf.file = NULL; - int err = -tt_proc_read(&self->file, (char *) 1000, 100, 0); + err = -tt_proc_read(&self->file, (char *) 1000, 100, 0); EXPECT_EQ(EINVAL, err); self->file.private_data = &pf; err = -tt_proc_read(&self->file, (char *) 1000, 100, 0); @@ -151,9 +154,11 @@ TEST_F(timetrace, tt_proc_read__bogus_file) } TEST_F(timetrace, tt_proc_read__uninitialized) { + int result; + tt_proc_open(NULL, &self->file); tt_destroy(); - int result = tt_proc_read(&self->file, (char *) 1000, 100, 0); + result = tt_proc_read(&self->file, (char *) 1000, 100, 0); EXPECT_EQ(0, result); } TEST_F(timetrace, tt_proc_read__nothing_to_read) @@ -256,9 +261,10 @@ TEST_F(timetrace, tt_proc_read__single_entry_too_large) TEST_F(timetrace, tt_proc_release__bogus_file) { struct tt_proc_file pf; + int err; pf.file = NULL; - int err = -tt_proc_release(NULL, &self->file); + err = -tt_proc_release(NULL, &self->file); EXPECT_EQ(EINVAL, err); self->file.private_data = &pf; err = -tt_proc_release(NULL, &self->file); @@ -295,4 +301,4 @@ TEST_F(timetrace, tt_proc_release__unfreeze) EXPECT_FALSE(tt_frozen); EXPECT_EQ(NULL, tt_buffers[1]->events[3].format); EXPECT_EQ(0, tt_buffers[1]->next_index); -} \ No newline at end of file +} diff --git a/test/utils.c b/test/utils.c index 3a17958d..b10457a6 100644 --- a/test/utils.c +++ b/test/utils.c @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +// SPDX-License-Identifier: BSD-2-Clause /* This file various utility functions for unit testing; this file * is implemented entirely in C, and accesses Homa and kernel internals. @@ -35,16 +33,17 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, struct in6_addr *server_ip, int server_port, int id, int req_length, int resp_length) { - int bytes_received; - union sockaddr_in_union server_addr; int saved_id = atomic64_read(&hsk->homa->next_outgoing_id); + union sockaddr_in_union server_addr; + int bytes_received, this_size; + struct homa_rpc *crpc; server_addr.in6.sin6_family = AF_INET6; server_addr.in6.sin6_addr = *server_ip; server_addr.in6.sin6_port = htons(server_port); if (id != 0) atomic64_set(&hsk->homa->next_outgoing_id, id); - struct homa_rpc *crpc = homa_rpc_new_client(hsk, &server_addr); + crpc = homa_rpc_new_client(hsk, &server_addr); if (IS_ERR(crpc)) return NULL; if (homa_message_out_fill(crpc, unit_iov_iter(NULL, req_length), 0)) { @@ -62,7 +61,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, struct data_header h = { .common = { .sport = htons(server_port), - .dport = htons(hsk->port), + .dport = htons(hsk->port), .type = DATA, .sender_id = cpu_to_be64(id ^ 1) }, @@ -74,7 +73,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, .seg = {.offset = 0} }; - int this_size = (resp_length > UNIT_TEST_DATA_PER_PACKET) + this_size = (resp_length > UNIT_TEST_DATA_PER_PACKET) ? UNIT_TEST_DATA_PER_PACKET : resp_length; homa_dispatch_pkts(mock_skb_new(server_ip, &h.common, this_size, 0), hsk->homa); @@ -88,11 +87,11 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, this_size = UNIT_TEST_DATA_PER_PACKET; h.seg.offset = htonl(bytes_received); homa_dispatch_pkts(mock_skb_new(server_ip, &h.common, - this_size , 0), hsk->homa); + this_size, 0), hsk->homa); } if (state == UNIT_RCVD_MSG) return crpc; - FAIL("unit_client_rpc received unexpected state %d", state); + FAIL("%s received unexpected state %d", __func__, state); homa_rpc_free(crpc); return NULL; } @@ -110,15 +109,15 @@ struct in6_addr unit_get_in_addr(char *s) { struct in6_addr ret = {}; unsigned int a, b, c, d; + if (sscanf(s, "%u.%u.%u.%u", &a, &b, &c, &d) == 4) { ret.s6_addr32[3] = htonl((a<<24) + (b<<16) + (c<<8) + d); ret.s6_addr32[2] = htonl(0x0000ffff); } else { - int inet_pton(int af, const char *src, void *dst); int res = inet_pton(AF_INET6, s, &ret); - if (res <= 0) { + + if (res <= 0) abort(); - } } return ret; } @@ -132,6 +131,7 @@ int unit_list_length(struct list_head *head) { struct list_head *pos; int count = 0; + list_for_each(pos, head) { count++; } @@ -146,6 +146,7 @@ int unit_list_length(struct list_head *head) void unit_log_active_ids(struct homa_sock *hsk) { struct homa_rpc *rpc; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) unit_log_printf(" ", "%llu", rpc->id); } @@ -157,8 +158,9 @@ void unit_log_active_ids(struct homa_sock *hsk) */ void unit_log_hashed_rpcs(struct homa_sock *hsk) { - int i; struct homa_rpc *rpc; + int i; + for (i = 0; i < HOMA_CLIENT_RPC_BUCKETS; i++) { hlist_for_each_entry_rcu(rpc, &hsk->client_rpc_buckets[i].rpcs, hash_links) { @@ -187,11 +189,10 @@ void unit_log_frag_list(struct sk_buff *skb, int verbose) for (frag = skb_shinfo(skb)->frag_list; frag != NULL; frag = frag->next) { - if (verbose) { + if (verbose) homa_print_packet(frag, buffer, sizeof(buffer)); - } else { + else homa_print_packet_short(frag, buffer, sizeof(buffer)); - } unit_log_printf("; ", "%s", buffer); } } @@ -205,15 +206,15 @@ void unit_log_grantables(struct homa *homa) { struct homa_peer *peer; struct homa_rpc *rpc; + list_for_each_entry(peer, &homa->grantable_peers, grantable_links) { list_for_each_entry(rpc, &peer->grantable_rpcs, grantable_links) { - unit_log_printf("; ", "%s from %s, id %lu, " - "remaining %d", + unit_log_printf("; ", "%s from %s, id %llu, remaining %d", homa_is_client(rpc->id) ? "response" : "request", homa_print_ipv6_addr(&peer->addr), - (long unsigned int) rpc->id, + rpc->id, rpc->msgin.bytes_remaining); } } @@ -237,11 +238,10 @@ void unit_log_message_out_packets(struct homa_message_out *message, int verbose) for (skb = message->packets; skb != NULL; skb = homa_get_skb_info(skb)->next_skb) { - if (verbose) { + if (verbose) homa_print_packet(skb, buffer, sizeof(buffer)); - } else { + else homa_print_packet_short(skb, buffer, sizeof(buffer)); - } unit_log_printf("; ", "%s", buffer); } } @@ -260,11 +260,10 @@ void unit_log_filled_skbs(struct sk_buff *skb, int verbose) char buffer[400]; while (skb != NULL) { - if (verbose) { + if (verbose) homa_print_packet(skb, buffer, sizeof(buffer)); - } else { + else homa_print_packet_short(skb, buffer, sizeof(buffer)); - } unit_log_printf("; ", "%s", buffer); skb = homa_get_skb_info(skb)->next_skb; } @@ -283,11 +282,10 @@ void unit_log_skb_list(struct sk_buff_head *packets, int verbose) char buffer[200]; skb_queue_walk(packets, skb) { - if (verbose) { + if (verbose) homa_print_packet(skb, buffer, sizeof(buffer)); - } else { + else homa_print_packet_short(skb, buffer, sizeof(buffer)); - } unit_log_printf("; ", "%s", buffer); } } @@ -300,11 +298,11 @@ void unit_log_skb_list(struct sk_buff_head *packets, int verbose) void unit_log_throttled(struct homa *homa) { struct homa_rpc *rpc; + list_for_each_entry_rcu(rpc, &homa->throttled_rpcs, throttled_links) { - unit_log_printf("; ", "%s id %lu, next_offset %d", + unit_log_printf("; ", "%s id %llu, next_offset %d", homa_is_client(rpc->id) ? "request" - : "response", - (long unsigned int) rpc->id, + : "response", rpc->id, rpc->msgout.next_xmit_offset); } } @@ -357,7 +355,7 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, struct data_header h = { .common = { .sport = htons(client_port), - .dport = htons(hsk->port), + .dport = htons(hsk->port), .type = DATA, .sender_id = cpu_to_be64(id ^ 1) }, @@ -370,13 +368,14 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, }; struct homa_rpc *srpc = homa_rpc_new_server(hsk, client_ip, &h, &created); + if (IS_ERR(srpc)) return NULL; EXPECT_EQ(srpc->completion_cookie, 0); homa_rpc_unlock(srpc); homa_dispatch_pkts(mock_skb_new(client_ip, &h.common, (req_length > UNIT_TEST_DATA_PER_PACKET) - ? UNIT_TEST_DATA_PER_PACKET : req_length , 0), + ? UNIT_TEST_DATA_PER_PACKET : req_length, 0), hsk->homa); if (state == UNIT_RCVD_ONE_PKT) return srpc; @@ -384,11 +383,12 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, bytes_received < req_length; bytes_received += UNIT_TEST_DATA_PER_PACKET) { int this_size = req_length - bytes_received; + if (this_size > UNIT_TEST_DATA_PER_PACKET) this_size = UNIT_TEST_DATA_PER_PACKET; h.seg.offset = htonl(bytes_received); homa_dispatch_pkts(mock_skb_new(client_ip, &h.common, - this_size , 0), hsk->homa); + this_size, 0), hsk->homa); } if (state == UNIT_RCVD_MSG) return srpc; @@ -402,9 +402,9 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, srpc->state = RPC_OUTGOING; if (state == UNIT_OUTGOING) return srpc; - FAIL("unit_server_rpc received unexpected state %d", state); + FAIL("%s received unexpected state %d", __func__, state); - error: +error: homa_rpc_free(srpc); return NULL; } @@ -429,6 +429,7 @@ struct iov_iter *unit_iov_iter(void *buffer, size_t length) { static struct iovec iovec; static struct iov_iter iter; + iovec.iov_base = buffer; iovec.iov_len = length; iov_iter_init(&iter, WRITE, &iovec, 1, length); @@ -443,6 +444,7 @@ struct iov_iter *unit_iov_iter(void *buffer, size_t length) char *unit_ack_string(struct homa_ack *ack) { static char buffer[1000]; + snprintf(buffer, sizeof(buffer), "client_port %d, server_port %d, client_id %llu", ntohs(ack->client_port), ntohs(ack->server_port), diff --git a/test/utils.h b/test/utils.h index dd741f6b..203b8acc 100644 --- a/test/utils.h +++ b/test/utils.h @@ -1,6 +1,4 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +/* SPDX-License-Identifier: BSD-2-Clause */ /* Utility functions for unit tests, implemented in C. */ @@ -34,14 +32,14 @@ enum unit_rpc_state { extern char *unit_ack_string(struct homa_ack *ack); extern struct homa_rpc - *unit_client_rpc(struct homa_sock *hsk, - enum unit_rpc_state state, struct in6_addr *client_ip, - struct in6_addr *server_ip, int server_port, int id, - int req_length, int resp_length); + *unit_client_rpc(struct homa_sock *hsk, + enum unit_rpc_state state, struct in6_addr *client_ip, + struct in6_addr *server_ip, int server_port, int id, + int req_length, int resp_length); extern struct in6_addr - unit_get_in_addr(char *s); + unit_get_in_addr(char *s); extern struct iov_iter - *unit_iov_iter(void *buffer, size_t length); + *unit_iov_iter(void *buffer, size_t length); extern int unit_list_length(struct list_head *head); extern void unit_log_active_ids(struct homa_sock *hsk); extern void unit_log_filled_skbs(struct sk_buff *skb, int verbose); @@ -49,14 +47,19 @@ extern void unit_log_frag_list(struct sk_buff *skb, int verbose); extern void unit_log_grantables(struct homa *homa); extern void unit_log_hashed_rpcs(struct homa_sock *hsk); extern void unit_log_message_out_packets( - struct homa_message_out *message, int verbose); + struct homa_message_out *message, int verbose); extern const char *unit_print_gaps(struct homa_rpc *rpc); extern struct homa_rpc - *unit_server_rpc(struct homa_sock *hsk, - enum unit_rpc_state state, struct in6_addr *server_ip, - struct in6_addr *client_ip, int client_port, int id, - int req_length, int resp_length); + *unit_server_rpc(struct homa_sock *hsk, + enum unit_rpc_state state, struct in6_addr *server_ip, + struct in6_addr *client_ip, int client_port, int id, + int req_length, int resp_length); extern void unit_log_skb_list(struct sk_buff_head *packets, - int verbose); + int verbose); extern void unit_log_throttled(struct homa *homa); extern void unit_teardown(void); + +/* Kludge to avoid including arpa/inet.h, which causes definition + * conflicts with kernel header files. + */ +extern int inet_pton(int af, const char *src, void *dst); From b0d51d5c760a5627d7967ab8a4411f55f79d7253 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 9 Oct 2024 11:54:57 -0700 Subject: [PATCH 043/625] Remove unneeded -include's in test/Makefile --- test/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/Makefile b/test/Makefile index e2191b52..cb981d77 100644 --- a/test/Makefile +++ b/test/Makefile @@ -15,9 +15,7 @@ CINCLUDES := -I. \ -I$(KDIR)/arch/x86/include/generated/uapi \ -I$(KDIR)/include/uapi \ -I$(KDIR)/include/generated/uapi \ - -include $(KDIR)/include/linux/kconfig.h \ - -include $(KDIR)/include/linux/compiler-version.h \ - -include $(KDIR)/include/linux/compiler_types.h + -include $(KDIR)/include/linux/kconfig.h CCINCLUDES := -I. \ -I.. \ -I$(KDIR)/arch/x86/include \ From 7e059658c98102b6ff50b21f2499a1ad5b898da5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 9 Oct 2024 14:00:10 -0700 Subject: [PATCH 044/625] More comments in kselftest_harness.h --- test/kselftest_harness.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/kselftest_harness.h b/test/kselftest_harness.h index 2d608d3b..c2e18a43 100644 --- a/test/kselftest_harness.h +++ b/test/kselftest_harness.h @@ -56,7 +56,10 @@ * and compiling them into a normal Linux executable along with the * unit tests). This creates potential problems with conflicts between * kernel header files and user-level header files. To avoid these conflicts, - * this file must be very careful about what headers it includes. + * this file must be very careful about what headers it includes. This file + * is based on a relatively old version of the official file; new versions + * generate even more header file conflicts, which appear very difficult + * to resolve. * This file also contains several other changes, such as: * - All tests run in a single process, rather than forking a child process * for each test. From 4eb0448aec99815c7c1a3d044eaf687c22ed93ee Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 9 Oct 2024 14:48:29 -0700 Subject: [PATCH 045/625] Remove 8021q stuff from update_linux (No longer needed) --- cloudlab/bin/update_linux | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cloudlab/bin/update_linux b/cloudlab/bin/update_linux index d4c8f60d..97fe00c0 100755 --- a/cloudlab/bin/update_linux +++ b/cloudlab/bin/update_linux @@ -32,9 +32,8 @@ for ((i = $first ; i <= $last; i++)); do node=node$i echo echo $node - ssh $node 'rm -rf tmp; mkdir -p tmp tmp/boot tmp/8021q' + ssh $node 'rm -rf tmp; mkdir -p tmp tmp/boot' rsync -rtv /boot/initrd.img-$v /boot/config-$v /boot/System.map-$v \ /boot/vmlinuz-$v $node:tmp/boot/ - rsync -rtv /lib/modules/$v/kernel/net/8021q/ $node:tmp/8021q/ - ssh $node "sudo cp -f tmp/boot/* /boot; sudo cp -f tmp/8021q/8021q.ko /lib/modules/$v/kernel/net/8021q; sudo reboot" + ssh $node "sudo cp -f tmp/boot/* /boot; sudo reboot" done From eed4a0cfc8ffd37bc5293af614d331786952b4c9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Oct 2024 16:47:07 -0700 Subject: [PATCH 046/625] Remove obsolete HOMAIOCREPLY definition --- homa.h | 1 - 1 file changed, 1 deletion(-) diff --git a/homa.h b/homa.h index c8ac00a2..437cb86d 100644 --- a/homa.h +++ b/homa.h @@ -207,7 +207,6 @@ struct homa_set_buf_args { * SIOCPROTOPRIVATE range of 0x89e0 through 0x89ef. */ -#define HOMAIOCREPLY _IOWR(0x89, 0xe2, struct homa_reply_args) #define HOMAIOCABORT _IOWR(0x89, 0xe3, struct homa_abort_args) #define HOMAIOCFREEZE _IO(0x89, 0xef) From e891f868dd003172836678185f553ec510a18b74 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 16 Oct 2024 10:17:49 -0700 Subject: [PATCH 047/625] Minor cleanups Remove stale declaration, add const declarations --- homa_impl.h | 8 +++----- homa_plumbing.c | 4 ++-- test/mock.c | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 1ad745d2..9b514608 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1088,13 +1088,11 @@ extern struct homa_interest extern void homa_close(struct sock *sock, long timeout); extern int homa_copy_to_user(struct homa_rpc *rpc); extern void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); -extern void homa_data_from_server(struct sk_buff *skb, - struct homa_rpc *crpc); extern void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); extern void homa_destroy(struct homa *homa); extern int homa_disconnect(struct sock *sk, int flags); extern void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); -extern int homa_dointvec(struct ctl_table *table, int write, +extern int homa_dointvec(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern int homa_err_handler_v4(struct sk_buff *skb, u32 info); extern int homa_err_handler_v6(struct sk_buff *skb, @@ -1165,8 +1163,8 @@ extern int homa_snprintf(char *buffer, int size, int used, extern int homa_softirq(struct sk_buff *skb); extern void homa_spin(int ns); extern char *homa_symbol_for_type(uint8_t type); -extern int homa_sysctl_softirq_cores(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +extern int homa_sysctl_softirq_cores(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern void homa_timer(struct homa *homa); extern int homa_timer_main(void *transportInfo); extern void homa_unhash(struct sock *sk); diff --git a/homa_plumbing.c b/homa_plumbing.c index ddaceb14..095f495b 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1488,7 +1488,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, * * Return: 0 for success, nonzero for error. */ -int homa_dointvec(struct ctl_table *table, int write, +int homa_dointvec(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int result; @@ -1564,7 +1564,7 @@ int homa_dointvec(struct ctl_table *table, int write, * * Return: 0 for success, nonzero for error. */ -int homa_sysctl_softirq_cores(struct ctl_table *table, int write, +int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct homa_offload_core *offload_core; diff --git a/test/mock.c b/test/mock.c index fe547ece..ec0bbfe3 100644 --- a/test/mock.c +++ b/test/mock.c @@ -845,7 +845,7 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode, return entry; } -int proc_dointvec(struct ctl_table *table, int write, +int proc_dointvec(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return 0; From 89b1e9963b4729e71450ae76b8ee1168550d4ce4 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 23 Oct 2024 10:18:00 -0700 Subject: [PATCH 048/625] Add stripping mechanism, plus checkpatch cleanups --- Makefile | 28 +++- cloudlab/bin/install_homa | 18 ++- homa.h | 48 +++--- homa_api.c | 32 ++-- homa_impl.h | 83 ++++++----- homa_incoming.c | 282 ++++++++++++++++++------------------ homa_outgoing.c | 197 ++++++++++++++----------- homa_peer.c | 41 +++--- homa_peer.h | 38 ++--- homa_plumbing.c | 195 +++++++++++++------------ homa_pool.c | 55 ++++--- homa_pool.h | 28 ++-- homa_receiver.h | 24 ++- homa_rpc.c | 82 +++++------ homa_rpc.h | 59 +++----- homa_sock.c | 30 ++-- homa_sock.h | 40 ++--- homa_stub.h | 80 ++++++++++ homa_timer.c | 52 +++---- homa_utils.c | 8 +- homa_wire.h | 19 ++- test/Makefile | 73 +++++++--- test/mock.c | 10 ++ test/unit_homa_incoming.c | 2 + timetrace.c | 167 +++++++++++---------- timetrace.h | 64 ++++---- util/strip.py | 298 ++++++++++++++++++++++++++++++++++++++ 27 files changed, 1274 insertions(+), 779 deletions(-) create mode 100644 homa_stub.h create mode 100755 util/strip.py diff --git a/Makefile b/Makefile index 25b50c45..fb0cc40f 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,6 @@ # Makefile to build Homa as a Linux module. -ifneq ($(KERNELRELEASE),) - -obj-m += homa.o -homa-y = homa_grant.o \ +HOMA_OBJS = homa_grant.o \ homa_incoming.o \ homa_metrics.o \ homa_offload.o \ @@ -18,6 +15,11 @@ homa-y = homa_grant.o \ homa_utils.o \ timetrace.o +ifneq ($(KERNELRELEASE),) + +obj-m += homa.o +homa-y = $(HOMA_OBJS) + MY_CFLAGS += -g ccflags-y += ${MY_CFLAGS} CC += ${MY_CFLAGS} @@ -41,6 +43,24 @@ install: check: ../homaLinux/scripts/kernel-doc -none *.c +# Copy stripped source files to a Linux source tree +LINUX_SRC_DIR ?= ../net-next +HOMA_TARGET ?= $(LINUX_SRC_DIR)/net/homa +CP_HDRS := homa_impl.h \ + homa_peer.h \ + homa_pool.h \ + homa_rpc.h \ + homa_sock.h \ + homa_stub.h \ + homa_wire.h +CP_SRCS := $(patsubst %.o,%.c,$(filter-out timetrace.o, $(HOMA_OBJS))) +CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS)) +net-next: $(CP_TARGETS) $(LINUX_SRC_DIR)/include/uapi/linux/homa.h +$(HOMA_TARGET)/%: % util/strip.py + util/strip.py $< > $@ +$(LINUX_SRC_DIR)/include/uapi/linux/homa.h: homa.h util/strip.py + util/strip.py $< > $@ + clean: $(MAKE) -C $(KDIR) M=$(shell pwd) clean diff --git a/cloudlab/bin/install_homa b/cloudlab/bin/install_homa index 71582537..181310e1 100755 --- a/cloudlab/bin/install_homa +++ b/cloudlab/bin/install_homa @@ -7,24 +7,32 @@ # or more target machines; it also loads the Homa kernel module. # # Usage: -# install_homa num_nodes [first] +# install_homa [--net-next] num_nodes [first] # # The "num_nodes" arguments indicates how many servers should be updated. # The "first" argument is optional; it is an integer identifying the # first node on which installation will occur (e.g. "install 4 2" means # node2 through node5 will be updated. "first" defaults to 0. -# This script assumes that Homa has been built in ~/homaModule on the -# current machine (this includes both homa.ko and all of the binaries in util). +# This script assumes that the Homa module binary (homa.ko) has already +# been built. If --net-next is specified, it will be in the kernel build +# directory (see code below for path), otherwise it will be in ~/homaModule. +# In addition, the utility programs in ~/homaModule/util must have been built. root=~/homaModule set -e + +homa_ko=$root/homa.ko +if [ $1 = "--net-next" ]; then + homa_ko=/netnext/net-next/net/homa/homa.ko + shift +fi if [ $# -eq 2 ]; then first=$2 elif [ $# -eq 1 ]; then first=0 else - echo "Usage: install_homa num_nodes [first]" + echo "Usage: install_homa [--net-next] num_nodes [first]" exit 1 fi last=`expr $first + $1 - 1` || true @@ -35,7 +43,7 @@ for ((i = $first ; i <= $last; i++)); do echo '*** Installing Homa on' $node '***' rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv ~/.bashrc ~/.bash_profile ~/.gdbinit $node: rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv --exclude __pycache__ ~/bin/ $node:bin/ - rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv $root/homa.ko $root/util/cp_node $root/util/homa_prio $root/util/*.py $node:bin/ + rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv $homa_ko $root/util/cp_node $root/util/homa_prio $root/util/*.py $node:bin/ ssh -4 $node 'sudo sysctl .kernel.printk="5 4 1 7"' ssh -4 $node 'echo $PATH' ssh -4 $node 'config default' diff --git a/homa.h b/homa.h index 437cb86d..f2920b99 100644 --- a/homa.h +++ b/homa.h @@ -4,8 +4,8 @@ * transport protocol. */ -#ifndef _HOMA_H -#define _HOMA_H +#ifndef _UAPI_LINUX_HOMA_H +#define _UAPI_LINUX_HOMA_H #include #ifndef __KERNEL__ @@ -81,11 +81,12 @@ struct homa_sendmsg_args { */ uint64_t completion_cookie; }; + #if !defined(__cplusplus) _Static_assert(sizeof(struct homa_sendmsg_args) >= 16, - "homa_sendmsg_args shrunk"); + "homa_sendmsg_args shrunk"); _Static_assert(sizeof(struct homa_sendmsg_args) <= 16, - "homa_sendmsg_args grew"); + "homa_sendmsg_args grew"); #endif /** @@ -93,7 +94,6 @@ _Static_assert(sizeof(struct homa_sendmsg_args) <= 16, * recvmsg; passed to recvmsg using the msg_control field. */ struct homa_recvmsg_args { - /** * @id: (in/out) Initially specifies the id of the desired RPC, or 0 * if any RPC is OK; returns the actual id received. @@ -143,11 +143,12 @@ struct homa_recvmsg_args { */ uint32_t bpage_offsets[HOMA_MAX_BPAGES]; }; + #if !defined(__cplusplus) _Static_assert(sizeof(struct homa_recvmsg_args) >= 120, - "homa_recvmsg_args shrunk"); + "homa_recvmsg_args shrunk"); _Static_assert(sizeof(struct homa_recvmsg_args) <= 120, - "homa_recvmsg_args grew"); + "homa_recvmsg_args grew"); #endif /* Flag bits for homa_recvmsg_args.flags (see man page for documentation): @@ -174,6 +175,7 @@ struct homa_abort_args { int _pad1; uint64_t _pad2[2]; }; + #if !defined(__cplusplus) _Static_assert(sizeof(struct homa_abort_args) >= 32, "homa_abort_args shrunk"); _Static_assert(sizeof(struct homa_abort_args) <= 32, "homa_abort_args grew"); @@ -210,24 +212,24 @@ struct homa_set_buf_args { #define HOMAIOCABORT _IOWR(0x89, 0xe3, struct homa_abort_args) #define HOMAIOCFREEZE _IO(0x89, 0xef) -extern int homa_abortp(int fd, struct homa_abort_args *args); - -extern int homa_send(int sockfd, const void *message_buf, - size_t length, const union sockaddr_in_union *dest_addr, - uint64_t *id, uint64_t completion_cookie); -extern int homa_sendv(int sockfd, const struct iovec *iov, - int iovcnt, const union sockaddr_in_union *dest_addr, - uint64_t *id, uint64_t completion_cookie); -extern ssize_t homa_reply(int sockfd, const void *message_buf, - size_t length, const union sockaddr_in_union *dest_addr, - uint64_t id); -extern ssize_t homa_replyv(int sockfd, const struct iovec *iov, - int iovcnt, const union sockaddr_in_union *dest_addr, - uint64_t id); -extern int homa_abort(int sockfd, uint64_t id, int error); +int homa_abortp(int fd, struct homa_abort_args *args); + +int homa_send(int sockfd, const void *message_buf, + size_t length, const union sockaddr_in_union *dest_addr, + uint64_t *id, uint64_t completion_cookie); +int homa_sendv(int sockfd, const struct iovec *iov, + int iovcnt, const union sockaddr_in_union *dest_addr, + uint64_t *id, uint64_t completion_cookie); +ssize_t homa_reply(int sockfd, const void *message_buf, + size_t length, const union sockaddr_in_union *dest_addr, + uint64_t id); +ssize_t homa_replyv(int sockfd, const struct iovec *iov, + int iovcnt, const union sockaddr_in_union *dest_addr, + uint64_t id); +int homa_abort(int sockfd, uint64_t id, int error); #ifdef __cplusplus } #endif -#endif /* _HOMA_H */ +#endif /* _UAPI_LINUX_HOMA_H */ diff --git a/homa_api.c b/homa_api.c index 72b32fe2..e351dda8 100644 --- a/homa_api.c +++ b/homa_api.c @@ -35,7 +35,7 @@ * error occurred, -1 is returned and errno is set appropriately. */ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, - const union sockaddr_in_union *dest_addr, uint64_t id) + const union sockaddr_in_union *dest_addr, uint64_t id) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -45,10 +45,10 @@ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, args.id = id; args.completion_cookie = 0; - vec.iov_base = (void *) message_buf; + vec.iov_base = (void *)message_buf; vec.iov_len = length; - hdr.msg_name = (void *) dest_addr; + hdr.msg_name = (void *)dest_addr; hdr.msg_namelen = sizeof(*dest_addr); hdr.msg_iov = &vec; hdr.msg_iovlen = 1; @@ -78,7 +78,7 @@ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, * error occurred, -1 is returned and errno is set appropriately. */ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, - const union sockaddr_in_union *dest_addr, uint64_t id) + const union sockaddr_in_union *dest_addr, uint64_t id) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -87,9 +87,9 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, args.id = id; args.completion_cookie = 0; - hdr.msg_name = (void *) dest_addr; + hdr.msg_name = (void *)dest_addr; hdr.msg_namelen = sizeof(*dest_addr); - hdr.msg_iov = (struct iovec *) iov; + hdr.msg_iov = (struct iovec *)iov; hdr.msg_iovlen = iovcnt; hdr.msg_control = &args; hdr.msg_controllen = 0; @@ -113,8 +113,8 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, * error occurred, -1 is returned and errno is set appropriately. */ int homa_send(int sockfd, const void *message_buf, size_t length, - const union sockaddr_in_union *dest_addr, uint64_t *id, - uint64_t completion_cookie) + const union sockaddr_in_union *dest_addr, uint64_t *id, + uint64_t completion_cookie) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -124,10 +124,10 @@ int homa_send(int sockfd, const void *message_buf, size_t length, args.id = 0; args.completion_cookie = completion_cookie; - vec.iov_base = (void *) message_buf; + vec.iov_base = (void *)message_buf; vec.iov_len = length; - hdr.msg_name = (void *) dest_addr; + hdr.msg_name = (void *)dest_addr; /* For some unknown reason, this change improves short-message P99 * latency by 20% in W3 under IPv4 (as of December 2022). */ @@ -139,7 +139,7 @@ int homa_send(int sockfd, const void *message_buf, size_t length, hdr.msg_control = &args; hdr.msg_controllen = 0; result = sendmsg(sockfd, &hdr, 0); - if ((result >= 0) && (id != NULL)) + if (result >= 0 && id) *id = args.id; return result; } @@ -162,8 +162,8 @@ int homa_send(int sockfd, const void *message_buf, size_t length, * error occurred, -1 is returned and errno is set appropriately. */ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, - const union sockaddr_in_union *dest_addr, uint64_t *id, - uint64_t completion_cookie) + const union sockaddr_in_union *dest_addr, uint64_t *id, + uint64_t completion_cookie) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -172,14 +172,14 @@ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, args.id = 0; args.completion_cookie = completion_cookie; - hdr.msg_name = (void *) dest_addr; + hdr.msg_name = (void *)dest_addr; hdr.msg_namelen = sizeof(*dest_addr); - hdr.msg_iov = (struct iovec *) iov; + hdr.msg_iov = (struct iovec *)iov; hdr.msg_iovlen = iovcnt; hdr.msg_control = &args; hdr.msg_controllen = 0; result = sendmsg(sockfd, &hdr, 0); - if ((result >= 0) && (id != NULL)) + if (result >= 0 && id) *id = args.id; return result; } diff --git a/homa_impl.h b/homa_impl.h index 9b514608..b8f0118b 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -8,9 +8,10 @@ #define _HOMA_IMPL_H #include +#if 1 /* See strip.py --alt */ #ifdef __UNIT_TEST__ #undef WARN -#define WARN(condition, format...) +#define WARN(...) #undef WARN_ON #define WARN_ON(condition) ({ \ @@ -21,6 +22,7 @@ #undef WARN_ON_ONCE #define WARN_ON_ONCE(condition) WARN_ON(condition) #endif +#endif /* See strip.py */ #include #include @@ -43,16 +45,24 @@ #include #include +#if 1 /* See strip.py */ +#include "homa.h" +#else /* See strip.py */ +#include +#endif /* See strip.py */ +#include "homa_wire.h" + +#if 1 /* See strip.py --alt */ #ifdef __UNIT_TEST__ #undef alloc_pages #define alloc_pages mock_alloc_pages -extern struct page *mock_alloc_pages(gfp_t gfp, unsigned int order); +struct page *mock_alloc_pages(gfp_t gfp, unsigned int order); #define compound_order mock_compound_order -extern unsigned int mock_compound_order(struct page *page); +unsigned int mock_compound_order(struct page *page); #define cpu_to_node mock_cpu_to_node -extern int mock_cpu_to_node(int cpu); +int mock_cpu_to_node(int cpu); #undef current #define current current_task @@ -60,60 +70,61 @@ extern struct task_struct *current_task; #undef get_cycles #define get_cycles mock_get_cycles -extern cycles_t mock_get_cycles(void); +cycles_t mock_get_cycles(void); #define get_page mock_get_page -extern void mock_get_page(struct page *page); +void mock_get_page(struct page *page); #undef kmalloc #define kmalloc mock_kmalloc -extern void *mock_kmalloc(size_t size, gfp_t flags); +void *mock_kmalloc(size_t size, gfp_t flags); #undef kmalloc_array -#define kmalloc_array(count, size, type) mock_kmalloc(count*size, type) +#define kmalloc_array(count, size, type) mock_kmalloc((count) * (size), type) -#define kthread_complete_and_exit(comp, code) +#define kthread_complete_and_exit(...) #ifdef page_address #undef page_address #endif -#define page_address(page) ((void *) page) +#define page_address(page) ((void *)page) #define page_ref_count mock_page_refs -extern int mock_page_refs(struct page *page); +int mock_page_refs(struct page *page); #define page_to_nid mock_page_to_nid -extern int mock_page_to_nid(struct page *page); +int mock_page_to_nid(struct page *page); #define put_page mock_put_page -extern void mock_put_page(struct page *page); +void mock_put_page(struct page *page); #define rcu_read_lock mock_rcu_read_lock -extern void mock_rcu_read_lock(void); +void mock_rcu_read_lock(void); #define rcu_read_unlock mock_rcu_read_unlock -extern void mock_rcu_read_unlock(void); +void mock_rcu_read_unlock(void); #undef register_net_sysctl #define register_net_sysctl mock_register_net_sysctl -extern struct ctl_table_header *mock_register_net_sysctl(struct net *net, - const char *path, struct ctl_table *table); +struct ctl_table_header *mock_register_net_sysctl(struct net *net, + const char *path, + struct ctl_table *table); -#define signal_pending(xxx) mock_signal_pending +#define signal_pending(...) mock_signal_pending extern int mock_signal_pending; #define spin_unlock mock_spin_unlock -extern void mock_spin_unlock(spinlock_t *lock); +void mock_spin_unlock(spinlock_t *lock); #undef vmalloc #define vmalloc mock_vmalloc -extern void *mock_vmalloc(size_t size); +void *mock_vmalloc(size_t size); #undef DECLARE_PER_CPU -#define DECLARE_PER_CPU(type, name) extern type name[10]; +#define DECLARE_PER_CPU(type, name) extern type name[10] #undef DEFINE_PER_CPU -#define DEFINE_PER_CPU(type, name) type name[10]; +#define DEFINE_PER_CPU(type, name) type name[10] #undef per_cpu #define per_cpu(name, core) (name[core]) @@ -126,20 +137,22 @@ extern void *mock_vmalloc(size_t size); #define BUG_ON(...) #define set_current_state(...) #endif +#endif /* See strip.py */ /* Forward declarations. */ struct homa_peer; struct homa_sock; struct homa; -#include "homa.h" +#if 1 /* See strip.py */ #include "timetrace.h" +#endif /* See strip.py */ #include "homa_metrics.h" /* Declarations used in this file, so they can't be made at the end. */ -extern void homa_throttle_lock_slow(struct homa *homa); +void homa_throttle_lock_slow(struct homa *homa); -#define sizeof32(type) ((int) (sizeof(type))) +#define sizeof32(type) ((int)(sizeof(type))) /** * define HOMA_MAX_GRANTS - Used to size various data structures for grant @@ -764,8 +777,8 @@ struct homa { #define HOMA_GRO_FAST_GRANTS 0x20 #define HOMA_GRO_SHORT_BYPASS 0x40 #define HOMA_GRO_GEN3 0x80 - #define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE|HOMA_GRO_GEN2 \ - |HOMA_GRO_SHORT_BYPASS|HOMA_GRO_FAST_GRANTS) + #define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE | HOMA_GRO_GEN2 | \ + HOMA_GRO_SHORT_BYPASS | HOMA_GRO_FAST_GRANTS) /* * @busy_usecs: if there has been activity on a core within the @@ -908,7 +921,7 @@ struct homa_skb_info { */ static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb) { - return (struct homa_skb_info *) (skb_end_pointer(skb) + return (struct homa_skb_info *)(skb_end_pointer(skb) - sizeof(struct homa_skb_info)); } @@ -924,7 +937,7 @@ static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb) */ static inline struct sk_buff **homa_next_skb(struct sk_buff *skb) { - return (struct sk_buff **) (skb_end_pointer(skb) - sizeof(char *)); + return (struct sk_buff **)(skb_end_pointer(skb) - sizeof(char *)); } /** @@ -1020,8 +1033,8 @@ static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union */ static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb) { - return skb_is_ipv6(skb) ? ipv6_hdr(skb)->saddr : ipv4_to_ipv6( - ip_hdr(skb)->saddr); + return skb_is_ipv6(skb) ? ipv6_hdr(skb)->saddr + : ipv4_to_ipv6(ip_hdr(skb)->saddr); } /** @@ -1058,14 +1071,14 @@ static inline __be32 tt_addr(const struct in6_addr x) } #ifdef __UNIT_TEST__ -extern void unit_log_printf(const char *separator, const char *format, ...) +void unit_log_printf(const char *separator, const char *format, ...) __printf(2, 3); #define UNIT_LOG unit_log_printf -extern void unit_hook(char *id); +void unit_hook(char *id); #define UNIT_HOOK(msg) unit_hook(msg) #else #define UNIT_LOG(...) -#define UNIT_HOOK(msg) +#define UNIT_HOOK(...) #endif extern void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, @@ -1204,7 +1217,7 @@ static inline void homa_check_pacer(struct homa *homa, int softirq) * to queue new packets; if the NIC queue becomes more than half * empty, then we will help out here. */ - if ((get_cycles() + homa->max_nic_queue_cycles/2) < + if ((get_cycles() + homa->max_nic_queue_cycles / 2) < atomic64_read(&homa->link_idle_time)) return; tt_record("homa_check_pacer calling homa_pacer_xmit"); diff --git a/homa_incoming.c b/homa_incoming.c index 41cc9daa..62b3e316 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -65,7 +65,7 @@ struct homa_gap *homa_gap_new(struct list_head *next, int start, int end) { struct homa_gap *gap; - gap = kmalloc(sizeof(struct homa_gap), GFP_KERNEL); + gap = kmalloc(sizeof(*gap), GFP_KERNEL); gap->start = start; gap->end = end; gap->time = get_cycles(); @@ -102,7 +102,7 @@ void homa_gap_retry(struct homa_rpc *rpc) */ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) { - struct data_header *h = (struct data_header *) skb->data; + struct data_header *h = (struct data_header *)skb->data; struct homa_gap *gap, *dummy, *gap2; int start = ntohl(h->seg.offset); int length = homa_data_len(skb); @@ -110,7 +110,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if ((start + length) > rpc->msgin.length) { tt_record3("Packet extended past message end; id %d, offset %d, length %d", - rpc->id, start, length); + rpc->id, start, length); goto discard; } @@ -137,12 +137,12 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) continue; if (start < gap->start) { tt_record4("Packet overlaps gap start: id %d, start %d, end %d, gap_start %d", - rpc->id, start, end, gap->start); + rpc->id, start, end, gap->start); goto discard; } if (end > gap->end) { tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", - rpc->id, start, end, gap->start); + rpc->id, start, end, gap->start); goto discard; } gap->start = end; @@ -161,7 +161,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) continue; if (end > gap->end) { tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", - rpc->id, start, end, gap->start); + rpc->id, start, end, gap->start); goto discard; } gap->end = start; @@ -181,7 +181,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) else INC_METRIC(packet_discards, 1); tt_record4("homa_add_packet discarding packet for id %d, offset %d, length %d, retransmit %d", - rpc->id, start, length, h->retransmit); + rpc->id, start, length, h->retransmit); kfree_skb(skb); return; @@ -210,8 +210,10 @@ int homa_copy_to_user(struct homa_rpc *rpc) #define MAX_SKBS 20 #endif struct sk_buff *skbs[MAX_SKBS]; +#if 1 /* See strip.py */ int start_offset = 0; int end_offset = 0; +#endif /* See strip.py */ int error = 0; __u64 start; int n = 0; /* Number of filled entries in skbs. */ @@ -233,7 +235,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) } skb = __skb_dequeue(&rpc->msgin.packets); - if (skb != NULL) { + if (skb) { skbs[n] = skb; n++; if (n < MAX_SKBS) @@ -250,7 +252,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) homa_rpc_unlock(rpc); tt_record1("starting copy to user space for id %d", - rpc->id); + rpc->id); /* Each iteration of this loop copies out one skb. */ for (i = 0; i < n; i++) { @@ -269,7 +271,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) while (copied < pkt_length) { chunk_size = pkt_length - copied; dst = homa_pool_get_buffer(rpc, offset + copied, - &buf_bytes); + &buf_bytes); if (buf_bytes < chunk_size) { if (buf_bytes == 0) { /* skb has data beyond message @@ -280,50 +282,54 @@ int homa_copy_to_user(struct homa_rpc *rpc) chunk_size = buf_bytes; } error = import_ubuf(READ, dst, chunk_size, - &iter); + &iter); if (error) goto free_skbs; error = skb_copy_datagram_iter(skbs[i], - sizeof(*h) + copied, &iter, - chunk_size); + sizeof(*h) + copied, + &iter, + chunk_size); if (error) goto free_skbs; copied += chunk_size; } +#if 1 /* See strip.py */ if (end_offset == 0) { start_offset = offset; } else if (end_offset != offset) { tt_record3("copied out bytes %d-%d for id %d", - start_offset, end_offset, - rpc->id); + start_offset, end_offset, rpc->id); start_offset = offset; } end_offset = offset + pkt_length; +#endif /* See strip.py */ } free_skbs: +#if 1 /* See strip.py */ if (end_offset != 0) { tt_record3("copied out bytes %d-%d for id %d", - start_offset, end_offset, rpc->id); + start_offset, end_offset, rpc->id); end_offset = 0; } +#endif /* See strip.py */ start = get_cycles(); for (i = 0; i < n; i++) kfree_skb(skbs[i]); INC_METRIC(skb_free_cycles, get_cycles() - start); INC_METRIC(skb_frees, n); tt_record2("finished freeing %d skbs for id %d", - n, rpc->id); + n, rpc->id); n = 0; atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc, "homa_copy_to_user"); - atomic_andnot(APP_NEEDS_LOCK|RPC_COPYING_TO_USER, &rpc->flags); + atomic_andnot(APP_NEEDS_LOCK | RPC_COPYING_TO_USER, &rpc->flags); if (error) break; } if (error) tt_record2("homa_copy_to_user returning error %d for id %d", - -error, rpc->id); + -error, rpc->id); return error; } @@ -341,7 +347,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) #define MAX_ACKS 10 #endif const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - struct data_header *h = (struct data_header *) skb->data; + struct data_header *h = (struct data_header *)skb->data; __u64 id = homa_local_id(h->common.sender_id); int dport = ntohs(h->common.dport); @@ -362,15 +368,14 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) if (!hsk) { if (skb_is_ipv6(skb)) icmp6_send(skb, ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, 0, NULL, - IP6CB(skb)); + ICMPV6_PORT_UNREACH, 0, NULL, IP6CB(skb)); else icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); + ICMP_PORT_UNREACH, 0); tt_record3("Discarding packet(s) for unknown port %u, id %llu, type %d", - dport, homa_local_id(h->common.sender_id), - h->common.type); - while (skb != NULL) { + dport, homa_local_id(h->common.sender_id), + h->common.type); + while (skb) { next = skb->next; kfree_skb(skb); skb = next; @@ -379,14 +384,14 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) } /* Each iteration through the following loop processes one packet. */ - for (; skb != NULL; skb = next) { - h = (struct data_header *) skb->data; + for (; skb; skb = next) { + h = (struct data_header *)skb->data; next = skb->next; /* Relinquish the RPC lock temporarily if it's needed * elsewhere. */ - if (rpc != NULL) { + if (rpc) { int flags = atomic_read(&rpc->flags); if (flags & APP_NEEDS_LOCK) { @@ -398,7 +403,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) } /* Find and lock the RPC if we haven't already done so. */ - if (rpc == NULL) { + if (!rpc) { if (!homa_is_client(id)) { /* We are the server for this RPC. */ if (h->common.type == DATA) { @@ -408,7 +413,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) * already exist. */ rpc = homa_rpc_new_server(hsk, &saddr, - h, &created); + h, &created); if (IS_ERR(rpc)) { pr_warn("homa_pkt_dispatch couldn't create server rpc: error %lu", -PTR_ERR(rpc)); @@ -416,10 +421,11 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) rpc = NULL; goto discard; } - } else + } else { rpc = homa_find_server_rpc(hsk, &saddr, - ntohs(h->common.sport), - id); + ntohs(h->common.sport), + id); + } } else { rpc = homa_find_client_rpc(hsk, id); } @@ -446,7 +452,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) switch (h->common.type) { case DATA: - if (h->ack.client_id != 0) { + if (h->ack.client_id) { /* Save the ack for processing later, when we * have released the RPC lock. */ @@ -473,7 +479,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) case BUSY: INC_METRIC(packets_received[BUSY - DATA], 1); tt_record2("received BUSY for id %d, peer 0x%x", - id, tt_addr(rpc->peer->addr)); + id, tt_addr(rpc->peer->addr)); /* Nothing to do for these packets except reset * silent_ticks, which happened above. */ @@ -494,7 +500,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) /* It isn't safe to process more packets once we've * released the RPC lock (this should never happen). */ - BUG_ON(next != NULL); + BUG_ON(next); break; default: INC_METRIC(unknown_packet_types, 1); @@ -505,7 +511,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) discard: kfree_skb(skb); } - if (rpc != NULL) + if (rpc) homa_grant_check_rpc(rpc); while (num_acks > 0) { @@ -513,7 +519,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) homa_rpc_acked(hsk, &saddr, &acks[num_acks]); } - if (hsk->dead_skbs >= 2*hsk->homa->dead_buffs_limit) { + if (hsk->dead_skbs >= 2 * hsk->homa->dead_buffs_limit) { /* We get here if neither homa_wait_for_message * nor homa_timer can keep up with reaping dead * RPCs. See reap.txt for details. @@ -540,11 +546,11 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) struct homa *homa = rpc->hsk->homa; tt_record4("incoming data packet, id %d, peer 0x%x, offset %d/%d", - homa_local_id(h->common.sender_id), - tt_addr(rpc->peer->addr), ntohl(h->seg.offset), - ntohl(h->message_length)); + homa_local_id(h->common.sender_id), + tt_addr(rpc->peer->addr), ntohl(h->seg.offset), + ntohl(h->message_length)); - if ((rpc->state != RPC_INCOMING) && homa_is_client(rpc->id)) { + if (rpc->state != RPC_INCOMING && homa_is_client(rpc->id)) { if (unlikely(rpc->state != RPC_OUTGOING)) goto discard; INC_METRIC(responses_received, 1); @@ -552,7 +558,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) tt_record2("Incoming message for id %d has %d unscheduled bytes", rpc->id, ntohl(h->incoming)); if (homa_message_in_init(rpc, ntohl(h->message_length), - ntohl(h->incoming)) != 0) + ntohl(h->incoming)) != 0) goto discard; } else if (rpc->state != RPC_INCOMING) { /* Must be server; note that homa_rpc_new_server already @@ -569,17 +575,16 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) * performance. */ tt_record4("Dropping packet because no buffer space available: id %d, offset %d, length %d, old incoming %d", - rpc->id, ntohl(h->seg.offset), - homa_data_len(skb), - rpc->msgin.granted); + rpc->id, ntohl(h->seg.offset), homa_data_len(skb), + rpc->msgin.granted); INC_METRIC(dropped_data_no_bufs, homa_data_len(skb)); goto discard; } homa_add_packet(rpc, skb); - if ((skb_queue_len(&rpc->msgin.packets) != 0) - && !(atomic_read(&rpc->flags) & RPC_PKTS_READY)) { + if (skb_queue_len(&rpc->msgin.packets) != 0 && + !(atomic_read(&rpc->flags) & RPC_PKTS_READY)) { atomic_or(RPC_PKTS_READY, &rpc->flags); homa_sock_lock(rpc->hsk, "homa_data_pkt"); homa_rpc_handoff(rpc); @@ -655,30 +660,30 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) * @hsk: Socket on which the packet was received. */ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, - struct homa_sock *hsk) + struct homa_sock *hsk) { - struct resend_header *h = (struct resend_header *) skb->data; + struct resend_header *h = (struct resend_header *)skb->data; +#if 1 /* See strip.py */ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); +#endif /* See strip.py */ struct busy_header busy; - if (rpc == NULL) { + if (!rpc) { tt_record4("resend request for unknown id %d, peer 0x%x:%d, offset %d; responding with UNKNOWN", - homa_local_id(h->common.sender_id), - tt_addr(saddr), ntohs(h->common.sport), - ntohl(h->offset)); + homa_local_id(h->common.sender_id), tt_addr(saddr), + ntohs(h->common.sport), ntohl(h->offset)); homa_xmit_unknown(skb, hsk); goto done; } tt_record4("resend request for id %llu, offset %d, length %d, prio %d", - rpc->id, ntohl(h->offset), ntohl(h->length), - h->priority); + rpc->id, ntohl(h->offset), ntohl(h->length), h->priority); if (!homa_is_client(rpc->id) && rpc->state != RPC_OUTGOING) { /* We are the server for this RPC and don't yet have a * response packet, so just send BUSY. */ tt_record2("sending BUSY from resend, id %d, state %d", - rpc->id, rpc->state); + rpc->id, rpc->state); homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); goto done; } @@ -691,16 +696,15 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, rpc->msgout.granted); homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); } else { - if (ntohl(h->length) == 0) { + if (ntohl(h->length) == 0) /* This RESEND is from a server just trying to determine * whether the client still cares about the RPC; return * BUSY so the server doesn't time us out. */ homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); - } homa_resend_data(rpc, ntohl(h->offset), - ntohl(h->offset) + ntohl(h->length), - h->priority); + ntohl(h->offset) + ntohl(h->length), + h->priority); } done: @@ -716,20 +720,19 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) { tt_record3("Received unknown for id %llu, peer %x:%d", - rpc->id, tt_addr(rpc->peer->addr), rpc->dport); + rpc->id, tt_addr(rpc->peer->addr), rpc->dport); if (homa_is_client(rpc->id)) { if (rpc->state == RPC_OUTGOING) { /* It appears that everything we've already transmitted * has been lost; retransmit it. */ tt_record4("Restarting id %d to server 0x%x:%d, lost %d bytes", - rpc->id, tt_addr(rpc->peer->addr), - rpc->dport, - rpc->msgout.next_xmit_offset); + rpc->id, tt_addr(rpc->peer->addr), + rpc->dport, rpc->msgout.next_xmit_offset); homa_freeze(rpc, RESTART_RPC, "Freezing because of RPC restart, id %d, peer 0x%x"); homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset, - homa_unsched_priority(rpc->hsk->homa, - rpc->peer, rpc->msgout.length)); + homa_unsched_priority(rpc->hsk->homa, + rpc->peer, rpc->msgout.length)); goto done; } @@ -737,8 +740,8 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), rpc->dport, rpc->state); tt_record4("Discarding unknown for RPC id %d, peer 0x%x:%d: bad state %d", - rpc->id, tt_addr(rpc->peer->addr), rpc->dport, - rpc->state); + rpc->id, tt_addr(rpc->peer->addr), rpc->dport, + rpc->state); } else { if (rpc->hsk->homa->verbose) pr_notice("Freeing rpc id %llu from client %s:%d: unknown to client", @@ -784,9 +787,9 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) * RPC exists. The RPC has been locked by the caller. */ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_rpc *rpc) + struct homa_rpc *rpc) { - struct common_header *h = (struct common_header *) skb->data; + struct common_header *h = (struct common_header *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); __u64 id = homa_local_id(h->sender_id); struct homa_peer *peer; @@ -798,12 +801,14 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, * for this RPC (the RPC still exists and we haven't received * the entire response), or if we can't find peer info. */ - if ((rpc != NULL) && ((rpc->state != RPC_INCOMING) - || rpc->msgin.bytes_remaining)) { + if (rpc && (rpc->state != RPC_INCOMING || + rpc->msgin.bytes_remaining)) { +#if 1 /* See strip.py */ tt_record3("NEED_ACK arrived for id %d before message received, state %d, remaining %d", - rpc->id, rpc->state, rpc->msgin.bytes_remaining); + rpc->id, rpc->state, rpc->msgin.bytes_remaining); homa_freeze(rpc, NEED_ACK_MISSING_DATA, - "Freezing because NEED_ACK received before message complete, id %d, peer 0x%x"); + "Freezing because NEED_ACK received before message complete, id %d, peer 0x%x"); +#endif /* See strip.py */ goto done; } else { peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); @@ -822,10 +827,11 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, ack.common.urgent = htons(HOMA_TCP_URGENT); ack.common.sender_id = cpu_to_be64(id); ack.num_acks = htons(homa_peer_get_acks(peer, - HOMA_MAX_ACKS_PER_PKT, ack.acks)); + HOMA_MAX_ACKS_PER_PKT, + ack.acks)); __homa_xmit_control(&ack, sizeof(ack), peer, hsk); tt_record3("Responded to NEED_ACK for id %d, peer %0x%x with %d other acks", - id, tt_addr(saddr), ntohs(ack.num_acks)); + id, tt_addr(saddr), ntohs(ack.num_acks)); done: kfree_skb(skb); @@ -841,13 +847,13 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, * be unlocked here. */ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_rpc *rpc) + struct homa_rpc *rpc) { const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - struct ack_header *h = (struct ack_header *) skb->data; + struct ack_header *h = (struct ack_header *)skb->data; int i, count; - if (rpc != NULL) { + if (rpc) { tt_record1("homa_ack_pkt freeing rpc id %d", rpc->id); homa_rpc_free(rpc); homa_rpc_unlock(rpc); @@ -857,8 +863,7 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, for (i = 0; i < count; i++) homa_rpc_acked(hsk, &saddr, &h->acks[i]); tt_record3("ACK received for id %d, peer 0x%x, with %d other acks", - homa_local_id(h->common.sender_id), - tt_addr(saddr), count); + homa_local_id(h->common.sender_id), tt_addr(saddr), count); kfree_skb(skb); } @@ -957,12 +962,12 @@ void homa_rpc_abort(struct homa_rpc *rpc, int error) if (!homa_is_client(rpc->id)) { INC_METRIC(server_rpc_discards, 1); tt_record3("aborting server RPC: peer 0x%x, id %d, error %d", - tt_addr(rpc->peer->addr), rpc->id, error); + tt_addr(rpc->peer->addr), rpc->id, error); homa_rpc_free(rpc); return; } tt_record3("aborting client RPC: peer 0x%x, id %d, error %d", - tt_addr(rpc->peer->addr), rpc->id, error); + tt_addr(rpc->peer->addr), rpc->id, error); rpc->error = error; homa_sock_lock(rpc->hsk, "homa_rpc_abort"); if (!rpc->hsk->shutdown) @@ -980,15 +985,15 @@ void homa_rpc_abort(struct homa_rpc *rpc, int error) * @error: Negative errno value indicating the reason for the abort. */ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, - int port, int error) + int port, int error) { struct homa_socktab_scan scan; struct homa_rpc *rpc, *tmp; struct homa_sock *hsk; rcu_read_lock(); - for (hsk = homa_socktab_start_scan(homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { + for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk; + hsk = homa_socktab_next(&scan)) { /* Skip the (expensive) lock acquisition if there's no * work to do. */ @@ -997,10 +1002,10 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, if (!homa_protect_rpcs(hsk)) continue; list_for_each_entry_safe(rpc, tmp, &hsk->active_rpcs, - active_links) { + active_links) { if (!ipv6_addr_equal(&rpc->peer->addr, addr)) continue; - if ((port != 0) && (rpc->dport != port)) + if (port && rpc->dport != port) continue; homa_rpc_lock(rpc, "rpc_abort_rpcs"); homa_rpc_abort(rpc, error); @@ -1039,8 +1044,8 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) continue; } tt_record4("homa_abort_sock_rpcs aborting id %u on port %d, peer 0x%x, error %d", - rpc->id, hsk->port, - tt_addr(rpc->peer->addr), error); + rpc->id, hsk->port, + tt_addr(rpc->peer->addr), error); if (error) homa_rpc_abort(rpc, error); else @@ -1070,7 +1075,7 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) * interest. */ int homa_register_interests(struct homa_interest *interest, - struct homa_sock *hsk, int flags, __u64 id) + struct homa_sock *hsk, int flags, __u64 id) { struct homa_rpc *rpc = NULL; @@ -1080,9 +1085,9 @@ int homa_register_interests(struct homa_interest *interest, if (!homa_is_client(id)) return -EINVAL; rpc = homa_find_client_rpc(hsk, id); - if (rpc == NULL) + if (!rpc) return -EINVAL; - if ((rpc->interest != NULL) && (rpc->interest != interest)) { + if (rpc->interest && rpc->interest != interest) { homa_rpc_unlock(rpc); return -EINVAL; } @@ -1110,10 +1115,9 @@ int homa_register_interests(struct homa_interest *interest, interest->locked = 0; if (flags & HOMA_RECVMSG_RESPONSE) { if (!list_empty(&hsk->ready_responses)) { - rpc = list_first_entry( - &hsk->ready_responses, - struct homa_rpc, - ready_links); + rpc = list_first_entry(&hsk->ready_responses, + struct homa_rpc, + ready_links); goto claim_rpc; } /* Insert this thread at the *front* of the list; @@ -1122,12 +1126,12 @@ int homa_register_interests(struct homa_interest *interest, * round-robining between threads. Same below. */ list_add(&interest->response_links, - &hsk->response_interests); + &hsk->response_interests); } if (flags & HOMA_RECVMSG_REQUEST) { if (!list_empty(&hsk->ready_requests)) { rpc = list_first_entry(&hsk->ready_requests, - struct homa_rpc, ready_links); + struct homa_rpc, ready_links); /* Make sure the interest isn't on the response list; * otherwise it might receive a second RPC. */ @@ -1143,7 +1147,7 @@ int homa_register_interests(struct homa_interest *interest, claim_rpc: list_del_init(&rpc->ready_links); if (!list_empty(&hsk->ready_requests) || - !list_empty(&hsk->ready_responses)) { + !list_empty(&hsk->ready_responses)) { // There are still more RPCs available, so let Linux know. hsk->sock.sk_data_ready(&hsk->sock); } @@ -1161,7 +1165,7 @@ int homa_register_interests(struct homa_interest *interest, interest->locked = 1; } atomic_andnot(RPC_HANDING_OFF, &rpc->flags); - atomic_long_set_release(&interest->ready_rpc, (long) rpc); + atomic_long_set_release(&interest->ready_rpc, (long)rpc); return 0; } @@ -1179,7 +1183,7 @@ int homa_register_interests(struct homa_interest *interest, * errno value. The RPC will be locked; the caller must unlock. */ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, - __u64 id) + __u64 id) { int error, blocked = 0, polled = 0; struct homa_rpc *result = NULL; @@ -1195,7 +1199,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, */ while (1) { error = homa_register_interests(&interest, hsk, flags, id); - rpc = (struct homa_rpc *) atomic_long_read(&interest.ready_rpc); + rpc = (struct homa_rpc *)atomic_long_read(&interest.ready_rpc); if (rpc) goto found_rpc; if (error < 0) { @@ -1212,15 +1216,15 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, while (1) { int reaper_result; - rpc = (struct homa_rpc *) atomic_long_read( - &interest.ready_rpc); + rpc = (struct homa_rpc *)atomic_long_read(&interest + .ready_rpc); if (rpc) { tt_record1("received RPC handoff while reaping, id %d", - rpc->id); + rpc->id); goto found_rpc; } reaper_result = homa_rpc_reap(hsk, - hsk->homa->reap_limit); + hsk->homa->reap_limit); if (reaper_result == 0) break; @@ -1270,13 +1274,13 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, /* Now it's time to sleep. */ per_cpu(homa_offload_core, interest.core).last_app_active = now; set_current_state(TASK_INTERRUPTIBLE); - rpc = (struct homa_rpc *) atomic_long_read(&interest.ready_rpc); + rpc = (struct homa_rpc *)atomic_long_read(&interest.ready_rpc); if (!rpc && !hsk->shutdown) { __u64 end; __u64 start = get_cycles(); tt_record1("homa_wait_for_message sleeping, pid %d", - current->pid); + current->pid); schedule(); end = get_cycles(); blocked = 1; @@ -1297,10 +1301,9 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, * so they have to be checked again after locking the socket. */ UNIT_HOOK("found_rpc"); - if ((interest.reg_rpc) - || (interest.request_links.next != LIST_POISON1) - || (interest.response_links.next - != LIST_POISON1)) { + if (interest.reg_rpc || + interest.request_links.next != LIST_POISON1 || + interest.response_links.next != LIST_POISON1) { homa_sock_lock(hsk, "homa_wait_for_message"); if (interest.reg_rpc) interest.reg_rpc->interest = NULL; @@ -1315,17 +1318,18 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, * this could have happened anytime up until we reset the * interests above). */ - rpc = (struct homa_rpc *) atomic_long_read(&interest.ready_rpc); + rpc = (struct homa_rpc *)atomic_long_read(&interest.ready_rpc); if (rpc) { tt_record2("homa_wait_for_message found rpc id %d, pid %d", - rpc->id, current->pid); + rpc->id, current->pid); if (!interest.locked) { atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc, "homa_wait_for_message"); - atomic_andnot(APP_NEEDS_LOCK|RPC_HANDING_OFF, - &rpc->flags); - } else + atomic_andnot(APP_NEEDS_LOCK | RPC_HANDING_OFF, + &rpc->flags); + } else { atomic_andnot(RPC_HANDING_OFF, &rpc->flags); + } if (!rpc->error) rpc->error = homa_copy_to_user(rpc); if (rpc->state == RPC_DEAD) { @@ -1335,8 +1339,8 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, if (rpc->error) goto done; atomic_andnot(RPC_PKTS_READY, &rpc->flags); - if ((rpc->msgin.bytes_remaining == 0) - && (!skb_queue_len(&rpc->msgin.packets))) + if (rpc->msgin.bytes_remaining == 0 && + !skb_queue_len(&rpc->msgin.packets)) goto done; homa_rpc_unlock(rpc); } @@ -1356,7 +1360,6 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, else if (polled) INC_METRIC(fast_wakeups, 1); return rpc; - } /** @@ -1373,7 +1376,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, * currently busy doing Homa transport work. */ struct homa_interest *homa_choose_interest(struct homa *homa, - struct list_head *head, int offset) + struct list_head *head, int offset) { __u64 busy_time = get_cycles() - homa->busy_cycles; struct homa_interest *backup = NULL; @@ -1382,8 +1385,8 @@ struct homa_interest *homa_choose_interest(struct homa *homa, list_for_each(pos, head) { interest = (struct homa_interest *) (((char *) pos) - offset); - if (per_cpu(homa_offload_core, interest->core).last_active - < busy_time) { + if (per_cpu(homa_offload_core, interest->core).last_active < + busy_time) { if (backup != NULL) INC_METRIC(handoffs_alt_thread, 1); return interest; @@ -1408,8 +1411,8 @@ void homa_rpc_handoff(struct homa_rpc *rpc) struct homa_sock *hsk = rpc->hsk; struct homa_interest *interest; - if ((atomic_read(&rpc->flags) & RPC_HANDING_OFF) - || !list_empty(&rpc->ready_links)) + if ((atomic_read(&rpc->flags) & RPC_HANDING_OFF) || + !list_empty(&rpc->ready_links)) return; /* First, see if someone is interested in this RPC specifically. @@ -1422,16 +1425,18 @@ void homa_rpc_handoff(struct homa_rpc *rpc) /* Second, check the interest list for this type of RPC. */ if (homa_is_client(rpc->id)) { interest = homa_choose_interest(hsk->homa, - &hsk->response_interests, - offsetof(struct homa_interest, response_links)); + &hsk->response_interests, + offsetof(struct homa_interest, + response_links)); if (interest) goto thread_waiting; list_add_tail(&rpc->ready_links, &hsk->ready_responses); INC_METRIC(responses_queued, 1); } else { interest = homa_choose_interest(hsk->homa, - &hsk->request_interests, - offsetof(struct homa_interest, request_links)); + &hsk->request_interests, + offsetof(struct homa_interest, + request_links)); if (interest) goto thread_waiting; list_add_tail(&rpc->ready_links, &hsk->ready_requests); @@ -1445,7 +1450,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) /* Notify the poll mechanism. */ hsk->sock.sk_data_ready(&hsk->sock); tt_record2("homa_rpc_handoff finished queuing id %d for port %d", - rpc->id, hsk->port); + rpc->id, hsk->port); return; thread_waiting: @@ -1458,9 +1463,8 @@ void homa_rpc_handoff(struct homa_rpc *rpc) interest->locked = 0; INC_METRIC(handoffs_thread_waiting, 1); tt_record3("homa_rpc_handoff handing off id %d to pid %d on core %d", - rpc->id, interest->thread->pid, - task_cpu(interest->thread)); - atomic_long_set_release(&interest->ready_rpc, (long) rpc); + rpc->id, interest->thread->pid, task_cpu(interest->thread)); + atomic_long_set_release(&interest->ready_rpc, (long)rpc); /* Update the last_app_active time for the thread's core, so Homa * will try to avoid doing any work there. @@ -1511,14 +1515,14 @@ void homa_incoming_sysctl_changed(struct homa *homa) homa->poll_cycles = tmp; tmp = homa->busy_usecs; - tmp = (tmp*cpu_khz)/1000; + tmp = (tmp * cpu_khz) / 1000; homa->busy_cycles = tmp; tmp = homa->gro_busy_usecs; - tmp = (tmp*cpu_khz)/1000; + tmp = (tmp * cpu_khz) / 1000; homa->gro_busy_cycles = tmp; tmp = homa->bpage_lease_usecs; - tmp = (tmp*cpu_khz)/1000; + tmp = (tmp * cpu_khz) / 1000; homa->bpage_lease_cycles = tmp; } diff --git a/homa_outgoing.c b/homa_outgoing.c index f49b9c88..31f7adcb 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -64,7 +64,7 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) * space. */ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, - struct iov_iter *iter) + struct iov_iter *iter) { struct homa_skb_info *homa_info = homa_get_skb_info(skb); int seg_length = homa_info->seg_length; @@ -82,7 +82,7 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, if (bytes_left < seg_length) seg_length = bytes_left; err = homa_skb_append_from_iter(rpc->hsk->homa, skb, iter, - seg_length); + seg_length); if (err != 0) return err; bytes_left -= seg_length; @@ -93,7 +93,7 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, seg.offset = htonl(offset); err = homa_skb_append_to_frag(rpc->hsk->homa, skb, &seg, - sizeof(seg)); + sizeof(seg)); if (err != 0) return err; } @@ -119,8 +119,8 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, * Return: A pointer to the new packet, or a negative errno. */ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, - struct iov_iter *iter, int offset, int length, - int max_seg_data) + struct iov_iter *iter, int offset, + int length, int max_seg_data) { struct homa_skb_info *homa_info; int segs, err, gso_size; @@ -171,7 +171,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, } else { gso_size = max_seg_data; err = homa_skb_append_from_iter(rpc->hsk->homa, skb, iter, - length); + length); } if (err) goto error; @@ -236,10 +236,10 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) int err; homa_message_out_init(rpc, iter->count); - if (unlikely((rpc->msgout.length > HOMA_MAX_MESSAGE_LENGTH) - || (rpc->msgout.length == 0))) { + if (unlikely(rpc->msgout.length > HOMA_MAX_MESSAGE_LENGTH || + rpc->msgout.length == 0)) { tt_record2("homa_message_out_fill found bad length %d for id %d", - rpc->msgout.length, rpc->id); + rpc->msgout.length, rpc->id); err = -EINVAL; goto error; } @@ -255,21 +255,21 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) /* Round gso_size down to an even # of mtus. */ segs_per_gso = (gso_size - rpc->hsk->ip_header_length - - sizeof(struct data_header))/max_seg_data; + - sizeof(struct data_header)) / max_seg_data; if (segs_per_gso == 0) segs_per_gso = 1; max_gso_data = segs_per_gso * max_seg_data; UNIT_LOG("; ", "mtu %d, max_seg_data %d, max_gso_data %d", - mtu, max_seg_data, max_gso_data); + mtu, max_seg_data, max_gso_data); - overlap_xmit = rpc->msgout.length > 2*max_gso_data; + overlap_xmit = rpc->msgout.length > 2 * max_gso_data; rpc->msgout.granted = rpc->msgout.unscheduled; atomic_or(RPC_COPYING_FROM_USER, &rpc->flags); homa_skb_stash_pages(rpc->hsk->homa, rpc->msgout.length); /* Each iteration of the loop below creates one GSO packet. */ tt_record3("starting copy from user space for id %d, length %d, unscheduled %d", - rpc->id, rpc->msgout.length, rpc->msgout.unscheduled); + rpc->id, rpc->msgout.length, rpc->msgout.unscheduled); last_link = &rpc->msgout.packets; for (bytes_left = rpc->msgout.length; bytes_left > 0; ) { int skb_data_bytes, offset; @@ -289,7 +289,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) if (skb_data_bytes > bytes_left) skb_data_bytes = bytes_left; skb = homa_new_data_packet(rpc, iter, offset, skb_data_bytes, - max_seg_data); + max_seg_data); if (unlikely(!skb)) { err = PTR_ERR(skb); homa_rpc_lock(rpc, "homa_message_out_fill"); @@ -316,7 +316,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) } } tt_record2("finished copy from user space for id %d, length %d", - rpc->id, rpc->msgout.length); + rpc->id, rpc->msgout.length); atomic_andnot(RPC_COPYING_FROM_USER, &rpc->flags); INC_METRIC(sent_msg_bytes, rpc->msgout.length); if (!overlap_xmit && xmit) @@ -343,9 +343,9 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) * was a problem. */ int homa_xmit_control(enum homa_packet_type type, void *contents, - size_t length, struct homa_rpc *rpc) + size_t length, struct homa_rpc *rpc) { - struct common_header *h = (struct common_header *) contents; + struct common_header *h = contents; h->type = type; h->sport = htons(rpc->hsk->port); @@ -370,9 +370,11 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, * was a problem. */ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, - struct homa_sock *hsk) + struct homa_sock *hsk) { +#if 1 /* See strip.py */ struct netdev_queue *txq; +#endif /* See strip.py */ struct common_header *h; struct dst_entry *dst; int result, priority; @@ -386,20 +388,21 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, dst_hold(dst); skb_dst_set(skb, dst); - h = (struct common_header *) skb_put(skb, length); + h = skb_put(skb, length); memcpy(h, contents, length); extra_bytes = HOMA_MIN_PKT_LENGTH - length; if (extra_bytes > 0) { memset(skb_put(skb, extra_bytes), 0, extra_bytes); UNIT_LOG(",", "padded control packet with %d bytes", - extra_bytes); + extra_bytes); } priority = hsk->homa->num_priorities-1; skb->ooo_okay = 1; skb_get(skb); if (hsk->inet.sk.sk_family == AF_INET6) { result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, - NULL, hsk->homa->priority_map[priority] << 4, 0); + NULL, hsk->homa->priority_map[priority] << 4, + 0); } else { /* This will find its way to the DSCP field in the IPv4 hdr. */ hsk->inet.tos = hsk->homa->priority_map[priority]<<5; @@ -417,23 +420,33 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, * a bogus "reference count"). */ if (refcount_read(&skb->users) > 1) { +#if 1 /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) { pr_notice("ip6_xmit didn't free Homa control packet (type %d) after error %d\n", - h->type, result); + h->type, result); } else { pr_notice("ip_queue_xmit didn't free Homa control packet (type %d) after error %d\n", - h->type, result); + h->type, result); tt_record2("ip_queue_xmit didn't free Homa control packet (type %d) after error %d\n", - h->type, result); + h->type, result); } +#else /* See strip.py */ + if (hsk->inet.sk.sk_family == AF_INET6) + pr_notice("ip6_xmit didn't free Homa control packet (type %d) after error %d\n", + h->type, result); + else + pr_notice("ip_queue_xmit didn't free Homa control packet (type %d) after error %d\n", + h->type, result); +#endif /* See strip.py */ } } +#if 1 /* See strip.py */ txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) tt_record4("__homa_xmit_control found stopped txq for id %d, qid %d, num_queued %d, limit %d", - be64_to_cpu(h->sender_id), - skb->queue_mapping, txq->dql.num_queued, - txq->dql.adj_limit); + be64_to_cpu(h->sender_id), skb->queue_mapping, + txq->dql.num_queued, txq->dql.adj_limit); +#endif /* See strip.py */ INC_METRIC(packets_sent[h->type - DATA], 1); INC_METRIC(priority_bytes[priority], skb->len); INC_METRIC(priority_packets[priority], 1); @@ -449,7 +462,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, */ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) { - struct common_header *h = (struct common_header *) skb->data; + struct common_header *h = (struct common_header *)skb->data; struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); struct unknown_header unknown; struct homa_peer *peer; @@ -459,8 +472,8 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) homa_print_ipv6_addr(&saddr), ntohs(h->sport), homa_local_id(h->sender_id)); tt_record3("sending unknown to 0x%x:%d for id %llu", - tt_addr(saddr), ntohs(h->sport), - homa_local_id(h->sender_id)); + tt_addr(saddr), ntohs(h->sport), + homa_local_id(h->sender_id)); unknown.common.sport = h->dport; unknown.common.dport = h->sport; unknown.common.type = UNKNOWN; @@ -490,7 +503,9 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) void homa_xmit_data(struct homa_rpc *rpc, bool force) { struct homa *homa = rpc->hsk->homa; +#if 1 /* See strip.py */ struct netdev_queue *txq; +#endif /* See strip.py */ atomic_inc(&rpc->msgout.active_xmits); while (*rpc->msgout.next_xmit) { @@ -507,7 +522,8 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) if ((rpc->msgout.length - rpc->msgout.next_xmit_offset) >= homa->throttle_min_bytes) { if (!homa_check_nic_queue(homa, skb, force)) { - tt_record1("homa_xmit_data adding id %u to throttle queue", rpc->id); + tt_record1("homa_xmit_data adding id %u to throttle queue", + rpc->id); homa_add_to_throttled(rpc); break; } @@ -526,11 +542,13 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) homa_rpc_unlock(rpc); skb_get(skb); __homa_xmit_data(skb, rpc, priority); +#if 1 /* See strip.py */ txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) tt_record4("homa_xmit_data found stopped txq for id %d, qid %d, num_queued %d, limit %d", - rpc->id, skb->queue_mapping, - txq->dql.num_queued, txq->dql.adj_limit); + rpc->id, skb->queue_mapping, + txq->dql.num_queued, txq->dql.adj_limit); +#endif /* See strip.py */ force = false; homa_rpc_lock(rpc, "homa_xmit_data"); if (rpc->state == RPC_DEAD) @@ -549,7 +567,9 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) */ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) { +#if 1 /* See strip.py */ struct homa_skb_info *homa_info = homa_get_skb_info(skb); +#endif /* See strip.py */ struct dst_entry *dst; int err; @@ -569,25 +589,24 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) skb->csum_offset = offsetof(struct common_header, checksum); if (rpc->hsk->inet.sk.sk_family == AF_INET6) { tt_record4("calling ip6_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", - homa_get_skb_info(skb)->wire_bytes, - tt_addr(rpc->peer->addr), rpc->id, - homa_info->offset); + homa_get_skb_info(skb)->wire_bytes, + tt_addr(rpc->peer->addr), rpc->id, + homa_info->offset); err = ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, - 0, NULL, - rpc->hsk->homa->priority_map[priority] << 4, 0); + 0, NULL, + rpc->hsk->homa->priority_map[priority] << 4, 0); } else { tt_record4("calling ip_queue_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", - homa_get_skb_info(skb)->wire_bytes, - tt_addr(rpc->peer->addr), rpc->id, - homa_info->offset); + homa_get_skb_info(skb)->wire_bytes, + tt_addr(rpc->peer->addr), rpc->id, + homa_info->offset); rpc->hsk->inet.tos = rpc->hsk->homa->priority_map[priority]<<5; err = ip_queue_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow); } tt_record4("Finished queueing packet: rpc id %llu, offset %d, len %d, qid %d", - rpc->id, homa_info->offset, - homa_get_skb_info(skb)->data_bytes, - skb->queue_mapping); + rpc->id, homa_info->offset, + homa_get_skb_info(skb)->data_bytes, skb->queue_mapping); if (err) INC_METRIC(data_xmit_errors, 1); INC_METRIC(packets_sent[0], 1); @@ -617,8 +636,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, /* Each iteration of this loop checks one packet in the message * to see if it contains segments that need to be retransmitted. */ - for (skb = rpc->msgout.packets; skb != NULL; - skb = homa_info->next_skb) { + for (skb = rpc->msgout.packets; skb; skb = homa_info->next_skb) { int seg_offset, offset, seg_length, data_left; struct data_header *h; @@ -632,15 +650,15 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, offset = homa_info->offset; seg_offset = sizeof32(struct data_header); data_left = homa_info->data_bytes; - if (skb_shinfo(skb)->gso_segs <= 1) + if (skb_shinfo(skb)->gso_segs <= 1) { seg_length = data_left; - else { + } else { seg_length = homa_info->seg_length; - h = (struct data_header *) skb_transport_header(skb); + h = (struct data_header *)skb_transport_header(skb); } for ( ; data_left > 0; data_left -= seg_length, - offset += seg_length, - seg_offset += skb_shinfo(skb)->gso_size) { + offset += seg_length, + seg_offset += skb_shinfo(skb)->gso_size) { struct homa_skb_info *new_homa_info; struct sk_buff *new_skb; int err; @@ -663,9 +681,8 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, UNIT_LOG("; ", "skb allocation error"); goto resend_done; } - h = (struct data_header *) __skb_put_data(new_skb, - skb_transport_header(skb), - sizeof32(struct data_header)); + h = __skb_put_data(new_skb, skb_transport_header(skb), + sizeof32(struct data_header)); h->common.sequence = htonl(offset); h->seg.offset = htonl(offset); h->retransmit = 1; @@ -676,12 +693,13 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, else h->incoming = htonl(offset + seg_length); err = homa_skb_append_from_skb(rpc->hsk->homa, new_skb, - skb, seg_offset, seg_length); + skb, seg_offset, + seg_length); if (err != 0) { pr_err("%s got error %d from homa_skb_append_from_skb\n", - __func__, err); + __func__, err); UNIT_LOG("; ", "%s got error %d while copying data", - __func__, -err); + __func__, -err); kfree_skb(new_skb); goto resend_done; } @@ -694,7 +712,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, new_homa_info->seg_length = seg_length; new_homa_info->offset = offset; tt_record3("retransmitting offset %d, length %d, id %d", - offset, seg_length, rpc->id); + offset, seg_length, rpc->id); homa_check_nic_queue(rpc->hsk->homa, new_skb, true); __homa_xmit_data(new_skb, rpc, priority); INC_METRIC(resent_packets, 1); @@ -716,10 +734,10 @@ void homa_outgoing_sysctl_changed(struct homa *homa) /* Code below is written carefully to avoid integer underflow or * overflow under expected usage patterns. Be careful when changing! */ - homa->cycles_per_kbyte = (8*(__u64) cpu_khz)/homa->link_mbps; - homa->cycles_per_kbyte = (101*homa->cycles_per_kbyte)/100; + homa->cycles_per_kbyte = (8 * (__u64)cpu_khz) / homa->link_mbps; + homa->cycles_per_kbyte = (101 * homa->cycles_per_kbyte) / 100; tmp = homa->max_nic_queue_ns; - tmp = (tmp*cpu_khz)/1000000; + tmp = (tmp * cpu_khz) / 1000000; homa->max_nic_queue_cycles = tmp; } @@ -745,15 +763,16 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) __u64 idle, new_idle, clock; bytes = homa_get_skb_info(skb)->wire_bytes; - cycles_for_packet = (bytes * homa->cycles_per_kbyte)/1000; + cycles_for_packet = (bytes * homa->cycles_per_kbyte) / 1000; while (1) { clock = get_cycles(); idle = atomic64_read(&homa->link_idle_time); - if (((clock + homa->max_nic_queue_cycles) < idle) && !force - && !(homa->flags & HOMA_FLAG_DONT_THROTTLE)) + if ((clock + homa->max_nic_queue_cycles) < idle && !force && + !(homa->flags & HOMA_FLAG_DONT_THROTTLE)) return 0; if (!list_empty(&homa->throttled_rpcs)) INC_METRIC(pacer_bytes, bytes); +#if 1 /* See strip.py */ if (idle < clock) { if (homa->pacer_wake_time) { __u64 lost = (homa->pacer_wake_time > idle) @@ -763,12 +782,19 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) tt_record1("pacer lost %d cycles", lost); } new_idle = clock + cycles_for_packet; - } else + } else { + new_idle = idle + cycles_for_packet; + } +#else /* See strip.py */ + if (idle < clock) + new_idle = clock + cycles_for_packet; + else new_idle = idle + cycles_for_packet; +#endif /* See strip.py */ /* This method must be thread-safe. */ if (atomic64_cmpxchg_relaxed(&homa->link_idle_time, idle, - new_idle) == idle) + new_idle) == idle) break; } return 1; @@ -776,13 +802,13 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) /** * homa_pacer_main() - Top-level function for the pacer thread. - * @transportInfo: Pointer to struct homa. + * @transport: Pointer to struct homa. * * Return: Always 0. */ -int homa_pacer_main(void *transportInfo) +int homa_pacer_main(void *transport) { - struct homa *homa = (struct homa *) transportInfo; + struct homa *homa = (struct homa *)transport; homa->pacer_wake_time = get_cycles(); while (1) { @@ -799,10 +825,16 @@ int homa_pacer_main(void *transportInfo) * incoming packets from being handled). */ set_current_state(TASK_INTERRUPTIBLE); +#if 1 /* See strip.py */ if (list_first_or_null_rcu(&homa->throttled_rpcs, - struct homa_rpc, throttled_links) == NULL) + struct homa_rpc, throttled_links) == NULL) tt_record("pacer sleeping"); else +#else /* See strip.py */ + if (list_first_or_null_rcu(&homa->throttled_rpcs, + struct homa_rpc, + throttled_links) != NULL) +#endif /* See strip.py */ __set_current_state(TASK_RUNNING); INC_METRIC(pacer_cycles, get_cycles() - homa->pacer_wake_time); homa->pacer_wake_time = 0; @@ -881,21 +913,23 @@ void homa_pacer_xmit(struct homa *homa) homa->pacer_fifo_count += 1000; rpc = NULL; list_for_each_entry_rcu(cur, &homa->throttled_rpcs, - throttled_links) { + throttled_links) { if (cur->msgout.init_cycles < oldest) { rpc = cur; oldest = cur->msgout.init_cycles; } } - } else + } else { rpc = list_first_or_null_rcu(&homa->throttled_rpcs, - struct homa_rpc, throttled_links); - if (rpc == NULL) { + struct homa_rpc, + throttled_links); + } + if (!rpc) { homa_throttle_unlock(homa); break; } if (!homa_bucket_try_lock(rpc->bucket, rpc->id, - "homa_pacer_xmit")) { + "homa_pacer_xmit")) { homa_throttle_unlock(homa); INC_METRIC(pacer_skipped_rpcs, 1); break; @@ -903,9 +937,9 @@ void homa_pacer_xmit(struct homa *homa) homa_throttle_unlock(homa); tt_record4("pacer calling homa_xmit_data for rpc id %llu, port %d, offset %d, bytes_left %d", - rpc->id, rpc->hsk->port, - rpc->msgout.next_xmit_offset, - rpc->msgout.length - rpc->msgout.next_xmit_offset); + rpc->id, rpc->hsk->port, + rpc->msgout.next_xmit_offset, + rpc->msgout.length - rpc->msgout.next_xmit_offset); homa_xmit_data(rpc, true); /* Note: rpc->state could be RPC_DEAD here, but the code @@ -919,8 +953,7 @@ void homa_pacer_xmit(struct homa *homa) homa_throttle_lock(homa); if (!list_empty(&rpc->throttled_links)) { tt_record2("pacer removing id %d from throttled list, offset %d", - rpc->id, - rpc->msgout.next_xmit_offset); + rpc->id, rpc->msgout.next_xmit_offset); list_del_rcu(&rpc->throttled_links); if (list_empty(&homa->throttled_rpcs)) INC_METRIC(throttled_cycles, get_cycles() @@ -980,7 +1013,7 @@ void homa_add_to_throttled(struct homa_rpc *rpc) bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; homa_throttle_lock(homa); list_for_each_entry_rcu(candidate, &homa->throttled_rpcs, - throttled_links) { + throttled_links) { int bytes_left_cand; checks++; @@ -992,7 +1025,7 @@ void homa_add_to_throttled(struct homa_rpc *rpc) candidate->msgout.next_xmit_offset; if (bytes_left_cand > bytes_left) { list_add_tail_rcu(&rpc->throttled_links, - &candidate->throttled_links); + &candidate->throttled_links); goto done; } } diff --git a/homa_peer.c b/homa_peer.c index 60c0989a..0294867c 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -24,8 +24,8 @@ int homa_peertab_init(struct homa_peertab *peertab) spin_lock_init(&peertab->write_lock); INIT_LIST_HEAD(&peertab->dead_dsts); - peertab->buckets = vmalloc( - HOMA_PEERTAB_BUCKETS * sizeof(*peertab->buckets)); + peertab->buckets = vmalloc(HOMA_PEERTAB_BUCKETS * + sizeof(*peertab->buckets)); if (!peertab->buckets) return -ENOMEM; for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) @@ -51,7 +51,7 @@ void homa_peertab_destroy(struct homa_peertab *peertab) for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], - peertab_links) { + peertab_links) { dst_release(peer->dst); kfree(peer); } @@ -118,9 +118,9 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now) { while (!list_empty(&peertab->dead_dsts)) { - struct homa_dead_dst *dead = list_first_entry( - &peertab->dead_dsts, struct homa_dead_dst, - dst_links); + struct homa_dead_dst *dead = list_first_entry(&peertab->dead_dsts, + struct homa_dead_dst, + dst_links); if (dead->gc_time > now) break; dst_release(dead->dst); @@ -143,7 +143,8 @@ void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now) * homa_peertab_destroy. */ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, - const struct in6_addr *addr, struct inet_sock *inet) + const struct in6_addr *addr, + struct inet_sock *inet) { /* Note: this function uses RCU operators to ensure safety even * if a concurrent call is adding a new entry. @@ -158,7 +159,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, bucket ^= hash_32(addr->in6_u.u6_addr32[2], HOMA_PEERTAB_BUCKET_BITS); bucket ^= hash_32(addr->in6_u.u6_addr32[3], HOMA_PEERTAB_BUCKET_BITS); hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], - peertab_links) { + peertab_links) { if (ipv6_addr_equal(&peer->addr, addr)) return peer; INC_METRIC(peer_hash_links, 1); @@ -172,13 +173,13 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, */ spin_lock_bh(&peertab->write_lock); hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], - peertab_links) { + peertab_links) { if (ipv6_addr_equal(&peer->addr, addr)) goto done; } peer = kmalloc(sizeof(*peer), GFP_ATOMIC); if (!peer) { - peer = (struct homa_peer *) ERR_PTR(-ENOMEM); + peer = (struct homa_peer *)ERR_PTR(-ENOMEM); INC_METRIC(peer_kmalloc_errors, 1); goto done; } @@ -186,7 +187,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, dst = homa_peer_get_dst(peer, inet); if (IS_ERR(dst)) { kfree(peer); - peer = (struct homa_peer *) PTR_ERR(dst); + peer = (struct homa_peer *)PTR_ERR(dst); INC_METRIC(peer_route_errors, 1); goto done; } @@ -221,7 +222,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, * @hsk: Socket that will be used to transmit data to the peer. */ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, - struct homa_sock *hsk) + struct homa_sock *hsk) { struct dst_entry *dst; @@ -246,7 +247,7 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, __u64 now = get_cycles(); dead->dst = peer->dst; - dead->gc_time = now + (cpu_khz<<7); + dead->gc_time = now + (cpu_khz << 7); list_add_tail(&dead->dst_links, &peertab->dead_dsts); homa_peertab_gc_dsts(peertab, now); } @@ -285,20 +286,20 @@ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, * Return: The dst structure (or an ERR_PTR). */ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, - struct inet_sock *inet) + struct inet_sock *inet) { memset(&peer->flow, 0, sizeof(peer->flow)); if (inet->sk.sk_family == AF_INET) { struct rtable *rt; flowi4_init_output(&peer->flow.u.ip4, inet->sk.sk_bound_dev_if, - inet->sk.sk_mark, inet->tos, RT_SCOPE_UNIVERSE, - inet->sk.sk_protocol, 0, - peer->addr.in6_u.u6_addr32[3], inet->inet_saddr, - 0, 0, inet->sk.sk_uid); + inet->sk.sk_mark, inet->tos, RT_SCOPE_UNIVERSE, + inet->sk.sk_protocol, 0, + peer->addr.in6_u.u6_addr32[3], inet->inet_saddr, + 0, 0, inet->sk.sk_uid); security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); rt = ip_route_output_flow(sock_net(&inet->sk), - &peer->flow.u.ip4, &inet->sk); + &peer->flow.u.ip4, &inet->sk); if (IS_ERR(rt)) return (struct dst_entry *)(PTR_ERR(rt)); return &rt->dst; @@ -420,7 +421,7 @@ int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst) if (count > peer->num_acks) count = peer->num_acks; memcpy(dst, &peer->acks[peer->num_acks - count], - count * sizeof(peer->acks[0])); + count * sizeof(peer->acks[0])); peer->num_acks -= count; homa_peer_unlock(peer); diff --git a/homa_peer.h b/homa_peer.h index b020a88d..e26574c8 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -40,7 +40,7 @@ struct homa_dead_dst { #define HOMA_PEERTAB_BUCKET_BITS 16 /** define HOME_PEERTAB_BUCKETS - Number of buckets in a homa_peertab. */ -#define HOMA_PEERTAB_BUCKETS (1 << HOMA_PEERTAB_BUCKET_BITS) +#define HOMA_PEERTAB_BUCKETS BIT(HOMA_PEERTAB_BUCKET_BITS) /** * struct homa_peertab - A hash table that maps from IPv6 addresses @@ -195,26 +195,26 @@ struct homa_peer { spinlock_t ack_lock; }; -extern void homa_dst_refresh(struct homa_peertab *peertab, - struct homa_peer *peer, struct homa_sock *hsk); -extern void homa_peertab_destroy(struct homa_peertab *peertab); -extern struct homa_peer ** +void homa_dst_refresh(struct homa_peertab *peertab, + struct homa_peer *peer, struct homa_sock *hsk); +void homa_peertab_destroy(struct homa_peertab *peertab); +struct homa_peer ** homa_peertab_get_peers(struct homa_peertab *peertab, - int *num_peers); -extern int homa_peertab_init(struct homa_peertab *peertab); -extern void homa_peer_add_ack(struct homa_rpc *rpc); -extern struct homa_peer + int *num_peers); +int homa_peertab_init(struct homa_peertab *peertab); +void homa_peer_add_ack(struct homa_rpc *rpc); +struct homa_peer *homa_peer_find(struct homa_peertab *peertab, - const struct in6_addr *addr, struct inet_sock *inet); -extern int homa_peer_get_acks(struct homa_peer *peer, int count, - struct homa_ack *dst); -extern struct dst_entry + const struct in6_addr *addr, struct inet_sock *inet); +int homa_peer_get_acks(struct homa_peer *peer, int count, + struct homa_ack *dst); +struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, - struct inet_sock *inet); -extern void homa_peer_lock_slow(struct homa_peer *peer); -extern void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, - int c2, int c3, int c4, int c5, int c6, int c7); -extern void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now); + struct inet_sock *inet); +void homa_peer_lock_slow(struct homa_peer *peer); +void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, + int c2, int c3, int c4, int c5, int c6, int c7); +void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now); /** * homa_peer_lock() - Acquire the lock for a peer's @unacked_lock. If the lock @@ -244,7 +244,7 @@ static inline void homa_peer_unlock(struct homa_peer *peer) * Return Up-to-date destination for peer. */ static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, - struct homa_sock *hsk) + struct homa_sock *hsk) { if (unlikely(peer->dst->obsolete > 0)) homa_dst_refresh(hsk->homa->peers, peer, hsk); diff --git a/homa_plumbing.c b/homa_plumbing.c index 095f495b..769d88cc 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -173,7 +173,7 @@ static struct net_protocol homa_protocol = { static struct inet6_protocol homav6_protocol = { .handler = homa_softirq, .err_handler = homa_err_handler_v6, - .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; /* Describes file operations implemented for /proc/net/homa_metrics. */ @@ -521,24 +521,24 @@ static int __init homa_load(void) pr_notice("Homa module loading\n"); pr_notice("Homa structure sizes: data_header %u, seg_header %u, ack %u, grant_header %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", - sizeof32(struct data_header), - sizeof32(struct seg_header), - sizeof32(struct homa_ack), - sizeof32(struct grant_header), - sizeof32(struct homa_peer), - sizeof32(struct iphdr), - sizeof32(struct flowi), - sizeof32(struct ipv6hdr), - sizeof32(struct flowi6), - sizeof32(struct tcp_sock), - sizeof32(struct homa_rpc), - sizeof32(struct sk_buff), - sizeof32(struct homa_recvmsg_args), - sizeof32(union sockaddr_in_union), - HOMA_MAX_BPAGES, - NR_CPUS, - nr_cpu_ids, - MAX_NUMNODES); + sizeof32(struct data_header), + sizeof32(struct seg_header), + sizeof32(struct homa_ack), + sizeof32(struct grant_header), + sizeof32(struct homa_peer), + sizeof32(struct iphdr), + sizeof32(struct flowi), + sizeof32(struct ipv6hdr), + sizeof32(struct flowi6), + sizeof32(struct tcp_sock), + sizeof32(struct homa_rpc), + sizeof32(struct sk_buff), + sizeof32(struct homa_recvmsg_args), + sizeof32(union sockaddr_in_union), + HOMA_MAX_BPAGES, + NR_CPUS, + nr_cpu_ids, + MAX_NUMNODES); status = proto_register(&homa_prot, 1); if (status != 0) { pr_err("proto_register failed for homa_prot: %d\n", status); @@ -554,13 +554,13 @@ static int __init homa_load(void) status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA); if (status != 0) { pr_err("inet_add_protocol failed in %s: %d\n", __func__, - status); + status); goto out_cleanup; } status = inet6_add_protocol(&homav6_protocol, IPPROTO_HOMA); if (status != 0) { pr_err("inet6_add_protocol failed in %s: %d\n", __func__, - status); + status); goto out_cleanup; } @@ -593,13 +593,15 @@ static int __init homa_load(void) if (IS_ERR(timer_kthread)) { status = PTR_ERR(timer_kthread); pr_err("couldn't create homa pacer thread: error %d\n", - status); + status); timer_kthread = NULL; goto out_cleanup; } homa_gro_hook_tcp(); +#if 1 /* See strip.py */ tt_init("timetrace", homa->temp); +#endif /* See strip.py */ return 0; @@ -627,7 +629,9 @@ static void __exit homa_unload(void) pr_notice("Homa module unloading\n"); exiting = true; +#if 1 /* See strip.py */ tt_destroy(); +#endif /* See strip.py */ homa_gro_unhook_tcp(); if (timer_kthread) @@ -662,7 +666,7 @@ module_exit(homa_unload); int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct homa_sock *hsk = homa_sk(sock->sk); - union sockaddr_in_union *addr_in = (union sockaddr_in_union *) addr; + union sockaddr_in_union *addr_in = (union sockaddr_in_union *)addr; int port = 0; if (unlikely(addr->sa_family != sock->sk->sk_family)) @@ -739,7 +743,7 @@ int homa_ioc_abort(struct sock *sk, int *karg) struct homa_abort_args args; struct homa_rpc *rpc; - if (unlikely(copy_from_user(&args, (void *) karg, sizeof(args)))) + if (unlikely(copy_from_user(&args, (void *)karg, sizeof(args)))) return -EFAULT; if (args._pad1 || args._pad2[0] || args._pad2[1]) @@ -750,7 +754,7 @@ int homa_ioc_abort(struct sock *sk, int *karg) } rpc = homa_find_client_rpc(hsk, args.id); - if (rpc == NULL) + if (!rpc) return -EINVAL; if (args.error == 0) homa_rpc_free(rpc); @@ -819,15 +823,15 @@ int homa_socket(struct sock *sk) * Return: 0 on success, otherwise a negative errno. */ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) + unsigned int optlen) { struct homa_sock *hsk = homa_sk(sk); struct homa_set_buf_args args; __u64 start = get_cycles(); int ret; - if ((level != IPPROTO_HOMA) || (optname != SO_HOMA_SET_BUF) - || (optlen != sizeof(struct homa_set_buf_args))) + if (level != IPPROTO_HOMA || optname != SO_HOMA_SET_BUF || + optlen != sizeof(struct homa_set_buf_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, optlen)) @@ -845,7 +849,6 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, INC_METRIC(so_set_buf_calls, 1); INC_METRIC(so_set_buf_cycles, get_cycles() - start); return ret; - } /** @@ -858,12 +861,11 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, * Return: 0 on success, otherwise a negative errno. */ int homa_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option) + char __user *optval, int __user *option) { pr_warn("unimplemented getsockopt invoked on Homa socket: level %d, optname %d\n", - level, optname); + level, optname); return -EINVAL; - } /** @@ -882,7 +884,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) __u64 finish; int result = 0; struct homa_rpc *rpc = NULL; - union sockaddr_in_union *addr = (union sockaddr_in_union *) msg->msg_name; + union sockaddr_in_union *addr = (union sockaddr_in_union *)msg->msg_name; per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; if (unlikely(!msg->msg_control_is_user)) { @@ -898,9 +900,9 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) result = -EAFNOSUPPORT; goto error; } - if ((msg->msg_namelen < sizeof(struct sockaddr_in)) - || ((msg->msg_namelen < sizeof(struct sockaddr_in6)) - && (addr->in6.sin6_family == AF_INET6))) { + if (msg->msg_namelen < sizeof(struct sockaddr_in) || + (msg->msg_namelen < sizeof(struct sockaddr_in6) && + addr->in6.sin6_family == AF_INET6)) { tt_record("homa_sendmsg error: msg_namelen too short"); result = -EINVAL; goto error; @@ -916,11 +918,10 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) } INC_METRIC(send_calls, 1); tt_record4("homa_sendmsg request, target 0x%x:%d, id %u, length %d", - (addr->in6.sin6_family == AF_INET) - ? ntohl(addr->in4.sin_addr.s_addr) - : tt_addr(addr->in6.sin6_addr), - ntohs(addr->in6.sin6_port), rpc->id, - length); + (addr->in6.sin6_family == AF_INET) + ? ntohl(addr->in4.sin_addr.s_addr) + : tt_addr(addr->in6.sin6_addr), + ntohs(addr->in6.sin6_port), rpc->id, length); rpc->completion_cookie = args.completion_cookie; result = homa_message_out_fill(rpc, &msg->msg_iter, 1); if (result) @@ -930,7 +931,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) rpc = NULL; if (unlikely(copy_to_user(msg->msg_control, &args, - sizeof(args)))) { + sizeof(args)))) { rpc = homa_find_client_rpc(hsk, args.id); result = -EFAULT; goto error; @@ -943,7 +944,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) INC_METRIC(reply_calls, 1); tt_record4("homa_sendmsg response, id %llu, port %d, pid %d, length %d", - args.id, hsk->port, current->pid, length); + args.id, hsk->port, current->pid, length); if (args.completion_cookie != 0) { tt_record("homa_sendmsg error: nonzero cookie"); result = -EINVAL; @@ -952,14 +953,14 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) canonical_dest = canonical_ipv6_addr(addr); rpc = homa_find_server_rpc(hsk, &canonical_dest, - ntohs(addr->in6.sin6_port), args.id); + ntohs(addr->in6.sin6_port), args.id); if (!rpc) { /* Return without an error if the RPC doesn't exist; * this could be totally valid (e.g. client is * no longer interested in it). */ tt_record2("homa_sendmsg error: RPC id %d, peer 0x%x, doesn't exist", - args.id, tt_addr(canonical_dest)); + args.id, tt_addr(canonical_dest)); return 0; } if (rpc->error) { @@ -967,7 +968,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) goto error; } if (rpc->state != RPC_IN_SERVICE) { - tt_record2("homa_sendmsg error: RPC id %d in bad state %d", rpc->id, rpc->state); + tt_record2("homa_sendmsg error: RPC id %d in bad state %d", + rpc->id, rpc->state); homa_rpc_unlock(rpc); rpc = 0; result = -EINVAL; @@ -976,7 +978,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) rpc->state = RPC_OUTGOING; result = homa_message_out_fill(rpc, &msg->msg_iter, 1); - if (result && (rpc->state != RPC_DEAD)) + if (result && rpc->state != RPC_DEAD) goto error; homa_rpc_unlock(rpc); finish = get_cycles(); @@ -991,7 +993,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) homa_rpc_unlock(rpc); } tt_record2("homa_sendmsg returning error %d for id %d", - result, args.id); + result, args.id); tt_freeze(); return result; } @@ -1029,7 +1031,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, goto done; } if (unlikely(copy_from_user(&control, msg->msg_control, - sizeof(control)))) { + sizeof(control)))) { result = -EFAULT; goto done; } @@ -1039,15 +1041,15 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, goto done; } tt_record3("homa_recvmsg starting, port %d, pid %d, flags %d", - hsk->port, current->pid, control.flags); + hsk->port, current->pid, control.flags); - if ((control.num_bpages > HOMA_MAX_BPAGES) - || (control.flags & ~HOMA_RECVMSG_VALID_FLAGS)) { + if (control.num_bpages > HOMA_MAX_BPAGES || + (control.flags & ~HOMA_RECVMSG_VALID_FLAGS)) { result = -EINVAL; goto done; } homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages, - control.bpage_offsets); + control.bpage_offsets); control.num_bpages = 0; rpc = homa_wait_for_message(hsk, (flags & MSG_DONTWAIT) @@ -1088,7 +1090,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (likely(rpc->msgin.length >= 0)) { control.num_bpages = rpc->msgin.num_bpages; memcpy(control.bpage_offsets, rpc->msgin.bpage_offsets, - sizeof(control.bpage_offsets)); + sizeof(control.bpage_offsets)); } if (sk->sk_family == AF_INET6) { struct sockaddr_in6 *in6 = msg->msg_name; @@ -1102,8 +1104,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, in4->sin_family = AF_INET; in4->sin_port = htons(rpc->dport); - in4->sin_addr.s_addr = ipv6_to_ipv4( - rpc->peer->addr); + in4->sin_addr.s_addr = ipv6_to_ipv4(rpc->peer->addr); *addr_len = sizeof(*in4); } memcpy(&control.peer_addr, msg->msg_name, *addr_len); @@ -1135,8 +1136,8 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, finish = get_cycles(); tt_record3("homa_recvmsg returning id %d, length %d, bpage0 %d", - control.id, result, - control.bpage_offsets[0] >> HOMA_BPAGE_SHIFT); + control.id, result, + control.bpage_offsets[0] >> HOMA_BPAGE_SHIFT); INC_METRIC(recv_cycles, finish - start); return result; } @@ -1195,9 +1196,9 @@ int homa_softirq(struct sk_buff *skb) INC_METRIC(softirq_calls, 1); per_cpu(homa_offload_core, raw_smp_processor_id()).last_active = start; if ((start - last) > 1000000) { - int scaled_ms = (int) (10*(start-last)/cpu_khz); + int scaled_ms = (int)(10 * (start - last) / cpu_khz); - if ((scaled_ms >= 50) && (scaled_ms < 10000)) { + if (scaled_ms >= 50 && scaled_ms < 10000) { // tt_record3("Gap in incoming packets: %d cycles " // "(%d.%1d ms)", // (int) (start - last), scaled_ms/10, @@ -1219,9 +1220,11 @@ int homa_softirq(struct sk_buff *skb) skb_shinfo(skb)->frag_list = NULL; packets = skb; prev_link = &packets; - for (skb = packets; skb != NULL; skb = next) { + for (skb = packets; skb; skb = next) { +#if 1 /* See strip.py */ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); +#endif /* See strip.py */ next = skb->next; /* Make the header available at skb->data, even if the packet @@ -1244,23 +1247,22 @@ int homa_softirq(struct sk_buff *skb) /* Reject packets that are too short or have bogus types. */ h = (struct common_header *) skb->data; - if (unlikely((skb->len < sizeof(struct common_header)) - || (h->type < DATA) - || (h->type >= BOGUS) - || (skb->len < header_lengths[h->type-DATA]))) { + if (unlikely(skb->len < sizeof(struct common_header) || + h->type < DATA || h->type >= BOGUS || + skb->len < header_lengths[h->type-DATA])) { if (homa->verbose) pr_warn("Homa %s packet from %s too short: %d bytes\n", - homa_symbol_for_type(h->type), - homa_print_ipv6_addr(&saddr), - skb->len - header_offset); + homa_symbol_for_type(h->type), + homa_print_ipv6_addr(&saddr), + skb->len - header_offset); INC_METRIC(short_packets, 1); goto discard; } if (first_packet) { tt_record4("homa_softirq: first packet from 0x%x:%d, id %llu, type %d", - tt_addr(saddr), ntohs(h->sport), - homa_local_id(h->sender_id), h->type); + tt_addr(saddr), ntohs(h->sport), + homa_local_id(h->sender_id), h->type); first_packet = 0; } @@ -1282,15 +1284,16 @@ int homa_softirq(struct sk_buff *skb) /* Process the packet now if it is a control packet or * if it contains an entire short message. */ - if ((h->type != DATA) || (ntohl(((struct data_header *) h) - ->message_length) < 1400)) { + if (h->type != DATA || ntohl(((struct data_header *)h) + ->message_length) < 1400) { UNIT_LOG("; ", "homa_softirq shortcut type 0x%x", - h->type); + h->type); *prev_link = skb->next; skb->next = NULL; homa_dispatch_pkts(skb, homa); - } else + } else { prev_link = &skb->next; + } continue; discard: @@ -1302,7 +1305,7 @@ int homa_softirq(struct sk_buff *skb) * collects all of the packets for a particular RPC and dispatches * them. */ - while (packets != NULL) { + while (packets) { struct in6_addr saddr, saddr2; struct common_header *h2; struct sk_buff *skb2; @@ -1312,10 +1315,10 @@ int homa_softirq(struct sk_buff *skb) saddr = skb_canonical_ipv6_saddr(skb); other_pkts = NULL; other_link = &other_pkts; - h = (struct common_header *) skb->data; - for (skb2 = skb->next; skb2 != NULL; skb2 = next) { + h = (struct common_header *)skb->data; + for (skb2 = skb->next; skb2; skb2 = next) { next = skb2->next; - h2 = (struct common_header *) skb2->data; + h2 = (struct common_header *)skb2->data; if (h2->sender_id == h->sender_id) { saddr2 = skb_canonical_ipv6_saddr(skb2); if (ipv6_addr_equal(&saddr, &saddr2)) { @@ -1331,7 +1334,7 @@ int homa_softirq(struct sk_buff *skb) *other_link = NULL; #ifdef __UNIT_TEST__ UNIT_LOG("; ", "id %lld, offsets", homa_local_id(h->sender_id)); - for (skb2 = packets; skb2 != NULL; skb2 = skb2->next) { + for (skb2 = packets; skb2; skb2 = skb2->next) { struct data_header *h3 = (struct data_header *) skb2->data; UNIT_LOG("", " %d", ntohl(h3->seg.offset)); @@ -1377,13 +1380,13 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; - if ((type == ICMP_DEST_UNREACH) && (code == ICMP_PORT_UNREACH)) { - char *icmp = (char *) icmp_hdr(skb); + if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) { + char *icmp = (char *)icmp_hdr(skb); struct common_header *h; - iph = (struct iphdr *) (icmp + sizeof(struct icmphdr)); - h = (struct common_header *) (icmp + sizeof(struct icmphdr) - + iph->ihl*4); + iph = (struct iphdr *)(icmp + sizeof(struct icmphdr)); + h = (struct common_header *)(icmp + sizeof(struct icmphdr) + + iph->ihl * 4); homa_abort_rpcs(homa, &saddr, htons(h->dport), -ENOTCONN); } else if (type == ICMP_DEST_UNREACH) { int error; @@ -1393,11 +1396,11 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) else error = -EHOSTUNREACH; tt_record2("ICMP destination unreachable: 0x%x (daddr 0x%x)", - iph->saddr, iph->daddr); + iph->saddr, iph->daddr); homa_abort_rpcs(homa, &saddr, 0, error); } else { pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n", - __func__, info, type, code); + __func__, info, type, code); } return 0; } @@ -1419,12 +1422,12 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; - if ((type == ICMPV6_DEST_UNREACH) && (code == ICMPV6_PORT_UNREACH)) { - char *icmp = (char *) icmp_hdr(skb); + if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_PORT_UNREACH) { + char *icmp = (char *)icmp_hdr(skb); struct common_header *h; - iph = (struct ipv6hdr *) (icmp + sizeof(struct icmphdr)); - h = (struct common_header *) (icmp + sizeof(struct icmphdr) + iph = (struct ipv6hdr *)(icmp + sizeof(struct icmphdr)); + h = (struct common_header *)(icmp + sizeof(struct icmphdr) + HOMA_IPV6_HEADER_LENGTH); homa_abort_rpcs(homa, &iph->daddr, htons(h->dport), -ENOTCONN); } else if (type == ICMPV6_DEST_UNREACH) { @@ -1435,7 +1438,7 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, else error = -EHOSTUNREACH; tt_record2("ICMPv6 destination unreachable: 0x%x (daddr 0x%x)", - tt_addr(iph->saddr), tt_addr(iph->daddr)); + tt_addr(iph->saddr), tt_addr(iph->daddr)); homa_abort_rpcs(homa, &iph->daddr, 0, error); } else { if (homa->verbose) @@ -1457,7 +1460,7 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, * state of the socket. */ __poll_t homa_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait) + struct poll_table_struct *wait) { struct sock *sk = sock->sk; __poll_t mask; @@ -1471,7 +1474,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, mask = POLLOUT | POLLWRNORM; if (!list_empty(&homa_sk(sk)->ready_requests) || - !list_empty(&homa_sk(sk)->ready_responses)) + !list_empty(&homa_sk(sk)->ready_responses)) mask |= POLLIN | POLLRDNORM; return mask; } @@ -1640,13 +1643,13 @@ enum hrtimer_restart homa_hrtimer(struct hrtimer *timer) /** * homa_timer_main() - Top-level function for the timer thread. - * @transportInfo: Pointer to struct homa. + * @transport: Pointer to struct homa. * * Return: Always 0. */ -int homa_timer_main(void *transportInfo) +int homa_timer_main(void *transport) { - struct homa *homa = (struct homa *) transportInfo; + struct homa *homa = (struct homa *)transport; struct hrtimer hrtimer; ktime_t tick_interval; u64 nsec; diff --git a/homa_pool.c b/homa_pool.c index 3863fcff..e845466e 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -15,7 +15,7 @@ #define MIN_EXTRA 4 /* When running unit tests, allow HOMA_BPAGE_SIZE and HOMA_BPAGE_SHIFT - * to be overriden. + * to be overridden. */ #ifdef __UNIT_TEST__ #include "mock.h" @@ -53,10 +53,10 @@ int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) struct homa_pool *pool = hsk->buffer_pool; int i, result; - if (((__u64) region) & ~PAGE_MASK) + if (((__u64)region) & ~PAGE_MASK) return -EINVAL; pool->hsk = hsk; - pool->region = (char *) region; + pool->region = (char *)region; pool->num_bpages = region_size >> HOMA_BPAGE_SHIFT; pool->descriptors = NULL; pool->cores = NULL; @@ -65,7 +65,7 @@ int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) goto error; } pool->descriptors = kmalloc_array(pool->num_bpages, - sizeof(struct homa_bpage), GFP_ATOMIC); + sizeof(struct homa_bpage), GFP_ATOMIC); if (!pool->descriptors) { result = -ENOMEM; goto error; @@ -83,7 +83,7 @@ int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) /* Allocate and initialize core-specific data. */ pool->cores = kmalloc_array(nr_cpu_ids, sizeof(struct homa_pool_core), - GFP_ATOMIC); + GFP_ATOMIC); if (!pool->cores) { result = -ENOMEM; goto error; @@ -133,7 +133,7 @@ void homa_pool_destroy(struct homa_pool *pool) * Return: 0 for success, -1 if there wasn't enough free space in the pool. */ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, - int set_owner) + int set_owner) { int core_num = raw_smp_processor_id(); struct homa_pool_core *core; @@ -168,7 +168,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, limit = pool->num_bpages - atomic_read(&pool->free_bpages); - extra = limit>>2; + extra = limit >> 2; limit += (extra < MIN_EXTRA) ? MIN_EXTRA : extra; if (limit > pool->num_bpages) limit = pool->num_bpages; @@ -195,14 +195,14 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, * grabbed the page). */ ref_count = atomic_read(&bpage->refs); - if ((ref_count >= 2) || ((ref_count == 1) && ((bpage->owner < 0) - || (bpage->expiration > now)))) + if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || + bpage->expiration > now))) continue; if (!spin_trylock_bh(&bpage->lock)) continue; ref_count = atomic_read(&bpage->refs); - if ((ref_count >= 2) || ((ref_count == 1) && ((bpage->owner < 0) - || (bpage->expiration > now)))) { + if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || + bpage->expiration > now))) { spin_unlock_bh(&bpage->lock); continue; } @@ -260,7 +260,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) /* The last chunk may be less than a full bpage; for this we use * the bpage that we own (and reuse it for multiple messages). */ - partial = rpc->msgin.length & (HOMA_BPAGE_SIZE-1); + partial = rpc->msgin.length & (HOMA_BPAGE_SIZE - 1); if (unlikely(partial == 0)) goto success; core_id = raw_smp_processor_id(); @@ -301,7 +301,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) new_page: if (homa_pool_get_pages(pool, 1, pages, 1) != 0) { homa_pool_release_buffers(pool, rpc->msgin.num_bpages, - rpc->msgin.bpage_offsets); + rpc->msgin.bpage_offsets); rpc->msgin.num_bpages = 0; goto out_of_space; } @@ -316,8 +316,8 @@ int homa_pool_allocate(struct homa_rpc *rpc) success: tt_record4("Allocated %d bpage pointers on port %d for id %d, free_bpages now %d", - rpc->msgin.num_bpages, pool->hsk->port, rpc->id, - atomic_read(&pool->free_bpages)); + rpc->msgin.num_bpages, pool->hsk->port, rpc->id, + atomic_read(&pool->free_bpages)); return 0; /* We get here if there wasn't enough buffer space for this @@ -326,9 +326,8 @@ int homa_pool_allocate(struct homa_rpc *rpc) out_of_space: INC_METRIC(buffer_alloc_failures, 1); tt_record4("Buffer allocation failed, port %d, id %d, length %d, free_bpages %d", - pool->hsk->port, rpc->id, - rpc->msgin.length, - atomic_read(&pool->free_bpages)); + pool->hsk->port, rpc->id, rpc->msgin.length, + atomic_read(&pool->free_bpages)); homa_sock_lock(pool->hsk, "homa_pool_allocate"); list_for_each_entry(other, &pool->hsk->waiting_for_bufs, buf_links) { if (other->msgin.length > rpc->msgin.length) { @@ -362,8 +361,8 @@ void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available) bpage_index = offset >> HOMA_BPAGE_SHIFT; BUG_ON(bpage_index >= rpc->msgin.num_bpages); - bpage_offset = offset & (HOMA_BPAGE_SIZE-1); - *available = (bpage_index < (rpc->msgin.num_bpages-1)) + bpage_offset = offset & (HOMA_BPAGE_SIZE - 1); + *available = (bpage_index < (rpc->msgin.num_bpages - 1)) ? HOMA_BPAGE_SIZE - bpage_offset : rpc->msgin.length - offset; return rpc->hsk->buffer_pool->region + rpc->msgin.bpage_offsets[bpage_index] @@ -380,7 +379,7 @@ void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available) * from the start of the pool to the buffer to be released. */ void homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, - __u32 *buffers) + __u32 *buffers) { int i; @@ -396,8 +395,8 @@ void homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, } } tt_record3("Released %d bpages, free_bpages for port %d now %d", - num_buffers, pool->hsk->port, - atomic_read(&pool->free_bpages)); + num_buffers, pool->hsk->port, + atomic_read(&pool->free_bpages)); } /** @@ -424,9 +423,9 @@ void homa_pool_check_waiting(struct homa_pool *pool) break; } rpc = list_first_entry(&pool->hsk->waiting_for_bufs, - struct homa_rpc, buf_links); + struct homa_rpc, buf_links); if (!homa_bucket_try_lock(rpc->bucket, rpc->id, - "homa_pool_check_waiting")) { + "homa_pool_check_waiting")) { /* Can't just spin on the RPC lock because we're * holding the socket lock (see sync.txt). Instead, * release the socket lock and try the entire @@ -443,9 +442,9 @@ void homa_pool_check_waiting(struct homa_pool *pool) set_bpages_needed(pool); homa_sock_unlock(pool->hsk); tt_record4("Retrying buffer allocation for id %d, length %d, free_bpages %d, new bpages_needed %d", - rpc->id, rpc->msgin.length, - atomic_read(&pool->free_bpages), - pool->bpages_needed); + rpc->id, rpc->msgin.length, + atomic_read(&pool->free_bpages), + pool->bpages_needed); homa_pool_allocate(rpc); if (rpc->msgin.num_bpages > 0) { /* Allocation succeeded; "wake up" the RPC. */ diff --git a/homa_pool.h b/homa_pool.h index bde038f2..bce2f40e 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -46,8 +46,9 @@ struct homa_bpage { }; }; }; + _Static_assert(sizeof(struct homa_bpage) == L1_CACHE_BYTES, - "homa_bpage overflowed a cache line"); + "homa_bpage overflowed a cache line"); /** * struct homa_pool_core - Holds core-specific data for a homa_pool (a bpage @@ -84,8 +85,9 @@ struct homa_pool_core { }; }; }; + _Static_assert(sizeof(struct homa_pool_core) == L1_CACHE_BYTES, - "homa_pool_core overflowed a cache line"); + "homa_pool_core overflowed a cache line"); /** * struct homa_pool - Describes a pool of buffer space for incoming @@ -141,16 +143,16 @@ struct homa_pool { int check_waiting_invoked; }; -extern int homa_pool_allocate(struct homa_rpc *rpc); -extern void homa_pool_check_waiting(struct homa_pool *pool); -extern void homa_pool_destroy(struct homa_pool *pool); -extern void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, - int *available); -extern int homa_pool_get_pages(struct homa_pool *pool, int num_pages, - __u32 *pages, int leave_locked); -extern int homa_pool_init(struct homa_sock *hsk, void *buf_region, - __u64 region_size); -extern void homa_pool_release_buffers(struct homa_pool *pool, - int num_buffers, __u32 *buffers); +int homa_pool_allocate(struct homa_rpc *rpc); +void homa_pool_check_waiting(struct homa_pool *pool); +void homa_pool_destroy(struct homa_pool *pool); +void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, + int *available); +int homa_pool_get_pages(struct homa_pool *pool, int num_pages, + __u32 *pages, int leave_locked); +int homa_pool_init(struct homa_sock *hsk, void *buf_region, + __u64 region_size); +void homa_pool_release_buffers(struct homa_pool *pool, + int num_buffers, __u32 *buffers); #endif /* _HOMA_POOL_H */ diff --git a/homa_receiver.h b/homa_receiver.h index 54805424..0d2b2282 100644 --- a/homa_receiver.h +++ b/homa_receiver.h @@ -1,6 +1,4 @@ -/* Copyright (c) Homa Developers - * SPDX-License-Identifier: BSD-1-Clause - */ +/* SPDX-License-Identifier: BSD-2-Clause */ #pragma once @@ -10,7 +8,6 @@ #include "homa.h" namespace homa { - /** * class homa::receiver - Helper class for receiving a series of messages * from a Homa socket. This class serves two purposes: first, it implements @@ -48,9 +45,9 @@ class receiver { { if (static_cast(offset) >= msg_length) return 0; - if ((offset >> HOMA_BPAGE_SHIFT) == (control.num_bpages-1)) + if ((offset >> HOMA_BPAGE_SHIFT) == (control.num_bpages - 1)) return msg_length - offset; - return HOMA_BPAGE_SIZE - (offset & (HOMA_BPAGE_SIZE-1)); + return HOMA_BPAGE_SIZE - (offset & (HOMA_BPAGE_SIZE - 1)); } /** @@ -58,7 +55,7 @@ class receiver { * cookie associated with the current message; result is undefined * if there is no current message. */ - uint64_t completion_cookie() const + uint64_t completion_cookie(void) const { return control.completion_cookie; } @@ -99,7 +96,7 @@ class receiver { * id() - Return the Homa RPC identifier for the current message, * or 0 if there is no current message. */ - inline uint64_t id() const + inline uint64_t id(void) const { return control.id; } @@ -109,7 +106,7 @@ class receiver { * is a request, and false if it is a response or if there is no * current message. */ - bool is_request() const + bool is_request(void) const { return control.id & 1; } @@ -119,20 +116,20 @@ class receiver { * current message, or a negative value if there is no current * message. */ - ssize_t length() const + ssize_t length(void) const { return msg_length; } size_t receive(int flags, uint64_t id); - void release(); + void release(void); /** * homa::receiver::src_addr() - Return a pointer to the address * of the sender of the current message. The result is undefined * if there is no current message. */ - const sockaddr_in_union *src_addr() const + const sockaddr_in_union *src_addr(void) const { return &source; } @@ -161,4 +158,5 @@ class receiver { /** @buf_region: First byte of buffer space for this message. */ char *buf_region; }; -} // namespace homa \ No newline at end of file + +} // namespace homa diff --git a/homa_rpc.c b/homa_rpc.c index 5dcbc021..e67b3fec 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -20,7 +20,7 @@ * caller must eventually unlock it. */ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, - const union sockaddr_in_union *dest) + const union sockaddr_in_union *dest) { struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); struct homa_rpc_bucket *bucket; @@ -40,7 +40,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, atomic_set(&crpc->flags, 0); atomic_set(&crpc->grants_in_progress, 0); crpc->peer = homa_peer_find(hsk->homa->peers, &dest_addr_as_ipv6, - &hsk->inet); + &hsk->inet); if (IS_ERR(crpc->peer)) { tt_record("error in homa_peer_find"); err = PTR_ERR(crpc->peer); @@ -106,8 +106,8 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, * to h, then it is returned instead of creating a new RPC. */ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, - const struct in6_addr *source, struct data_header *h, - int *created) + const struct in6_addr *source, + struct data_header *h, int *created) { __u64 id = homa_local_id(h->common.sender_id); struct homa_rpc_bucket *bucket; @@ -120,9 +120,9 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, bucket = homa_server_rpc_bucket(hsk, id); homa_bucket_lock(bucket, id, "homa_rpc_new_server"); hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { - if ((srpc->id == id) && - (srpc->dport == ntohs(h->common.sport)) && - ipv6_addr_equal(&srpc->peer->addr, source)) { + if (srpc->id == id && + srpc->dport == ntohs(h->common.sport) && + ipv6_addr_equal(&srpc->peer->addr, source)) { /* RPC already exists; just return it instead * of creating a new RPC. */ @@ -167,9 +167,9 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, srpc->magic = HOMA_RPC_MAGIC; srpc->start_cycles = get_cycles(); tt_record2("Incoming message for id %d has %d unscheduled bytes", - srpc->id, ntohl(h->incoming)); + srpc->id, ntohl(h->incoming)); err = homa_message_in_init(srpc, ntohl(h->message_length), - ntohl(h->incoming)); + ntohl(h->incoming)); if (err != 0) goto error; @@ -182,7 +182,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, } hlist_add_head(&srpc->hash_links, &bucket->rpcs); list_add_tail_rcu(&srpc->active_links, &hsk->active_rpcs); - if ((ntohl(h->seg.offset) == 0) && (srpc->msgin.num_bpages > 0)) { + if (ntohl(h->seg.offset) == 0 && srpc->msgin.num_bpages > 0) { atomic_or(RPC_PKTS_READY, &srpc->flags); homa_rpc_handoff(srpc); } @@ -207,7 +207,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, * @ack: Information about an RPC from @saddr that may now be deleted safely. */ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, - struct homa_ack *ack) + struct homa_ack *ack) { __u16 client_port = ntohs(ack->client_port); __u16 server_port = ntohs(ack->server_port); @@ -259,7 +259,7 @@ void homa_rpc_free(struct homa_rpc *rpc) * function should only make changes needed to make the RPC * inaccessible. */ - if (!rpc || (rpc->state == RPC_DEAD)) + if (!rpc || rpc->state == RPC_DEAD) return; UNIT_LOG("; ", "homa_rpc_free invoked"); tt_record1("homa_rpc_free invoked for id %d", rpc->id); @@ -279,7 +279,7 @@ void homa_rpc_free(struct homa_rpc *rpc) list_add_tail_rcu(&rpc->dead_links, &rpc->hsk->dead_rpcs); __list_del_entry(&rpc->ready_links); __list_del_entry(&rpc->buf_links); - if (rpc->interest != NULL) { + if (rpc->interest) { rpc->interest->reg_rpc = NULL; wake_up_process(rpc->interest->thread); rpc->interest = NULL; @@ -291,9 +291,9 @@ void homa_rpc_free(struct homa_rpc *rpc) if (rpc->msgin.length >= 0) { rpc->hsk->dead_skbs += skb_queue_len(&rpc->msgin.packets); while (1) { - struct homa_gap *gap = list_first_entry_or_null( - &rpc->msgin.gaps, struct homa_gap, links); - if (gap == NULL) + struct homa_gap *gap = list_first_entry_or_null(&rpc->msgin.gaps, + struct homa_gap, links); + if (!gap) break; list_del(&gap->links); kfree(gap); @@ -353,25 +353,24 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) if (batch_size > BATCH_MAX) batch_size = BATCH_MAX; count -= batch_size; - num_skbs = num_rpcs = 0; + num_skbs = 0; + num_rpcs = 0; homa_sock_lock(hsk, "homa_rpc_reap"); if (atomic_read(&hsk->protect_count)) { INC_METRIC(disabled_reaps, 1); tt_record2("homa_rpc_reap returning: protect_count %d, dead_skbs %d", - atomic_read(&hsk->protect_count), - hsk->dead_skbs); + atomic_read(&hsk->protect_count), + hsk->dead_skbs); homa_sock_unlock(hsk); return 0; } /* Collect buffers and freeable RPCs. */ list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) { - if ((atomic_read(&rpc->flags) & RPC_CANT_REAP) - || (atomic_read(&rpc->grants_in_progress) - != 0) - || (atomic_read(&rpc->msgout.active_xmits) - != 0)) { + if ((atomic_read(&rpc->flags) & RPC_CANT_REAP) || + atomic_read(&rpc->grants_in_progress)!= 0 || + atomic_read(&rpc->msgout.active_xmits) != 0) { INC_METRIC(disabled_rpc_reaps, 1); continue; } @@ -383,9 +382,8 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) if (rpc->msgout.length >= 0) { while (rpc->msgout.packets) { skbs[num_skbs] = rpc->msgout.packets; - rpc->msgout.packets = homa_get_skb_info( - rpc->msgout.packets) - ->next_skb; + rpc->msgout.packets = homa_get_skb_info(rpc + ->msgout.packets)->next_skb; num_skbs++; rpc->msgout.num_skbs--; if (num_skbs >= batch_size) @@ -425,8 +423,8 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) */ release: hsk->dead_skbs -= num_skbs + rx_frees; - result = !list_empty(&hsk->dead_rpcs) - && ((num_skbs + num_rpcs) != 0); + result = !list_empty(&hsk->dead_rpcs) && + (num_skbs + num_rpcs) != 0; homa_sock_unlock(hsk); homa_skb_free_many_tx(hsk->homa, skbs, num_skbs); for (i = 0; i < num_rpcs; i++) { @@ -441,29 +439,28 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) homa_rpc_unlock(rpc); if (unlikely(rpc->msgin.num_bpages)) - homa_pool_release_buffers( - rpc->hsk->buffer_pool, - rpc->msgin.num_bpages, - rpc->msgin.bpage_offsets); + homa_pool_release_buffers(rpc->hsk->buffer_pool, + rpc->msgin.num_bpages, + rpc->msgin.bpage_offsets); if (rpc->msgin.length >= 0) { while (1) { - struct homa_gap *gap = list_first_entry_or_null( - &rpc->msgin.gaps, + struct homa_gap *gap = list_first_entry_or_null(&rpc + ->msgin.gaps, struct homa_gap, links); - if (gap == NULL) + if (!gap) break; list_del(&gap->links); kfree(gap); } } tt_record1("homa_rpc_reap finished reaping id %d", - rpc->id); + rpc->id); rpc->state = 0; kfree(rpc); } tt_record4("reaped %d skbs, %d rpcs; %d skbs remain for port %d", - num_skbs + rx_frees, num_rpcs, hsk->dead_skbs, - hsk->port); + num_skbs + rx_frees, num_rpcs, hsk->dead_skbs, + hsk->port); if (!result) break; } @@ -508,15 +505,16 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) * unlock it by invoking homa_rpc_unlock. */ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u16 sport, __u64 id) + const struct in6_addr *saddr, __u16 sport, + __u64 id) { struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); struct homa_rpc *srpc; homa_bucket_lock(bucket, id, __func__); hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { - if ((srpc->id == id) && (srpc->dport == sport) && - ipv6_addr_equal(&srpc->peer->addr, saddr)) + if (srpc->id == id && srpc->dport == sport && + ipv6_addr_equal(&srpc->peer->addr, saddr)) return srpc; } homa_bucket_unlock(bucket, id); diff --git a/homa_rpc.h b/homa_rpc.h index ce8ac920..27380eb1 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -408,39 +408,40 @@ struct homa_rpc { * @start_cycles: time (from get_cycles()) when this RPC was created. * Used (sometimes) for testing. */ - uint64_t start_cycles; + u64 start_cycles; }; -extern void homa_check_rpc(struct homa_rpc *rpc); -extern struct homa_rpc +void homa_check_rpc(struct homa_rpc *rpc); +struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id); -extern struct homa_rpc +struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u16 sport, __u64 id); -extern void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, - struct homa_ack *ack); -extern void homa_rpc_free(struct homa_rpc *rpc); -extern void homa_rpc_log(struct homa_rpc *rpc); -extern void homa_rpc_log_active(struct homa *homa, uint64_t id); -extern void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); -extern void homa_rpc_log_tt(struct homa_rpc *rpc); -extern struct homa_rpc + const struct in6_addr *saddr, __u16 sport, + __u64 id); +void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, + struct homa_ack *ack); +void homa_rpc_free(struct homa_rpc *rpc); +void homa_rpc_log(struct homa_rpc *rpc); +void homa_rpc_log_active(struct homa *homa, uint64_t id); +void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); +void homa_rpc_log_tt(struct homa_rpc *rpc); +struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, - const union sockaddr_in_union *dest); -extern struct homa_rpc + const union sockaddr_in_union *dest); +struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, - const struct in6_addr *source, struct data_header *h, - int *created); -extern int homa_rpc_reap(struct homa_sock *hsk, int count); -extern char *homa_symbol_for_state(struct homa_rpc *rpc); -extern int homa_validate_incoming(struct homa *homa, int verbose, - int *link_errors); + const struct in6_addr *source, + struct data_header *h, int *created); +int homa_rpc_reap(struct homa_sock *hsk, int count); +char *homa_symbol_for_state(struct homa_rpc *rpc); +int homa_validate_incoming(struct homa *homa, int verbose, + int *link_errors); /** * homa_rpc_lock() - Acquire the lock for an RPC. * @rpc: RPC to lock. Note: this function is only safe under * limited conditions (in most cases homa_bucket_lock should be - * used). The caller must ensure that the RPC cannot be reaped + * used). The caller must ensure that the RPC cannot be reaped * before the lock is acquired. It cannot do that by acquirin * the socket lock, since that violates lock ordering constraints. * One approach is to use homa_protect_rpcs. Don't use this function @@ -496,20 +497,6 @@ static inline void homa_unprotect_rpcs(struct homa_sock *hsk) atomic_dec(&hsk->protect_count); } -/** - * homa_rpc_validate() - Check to see if an RPC has been reaped (which - * would mean it is no longer valid); if so, crash the kernel with a stack - * trace. - * @rpc: RPC to validate. - */ -static inline void homa_rpc_validate(struct homa_rpc *rpc) -{ - if (rpc->magic == HOMA_RPC_MAGIC) - return; - pr_err("Accessing reaped Homa RPC!\n"); - BUG(); -} - /** * homa_is_client(): returns true if we are the client for a particular RPC, * false if we are the server. diff --git a/homa_sock.c b/homa_sock.c index 9cdf7b3c..144a33ca 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -28,7 +28,7 @@ void homa_socktab_destroy(struct homa_socktab *socktab) struct homa_socktab_scan scan; struct homa_sock *hsk; - for (hsk = homa_socktab_start_scan(socktab, &scan); hsk != NULL; + for (hsk = homa_socktab_start_scan(socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { homa_sock_destroy(hsk); } @@ -55,7 +55,7 @@ void homa_socktab_destroy(struct homa_socktab *socktab) * being reclaimed during the scan. */ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, - struct homa_socktab_scan *scan) + struct homa_socktab_scan *scan) { scan->socktab = socktab; scan->current_bucket = -1; @@ -78,18 +78,18 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) struct homa_sock *hsk; while (1) { - while (scan->next == NULL) { + while (!scan->next) { scan->current_bucket++; if (scan->current_bucket >= HOMA_SOCKTAB_BUCKETS) return NULL; scan->next = (struct homa_socktab_links *) - hlist_first_rcu( - &scan->socktab->buckets[scan->current_bucket]); + hlist_first_rcu(&scan->socktab->buckets + [scan->current_bucket]); } links = scan->next; hsk = links->sock; - scan->next = (struct homa_socktab_links *) hlist_next_rcu( - &links->hash_links); + scan->next = (struct homa_socktab_links *)hlist_next_rcu(&links + ->hash_links); return hsk; } } @@ -129,7 +129,7 @@ void homa_sock_init(struct homa_sock *hsk, struct homa *homa) homa->next_client_port++; hsk->socktab_links.sock = hsk; hlist_add_head_rcu(&hsk->socktab_links.hash_links, - &socktab->buckets[homa_port_hash(hsk->port)]); + &socktab->buckets[homa_port_hash(hsk->port)]); INIT_LIST_HEAD(&hsk->active_rpcs); INIT_LIST_HEAD(&hsk->dead_rpcs); hsk->dead_skbs = 0; @@ -213,10 +213,12 @@ void homa_sock_shutdown(struct homa_sock *hsk) while (!list_empty(&hsk->dead_rpcs)) { homa_rpc_reap(hsk, 1000); i++; +#if 1 /* See strip.py */ if (i == 5) { tt_record("Freezing because reap seems hung"); tt_freeze(); } +#endif /* See strip.py */ } homa_pool_destroy(hsk->buffer_pool); @@ -247,7 +249,7 @@ void homa_sock_destroy(struct homa_sock *hsk) * Return: 0 for success, otherwise a negative errno. */ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, - __u16 port) + __u16 port) { struct homa_sock *owner; int result = 0; @@ -264,7 +266,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, } owner = homa_sock_find(socktab, port); - if (owner != NULL) { + if (owner) { if (owner != hsk) result = -EADDRINUSE; goto done; @@ -274,7 +276,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, hsk->inet.inet_num = port; hsk->inet.inet_sport = htons(hsk->port); hlist_add_head_rcu(&hsk->socktab_links.hash_links, - &socktab->buckets[homa_port_hash(port)]); + &socktab->buckets[homa_port_hash(port)]); done: spin_unlock_bh(&socktab->write_lock); homa_sock_unlock(hsk); @@ -297,7 +299,7 @@ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) struct homa_sock *result = NULL; hlist_for_each_entry_rcu(link, &socktab->buckets[homa_port_hash(port)], - hash_links) { + hash_links) { struct homa_sock *hsk = link->sock; if (hsk->port == port) { @@ -340,10 +342,10 @@ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id) __u64 start = get_cycles(); tt_record2("beginning wait for rpc lock, id %d (bucket %d)", - id, bucket->id); + id, bucket->id); spin_lock_bh(&bucket->lock); tt_record2("ending wait for bucket lock, id %d (bucket %d)", - id, bucket->id); + id, bucket->id); if (homa_is_client(id)) { INC_METRIC(client_lock_misses, 1); INC_METRIC(client_lock_miss_cycles, get_cycles() - start); diff --git a/homa_sock.h b/homa_sock.h index 5704ec53..bd56c734 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -9,7 +9,7 @@ struct homa; struct homa_pool; -extern void homa_sock_lock_slow(struct homa_sock *hsk); +void homa_sock_lock_slow(struct homa_sock *hsk); /** * define HOMA_SOCKTAB_BUCKETS - Number of hash buckets in a homa_socktab. @@ -252,20 +252,20 @@ struct homa_sock { struct homa_pool *buffer_pool; }; -extern void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id); -extern int homa_sock_bind(struct homa_socktab *socktab, - struct homa_sock *hsk, __u16 port); -extern void homa_sock_destroy(struct homa_sock *hsk); -extern struct homa_sock * +void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id); +int homa_sock_bind(struct homa_socktab *socktab, + struct homa_sock *hsk, __u16 port); +void homa_sock_destroy(struct homa_sock *hsk); +struct homa_sock * homa_sock_find(struct homa_socktab *socktab, __u16 port); -extern void homa_sock_init(struct homa_sock *hsk, struct homa *homa); -extern void homa_sock_shutdown(struct homa_sock *hsk); -extern int homa_socket(struct sock *sk); -extern void homa_socktab_destroy(struct homa_socktab *socktab); -extern void homa_socktab_init(struct homa_socktab *socktab); -extern struct homa_sock +void homa_sock_init(struct homa_sock *hsk, struct homa *homa); +void homa_sock_shutdown(struct homa_sock *hsk); +int homa_socket(struct sock *sk); +void homa_socktab_destroy(struct homa_socktab *socktab); +void homa_socktab_init(struct homa_socktab *socktab); +struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan); -extern struct homa_sock +struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, struct homa_socktab_scan *scan); @@ -319,8 +319,8 @@ static inline int homa_port_hash(__u16 port) * * Return: The bucket in which this RPC will appear, if the RPC exists. */ -static inline struct homa_rpc_bucket *homa_client_rpc_bucket( - struct homa_sock *hsk, __u64 id) +static inline struct homa_rpc_bucket *homa_client_rpc_bucket(struct homa_sock *hsk, + __u64 id) { /* We can use a really simple hash function here because RPC ids * are allocated sequentially. @@ -337,8 +337,8 @@ static inline struct homa_rpc_bucket *homa_client_rpc_bucket( * * Return: The bucket in which this RPC will appear, if the RPC exists. */ -static inline struct homa_rpc_bucket *homa_server_rpc_bucket( - struct homa_sock *hsk, __u64 id) +static inline struct homa_rpc_bucket *homa_server_rpc_bucket(struct homa_sock *hsk, + __u64 id) { /* Each client allocates RPC ids sequentially, so they will * naturally distribute themselves across the hash space. @@ -357,7 +357,7 @@ static inline struct homa_rpc_bucket *homa_server_rpc_bucket( * but used occasionally for diagnostics and debugging. */ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, - __u64 id, const char *locker) + __u64 id, const char *locker) { if (!spin_trylock_bh(&bucket->lock)) homa_bucket_lock_slow(bucket, id); @@ -374,7 +374,7 @@ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, * currently owned by someone else. */ static inline int homa_bucket_try_lock(struct homa_rpc_bucket *bucket, - __u64 id, const char *locker) + __u64 id, const char *locker) { if (!spin_trylock_bh(&bucket->lock)) return 0; @@ -396,4 +396,4 @@ static inline struct homa_sock *homa_sk(const struct sock *sk) return (struct homa_sock *)sk; } -#endif /* _HOMA_SOCK_H */ \ No newline at end of file +#endif /* _HOMA_SOCK_H */ diff --git a/homa_stub.h b/homa_stub.h new file mode 100644 index 00000000..fb2ba419 --- /dev/null +++ b/homa_stub.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file contains stripped-down replacements that have been + * temporarily removed from Homa during the Linux upstreaming + * process. By the time upstreaming is complete this file will + * have gone away. + */ + +#ifndef _HOMA_STUB_H +#define _HOMA_STUB_H + +#include "homa_impl.h" + +static inline int homa_skb_append_from_iter(struct homa *homa, + struct sk_buff *skb, + struct iov_iter *iter, int length) +{ + char *dst = skb_put(skb, length); + + if (copy_from_iter(dst, length, iter) != length) + return -EFAULT; + return 0; +} + +static inline int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, + void *buf, int length) +{ + char *dst = skb_put(skb, length); + + memcpy(dst, buf, length); + return 0; +} + +static inline int homa_skb_append_from_skb(struct homa *homa, + struct sk_buff *dst_skb, + struct sk_buff *src_skb, + int offset, int length) +{ + return homa_skb_append_to_frag(homa, dst_skb, + skb_transport_header(src_skb) + offset, length); +} + +static inline void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +static inline void homa_skb_free_many_tx(struct homa *homa, + struct sk_buff **skbs, int count) +{ + int i; + + for (i = 0; i < count; i++) + kfree_skb(skbs[i]); +} + +static inline void homa_skb_get(struct sk_buff *skb, void *dest, int offset, + int length) +{ + memcpy(dest, skb_transport_header(skb) + offset, length); +} + +static inline struct sk_buff *homa_skb_new_tx(int length) +{ + struct sk_buff *skb; + + skb = alloc_skb(HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH + + sizeof(struct homa_skb_info) + length, + GFP_KERNEL); + if (likely(skb)) { + skb_reserve(skb, HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH); + skb_reset_transport_header(skb); + } + return skb; +} + +static inline void homa_skb_stash_pages(struct homa *homa, int length) +{} + +#endif /* _HOMA_STUB_H */ diff --git a/homa_timer.c b/homa_timer.c index 9969f532..f7ec816a 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -24,22 +24,22 @@ void homa_check_rpc(struct homa_rpc *rpc) const char *us, *them; /* See if we need to request an ack for this RPC. */ - if (!homa_is_client(rpc->id) && (rpc->state == RPC_OUTGOING) - && (rpc->msgout.next_xmit_offset >= rpc->msgout.length)) { - if (rpc->done_timer_ticks == 0) + if (!homa_is_client(rpc->id) && rpc->state == RPC_OUTGOING && + rpc->msgout.next_xmit_offset >= rpc->msgout.length) { + if (rpc->done_timer_ticks == 0) { rpc->done_timer_ticks = homa->timer_ticks; - else { + } else { /* >= comparison that handles tick wrap-around. */ if ((rpc->done_timer_ticks + homa->request_ack_ticks - - 1 - homa->timer_ticks) & 1<<31) { + - 1 - homa->timer_ticks) & 1 << 31) { struct need_ack_header h; homa_xmit_control(NEED_ACK, &h, sizeof(h), rpc); tt_record4("Sent NEED_ACK for RPC id %d to peer 0x%x, port %d, ticks %d", - rpc->id, - tt_addr(rpc->peer->addr), - rpc->dport, homa->timer_ticks - - rpc->done_timer_ticks); + rpc->id, + tt_addr(rpc->peer->addr), + rpc->dport, homa->timer_ticks + - rpc->done_timer_ticks); } } } @@ -81,16 +81,16 @@ void homa_check_rpc(struct homa_rpc *rpc) if (rpc->silent_ticks >= homa->timeout_ticks) { INC_METRIC(rpc_timeouts, 1); tt_record3("RPC id %d, peer 0x%x, aborted because of timeout, state %d", - rpc->id, tt_addr(rpc->peer->addr), rpc->state); + rpc->id, tt_addr(rpc->peer->addr), rpc->state); homa_rpc_log_active_tt(homa, 0); tt_record1("Freezing because of RPC abort (id %d)", rpc->id); homa_freeze_peers(homa); tt_freeze(); if (homa->verbose) pr_notice("RPC id %llu, peer %s, aborted because of timeout, state %d\n", - rpc->id, - homa_print_ipv6_addr(&rpc->peer->addr), - rpc->state); + rpc->id, + homa_print_ipv6_addr(&rpc->peer->addr), + rpc->state); homa_rpc_abort(rpc, -ETIMEDOUT); return; } @@ -118,6 +118,7 @@ void homa_check_rpc(struct homa_rpc *rpc) } resend.priority = homa->num_priorities-1; homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); +#if 1 /* See strip.py */ if (homa_is_client(rpc->id)) { us = "client"; them = "server"; @@ -125,25 +126,26 @@ void homa_check_rpc(struct homa_rpc *rpc) rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); tt_record4("length %d, granted %d, rem %d, rec_incoming %d", - rpc->msgin.length, rpc->msgin.granted, - rpc->msgin.bytes_remaining, - rpc->msgin.rec_incoming); + rpc->msgin.length, rpc->msgin.granted, + rpc->msgin.bytes_remaining, + rpc->msgin.rec_incoming); } else { us = "server"; them = "client"; tt_record4("Sent RESEND for server RPC id %llu, client 0x%x:%d offset %d", - rpc->id, tt_addr(rpc->peer->addr), - rpc->dport, rpc->msgin.recv_end); + rpc->id, tt_addr(rpc->peer->addr), rpc->dport, + rpc->msgin.recv_end); tt_record4("length %d, granted %d, rem %d, rec_incoming %d", - rpc->msgin.length, rpc->msgin.granted, - rpc->msgin.bytes_remaining, - rpc->msgin.rec_incoming); + rpc->msgin.length, rpc->msgin.granted, + rpc->msgin.bytes_remaining, + rpc->msgin.rec_incoming); } +#endif /* See strip.py */ if (homa->verbose) pr_notice("Homa %s RESEND to %s %s:%d for id %llu, offset %d, length %d\n", us, them, - homa_print_ipv6_addr(&rpc->peer->addr), - rpc->dport, rpc->id, rpc->msgin.recv_end, - rpc->msgin.granted - rpc->msgin.recv_end); + homa_print_ipv6_addr(&rpc->peer->addr), + rpc->dport, rpc->id, rpc->msgin.recv_end, + rpc->msgin.granted - rpc->msgin.recv_end); } /** @@ -201,7 +203,7 @@ void homa_timer(struct homa *homa) */ rcu_read_lock(); for (hsk = homa_socktab_start_scan(homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { + hsk; hsk = homa_socktab_next(&scan)) { while (hsk->dead_skbs >= homa->dead_buffs_limit) { /* If we get here, it means that homa_wait_for_message * isn't keeping up with RPC reaping, so we'll help diff --git a/homa_utils.c b/homa_utils.c index 8fc932b6..a92d28e7 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -56,9 +56,9 @@ int homa_init(struct homa *homa) homa->throttle_min_bytes = 200; atomic_set(&homa->total_incoming, 0); homa->next_client_port = HOMA_MIN_DEFAULT_PORT; - homa->port_map = kmalloc(sizeof *homa->port_map, GFP_KERNEL); + homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL); homa_socktab_init(homa->port_map); - homa->peers = kmalloc(sizeof *homa->peers, GFP_KERNEL); + homa->peers = kmalloc(sizeof(*homa->peers), GFP_KERNEL); err = homa_peertab_init(homa->peers); if (err) { pr_err("Couldn't initialize peer table (errno %d)\n", -err); @@ -101,7 +101,7 @@ int homa_init(struct homa *homa) homa->dead_buffs_limit = 5000; homa->max_dead_buffs = 0; homa->pacer_kthread = kthread_run(homa_pacer_main, homa, - "homa_pacer"); + "homa_pacer"); if (IS_ERR(homa->pacer_kthread)) { err = PTR_ERR(homa->pacer_kthread); homa->pacer_kthread = NULL; @@ -650,7 +650,7 @@ void homa_spin(int ns) { __u64 end; - end = get_cycles() + (ns*cpu_khz)/1000000; + end = get_cycles() + (ns * cpu_khz) / 1000000; while (get_cycles() < end) /* Empty loop body.*/ ; diff --git a/homa_wire.h b/homa_wire.h index 3cc38417..d68bc8e9 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -285,12 +285,11 @@ struct data_header { struct seg_header seg; } __packed; _Static_assert(sizeof(struct data_header) <= HOMA_MAX_HEADER, - "data_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + "data_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); _Static_assert(sizeof(struct data_header) >= HOMA_MIN_PKT_LENGTH, - "data_header too small: Homa doesn't currently have codeto pad data packets"); -_Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) - & 0x3) == 0, - " data_header length not a multiple of 4 bytes (required for TCP/TSO compatibility"); + "data_header too small: Homa doesn't currently have codeto pad data packets"); +_Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) & 0x3) == 0, + " data_header length not a multiple of 4 bytes (required for TCP/TSO compatibility"); /** * homa_data_len() - Returns the total number of bytes in a DATA packet @@ -372,7 +371,7 @@ struct resend_header { __u8 priority; } __packed; _Static_assert(sizeof(struct resend_header) <= HOMA_MAX_HEADER, - "resend_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + "resend_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct unknown_header - Wire format for UNKNOWN packets. @@ -388,7 +387,7 @@ struct unknown_header { struct common_header common; } __packed; _Static_assert(sizeof(struct unknown_header) <= HOMA_MAX_HEADER, - "unknown_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + "unknown_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct busy_header - Wire format for BUSY packets. @@ -401,7 +400,7 @@ struct busy_header { struct common_header common; } __packed; _Static_assert(sizeof(struct busy_header) <= HOMA_MAX_HEADER, - "busy_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + "busy_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct cutoffs_header - Wire format for CUTOFFS packets. @@ -454,7 +453,7 @@ struct need_ack_header { struct common_header common; } __packed; _Static_assert(sizeof(struct need_ack_header) <= HOMA_MAX_HEADER, - "need_ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + "need_ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct ack_header - Wire format for ACK packets. @@ -474,7 +473,7 @@ struct ack_header { struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; } __packed; _Static_assert(sizeof(struct ack_header) <= HOMA_MAX_HEADER, - "ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + "ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * homa_local_id(): given an RPC identifier from an input packet (which diff --git a/test/Makefile b/test/Makefile index cb981d77..fb2faa5c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -6,25 +6,25 @@ CXX ?= g++ PERL ?= perl ARCH ?= x86 -CINCLUDES := -I. \ - -I.. \ +all: test + +KERN_INCLUDES := \ -I$(KDIR)/arch/x86/include \ -I$(KDIR)/arch/x86/include/generated \ -I$(KDIR)/include \ -I$(KDIR)/arch/x86/include/uapi \ -I$(KDIR)/arch/x86/include/generated/uapi \ -I$(KDIR)/include/uapi \ - -I$(KDIR)/include/generated/uapi \ + -I$(KDIR)/include/generated/uapi +CINCLUDES := \ + -I. \ + -I.. \ + $(KERN_INCLUDES) \ -include $(KDIR)/include/linux/kconfig.h -CCINCLUDES := -I. \ +CCINCLUDES := \ + -I. \ -I.. \ - -I$(KDIR)/arch/x86/include \ - -I$(KDIR)/arch/x86/include/generated \ - -I$(KDIR)/include \ - -I$(KDIR)/arch/x86/include/uapi \ - -I$(KDIR)/arch/x86/include/generated/uapi \ - -I$(KDIR)/include/uapi \ - -I$(KDIR)/include/generated/uapi + $(KERN_KINCLUDES) DEFS := -D__KERNEL__ \ -D__UNIT_TEST__ \ @@ -71,15 +71,12 @@ OTHER_SRCS := ccutils.cc \ main.c \ mock.c \ utils.c - OTHER_OBJS := $(patsubst %.c,%.o,$(patsubst %.cc,%.o,$(OTHER_SRCS))) OBJS := $(TEST_OBJS) $(HOMA_OBJS) $(OTHER_OBJS) CLEANS = unit $(OBJS) *.d .deps -all: run_tests - # This seems to be the only way to disable the built-in implicit rules # for %:%.c and %:%.cc. .SUFFIXES: @@ -100,9 +97,49 @@ all: run_tests unit: $(OBJS) $(CXX) $(CFLAGS) $^ -o $@ -lasan -run_tests: unit +test: unit ./unit +# Additional definitions for running unit tests using stripped sources. + +S_HOMA_SRCS := $(patsubst %,stripped/%,$(filter-out timetrace.c, $(HOMA_SRCS))) +S_HOMA_OBJS := $(patsubst %.c,%.o,$(S_HOMA_SRCS)) +S_HOMA_HDRS := stripped/homa.h \ + stripped/homa_impl.h \ + stripped/homa_peer.h \ + stripped/homa_pool.h \ + stripped/homa_receiver.h \ + stripped/homa_rpc.h \ + stripped/homa_sock.h \ + stripped/homa_stub.h \ + stripped/homa_wire.h +stripped/%.c: ../%.c + ../util/strip.py $< > $@ +stripped/%.h: ../%.h + ../util/strip.py $< > $@ +S_TEST_OBJS := $(patsubst %,stripped/%,$(filter-out unit_timetrace.o, $(TEST_OBJS))) +S_OBJS := $(S_HOMA_OBJS) $(S_TEST_OBJS) $(patsubst %,stripped/%,$(OTHER_OBJS)) + +$(S_OBJS): | stripped $(S_HOMA_HDRS) + +stripped: + mkdir -p stripped + +stripped/%.o: stripped/%.c + $(CC) -c $(patsubst -I..,-Istripped,$(CFLAGS)) $< -o $@ +stripped/%.o: %.c + $(CC) -c $(patsubst -I..,-Istripped,$(CFLAGS)) $< -o $@ +stripped/%.o: %.cc + $(CXX) -c $(patsubst -I..,-Istripped,$(CCFLAGS)) $< -o $@ + +s_unit: $(S_OBJS) + $(CXX) $(CFLAGS) $^ -o $@ -lasan + +s_test: s_unit + ./s_unit + +CLEANS += s_unit + # The target below shouldn't be needed: theoretically, any code that is # sensitive to IPv4 vs. IPv6 should be tested explicitly, regardless of # the --ipv4 argument. @@ -113,13 +150,14 @@ test_both: unit ./unit clean: - rm -f unit $(CLEANS) + rm -f $(CLEANS) + rm -rf stripped # This magic (along with the -MD gcc option) automatically generates makefile # dependencies for header files included from C source files we compile, # and keeps those dependencies up-to-date every time we recompile. # See 'mergedep.pl' for more information. -.deps: $(wildcard *.d) +.deps: $(wildcard *.d stripped/*.d) @mkdir -p $(@D) $(PERL) mergedep.pl $@ $^ -include .deps @@ -128,4 +166,3 @@ clean: # prints the value of a make variable. print-%: @echo $* = $($*) - diff --git a/test/mock.c b/test/mock.c index ec0bbfe3..ff0b369a 100644 --- a/test/mock.c +++ b/test/mock.c @@ -703,7 +703,11 @@ void kfree(const void *block) free((void *) block); } +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +#else +void __kfree_skb(struct sk_buff *skb) +#endif { int i; struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -948,6 +952,12 @@ int sk_set_peek_off(struct sock *sk, int val) return 0; } +void sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason reason) +{ + __kfree_skb(skb); +} + int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *iter, int size) { diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index f6f5c4a7..ef4c64b1 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -754,6 +754,7 @@ TEST_F(homa_incoming, homa_copy_to_user__error_in_skb_copy_datagram_iter) EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } +#ifdef HOMA_TIMETRACE_H TEST_F(homa_incoming, homa_copy_to_user__timetrace_info) { struct homa_rpc *crpc; @@ -790,6 +791,7 @@ TEST_F(homa_incoming, homa_copy_to_user__timetrace_info) traces); tt_destroy(); } +#endif TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv4) { diff --git a/timetrace.c b/timetrace.c index 15f490ab..abcd1fc9 100644 --- a/timetrace.c +++ b/timetrace.c @@ -10,29 +10,29 @@ //#define TT_KERNEL 1 #endif #ifdef TT_KERNEL -extern struct tt_buffer *tt_linux_buffers[]; -extern void (*tt_linux_freeze)(void); -extern atomic_t *tt_linux_freeze_count; -extern atomic_t tt_linux_freeze_no_homa; -extern int *tt_linux_homa_temp; -extern int tt_linux_homa_temp_default[]; -extern void (*tt_linux_inc_metrics)(int metric, __u64 count); -extern void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, - const char *format, __u32 arg0, __u32 arg1, __u32 arg2, - __u32 arg3); -extern void tt_linux_skip_metrics(int metric, __u64 count); -extern void (*tt_linux_printk)(void); -extern void (*tt_linux_dbg1)(char *msg, ...); -extern void (*tt_linux_dbg2)(char *msg, ...); -extern void (*tt_linux_dbg3)(char *msg, ...); -extern void tt_linux_nop(void); -extern void homa_trace(__u64 u0, __u64 u1, int i0, int i1); - -extern void ltt_record_nop(struct tt_buffer *buffer, __u64 timestamp, - const char *format, __u32 arg0, __u32 arg1, - __u32 arg2, __u32 arg3); +struct tt_buffer *tt_linux_buffers[]; +void (*tt_linux_freeze)(void); +atomic_t *tt_linux_freeze_count; +atomic_t tt_linux_freeze_no_homa; +int *tt_linux_homa_temp; +int tt_linux_homa_temp_default[]; +void (*tt_linux_inc_metrics)(int metric, __u64 count); +void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, + const char *format, __u32 arg0, __u32 arg1, __u32 arg2, + __u32 arg3); +void tt_linux_skip_metrics(int metric, __u64 count); +void (*tt_linux_printk)(void); +void (*tt_linux_dbg1)(char *msg, ...); +void (*tt_linux_dbg2)(char *msg, ...); +void (*tt_linux_dbg3)(char *msg, ...); +void tt_linux_nop(void); +void homa_trace(__u64 u0, __u64 u1, int i0, int i1); + +void ltt_record_nop(struct tt_buffer *buffer, __u64 timestamp, + const char *format, __u32 arg0, __u32 arg1, + __u32 arg2, __u32 arg3); #endif -extern void tt_inc_metric(int metric, __u64 count); +void tt_inc_metric(int metric, __u64 count); /* Separate buffers for each core: this eliminates the need for * synchronization in tt_record, which improves performance significantly. @@ -110,19 +110,19 @@ int tt_init(char *proc_file, int *temp) struct tt_buffer *buffer; buffer = kmalloc(sizeof(*buffer), GFP_KERNEL); - if (buffer == NULL) { - pr_err("timetrace couldn't allocate tt_buffers\n"); + if (!buffer) { + pr_err("%s couldn't allocate tt_buffers\n", __func__); goto error; } memset(buffer, 0, sizeof(*buffer)); tt_buffers[i] = buffer; } - if (proc_file != NULL) { + if (proc_file) { tt_dir_entry = proc_create(proc_file, 0444, NULL, &tt_pops); if (!tt_dir_entry) { pr_err("couldn't create /proc/%s for timetrace reading\n", - proc_file); + proc_file); goto error; } } else { @@ -170,7 +170,7 @@ void tt_destroy(void) spin_lock(&tt_lock); if (init) { init = false; - if (tt_dir_entry != NULL) + if (tt_dir_entry) proc_remove(tt_dir_entry); } for (i = 0; i < nr_cpu_ids; i++) { @@ -239,8 +239,8 @@ void tt_freeze(void) * @arg3: Argument to use when printing a message about this event. */ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, - const char *format, __u32 arg0, __u32 arg1, __u32 arg2, - __u32 arg3) + const char *format, __u32 arg0, __u32 arg1, __u32 arg2, + __u32 arg3) { struct tt_event *event; @@ -254,9 +254,9 @@ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, event = &buffer->events[buffer->next_index]; buffer->next_index = (buffer->next_index + 1) #ifdef __UNIT_TEST__ - & (tt_buffer_size-1); + & (tt_buffer_size - 1); #else - & (TT_BUF_SIZE-1); + & (TT_BUF_SIZE - 1); #endif event->timestamp = timestamp; @@ -288,11 +288,11 @@ void tt_find_oldest(int *pos) for (i = 0; i < nr_cpu_ids; i++) { buffer = tt_buffers[i]; - if (buffer->events[tt_buffer_size-1].format == NULL) { + if (!buffer->events[tt_buffer_size - 1].format) { pos[i] = 0; } else { int index = (buffer->next_index + 1) - & (tt_buffer_size-1); + & (tt_buffer_size - 1); struct tt_event *event = &buffer->events[index]; pos[i] = index; @@ -306,9 +306,9 @@ void tt_find_oldest(int *pos) */ for (i = 0; i < nr_cpu_ids; i++) { buffer = tt_buffers[i]; - while ((buffer->events[pos[i]].timestamp < start_time) - && (pos[i] != buffer->next_index)) { - pos[i] = (pos[i] + 1) & (tt_buffer_size-1); + while (buffer->events[pos[i]].timestamp < start_time && + pos[i] != buffer->next_index) { + pos[i] = (pos[i] + 1) & (tt_buffer_size - 1); } } } @@ -332,7 +332,7 @@ int tt_proc_open(struct inode *inode, struct file *file) goto done; } pf = kmalloc(sizeof(*pf), GFP_KERNEL); - if (pf == NULL) { + if (!pf) { result = -ENOMEM; goto done; } @@ -346,7 +346,7 @@ int tt_proc_open(struct inode *inode, struct file *file) if (!tt_test_no_khz) { pf->bytes_available = snprintf(pf->msg_storage, TT_PF_BUF_SIZE, - "cpu_khz: %u\n", cpu_khz); + "cpu_khz: %u\n", cpu_khz); } done: @@ -368,7 +368,7 @@ int tt_proc_open(struct inode *inode, struct file *file) * file was reached, and a negative number indicates an error (-errno). */ ssize_t tt_proc_read(struct file *file, char __user *user_buf, - size_t length, loff_t *offset) + size_t length, loff_t *offset) { struct tt_proc_file *pf = file->private_data; @@ -378,9 +378,9 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, int copied_to_user = 0; spin_lock(&tt_lock); - if ((pf == NULL) || (pf->file != file)) { + if (!pf || pf->file != file) { pr_err("tt_metrics_read found damaged private_data: 0x%p\n", - file->private_data); + file->private_data); copied_to_user = -EINVAL; goto done; } @@ -403,8 +403,8 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, struct tt_buffer *buffer = tt_buffers[i]; event = &buffer->events[pf->pos[i]]; - if ((pf->pos[i] != buffer->next_index) - && (event->timestamp < earliest_time)) { + if (pf->pos[i] != buffer->next_index && + event->timestamp < earliest_time) { current_core = i; earliest_time = event->timestamp; } @@ -415,16 +415,15 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, } /* Format one event. */ - event = &(tt_buffers[current_core]->events[ - pf->pos[current_core]]); + event = &(tt_buffers[current_core]->events[pf->pos[current_core]]); available = tt_pf_storage - (pf->next_byte + pf->bytes_available - pf->msg_storage); if (available == 0) goto flush; entry_length = snprintf(pf->next_byte + pf->bytes_available, - available, "%lu [C%02d] ", - (unsigned long) event->timestamp, - current_core); + available, "%lu [C%02d] ", + (unsigned long)event->timestamp, + current_core); if (available >= entry_length) entry_length += snprintf(pf->next_byte + pf->bytes_available + entry_length, @@ -446,7 +445,7 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, pf->next_byte[pf->bytes_available + entry_length] = '\n'; pf->bytes_available += entry_length + 1; pf->pos[current_core] = (pf->pos[current_core] + 1) - & (tt_buffer_size-1); + & (tt_buffer_size - 1); continue; flush: @@ -456,7 +455,7 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, if (chunk_size == 0) goto done; failed_to_copy = copy_to_user(user_buf + copied_to_user, - pf->next_byte, chunk_size); + pf->next_byte, chunk_size); chunk_size -= failed_to_copy; pf->bytes_available -= chunk_size; if (pf->bytes_available == 0) @@ -476,7 +475,6 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, return copied_to_user; } - /** * tt_proc_lseek() - This function is invoked to handle seeks on * /proc/timetrace. Right now seeks are ignored: the file must be @@ -503,9 +501,9 @@ int tt_proc_release(struct inode *inode, struct file *file) struct tt_proc_file *pf = file->private_data; int i; - if ((pf == NULL) || (pf->file != file)) { - pr_err("tt_metrics_release found damaged private_data: 0x%p\n", - file->private_data); + if (!pf || pf->file != file) { + pr_err("%s found damaged private_data: 0x%p\n", __func__, + file->private_data); return -EINVAL; } @@ -527,7 +525,7 @@ int tt_proc_release(struct inode *inode, struct file *file) for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; - buffer->events[tt_buffer_size-1].format = NULL; + buffer->events[tt_buffer_size - 1].format = NULL; buffer->next_index = 0; } } @@ -578,7 +576,7 @@ void tt_print_file(char *path) filp = filp_open(path, O_WRONLY | O_CREAT, 0666); if (IS_ERR(filp)) { pr_err("%s couldn't open %s: error %ld\n", __func__, path, - -PTR_ERR(filp)); + -PTR_ERR(filp)); filp = NULL; goto done; } @@ -602,8 +600,8 @@ void tt_print_file(char *path) struct tt_buffer *buffer = tt_buffers[i]; event = &buffer->events[pos[i]]; - if ((pos[i] != buffer->next_index) - && (event->timestamp < earliest_time)) { + if (pos[i] != buffer->next_index && + event->timestamp < earliest_time) { current_core = i; earliest_time = event->timestamp; } @@ -612,15 +610,14 @@ void tt_print_file(char *path) /* None of the traces have any more events. */ break; } - event = &(tt_buffers[current_core]->events[ - pos[current_core]]); - pos[current_core] = (pos[current_core] + 1) - & (tt_buffer_size-1); + event = &(tt_buffers[current_core]->events[pos[current_core]]); + pos[current_core] = (pos[current_core] + 1) & + (tt_buffer_size - 1); bytes_used += snprintf(buffer + bytes_used, sizeof(buffer) - bytes_used, "%lu [C%02d] ", - (unsigned long) event->timestamp, + (unsigned long)event->timestamp, current_core); bytes_used += snprintf(buffer + bytes_used, sizeof(buffer) - bytes_used, @@ -632,10 +629,10 @@ void tt_print_file(char *path) } if ((bytes_used + 1000) >= sizeof(buffer)) { err = kernel_write(filp, buffer, bytes_used, - &offset); + &offset); if (err < 0) { pr_notice("%s got error %d writing %s\n", - __func__, -err, path); + __func__, -err, path); goto done; } bytes_used = 0; @@ -645,18 +642,18 @@ void tt_print_file(char *path) err = kernel_write(filp, buffer, bytes_used, &offset); if (err < 0) pr_err("%s got error %d writing %s\n", - __func__, -err, path); + __func__, -err, path); } done: - if (filp != NULL) { + if (filp) { err = vfs_fsync(filp, 0); if (err < 0) pr_err("%s got error %d in fsync\n", __func__, -err); err = filp_close(filp, NULL); if (err < 0) pr_err("%s got error %d in filp_close\n", __func__, - -err); + -err); } atomic_dec(&tt_freeze_count); atomic_set(&active, 0); @@ -703,8 +700,8 @@ void tt_printk(void) struct tt_buffer *buffer = tt_buffers[i]; event = &buffer->events[pos[i]]; - if ((pos[i] != buffer->next_index) - && (event->timestamp < earliest_time)) { + if (pos[i] != buffer->next_index && + event->timestamp < earliest_time) { current_core = i; earliest_time = event->timestamp; } @@ -713,16 +710,15 @@ void tt_printk(void) /* None of the traces have any more events. */ break; } - event = &(tt_buffers[current_core]->events[ - pos[current_core]]); + event = &(tt_buffers[current_core]->events[pos[current_core]]); pos[current_core] = (pos[current_core] + 1) - & (tt_buffer_size-1); + & (tt_buffer_size - 1); snprintf(msg, sizeof(msg), event->format, event->arg0, - event->arg1, event->arg2, event->arg3); + event->arg1, event->arg2, event->arg3); pr_notice("%lu [C%02d] %s\n", - (unsigned long) event->timestamp, - current_core, msg); + (unsigned long)event->timestamp, + current_core, msg); } atomic_dec(&tt_freeze_count); @@ -763,8 +759,8 @@ void tt_get_messages(char *buffer, size_t length) struct tt_buffer *buffer = tt_buffers[i]; event = &buffer->events[pos[i]]; - if ((pos[i] != buffer->next_index) - && (event->timestamp < earliest_time)) { + if (pos[i] != buffer->next_index && + event->timestamp < earliest_time) { current_core = i; earliest_time = event->timestamp; } @@ -773,22 +769,21 @@ void tt_get_messages(char *buffer, size_t length) /* None of the traces have any more events. */ break; } - event = &(tt_buffers[current_core]->events[ - pos[current_core]]); + event = &(tt_buffers[current_core]->events[pos[current_core]]); pos[current_core] = (pos[current_core] + 1) - & (tt_buffer_size-1); + & (tt_buffer_size - 1); if (printed > 0) { result = snprintf(buffer + printed, length - printed, - "; "); - if ((result < 0) || (result >= (length - printed))) + "; "); + if (result < 0 || result >= (length - printed)) break; printed += result; } result = snprintf(buffer + printed, length - printed, - event->format, event->arg0, event->arg1, - event->arg2, event->arg3); - if ((result < 0) || (result >= (length - printed))) + event->format, event->arg0, event->arg1, + event->arg2, event->arg3); + if (result < 0 || result >= (length - printed)) break; printed += result; } diff --git a/timetrace.h b/timetrace.h index 7547e381..db8aa43d 100644 --- a/timetrace.h +++ b/timetrace.h @@ -45,7 +45,7 @@ struct tt_event { /* The number of events in a tt_buffer, as a power of 2. */ #define TT_BUF_SIZE_EXP 14 -#define TT_BUF_SIZE (1<> 32; + return ((__u64)p) >> 32; } static inline __u32 tt_lo(void *p) { - return ((__u64) p) & 0xffffffff; + return ((__u64)p) & 0xffffffff; } -#define SPLIT_64(value) (int) (((__u64) (value)) >> 32), (int) (((__u64) (value)) & 0xffffffff) - #endif // HOMA_TIMETRACE_H diff --git a/util/strip.py b/util/strip.py new file mode 100755 index 00000000..7493495b --- /dev/null +++ b/util/strip.py @@ -0,0 +1,298 @@ +#!/usr/bin/python3 + +# SPDX-License-Identifier: BSD-2-Clause + +""" +This script is used to copy information from the Homa GitHub repo to +a Linux kernel repo, removing information that doesn't belong in the +official kernel version (primarily calls to tt_record). + +Usage: strip.py file file file ... destdir + +Each of the files will be read, stripped as appropriate, and copied to a +file by the same name in destdir. If there is only a single file and no +destdir, then the stripped file is printed on standard output. + +In some cases, such as calls to tt_record*, information is removed +automatically. In other cases, it is controlled with #if statments in +the following ways: + +* This entire block will be removed in the stripped version: + #if 1 /* See strip.py */ + ... + #endif /* See strip.py */ + +* The #if and #endif statements will be removed, leaving just the code + in between: + #if 0 /* See strip.py */ + ... + #endif /* See strip.py */ + +* Everything will be removed except the code between #else and #endif: + #if 1 /* See strip.py */ + ... + #else /* See strip.py */ + ... + #endif /* See strip.py */ +""" + +from collections import defaultdict +from glob import glob +from optparse import OptionParser +import math +import os +from pathlib import Path +import re +import string +import sys + +exit_code = 0 + +def remove_close(line): + """ + Given a line of text containing a '}', remove the '}' and any + following white space. If there is no '}', returns the original line. + """ + i = line.rfind('}') + if i < 0: + return line + for j in range(i+1, len(line), 1): + if line[j] != ' ': + break + return line[0:i] + line [j:] + +def remove_open(line): + """ + Given a line of text containing a '{', remove the '{' and any + preceding white space. If there is no '{', returns the original line. + """ + i = line.rfind('{') + if i < 0: + return line + j = -1 + for j in range(i-1, -1, -1): + if line[j] != ' ': + break + return line[0:j+1] + line [i+1:] + +def leading_space(line): + """ + Return the number of characters of leading space in a line (a tab counts + as 8 spaces). + """ + + count = 0 + for c in line: + if c == ' ': + count += 1 + elif c == '\t': + count += 8 + else: + break + return count + +def last_non_blank(s): + """ + Return the last non-blank character in s, or None if there is no + non-blank character in s. + """ + s2 = s.rstrip() + if s2: + return s2[-1] + return None + +def scan(file): + """ + Read a file, remove information that shouldn't appear in the Linux kernel + version, and return an array of lines representing the stripped file. + file: Pathname of file to read + """ + + global exit_code + + # True means we're in the middle of a '/* ... */' comment + in_comment = False + + # True means we're in the middle of a statement that should be skipped. + in_statement = False + + # Values of 0 or 1 mean we're in the middle of a group of lines labeled + # with '#if /* GitHubOnly */'. 0 means we're including lines, 1 means + # we're stripping them. None means we're not in such a group. + in_labeled_skip = None + + # True means we're in the middle of an '#ifdef __UNIT_TEST__' + in_unit = False + + # Array of lines containing the stripped version of the file + slines = [] + + # Index in slines of the most recent line ending with a '{', or None + # if none. Only valid for innermost blocks (those with no nested blocks). + open_index = None + + # Number of statements that have been seen since the last '{': used to + # eliminate curly braces around blocks that end up with only a single + # statement. Set to a number > 1 if there isn't an "interesting" + # current block. + statements_in_block = 100 + + # True means lines were automatically deleted in the current block; + # at the end of the block, see if curly braces are no longer needed. + check_braces = False + + # Used when deleted statements like tt_record are surrounded on both + # sides by empty lines; the second empty line will be deleted. + delete_empty_line = False + + line_num = 0 + + f = open(file) + for line in f: + line_num += 1 + + # pline is used for parsing; it is modified to remove + # uninteresting information such as comments and whitespace. + pline = line.rstrip() + + # Handle comments. + if in_comment: + index = pline.find('*/') + if index < 0: + slines.append(line) + continue + pline = pline[index+2:] + in_comment = False + index = pline.find('/*') + if index >= 0: + index2 = pline.find('*/', index+2) + if index2 >= 0: + pline = pline[0:index] + pline[index2+2:] + else: + pline = pline[0:index] + in_comment = True + index = pline.find('//') + if index >= 0: + pline = pline[0:index] + + pline = pline.strip() + + # Strip groups of lines labeled with special '#if' + if in_labeled_skip != None: + if line.startswith('#endif /* See strip.py */'): + in_labeled_skip = None + check_braces = False + continue + elif line.startswith('#else /* See strip.py */'): + in_labeled_skip = 0 + continue + if in_labeled_skip == 1: + continue + if line.startswith('#if 1 /* See strip.py */'): + if slines[-1].strip() == '': + slines.pop() + in_labeled_skip = 1 + check_braces = False + continue + if line.startswith('#if 0 /* See strip.py */'): + if slines[-1].strip() == '': + slines.pop() + in_labeled_skip = 0 + check_braces = False + continue + + # Strip tt_freeze() statements. + if pline == 'tt_freeze();': + check_braces = True + if slines[-1].strip() == '': + delete_empty_line = True + continue + + # Strip tt_record statements. + if in_statement: + if pline[-1] == ';': + in_statement = False + check_braces = True + continue + if re.match('tt_record[1-4]?[(]', pline): + # If this is the only statement in its block, delete the + # outer block statement (if, while, etc.). + indent = leading_space(line) + for i in range(len(slines)-1, -1, -1): + prev = slines[i] + prev_indent = leading_space(prev) + if last_non_blank(prev) == '{': + break + if prev_indent == 0: + # Label or method start; no need to continue further + break + if leading_space(prev) < indent: + if prev.lstrip().startswith('case'): + print('%s:%d: \'case\' before tt_record; don\'t know how to handle' + % (file, i), file=sys.stderr) + exit_code = 1 + break + slines = slines[:i] + break + + if pline[-1] != ';': + in_statement = True + if slines[-1].strip() == '': + delete_empty_line = True + check_braces = True + continue + + if not pline: + if not line.isspace() or not delete_empty_line: + slines.append(line) + delete_empty_line = False + continue + delete_empty_line = False + + # Remove braces for blocks that now have only a single statement + if pline[0] == '}': + if check_braces: + check_braces = False; + if open_index != None: + if statements_in_block == 0: + print('%s:%d: stripping creates empty block' % + (file, line_num), file=sys.stderr) + exit_code = 1 + if statements_in_block == 1: + slines[open_index] = remove_open(slines[open_index]) + line = remove_close(line) + if not line.strip(): + open_index = None + continue + open_index = None + if pline[-1] == '{' and line[0] != '{': + statements_in_block = 0 + open_index = len(slines) + + # Count statements + if pline[-1] == ';': + statements_in_block += 1 + + # The current line needs to be retained in the output. + slines.append(line) + f.close() + return slines + +if __name__ == '__main__': + f = sys.stdin + if len(sys.argv) < 2: + print('Usage: strip.py file [file ... destdir]', file=sys.stderr) + exit(1) + if len(sys.argv) == 2: + for line in scan(sys.argv[1]): + print(line, end='') + else: + for file in sys.argv[1:-1]: + dst_file = '%s/%s' % (sys.argv[-1], file) + print('Stripping %s into %s' % (file, dst_file)) + slines = scan(file) + dst = open(dst_file, 'w') + for line in slines: + print(line, end='', file=dst) + dst.close() + sys.exit(exit_code) \ No newline at end of file From 537a92412f2810f03ee6d287761b81ac7c0a2ea1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 28 Oct 2024 14:38:04 -0700 Subject: [PATCH 049/625] Update notes.txt --- notes.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/notes.txt b/notes.txt index 4bcdc4a2..68152c32 100755 --- a/notes.txt +++ b/notes.txt @@ -30,6 +30,10 @@ Notes for Homa implementation in Linux: * Notes on Linux qdiscs: +* Need to rework timeout handling to keep separate "silent_ticks" values: + * One to use for retransmissions (consider only progress in getting data) + * One to use for timeouts (consider all packets received from host) + * Remedies to consider for the performance problems at 100 Gbps, where one tx channel gets very backed up: * Implement zero-copy on output in order to reduce memory bandwidth @@ -403,6 +407,9 @@ Notes for Homa implementation in Linux: ip_input.c: ip_rcv_finish ip_input.c: dst_input homa_plumbing.c: homa_softirq +<<<<<<< Updated upstream gcc -g -Wp,-MMD,/users/ouster/homaModule/.homa_offload.o.d -nostdinc -I./arch/x86/include -I./arch/x86/include/generated -I./include -I./arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I./include/uapi -I./include/generated/uapi -include ./include/linux/compiler-version.h -include ./include/linux/kconfig.h -include ./include/linux/compiler_types.h -D__KERNEL__ -fmacro-prefix-map=./= -std=gnu11 -fshort-wchar -funsigned-char -fno-common -fno-PIE -fno-strict-aliasing -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -fcf-protection=none -m64 -falign-jumps=1 -falign-loops=1 -mno-80387 -mno-fp-ret-in-387 -mpreferred-stack-boundary=3 -mskip-rax-setup -mtune=generic -mno-red-zone -mcmodel=kernel -Wno-sign-compare -fno-asynchronous-unwind-tables -mindirect-branch=thunk-extern -mindirect-branch-register -mindirect-branch-cs-prefix -mfunction-return=thunk-extern -fno-jump-tables -fpatchable-function-entry=16,16 -fno-delete-null-pointer-checks -O2 -fno-allow-store-data-races -fstack-protector-strong -fno-omit-frame-pointer -fno-optimize-sibling-calls -fno-stack-clash-protection -pg -mrecord-mcount -mfentry -DCC_USING_FENTRY -falign-functions=16 -fno-strict-overflow -fno-stack-check -fconserve-stack -Wall -Wundef -Werror=implicit-function-declaration -Werror=implicit-int -Werror=return-type -Werror=strict-prototypes -Wno-format-security -Wno-trigraphs -Wno-frame-address -Wno-address-of-packed-member -Wmissing-declarations -Wmissing-prototypes -Wframe-larger-than=2048 -Wno-main -Wvla -Wno-pointer-sign -Wcast-function-type -Wno-stringop-overflow -Wno-array-bounds -Wno-alloc-size-larger-than -Wimplicit-fallthrough=5 -Werror=date-time -Werror=incompatible-pointer-types -Werror=designated-init -Wenum-conversion -Wextra -Wunused -Wno-unused-but-set-variable -Wno-unused-const-variable -Wno-packed-not-aligned -Wno-format-overflow -Wno-format-truncation -Wno-stringop-truncation -Wno-override-init -Wno-missing-field-initializers -Wno-type-limits -Wno-shift-negative-value -Wno-maybe-uninitialized -Wno-sign-compare -Wno-unused-parameter -g -gdwarf-4 -g -DMODULE -DKBUILD_BASENAME='"homa_offload"' -DKBUILD_MODNAME='"homa"' -D__KBUILD_MODNAME=kmod_homa -E -o /users/ouster/homaModule/homa_offload.e /users/ouster/homaModule/homa_offload.c ; ./tools/objtool/objtool --hacks=jump_label --hacks=noinstr --hacks=skylake --retpoline --rethunk --stackval --static-call --uaccess --prefix=16 --module /users/ouster/homaModule/homa_offload.o +======= +>>>>>>> Stashed changes From 883aabead486ffa150e47f0770ff4d2cec5005d5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 24 Oct 2024 10:23:36 -0700 Subject: [PATCH 050/625] Update internal modify dates for man pages --- man/homa.7 | 2 +- man/homa_abort.3 | 2 +- man/recvmsg.2 | 2 +- man/sendmsg.2 | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/homa.7 b/man/homa.7 index dd171de1..76adfe2a 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -1,4 +1,4 @@ -.TH HOMA 7 2022-12-13 "Homa" "Linux Programmer's Manual" +.TH HOMA 7 2024-7-26 "Homa" "Linux Programmer's Manual" .SH NAME homa \- Homa transport protocol .SH SYNOPSIS diff --git a/man/homa_abort.3 b/man/homa_abort.3 index edecbafc..04972654 100644 --- a/man/homa_abort.3 +++ b/man/homa_abort.3 @@ -1,4 +1,4 @@ -.TH HOMA_ABORT 3 2021-08-24 "Homa" "Linux Programmer's Manual" +.TH HOMA_ABORT 3 2022-9-15 "Homa" "Linux Programmer's Manual" .SH NAME homa_abort \- terminate an outgoing RPC .SH SYNOPSIS diff --git a/man/recvmsg.2 b/man/recvmsg.2 index 1db77b27..57162de7 100644 --- a/man/recvmsg.2 +++ b/man/recvmsg.2 @@ -1,4 +1,4 @@ -.TH RECVMSG 2 2022-12-13 "Homa" "Linux Programmer's Manual" +.TH RECVMSG 2 2024-7-16 "Homa" "Linux Programmer's Manual" .SH NAME recvmsg \- receive a Homa message .SH SYNOPSIS diff --git a/man/sendmsg.2 b/man/sendmsg.2 index 1efbc585..f5864a06 100644 --- a/man/sendmsg.2 +++ b/man/sendmsg.2 @@ -1,4 +1,4 @@ -.TH SENDMSG 2 2022-12-14 "Homa" "Linux Programmer's Manual" +.TH SENDMSG 2 2023-11-2 "Homa" "Linux Programmer's Manual" .SH NAME sendmsg \- send a Homa request or response message .SH SYNOPSIS From 966958b2d1c576ccaf030c635cf6aa38f13b1ad8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 24 Oct 2024 11:42:35 -0700 Subject: [PATCH 051/625] Add --alt option to util/strip.py --- homa_impl.h | 2 +- test/Makefile | 4 ++-- util/strip.py | 35 +++++++++++++++++++++++++++-------- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index b8f0118b..ddfbb56f 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -45,7 +45,7 @@ #include #include -#if 1 /* See strip.py */ +#if 1 /* See strip.py --alt */ #include "homa.h" #else /* See strip.py */ #include diff --git a/test/Makefile b/test/Makefile index fb2faa5c..ed0e9802 100644 --- a/test/Makefile +++ b/test/Makefile @@ -114,9 +114,9 @@ S_HOMA_HDRS := stripped/homa.h \ stripped/homa_stub.h \ stripped/homa_wire.h stripped/%.c: ../%.c - ../util/strip.py $< > $@ + ../util/strip.py --alt $< > $@ stripped/%.h: ../%.h - ../util/strip.py $< > $@ + ../util/strip.py --alt $< > $@ S_TEST_OBJS := $(patsubst %,stripped/%,$(filter-out unit_timetrace.o, $(TEST_OBJS))) S_OBJS := $(S_HOMA_OBJS) $(S_TEST_OBJS) $(patsubst %,stripped/%,$(OTHER_OBJS)) diff --git a/util/strip.py b/util/strip.py index 7493495b..387ec27d 100755 --- a/util/strip.py +++ b/util/strip.py @@ -7,7 +7,7 @@ a Linux kernel repo, removing information that doesn't belong in the official kernel version (primarily calls to tt_record). -Usage: strip.py file file file ... destdir +Usage: strip.py [--alt] file file file ... destdir Each of the files will be read, stripped as appropriate, and copied to a file by the same name in destdir. If there is only a single file and no @@ -34,6 +34,16 @@ #else /* See strip.py */ ... #endif /* See strip.py */ + +* It is also possible to strip using "alt" mode, with lines like this: + #if 1 /* See strip.py --alt */ + #if 0 /* See strip.py --alt */ + If the --alt option was not specified then these lines are handled as + if "--alt" wasn't present in the comments. However, if the --alt option + was specified then these lines are ignored. + +If the --alt option is specified, it means the output is intended for +testing outside the Linux kernel. In this case, the lines """ from collections import defaultdict @@ -101,11 +111,12 @@ def last_non_blank(s): return s2[-1] return None -def scan(file): +def scan(file, alt_mode): """ Read a file, remove information that shouldn't appear in the Linux kernel version, and return an array of lines representing the stripped file. - file: Pathname of file to read + file: Pathname of file to read + alt_mode: True means the --alt option was specified """ global exit_code @@ -188,13 +199,17 @@ def scan(file): continue if in_labeled_skip == 1: continue - if line.startswith('#if 1 /* See strip.py */'): + if line.startswith('#if 1 /* See strip.py */') or ( + line.startswith('#if 1 /* See strip.py --alt */') + and not alt_mode): if slines[-1].strip() == '': slines.pop() in_labeled_skip = 1 check_braces = False continue - if line.startswith('#if 0 /* See strip.py */'): + if line.startswith('#if 0 /* See strip.py */')or ( + line.startswith('#if 0 /* See strip.py --alt */') + and not alt_mode): if slines[-1].strip() == '': slines.pop() in_labeled_skip = 0 @@ -280,17 +295,21 @@ def scan(file): if __name__ == '__main__': f = sys.stdin + alt_mode = False; + if (len(sys.argv) >= 2) and (sys.argv[1] == '--alt'): + alt_mode = True; + del sys.argv[1] if len(sys.argv) < 2: - print('Usage: strip.py file [file ... destdir]', file=sys.stderr) + print('Usage: strip.py [--alt] file [file ... destdir]', file=sys.stderr) exit(1) if len(sys.argv) == 2: - for line in scan(sys.argv[1]): + for line in scan(sys.argv[1], alt_mode): print(line, end='') else: for file in sys.argv[1:-1]: dst_file = '%s/%s' % (sys.argv[-1], file) print('Stripping %s into %s' % (file, dst_file)) - slines = scan(file) + slines = scan(file, alt_mode) dst = open(dst_file, 'w') for line in slines: print(line, end='', file=dst) From f7e47175085b48ae731118db51dc688a12ebbc6a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 25 Oct 2024 13:50:29 -0700 Subject: [PATCH 052/625] Switch to IANA-assigned value for IPPROTO_HOMA --- homa.h | 6 ++---- test/unit_homa_offload.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/homa.h b/homa.h index f2920b99..382f9e5b 100644 --- a/homa.h +++ b/homa.h @@ -18,10 +18,8 @@ extern "C" { #endif -/* Homa's protocol number within the IP protocol space (this is not an - * officially allocated slot). - */ -#define IPPROTO_HOMA 0xFD +/* IANA-assigned Internet Protocol number for Homa. */ +#define IPPROTO_HOMA 146 /** * define HOMA_MAX_MESSAGE_LENGTH - Maximum bytes of payload in a Homa diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 4cf374ee..52179d1d 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -195,7 +195,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) EXPECT_EQ(skb, cur_offload_core->held_skb); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(IPPROTO_HOMA, ip_hdr(skb)->protocol); - EXPECT_EQ(2303, ip_hdr(skb)->check); + EXPECT_EQ(29695, ip_hdr(skb)->check); kfree_skb(skb); homa_gro_unhook_tcp(); } From f1e10fdb964643b56add59bdc6c7ada3ab06332e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 28 Oct 2024 15:46:19 -0700 Subject: [PATCH 053/625] Add conditional code for Linux 6.12 --- homa_impl.h | 10 ++++++++++ homa_plumbing.c | 12 ++++++++++++ test/mock.c | 7 ++++++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/homa_impl.h b/homa_impl.h index ddfbb56f..68a871ae 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1105,8 +1105,13 @@ extern void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); extern void homa_destroy(struct homa *homa); extern int homa_disconnect(struct sock *sk, int flags); extern void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) +extern int homa_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#else extern int homa_dointvec(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif extern int homa_err_handler_v4(struct sk_buff *skb, u32 info); extern int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, @@ -1176,8 +1181,13 @@ extern int homa_snprintf(char *buffer, int size, int used, extern int homa_softirq(struct sk_buff *skb); extern void homa_spin(int ns); extern char *homa_symbol_for_type(uint8_t type); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) +extern int homa_sysctl_softirq_cores(struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#else extern int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif extern void homa_timer(struct homa *homa); extern int homa_timer_main(void *transportInfo); extern void homa_unhash(struct sock *sk); diff --git a/homa_plumbing.c b/homa_plumbing.c index 769d88cc..417048ac 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -490,7 +490,9 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) {} +#endif }; /* Sizes of the headers for each Homa packet type, in bytes. */ @@ -1491,8 +1493,13 @@ __poll_t homa_poll(struct file *file, struct socket *sock, * * Return: 0 for success, nonzero for error. */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) +int homa_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +#else int homa_dointvec(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +#endif { int result; @@ -1567,8 +1574,13 @@ int homa_dointvec(const struct ctl_table *table, int write, * * Return: 0 for success, nonzero for error. */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) +int homa_sysctl_softirq_cores(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +#else int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +#endif { struct homa_offload_core *offload_core; struct ctl_table table_copy; diff --git a/test/mock.c b/test/mock.c index ff0b369a..e3c3e344 100644 --- a/test/mock.c +++ b/test/mock.c @@ -849,8 +849,13 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode, return entry; } +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) +int proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +#else int proc_dointvec(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +#endif { return 0; } @@ -955,7 +960,7 @@ int sk_set_peek_off(struct sock *sk, int val) void sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - __kfree_skb(skb); + kfree_skb(skb); } int skb_copy_datagram_iter(const struct sk_buff *from, int offset, From d3a8ee74fd6d472648c61091913fd0f154bd30cd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 30 Oct 2024 09:14:08 -0700 Subject: [PATCH 054/625] Update notes.txt --- notes.txt | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/notes.txt b/notes.txt index 68152c32..b29de62d 100755 --- a/notes.txt +++ b/notes.txt @@ -29,10 +29,13 @@ Notes for Homa implementation in Linux: * Also consider the amount of data that is "stuck" in the NIC? * Notes on Linux qdiscs: - -* Need to rework timeout handling to keep separate "silent_ticks" values: - * One to use for retransmissions (consider only progress in getting data) - * One to use for timeouts (consider all packets received from host) + * qdisc_create() is in sch_api.c + * Packet transmission "starts" in __dev_xmit_skb in dev.c. + * sch_direct_xmit is called once it's time to actually transmit a + packet (or list of packets). However, the device driver can return + NETDEV_TX_BUSY, in which case the packet will be (re)queued in the qdisc. + * TCQ_F_NOLOCK seems to apply to the qdisc root lock: individual qdiscs + still get locked. * Remedies to consider for the performance problems at 100 Gbps, where one tx channel gets very backed up: @@ -407,9 +410,3 @@ Notes for Homa implementation in Linux: ip_input.c: ip_rcv_finish ip_input.c: dst_input homa_plumbing.c: homa_softirq -<<<<<<< Updated upstream - - - gcc -g -Wp,-MMD,/users/ouster/homaModule/.homa_offload.o.d -nostdinc -I./arch/x86/include -I./arch/x86/include/generated -I./include -I./arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I./include/uapi -I./include/generated/uapi -include ./include/linux/compiler-version.h -include ./include/linux/kconfig.h -include ./include/linux/compiler_types.h -D__KERNEL__ -fmacro-prefix-map=./= -std=gnu11 -fshort-wchar -funsigned-char -fno-common -fno-PIE -fno-strict-aliasing -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -fcf-protection=none -m64 -falign-jumps=1 -falign-loops=1 -mno-80387 -mno-fp-ret-in-387 -mpreferred-stack-boundary=3 -mskip-rax-setup -mtune=generic -mno-red-zone -mcmodel=kernel -Wno-sign-compare -fno-asynchronous-unwind-tables -mindirect-branch=thunk-extern -mindirect-branch-register -mindirect-branch-cs-prefix -mfunction-return=thunk-extern -fno-jump-tables -fpatchable-function-entry=16,16 -fno-delete-null-pointer-checks -O2 -fno-allow-store-data-races -fstack-protector-strong -fno-omit-frame-pointer -fno-optimize-sibling-calls -fno-stack-clash-protection -pg -mrecord-mcount -mfentry -DCC_USING_FENTRY -falign-functions=16 -fno-strict-overflow -fno-stack-check -fconserve-stack -Wall -Wundef -Werror=implicit-function-declaration -Werror=implicit-int -Werror=return-type -Werror=strict-prototypes -Wno-format-security -Wno-trigraphs -Wno-frame-address -Wno-address-of-packed-member -Wmissing-declarations -Wmissing-prototypes -Wframe-larger-than=2048 -Wno-main -Wvla -Wno-pointer-sign -Wcast-function-type -Wno-stringop-overflow -Wno-array-bounds -Wno-alloc-size-larger-than -Wimplicit-fallthrough=5 -Werror=date-time -Werror=incompatible-pointer-types -Werror=designated-init -Wenum-conversion -Wextra -Wunused -Wno-unused-but-set-variable -Wno-unused-const-variable -Wno-packed-not-aligned -Wno-format-overflow -Wno-format-truncation -Wno-stringop-truncation -Wno-override-init -Wno-missing-field-initializers -Wno-type-limits -Wno-shift-negative-value -Wno-maybe-uninitialized -Wno-sign-compare -Wno-unused-parameter -g -gdwarf-4 -g -DMODULE -DKBUILD_BASENAME='"homa_offload"' -DKBUILD_MODNAME='"homa"' -D__KBUILD_MODNAME=kmod_homa -E -o /users/ouster/homaModule/homa_offload.e /users/ouster/homaModule/homa_offload.c ; ./tools/objtool/objtool --hacks=jump_label --hacks=noinstr --hacks=skylake --retpoline --rethunk --stackval --static-call --uaccess --prefix=16 --module /users/ouster/homaModule/homa_offload.o -======= ->>>>>>> Stashed changes From e6f27c1ce12d93bcd29f5c00ee31af9d15d6f544 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 13 Nov 2024 13:20:44 -0800 Subject: [PATCH 055/625] Strip _Static_assert statement (Causes problems on machines with different word sizes) --- homa_pool.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/homa_pool.h b/homa_pool.h index bce2f40e..0c0f2bd2 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -86,8 +86,10 @@ struct homa_pool_core { }; }; +#if 1 /* See strip.py */ _Static_assert(sizeof(struct homa_pool_core) == L1_CACHE_BYTES, "homa_pool_core overflowed a cache line"); +#endif /* See strip.py */ /** * struct homa_pool - Describes a pool of buffer space for incoming From bf6d5dfaf12edff0e0a748c5a6e3c63dd471d539 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 13 Nov 2024 14:09:09 -0800 Subject: [PATCH 056/625] Improve strip.py * Strip code related to unit tests * Strip tt_records even if in comments * Also, various changes to code files to make this work, plus a few other cleanups --- homa_impl.h | 12 ++--- homa_incoming.c | 8 +-- homa_plumbing.c | 26 ++++++---- homa_pool.c | 6 +-- homa_rpc.c | 4 +- homa_sock.c | 2 +- homa_timer.c | 4 -- timetrace.c | 6 +-- util/strip.py | 135 +++++++++++++++++++++++++++++++----------------- 9 files changed, 121 insertions(+), 82 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 68a871ae..f2561b58 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -8,7 +8,6 @@ #define _HOMA_IMPL_H #include -#if 1 /* See strip.py --alt */ #ifdef __UNIT_TEST__ #undef WARN #define WARN(...) @@ -21,8 +20,7 @@ #undef WARN_ON_ONCE #define WARN_ON_ONCE(condition) WARN_ON(condition) -#endif -#endif /* See strip.py */ +#endif /* __UNIT_TEST__ */ #include #include @@ -52,7 +50,6 @@ #endif /* See strip.py */ #include "homa_wire.h" -#if 1 /* See strip.py --alt */ #ifdef __UNIT_TEST__ #undef alloc_pages #define alloc_pages mock_alloc_pages @@ -130,6 +127,7 @@ void *mock_vmalloc(size_t size); #define per_cpu(name, core) (name[core]) #endif /* __UNIT_TEST__ */ +#if 1 /* See strip.py */ /* Null out things that confuse VSCode Intellisense */ #ifdef __VSCODE__ #define raw_smp_processor_id() 1 @@ -1070,16 +1068,18 @@ static inline __be32 tt_addr(const struct in6_addr x) : ntohl(x.in6_u.u6_addr32[1])); } +#if 1 /* See strip.py --alt */ #ifdef __UNIT_TEST__ void unit_log_printf(const char *separator, const char *format, ...) __printf(2, 3); #define UNIT_LOG unit_log_printf void unit_hook(char *id); #define UNIT_HOOK(msg) unit_hook(msg) -#else +#else /* __UNIT_TEST__ */ #define UNIT_LOG(...) #define UNIT_HOOK(...) -#endif +#endif /* __UNIT_TEST__ */ +#endif /* See strip.py */ extern void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, int port, int error); diff --git a/homa_incoming.c b/homa_incoming.c index 62b3e316..a412bb96 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -206,9 +206,9 @@ int homa_copy_to_user(struct homa_rpc *rpc) { #ifdef __UNIT_TEST__ #define MAX_SKBS 3 -#else +#else /* __UNIT_TEST__ */ #define MAX_SKBS 20 -#endif +#endif /* __UNIT_TEST__ */ struct sk_buff *skbs[MAX_SKBS]; #if 1 /* See strip.py */ int start_offset = 0; @@ -343,9 +343,9 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) { #ifdef __UNIT_TEST__ #define MAX_ACKS 2 -#else +#else /* __UNIT_TEST__ */ #define MAX_ACKS 10 -#endif +#endif /* __UNIT_TEST__ */ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); struct data_header *h = (struct data_header *)skb->data; __u64 id = homa_local_id(h->common.sender_id); diff --git a/homa_plumbing.c b/homa_plumbing.c index 417048ac..ba3b22ff 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -11,7 +11,7 @@ #ifndef __UNIT_TEST__ MODULE_LICENSE("Dual MIT/GPL"); -#endif +#endif /* __UNIT_TEST__ */ MODULE_AUTHOR("John Ousterhout"); MODULE_DESCRIPTION("Homa transport protocol"); MODULE_VERSION("0.01"); @@ -1189,28 +1189,32 @@ int homa_softirq(struct sk_buff *skb) struct sk_buff **prev_link, **other_link; struct common_header *h; int first_packet = 1; - static __u64 last; int header_offset; +#if 1 /* See strip.py */ + static __u64 last; int pull_length; __u64 start; start = get_cycles(); INC_METRIC(softirq_calls, 1); - per_cpu(homa_offload_core, raw_smp_processor_id()).last_active = start; + per_cpu(homa_offload_core, raw_smp_processor_id()).last_active + = start; if ((start - last) > 1000000) { int scaled_ms = (int)(10 * (start - last) / cpu_khz); if (scaled_ms >= 50 && scaled_ms < 10000) { -// tt_record3("Gap in incoming packets: %d cycles " -// "(%d.%1d ms)", -// (int) (start - last), scaled_ms/10, -// scaled_ms%10); -// pr_notice("Gap in incoming packets: %llu " -// "cycles, (%d.%1d ms)", (start - last), -// scaled_ms/10, scaled_ms%10); +// tt_record3("Gap in incoming packets: %d cycles (%d.%1d ms)", +// (int) (start - last), scaled_ms/10, +// scaled_ms%10); +// pr_notice("Gap in incoming packets: %llu cycles, (%d.%1d ms)", +// (start - last), +// scaled_ms/10, scaled_ms%10); } } last = start; +#else /* See strip.py */ + int pull_length; +#endif /* See strip.py */ /* skb may actually contain many distinct packets, linked through * skb_shinfo(skb)->frag_list by the Homa GRO mechanism. Make a @@ -1341,7 +1345,7 @@ int homa_softirq(struct sk_buff *skb) skb2->data; UNIT_LOG("", " %d", ntohl(h3->seg.offset)); } -#endif +#endif /* __UNIT_TEST__ */ homa_dispatch_pkts(packets, homa); packets = other_pkts; } diff --git a/homa_pool.c b/homa_pool.c index e845466e..d4472a7b 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -14,16 +14,16 @@ /* Used when determining how many bpages to consider for allocation. */ #define MIN_EXTRA 4 +#ifdef __UNIT_TEST__ /* When running unit tests, allow HOMA_BPAGE_SIZE and HOMA_BPAGE_SHIFT * to be overridden. */ -#ifdef __UNIT_TEST__ #include "mock.h" #undef HOMA_BPAGE_SIZE #define HOMA_BPAGE_SIZE mock_bpage_size #undef HOMA_BPAGE_SHIFT #define HOMA_BPAGE_SHIFT mock_bpage_shift -#endif +#endif /* __UNIT_TEST__ */ /** * set_bpages_needed() - Set the bpages_needed field of @pool based @@ -412,7 +412,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) { #ifdef __UNIT_TEST__ pool->check_waiting_invoked += 1; -#endif +#endif /* __UNIT_TEST__ */ while (atomic_read(&pool->free_bpages) >= pool->bpages_needed) { struct homa_rpc *rpc; diff --git a/homa_rpc.c b/homa_rpc.c index e67b3fec..6a3c6ff6 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -331,9 +331,9 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) { #ifdef __UNIT_TEST__ #define BATCH_MAX 3 -#else +#else /* __UNIT_TEST__ */ #define BATCH_MAX 20 -#endif +#endif /* __UNIT_TEST__ */ struct homa_rpc *rpcs[BATCH_MAX]; struct sk_buff *skbs[BATCH_MAX]; int num_skbs, num_rpcs; diff --git a/homa_sock.c b/homa_sock.c index 144a33ca..66f1ea8e 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -84,7 +84,7 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) return NULL; scan->next = (struct homa_socktab_links *) hlist_first_rcu(&scan->socktab->buckets - [scan->current_bucket]); + [scan->current_bucket]); } links = scan->next; hsk = links->sock; diff --git a/homa_timer.c b/homa_timer.c index f7ec816a..52f12541 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -257,10 +257,6 @@ void homa_timer(struct homa *homa) tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", total_incoming_rpcs, sum_incoming, sum_incoming_rec, atomic_read(&homa->total_incoming)); - -// if (total_rpcs > 0) -// tt_record1("homa_timer finished scanning %d RPCs", total_rpcs); - homa_skb_release_pages(homa); end = get_cycles(); INC_METRIC(timer_cycles, end-start); diff --git a/timetrace.c b/timetrace.c index abcd1fc9..6213cdf3 100644 --- a/timetrace.c +++ b/timetrace.c @@ -8,7 +8,7 @@ * those stubs to allow the rest of the kernel to log in our buffers. */ //#define TT_KERNEL 1 -#endif +#endif /* __UNIT_TEST__ */ #ifdef TT_KERNEL struct tt_buffer *tt_linux_buffers[]; void (*tt_linux_freeze)(void); @@ -255,9 +255,9 @@ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, buffer->next_index = (buffer->next_index + 1) #ifdef __UNIT_TEST__ & (tt_buffer_size - 1); -#else +#else /* __UNIT_TEST__ */ & (TT_BUF_SIZE - 1); -#endif +#endif /* __UNIT_TEST__ */ event->timestamp = timestamp; event->format = format; diff --git a/util/strip.py b/util/strip.py index 387ec27d..f3cb2534 100755 --- a/util/strip.py +++ b/util/strip.py @@ -5,7 +5,7 @@ """ This script is used to copy information from the Homa GitHub repo to a Linux kernel repo, removing information that doesn't belong in the -official kernel version (primarily calls to tt_record). +official kernel version (such as calls to tt_record). Usage: strip.py [--alt] file file file ... destdir @@ -13,9 +13,9 @@ file by the same name in destdir. If there is only a single file and no destdir, then the stripped file is printed on standard output. -In some cases, such as calls to tt_record*, information is removed -automatically. In other cases, it is controlled with #if statments in -the following ways: +In some cases, such as calls to tt_record* and code related to unit tests, +information is removed automatically. In other cases, it is controlled with +#if statments in the following ways: * This entire block will be removed in the stripped version: #if 1 /* See strip.py */ @@ -121,19 +121,28 @@ def scan(file, alt_mode): global exit_code - # True means we're in the middle of a '/* ... */' comment + # True means the current line is in the middle of a /* ... */ comment in_comment = False - # True means we're in the middle of a statement that should be skipped. - in_statement = False + # True means the current line is at least partially a comment line. + current_has_comment = False + + # True means we're in the middle of a multi-line statement that + # should be skipped (drop until a semicolon is seen). + skip_statement = False # Values of 0 or 1 mean we're in the middle of a group of lines labeled # with '#if /* GitHubOnly */'. 0 means we're including lines, 1 means # we're stripping them. None means we're not in such a group. in_labeled_skip = None - # True means we're in the middle of an '#ifdef __UNIT_TEST__' - in_unit = False + # Used to strip out unit testing code. Value is one of: + # None: We're not in the middle of an '#ifdef __UNIT_TEST__' + # 'if': An '#idfdef __UNIT_TEST__" has been seen, but not the + # corresponding #else or #endif not been seen yet + # 'else': We are in the middle of an '#else' clause for an + # '#ifdef __UNIT_TEST__' + in_unit = None # Array of lines containing the stripped version of the file slines = [] @@ -166,25 +175,22 @@ def scan(file, alt_mode): # uninteresting information such as comments and whitespace. pline = line.rstrip() - # Handle comments. + # See if (part of) this line is a comment. + current_has_comment = in_comment if in_comment: index = pline.find('*/') - if index < 0: - slines.append(line) - continue - pline = pline[index+2:] - in_comment = False - index = pline.find('/*') - if index >= 0: - index2 = pline.find('*/', index+2) - if index2 >= 0: - pline = pline[0:index] + pline[index2+2:] - else: - pline = pline[0:index] - in_comment = True - index = pline.find('//') - if index >= 0: - pline = pline[0:index] + if index >= 0: + in_comment = False + else: + index = pline.find('/*') + if index >= 0: + current_has_comment = True + index2 = pline.find('*/', index+2) + if index2 < 0: + in_comment = True + index = pline.find('//') + if index >= 0: + current_has_comment = True pline = pline.strip() @@ -203,7 +209,7 @@ def scan(file, alt_mode): line.startswith('#if 1 /* See strip.py --alt */') and not alt_mode): if slines[-1].strip() == '': - slines.pop() + delete_empty_line = True in_labeled_skip = 1 check_braces = False continue @@ -224,39 +230,72 @@ def scan(file, alt_mode): continue # Strip tt_record statements. - if in_statement: + if skip_statement: if pline[-1] == ';': - in_statement = False + skip_statement = False check_braces = True continue - if re.match('tt_record[1-4]?[(]', pline): + match = re.match('(//[ \t]*)?tt_record[1-4]?[(]', pline) + if match: # If this is the only statement in its block, delete the # outer block statement (if, while, etc.). - indent = leading_space(line) - for i in range(len(slines)-1, -1, -1): - prev = slines[i] - prev_indent = leading_space(prev) - if last_non_blank(prev) == '{': - break - if prev_indent == 0: - # Label or method start; no need to continue further - break - if leading_space(prev) < indent: - if prev.lstrip().startswith('case'): - print('%s:%d: \'case\' before tt_record; don\'t know how to handle' - % (file, i), file=sys.stderr) - exit_code = 1 + if not match.group(1): + indent = leading_space(line) + for i in range(len(slines)-1, -1, -1): + prev = slines[i] + prev_indent = leading_space(prev) + if last_non_blank(prev) == '{': + break + if prev_indent == 0: + # Label or method start; no need to continue further + break + if leading_space(prev) < indent: + if prev.lstrip().startswith('case'): + print('%s:%d: \'case\' before tt_record; don\'t know how to handle' + % (file, i), file=sys.stderr) + exit_code = 1 + break + slines = slines[:i] break - slines = slines[:i] - break if pline[-1] != ';': - in_statement = True + skip_statement = True if slines[-1].strip() == '': delete_empty_line = True check_braces = True continue + # Strip UNIT_LOG and UNIT_HOOK statements. + if not alt_mode and (pline.startswith('UNIT_LOG(') or + pline.startswith('UNIT_HOOK(')): + if pline[-1] != ';': + skip_statement = True + if slines[-1].strip() == '': + delete_empty_line = True + check_braces = True + continue + + # Strip '#ifdef __UNIT_TEST__' blocks (keep #else clauses) + if in_unit: + if line.startswith('#endif /* __UNIT_TEST__ */'): + in_unit = None + continue + if line.startswith('#else /* __UNIT_TEST__ */'): + in_unit = 'else' + continue + if in_unit == 'if': + continue + elif line.startswith('#ifdef __UNIT_TEST__') and not alt_mode: + in_unit = 'if' + if slines[-1].strip() == '': + delete_empty_line = True + continue + elif line.startswith('#ifndef __UNIT_TEST__') and not alt_mode: + in_unit = 'else' + if slines[-1].strip() == '': + delete_empty_line = True + continue + if not pline: if not line.isspace() or not delete_empty_line: slines.append(line) @@ -285,7 +324,7 @@ def scan(file, alt_mode): open_index = len(slines) # Count statements - if pline[-1] == ';': + if pline[-1] == ';' and not current_has_comment: statements_in_block += 1 # The current line needs to be retained in the output. From 921ebb2c045cdedc30e61defccb22d1aa0230a86 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 13 Nov 2024 14:26:55 -0800 Subject: [PATCH 057/625] Fix bug in Linux version portability --- test/mock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/mock.c b/test/mock.c index e3c3e344..95593d9c 100644 --- a/test/mock.c +++ b/test/mock.c @@ -960,7 +960,11 @@ int sk_set_peek_off(struct sock *sk, int val) void sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) kfree_skb(skb); +#else + __kfree_skb(skb); +#endif } int skb_copy_datagram_iter(const struct sk_buff *from, int offset, From d7a44824fb4f7955eb0f538c4c93bf5634bac1a7 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 31 Oct 2024 21:29:48 -0700 Subject: [PATCH 058/625] Make it safe to delete sockets during homa_socktab scans * Keep track of active scans for each homa_socktab. * Adjust scans when sockets are deleted. * Add homa_socktab_end_scan to cleanup state after scan. * Add util_homa_destroy to detect unclosed scans during unit tests. --- homa_incoming.c | 1 + homa_rpc.c | 3 + homa_sock.c | 56 ++++++++++++++++-- homa_sock.h | 43 ++++++++------ homa_timer.c | 1 + homa_utils.c | 5 ++ test/unit_homa_outgoing.c | 5 +- test/unit_homa_plumbing.c | 5 +- test/unit_homa_sock.c | 116 ++++++++++++++++++++++++++++++-------- test/utils.c | 11 ++++ test/utils.h | 1 + 11 files changed, 200 insertions(+), 47 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index a412bb96..6f8ab1aa 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1013,6 +1013,7 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, } homa_unprotect_rpcs(hsk); } + homa_socktab_end_scan(&scan); rcu_read_unlock(); } diff --git a/homa_rpc.c b/homa_rpc.c index 6a3c6ff6..d6c6949d 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -585,6 +585,7 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) } homa_unprotect_rpcs(hsk); } + homa_socktab_end_scan(&scan); rcu_read_unlock(); pr_notice("Finished logging active Homa RPCs: %d active RPCs\n", count); } @@ -672,6 +673,7 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) } homa_unprotect_rpcs(hsk); } + homa_socktab_end_scan(&scan); rcu_read_unlock(); tt_record1("Finished logging (%d active Homa RPCs)", count); } @@ -738,6 +740,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) } homa_unprotect_rpcs(hsk); } + homa_socktab_end_scan(&scan); rcu_read_unlock(); actual = atomic_read(&homa->total_incoming); tt_record3("homa_validate_incoming diff %d (expected %d, got %d)", diff --git a/homa_sock.c b/homa_sock.c index 66f1ea8e..b3f274f2 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -17,6 +17,7 @@ void homa_socktab_init(struct homa_socktab *socktab) spin_lock_init(&socktab->write_lock); for (i = 0; i < HOMA_SOCKTAB_BUCKETS; i++) INIT_HLIST_HEAD(&socktab->buckets[i]); + INIT_LIST_HEAD(&socktab->active_scans); } /** @@ -32,6 +33,7 @@ void homa_socktab_destroy(struct homa_socktab *socktab) hsk = homa_socktab_next(&scan)) { homa_sock_destroy(hsk); } + homa_socktab_end_scan(&scan); } /** @@ -51,8 +53,12 @@ void homa_socktab_destroy(struct homa_socktab *socktab) * delete them while the scan is in progress. If a socket is removed from * the table during the scan, it may or may not be returned by * homa_socktab_next. New entries added during the scan may or may not be - * returned. The caller should use RCU to prevent socket storage from - * being reclaimed during the scan. + * returned. The caller must hold an RCU read lock when invoking the + * scan-related methods here, as well as when manipulating sockets returned + * during the scan. It is safe to release and reacquire the RCU read lock + * during a scan, as long as no socket is held when the read lock is + * released and homa_socktab_next isn't invoked until the RCU read lock + * is reacquired. */ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, struct homa_socktab_scan *scan) @@ -60,6 +66,11 @@ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, scan->socktab = socktab; scan->current_bucket = -1; scan->next = NULL; + + spin_lock_bh(&socktab->write_lock); + list_add_tail_rcu(&scan->scan_links, &socktab->active_scans); + spin_unlock_bh(&socktab->write_lock); + return homa_socktab_next(scan); } @@ -94,6 +105,17 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) } } +/** + * homa_socktab_end_scan() - Must be invoked on completion of each scan + * to clean up state associated with the scan. + * @scan: State of the scan. + */ +void homa_socktab_end_scan(struct homa_socktab_scan *scan) +{ + spin_lock_bh(&scan->socktab->write_lock); + list_del(&scan->scan_links); + spin_unlock_bh(&scan->socktab->write_lock); +} /** * homa_sock_init() - Constructor for homa_sock objects. This function * initializes only the parts of the socket that are owned by Homa. @@ -158,6 +180,30 @@ void homa_sock_init(struct homa_sock *hsk, struct homa *homa) spin_unlock_bh(&socktab->write_lock); } +/* + * homa_sock_unlink() - Unlinks a socket from its socktab and does + * related cleanups. Once this method returns, the socket will not be + * discoverable through the socktab. + */ +void homa_sock_unlink(struct homa_sock *hsk) +{ + struct homa_socktab *socktab = hsk->homa->port_map; + struct homa_socktab_scan *scan; + + /* If any scans refer to this socket, advance them to refer to + * the next socket instead. + */ + spin_lock_bh(&socktab->write_lock); + list_for_each_entry(scan, &socktab->active_scans, scan_links) { + if (!scan->next || (scan->next->sock != hsk)) + continue; + scan->next = (struct homa_socktab_links *)hlist_next_rcu( + &scan->next->hash_links); + } + hlist_del_rcu(&hsk->socktab_links.hash_links); + spin_unlock_bh(&socktab->write_lock); +} + /** * homa_sock_shutdown() - Disable a socket so that it can no longer * be used for either sending or receiving messages. Any system calls @@ -180,7 +226,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) * active operations that hold RPC locks but not the socket lock. * 1. Set @shutdown; this ensures that no new RPCs will be created for * this socket (though some creations might already be in progress). - * 2. Remove the socket from the port map: this ensures that + * 2. Remove the socket from its socktab: this ensures that * incoming packets for the socket will be dropped. * 3. Go through all of the RPCs and delete them; this will * synchronize with any operations in progress. @@ -191,9 +237,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) * See sync.txt for additional information about locking. */ hsk->shutdown = true; - spin_lock_bh(&hsk->homa->port_map->write_lock); - hlist_del_rcu(&hsk->socktab_links.hash_links); - spin_unlock_bh(&hsk->homa->port_map->write_lock); + homa_sock_unlink(hsk); homa_sock_unlock(hsk); list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { diff --git a/homa_sock.h b/homa_sock.h index bd56c734..aaccb41f 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -37,6 +37,12 @@ struct homa_socktab { * consist of homa_socktab_link objects. */ struct hlist_head buckets[HOMA_SOCKTAB_BUCKETS]; + + /** + * @active_scans: List of homa_socktab_scan structs for all scans + * currently underway on this homa_socktab. + */ + struct list_head active_scans; }; /** @@ -71,6 +77,11 @@ struct homa_socktab_scan { * more sockets in the current bucket. */ struct homa_socktab_links *next; + + /** + * @scan_links: Used to link this scan into @socktab->scans. + */ + struct list_head scan_links; }; /** @@ -252,22 +263,22 @@ struct homa_sock { struct homa_pool *buffer_pool; }; -void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id); -int homa_sock_bind(struct homa_socktab *socktab, - struct homa_sock *hsk, __u16 port); -void homa_sock_destroy(struct homa_sock *hsk); -struct homa_sock * - homa_sock_find(struct homa_socktab *socktab, __u16 port); -void homa_sock_init(struct homa_sock *hsk, struct homa *homa); -void homa_sock_shutdown(struct homa_sock *hsk); -int homa_socket(struct sock *sk); -void homa_socktab_destroy(struct homa_socktab *socktab); -void homa_socktab_init(struct homa_socktab *socktab); -struct homa_sock - *homa_socktab_next(struct homa_socktab_scan *scan); -struct homa_sock - *homa_socktab_start_scan(struct homa_socktab *socktab, - struct homa_socktab_scan *scan); +void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, + __u64 id); +int homa_sock_bind(struct homa_socktab *socktab, + struct homa_sock *hsk, __u16 port); +void homa_sock_destroy(struct homa_sock *hsk); +struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port); +void homa_sock_init(struct homa_sock *hsk, struct homa *homa); +void homa_sock_shutdown(struct homa_sock *hsk); +void homa_sock_unlink(struct homa_sock *hsk); +int homa_socket(struct sock *sk); +void homa_socktab_destroy(struct homa_socktab *socktab); +void homa_socktab_end_scan(struct homa_socktab_scan *scan); +void homa_socktab_init(struct homa_socktab *socktab); +struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan); +struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, + struct homa_socktab_scan *scan); /** * homa_sock_lock() - Acquire the lock for a socket. If the socket diff --git a/homa_timer.c b/homa_timer.c index 52f12541..6b131a99 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -253,6 +253,7 @@ void homa_timer(struct homa *homa) } homa_unprotect_rpcs(hsk); } + homa_socktab_end_scan(&scan); rcu_read_unlock(); tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", total_incoming_rpcs, sum_incoming, sum_incoming_rec, diff --git a/homa_utils.c b/homa_utils.c index a92d28e7..6e9fb6f7 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -140,6 +140,10 @@ int homa_init(struct homa *homa) */ void homa_destroy(struct homa *homa) { +#ifdef __UNIT_TEST__ +#include "utils.h" + unit_homa_destroy(homa); +#endif /* __UNIT_TEST__ */ if (homa->pacer_kthread) { homa_pacer_stop(homa); wait_for_completion(&homa_pacer_kthread_done); @@ -500,6 +504,7 @@ void homa_freeze_peers(struct homa *homa) /* Find a socket to use (any will do). */ hsk = homa_socktab_start_scan(homa->port_map, &scan); + homa_socktab_end_scan(&scan); if (hsk == NULL) { tt_record("homa_freeze_peers couldn't find a socket"); return; diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 4abc8403..612139ab 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -16,7 +16,10 @@ static void unlock_hook(char *id) { if (strcmp(id, "unlock") != 0) return; - homa_rpc_free(hook_rpc); + if (hook_rpc) { + homa_rpc_free(hook_rpc); + hook_rpc = NULL; + } } /* The following hook function frees an RPC when it is locked. */ diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 9f152b22..6893149a 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -17,7 +17,10 @@ static void unlock_hook(char *id) { if (strcmp(id, "unlock") != 0) return; - homa_rpc_free(hook_rpc); + if (hook_rpc) { + homa_rpc_free(hook_rpc); + hook_rpc = 0; + } } FIXTURE(homa_plumbing) { diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index e2b7caec..5f60f73b 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -11,6 +11,16 @@ #define n(x) htons(x) #define N(x) htonl(x) +int num_active_scans(struct homa_socktab *socktab) +{ + struct homa_socktab_scan *scan; + int count = 0; + + list_for_each_entry(scan, &socktab->active_scans, scan_links) + count++; + return count; +} + FIXTURE(homa_sock) { struct homa homa; struct homa_sock hsk; @@ -53,6 +63,8 @@ TEST_F(homa_sock, homa_socktab_start_scan) EXPECT_EQ(&self->hsk, homa_socktab_start_scan(self->homa.port_map, &scan)); EXPECT_EQ(100, scan.current_bucket); + EXPECT_EQ(1, num_active_scans(self->homa.port_map)); + homa_socktab_end_scan(&scan); } TEST_F(homa_sock, homa_socktab_next__basics) @@ -81,30 +93,26 @@ TEST_F(homa_sock, homa_socktab_next__basics) homa_sock_destroy(&hsk2); homa_sock_destroy(&hsk3); homa_sock_destroy(&hsk4); + homa_socktab_end_scan(&scan); } -TEST_F(homa_sock, homa_socktab_next__deleted_socket) + +TEST_F(homa_sock, homa_socktab_end_scan) { - struct homa_sock hsk1, hsk2, hsk3, *hsk; - struct homa_socktab_scan scan; - int first_port = 34000; + struct homa_socktab_scan scan1, scan2, scan3; homa_destroy(&self->homa); homa_init(&self->homa); - mock_sock_init(&hsk1, &self->homa, first_port); - mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); - mock_sock_init(&hsk3, &self->homa, first_port+2*HOMA_SOCKTAB_BUCKETS); - hsk = homa_socktab_start_scan(self->homa.port_map, &scan); - EXPECT_EQ(first_port+2*HOMA_SOCKTAB_BUCKETS, hsk->port); - homa_sock_destroy(&hsk2); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(first_port+HOMA_SOCKTAB_BUCKETS, hsk->port); - EXPECT_EQ(1, hsk->shutdown); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(first_port, hsk->port); - hsk = homa_socktab_next(&scan); - EXPECT_EQ(NULL, hsk); - homa_sock_destroy(&hsk1); - homa_sock_destroy(&hsk3); + mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); + homa_socktab_start_scan(self->homa.port_map, &scan1); + homa_socktab_start_scan(self->homa.port_map, &scan2); + homa_socktab_start_scan(self->homa.port_map, &scan3); + EXPECT_EQ(3, num_active_scans(self->homa.port_map)); + homa_socktab_end_scan(&scan2); + EXPECT_EQ(2, num_active_scans(self->homa.port_map)); + homa_socktab_end_scan(&scan1); + EXPECT_EQ(1, num_active_scans(self->homa.port_map)); + homa_socktab_end_scan(&scan3); + EXPECT_EQ(0, num_active_scans(self->homa.port_map)); } TEST_F(homa_sock, homa_sock_init__skip_port_in_use) @@ -146,7 +154,58 @@ TEST_F(homa_sock, homa_sock_init__hijack_tcp) homa_sock_destroy(&no_hijack); } -TEST_F(homa_sock, homa_sock_shutdown__basics) +TEST_F(homa_sock, homa_sock_unlink__update_scans) +{ + struct homa_sock hsk1, hsk2, hsk3, hsk4, *hska, *hskb; + struct homa_socktab_scan scana, scanb; + int first_port = 34000; + + homa_destroy(&self->homa); + homa_init(&self->homa); + mock_sock_init(&hsk1, &self->homa, first_port); + mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); + mock_sock_init(&hsk3, &self->homa, first_port+2*HOMA_SOCKTAB_BUCKETS); + mock_sock_init(&hsk4, &self->homa, first_port+3*HOMA_SOCKTAB_BUCKETS); + + /* Set scana to first socket in hash list. */ + hska = homa_socktab_start_scan(self->homa.port_map, &scana); + EXPECT_NE(NULL, hska); + EXPECT_EQ(first_port + 3*HOMA_SOCKTAB_BUCKETS, hska->port); + + /* Set scanb to second socket in hash list. */ + homa_socktab_start_scan(self->homa.port_map, &scanb); + hskb = homa_socktab_next(&scanb); + EXPECT_NE(NULL, hskb); + EXPECT_EQ(first_port + 2*HOMA_SOCKTAB_BUCKETS, hskb->port); + + /* Delete third socket. */ + homa_sock_destroy(&hsk2); + EXPECT_NE(NULL, scana.next); + EXPECT_EQ(first_port + 2*HOMA_SOCKTAB_BUCKETS, scana.next->sock->port); + EXPECT_NE(NULL, scanb.next); + EXPECT_EQ(first_port, scanb.next->sock->port); + + /* Delete second socket. */ + homa_sock_destroy(&hsk3); + EXPECT_NE(NULL, scana.next); + EXPECT_EQ(first_port, scana.next->sock->port); + EXPECT_NE(NULL, scanb.next); + EXPECT_EQ(first_port, scanb.next->sock->port); + + /* Delete last socket. */ + homa_sock_destroy(&hsk1); + EXPECT_EQ(NULL, scana.next); + EXPECT_EQ(NULL, scanb.next); + + /* Delete first socket. */ + homa_sock_destroy(&hsk4); + EXPECT_EQ(NULL, scana.next); + EXPECT_EQ(NULL, scanb.next); + + homa_socktab_end_scan(&scana); + homa_socktab_end_scan(&scanb); +} +TEST_F(homa_sock, homa_sock_unlink__remove_from_map) { struct homa_sock hsk2, hsk3; int client2, client3; @@ -158,21 +217,32 @@ TEST_F(homa_sock, homa_sock_shutdown__basics) client3 = hsk3.port; EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, client2)); - EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, 100)); EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, client3)); homa_sock_shutdown(&hsk2); EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client2)); - EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, 100)); EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, client3)); homa_sock_shutdown(&hsk3); EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client2)); - EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, 100)); EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client3)); } + +TEST_F(homa_sock, homa_sock_shutdown__unlink_socket) +{ + struct homa_sock hsk; + int client; + + mock_sock_init(&hsk, &self->homa, 0); + EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk, 100)); + client = hsk.port; + EXPECT_EQ(&hsk, homa_sock_find(self->homa.port_map, client)); + + homa_sock_shutdown(&hsk); + EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client)); +} TEST_F(homa_sock, homa_sock_shutdown__already_shutdown) { unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, diff --git a/test/utils.c b/test/utils.c index b10457a6..5c85cf90 100644 --- a/test/utils.c +++ b/test/utils.c @@ -451,3 +451,14 @@ char *unit_ack_string(struct homa_ack *ack) be64_to_cpu(ack->client_id)); return buffer; } + +/** + * unit_homa_destroy() - When unit tests are run, this function is invoked + * by homa_destroy. It checks for various errors and reports them. + * @homa: Homa shared data that is about to be deleted. + */ +void unit_homa_destroy(struct homa *homa) +{ + if (!list_empty(&homa->port_map->active_scans)) + FAIL("struct homa deleted with active socktab scans"); +} \ No newline at end of file diff --git a/test/utils.h b/test/utils.h index 203b8acc..9f876e5e 100644 --- a/test/utils.h +++ b/test/utils.h @@ -38,6 +38,7 @@ extern struct homa_rpc int req_length, int resp_length); extern struct in6_addr unit_get_in_addr(char *s); +extern void unit_homa_destroy(struct homa *homa); extern struct iov_iter *unit_iov_iter(void *buffer, size_t length); extern int unit_list_length(struct list_head *head); From 88619202d5d06db07127fcca04f3f5ab8927efe0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 1 Nov 2024 10:53:16 -0700 Subject: [PATCH 059/625] Use uintptr_t for portability to 32-bit platforms --- homa_pool.c | 2 +- timetrace.h | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/homa_pool.c b/homa_pool.c index d4472a7b..1e82e8c9 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -53,7 +53,7 @@ int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) struct homa_pool *pool = hsk->buffer_pool; int i, result; - if (((__u64)region) & ~PAGE_MASK) + if (((uintptr_t)region) & ~PAGE_MASK) return -EINVAL; pool->hsk = hsk; pool->region = (char *)region; diff --git a/timetrace.h b/timetrace.h index db8aa43d..ba7a5b72 100644 --- a/timetrace.h +++ b/timetrace.h @@ -200,12 +200,15 @@ static inline void tt_record(const char *format) static inline __u32 tt_hi(void *p) { - return ((__u64)p) >> 32; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshift-count-overflow" + return ((uintptr_t)p) >> 32; +#pragma GCC diagnostic pop } static inline __u32 tt_lo(void *p) { - return ((__u64)p) & 0xffffffff; + return ((uintptr_t)p) & 0xffffffff; } #endif // HOMA_TIMETRACE_H From f92b3846762c0114926b240afc003c03e90a4c42 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 1 Nov 2024 10:59:31 -0700 Subject: [PATCH 060/625] Use do_div instead of "/" for portability --- homa_impl.h | 4 ++-- homa_incoming.c | 12 ++++++------ homa_outgoing.c | 24 +++++++++++++++--------- homa_plumbing.c | 21 +-------------------- homa_utils.c | 4 +++- 5 files changed, 27 insertions(+), 38 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index f2561b58..83b0c3df 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1223,11 +1223,11 @@ static inline void homa_check_pacer(struct homa *homa, int softirq) if (list_empty(&homa->throttled_rpcs)) return; - /* The "/2" in the line below gives homa_pacer_main the first chance + /* The ">> 1" in the line below gives homa_pacer_main the first chance * to queue new packets; if the NIC queue becomes more than half * empty, then we will help out here. */ - if ((get_cycles() + homa->max_nic_queue_cycles / 2) < + if ((get_cycles() +(homa->max_nic_queue_cycles >> 1)) < atomic64_read(&homa->link_idle_time)) return; tt_record("homa_check_pacer calling homa_pacer_xmit"); diff --git a/homa_incoming.c b/homa_incoming.c index 6f8ab1aa..7c9ec6de 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1515,15 +1515,15 @@ void homa_incoming_sysctl_changed(struct homa *homa) tmp = (tmp*cpu_khz)/1000; homa->poll_cycles = tmp; - tmp = homa->busy_usecs; - tmp = (tmp * cpu_khz) / 1000; + tmp = homa->busy_usecs * cpu_khz; + do_div(tmp, 1000); homa->busy_cycles = tmp; - tmp = homa->gro_busy_usecs; - tmp = (tmp * cpu_khz) / 1000; + tmp = homa->gro_busy_usecs * cpu_khz; + do_div(tmp, 1000); homa->gro_busy_cycles = tmp; - tmp = homa->bpage_lease_usecs; - tmp = (tmp * cpu_khz) / 1000; + tmp = homa->bpage_lease_usecs * cpu_khz; + do_div(tmp, 1000); homa->bpage_lease_cycles = tmp; } diff --git a/homa_outgoing.c b/homa_outgoing.c index 31f7adcb..24673b4d 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -123,9 +123,10 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, int length, int max_seg_data) { struct homa_skb_info *homa_info; - int segs, err, gso_size; struct data_header *h; struct sk_buff *skb; + int err, gso_size; + uint64_t segs; /* Initialize the overall skb. */ skb = homa_skb_new_tx(sizeof32(struct data_header)); @@ -153,7 +154,8 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, h->retransmit = 0; h->seg.offset = -1; - segs = (length + max_seg_data - 1) / max_seg_data; + segs = length + max_seg_data - 1; + do_div(segs, max_seg_data); homa_info = homa_get_skb_info(skb); homa_info->next_skb = NULL; homa_info->wire_bytes = length + segs * (sizeof(struct data_header) @@ -254,8 +256,9 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) gso_size = rpc->hsk->homa->max_gso_size; /* Round gso_size down to an even # of mtus. */ - segs_per_gso = (gso_size - rpc->hsk->ip_header_length - - sizeof(struct data_header)) / max_seg_data; + segs_per_gso = gso_size - rpc->hsk->ip_header_length + - sizeof(struct data_header); + do_div(segs_per_gso, max_seg_data); if (segs_per_gso == 0) segs_per_gso = 1; max_gso_data = segs_per_gso * max_seg_data; @@ -734,10 +737,12 @@ void homa_outgoing_sysctl_changed(struct homa *homa) /* Code below is written carefully to avoid integer underflow or * overflow under expected usage patterns. Be careful when changing! */ - homa->cycles_per_kbyte = (8 * (__u64)cpu_khz) / homa->link_mbps; - homa->cycles_per_kbyte = (101 * homa->cycles_per_kbyte) / 100; - tmp = homa->max_nic_queue_ns; - tmp = (tmp * cpu_khz) / 1000000; + homa->cycles_per_kbyte = 8 * (__u64)cpu_khz; + do_div(homa->cycles_per_kbyte, homa->link_mbps); + homa->cycles_per_kbyte = 101 * homa->cycles_per_kbyte; + do_div(homa->cycles_per_kbyte, 100); + tmp = homa->max_nic_queue_ns * cpu_khz; + do_div(tmp, 1000000); homa->max_nic_queue_cycles = tmp; } @@ -763,7 +768,8 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) __u64 idle, new_idle, clock; bytes = homa_get_skb_info(skb)->wire_bytes; - cycles_for_packet = (bytes * homa->cycles_per_kbyte) / 1000; + cycles_for_packet = bytes * homa->cycles_per_kbyte; + do_div(cycles_for_packet, 1000); while (1) { clock = get_cycles(); idle = atomic64_read(&homa->link_idle_time); diff --git a/homa_plumbing.c b/homa_plumbing.c index ba3b22ff..cde33667 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1190,31 +1190,12 @@ int homa_softirq(struct sk_buff *skb) struct common_header *h; int first_packet = 1; int header_offset; -#if 1 /* See strip.py */ - static __u64 last; int pull_length; __u64 start; start = get_cycles(); INC_METRIC(softirq_calls, 1); - per_cpu(homa_offload_core, raw_smp_processor_id()).last_active - = start; - if ((start - last) > 1000000) { - int scaled_ms = (int)(10 * (start - last) / cpu_khz); - - if (scaled_ms >= 50 && scaled_ms < 10000) { -// tt_record3("Gap in incoming packets: %d cycles (%d.%1d ms)", -// (int) (start - last), scaled_ms/10, -// scaled_ms%10); -// pr_notice("Gap in incoming packets: %llu cycles, (%d.%1d ms)", -// (start - last), -// scaled_ms/10, scaled_ms%10); - } - } - last = start; -#else /* See strip.py */ - int pull_length; -#endif /* See strip.py */ + per_cpu(homa_offload_core, raw_smp_processor_id()).last_active = start; /* skb may actually contain many distinct packets, linked through * skb_shinfo(skb)->frag_list by the Homa GRO mechanism. Make a diff --git a/homa_utils.c b/homa_utils.c index 6e9fb6f7..3eb53a54 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -655,7 +655,9 @@ void homa_spin(int ns) { __u64 end; - end = get_cycles() + (ns * cpu_khz) / 1000000; + end = ns * cpu_khz; + do_div(end, 1000000); + end += get_cycles(); while (get_cycles() < end) /* Empty loop body.*/ ; From 63051ed9693cd037d8aa8aaf471a0f3764cc7797 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 1 Nov 2024 16:24:04 -0700 Subject: [PATCH 061/625] Use sched_clock() instead of get_cycles() Also, eliminate usage of cpu_khz. --- homa_grant.c | 12 +-- homa_grant.h | 6 +- homa_impl.h | 61 +++++-------- homa_incoming.c | 75 +++++++--------- homa_metrics.c | 132 ++++++++++++++-------------- homa_metrics.h | 175 +++++++++++++++++--------------------- homa_offload.c | 30 +++---- homa_offload.h | 6 +- homa_outgoing.c | 65 +++++++------- homa_peer.c | 12 +-- homa_peer.h | 2 +- homa_plumbing.c | 32 +++---- homa_pool.c | 10 +-- homa_pool.h | 2 +- homa_rpc.c | 4 +- homa_rpc.h | 12 +-- homa_skb.c | 20 ++--- homa_sock.c | 10 +-- homa_timer.c | 10 +-- homa_utils.c | 16 ++-- test/mock.c | 17 +++- test/mock.h | 2 + test/unit_homa_grant.c | 24 +++--- test/unit_homa_incoming.c | 54 +++++++----- test/unit_homa_offload.c | 20 ++--- test/unit_homa_outgoing.c | 67 +++++++-------- test/unit_homa_peer.c | 18 ++-- test/unit_homa_pool.c | 24 +++--- test/unit_homa_rpc.c | 12 +-- test/unit_homa_skb.c | 18 ++-- test/unit_homa_sock.c | 6 +- timetrace.c | 6 +- timetrace.h | 6 ++ util/metrics.py | 102 +++++++++++----------- 34 files changed, 516 insertions(+), 552 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index de92507b..d540b162 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -80,7 +80,7 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) /* Message not yet tracked; add it in priority order to * the peer's list. */ - __u64 time = get_cycles(); + __u64 time = sched_clock(); INC_METRIC(grantable_rpcs_integral, homa->num_grantable_rpcs * (time - homa->last_grantable_change)); @@ -161,7 +161,7 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) struct homa *homa = rpc->hsk->homa; struct homa_peer *peer = rpc->peer; struct homa_rpc *candidate; - __u64 time = get_cycles(); + __u64 time = sched_clock(); struct homa_rpc *head; if (list_empty(&rpc->grantable_links)) @@ -401,7 +401,7 @@ void homa_grant_recalc(struct homa *homa, int locked) return; } } - start = get_cycles(); + start = sched_clock(); /* We may have to recalculate multiple times if grants sent in one * round cause messages to be completely granted, opening up @@ -487,7 +487,7 @@ void homa_grant_recalc(struct homa *homa, int locked) break; } } - INC_METRIC(grant_recalc_cycles, get_cycles() - start); + INC_METRIC(grant_recalc_ns, sched_clock() - start); } /** @@ -647,7 +647,7 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) int homa_grantable_lock_slow(struct homa *homa, int recalc) { int starting_count = atomic_read(&homa->grant_recalc_count); - __u64 start = get_cycles(); + __u64 start = sched_clock(); int result = 0; tt_record("beginning wait for grantable lock"); @@ -664,7 +664,7 @@ int homa_grantable_lock_slow(struct homa *homa, int recalc) } } INC_METRIC(grantable_lock_misses, 1); - INC_METRIC(grantable_lock_miss_cycles, get_cycles() - start); + INC_METRIC(grantable_lock_miss_ns, sched_clock() - start); return result; } diff --git a/homa_grant.h b/homa_grant.h index 411f40c6..68feca11 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -41,7 +41,7 @@ static inline int homa_grantable_lock(struct homa *homa, int recalc) result = 1; else result = homa_grantable_lock_slow(homa, recalc); - homa->grantable_lock_time = get_cycles(); + homa->grantable_lock_time = sched_clock(); return result; } @@ -51,8 +51,8 @@ static inline int homa_grantable_lock(struct homa *homa, int recalc) */ static inline void homa_grantable_unlock(struct homa *homa) { - INC_METRIC(grantable_lock_cycles, get_cycles() - - homa->grantable_lock_time); + INC_METRIC(grantable_lock_ns, sched_clock() - + homa->grantable_lock_time); spin_unlock_bh(&homa->grantable_lock); } diff --git a/homa_impl.h b/homa_impl.h index 83b0c3df..5009dcd1 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -65,10 +66,6 @@ int mock_cpu_to_node(int cpu); #define current current_task extern struct task_struct *current_task; -#undef get_cycles -#define get_cycles mock_get_cycles -cycles_t mock_get_cycles(void); - #define get_page mock_get_page void mock_get_page(struct page *page); @@ -255,7 +252,7 @@ struct homa { atomic64_t next_outgoing_id; /** - * @link_idle_time: The time, measured by get_cycles() at which we + * @link_idle_time: The time, measured by sched_clock, at which we * estimate that all of the packets we have passed to Linux for * transmission will have been transmitted. May be in the past. * This estimate assumes that only Homa is transmitting data, so @@ -271,7 +268,7 @@ struct homa { spinlock_t grantable_lock __aligned(L1_CACHE_BYTES); /** - * @grantable_lock_time: get_cycles() time when grantable_lock + * @grantable_lock_time: sched_clock() time when grantable_lock * was last locked. */ __u64 grantable_lock_time; @@ -302,7 +299,7 @@ struct homa { /** @num_grantable_rpcs: The number of RPCs in grantable_rpcs. */ int num_grantable_rpcs; - /** @last_grantable_change: The get_cycles time of the most recent + /** @last_grantable_change: The sched_clock() time of the most recent * increment or decrement of num_grantable_rpcs; used for computing * statistics. */ @@ -316,7 +313,7 @@ struct homa { int max_grantable_rpcs; /** - * @oldest_rpc: The RPC with incoming data whose start_cycles is + * @oldest_rpc: The RPC with incoming data whose start_ns is * farthest in the past). NULL means either there are no incoming * RPCs or the oldest needs to be recomputed. Must hold grantable_lock * to update. @@ -383,8 +380,8 @@ struct homa { int pacer_fifo_count; /** - * @pacer_start: get_cycles() time when the pacer last woke up - * (if the pacer is running) or 0 if the pacer is sleeping. + * @pacer_wake_time: time (in sched_clock units) when the pacer last + * woke up (if the pacer is running) or 0 if the pacer is sleeping. */ __u64 pacer_wake_time; @@ -404,8 +401,8 @@ struct homa { struct list_head throttled_rpcs; /** - * @throttle_add: The get_cycles() time when the most recent RPC - * was added to @throttled_rpcs. + * @throttle_add: The time (in sched_clock() units) when the most + * recent RPC was added to @throttled_rpcs. */ __u64 throttle_add; @@ -486,7 +483,7 @@ struct homa { int pages_to_free_slots; /** - * @skb_page_free_time: Time (in get_cycles() units) when the + * @skb_page_free_time: Time (in sched_clock() units) when the * next sk_buff page should be freed. Could be in the past. */ __u64 skb_page_free_time; @@ -534,12 +531,6 @@ struct homa { */ int poll_usecs; - /** - * @poll_cycles: The value of @poll_usecs in the units returned - * by get_cycles(). - */ - int poll_cycles; - /** * @num_priorities: The total number of priority levels available for * Homa's use. Internally, Homa will use priorities from 0 to @@ -693,18 +684,12 @@ struct homa { int max_nic_queue_ns; /** - * @max_nic_queue_cycles: Same as max_nic_queue_ns, except in units - * of get_cycles(). - */ - int max_nic_queue_cycles; - - /** - * @cycles_per_kbyte: the number of cycles, as measured by get_cycles(), - * that it takes to transmit 1000 bytes on our uplink. This is actually - * a slight overestimate of the value, to ensure that we don't - * underestimate NIC queue length and queue too many packets. + * @ns_per_mbyte: the number of ns that it takes to transmit + * 10**6 bytes on our uplink. This is actually a slight overestimate + * of the value, to ensure that we don't underestimate NIC queue + * length and queue too many packets. */ - __u32 cycles_per_kbyte; + __u32 ns_per_mbyte; /** * @verbose: Nonzero enables additional logging. Set externally via @@ -786,8 +771,8 @@ struct homa { */ int busy_usecs; - /** @busy_cycles: Same as busy_usecs except in get_cycles() units. */ - int busy_cycles; + /** @busy_ns: Same as busy_usecs except in sched_clock() units. */ + int busy_ns; /* * @gro_busy_usecs: if the gap between the completion of @@ -799,8 +784,8 @@ struct homa { */ int gro_busy_usecs; - /** @gro_busy_cycles: Same as busy_usecs except in get_cycles() units. */ - int gro_busy_cycles; + /** @gro_busy_ns: Same as busy_usecs except in sched_clock() units. */ + int gro_busy_ns; /** * @timer_ticks: number of times that homa_timer has been invoked @@ -855,12 +840,6 @@ struct homa { */ int bpage_lease_usecs; - /** - * @bpage_lease_cycles: The value of @bpage_lease_usecs in get_cycles - * units. - */ - int bpage_lease_cycles; - /** * @next_id: Set via sysctl; causes next_outgoing_id to be set to * this value; always reads as zero. Typically used while debugging to @@ -1227,7 +1206,7 @@ static inline void homa_check_pacer(struct homa *homa, int softirq) * to queue new packets; if the NIC queue becomes more than half * empty, then we will help out here. */ - if ((get_cycles() +(homa->max_nic_queue_cycles >> 1)) < + if ((sched_clock() + (homa->max_nic_queue_ns >> 1)) < atomic64_read(&homa->link_idle_time)) return; tt_record("homa_check_pacer calling homa_pacer_xmit"); diff --git a/homa_incoming.c b/homa_incoming.c index 7c9ec6de..a98bd80b 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -68,7 +68,7 @@ struct homa_gap *homa_gap_new(struct list_head *next, int start, int end) gap = kmalloc(sizeof(*gap), GFP_KERNEL); gap->start = start; gap->end = end; - gap->time = get_cycles(); + gap->time = sched_clock(); list_add_tail(&gap->links, next); return gap; } @@ -313,10 +313,10 @@ int homa_copy_to_user(struct homa_rpc *rpc) end_offset = 0; } #endif /* See strip.py */ - start = get_cycles(); + start = sched_clock(); for (i = 0; i < n; i++) kfree_skb(skbs[i]); - INC_METRIC(skb_free_cycles, get_cycles() - start); + INC_METRIC(skb_free_ns, sched_clock() - start); INC_METRIC(skb_frees, n); tt_record2("finished freeing %d skbs for id %d", n, rpc->id); @@ -524,12 +524,11 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) * nor homa_timer can keep up with reaping dead * RPCs. See reap.txt for details. */ - uint64_t start = get_cycles(); + uint64_t start = sched_clock(); tt_record("homa_data_pkt calling homa_rpc_reap"); homa_rpc_reap(hsk, hsk->homa->reap_limit); - INC_METRIC(data_pkt_reap_cycles, - get_cycles() - start); + INC_METRIC(data_pkt_reap_ns, sched_clock() - start); } } @@ -1186,11 +1185,11 @@ int homa_register_interests(struct homa_interest *interest, struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, __u64 id) { + uint64_t poll_start, poll_end, now; int error, blocked = 0, polled = 0; struct homa_rpc *result = NULL; struct homa_interest interest; struct homa_rpc *rpc = NULL; - uint64_t poll_start, now; /* Each iteration of this loop finds an RPC, but it might not be * in a state where we can return it (e.g., there might be packets @@ -1208,9 +1207,6 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, goto found_rpc; } -// tt_record3("Preparing to poll, socket %d, flags 0x%x, pid %d", -// hsk->client_port, flags, current->pid); - /* There is no ready RPC so far. Clean up dead RPCs before * going to sleep (or returning, if in nonblocking mode). */ @@ -1237,10 +1233,15 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, goto found_rpc; } + // tt_record4("Preparing to poll, socket %d, flags 0x%x, pid %d, poll_usecs %d", + // hsk->port, flags, current->pid, + // hsk->homa->poll_usecs); + /* Busy-wait for a while before going to sleep; this avoids * context-switching overhead to wake up. */ - poll_start = now = get_cycles(); + poll_start = now = sched_clock(); + poll_end = now + (1000 * hsk->homa->poll_usecs); while (1) { __u64 blocked; @@ -1251,26 +1252,23 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, rpc->id, hsk->port, current->pid); polled = 1; - INC_METRIC(poll_cycles, now - poll_start); + INC_METRIC(poll_ns, now - poll_start); goto found_rpc; } - if (now >= (poll_start + hsk->homa->poll_cycles)) + if (now >= poll_end) { + INC_METRIC(poll_ns, now - poll_start); break; - blocked = get_cycles(); + } + blocked = sched_clock(); schedule(); - now = get_cycles(); + now = sched_clock(); blocked = now - blocked; - if (blocked > 5000) { - /* Looks like another thread ran (or perhaps - * SoftIRQ). Count this time as blocked. - */ - INC_METRIC(blocked_cycles, blocked); - poll_start += blocked; - } + INC_METRIC(blocked_ns, blocked); + poll_start += blocked; } tt_record2("Poll ended unsuccessfully on socket %d, pid %d", hsk->port, current->pid); - INC_METRIC(poll_cycles, now - poll_start); + INC_METRIC(poll_ns, now - poll_start); /* Now it's time to sleep. */ per_cpu(homa_offload_core, interest.core).last_app_active = now; @@ -1278,14 +1276,14 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, rpc = (struct homa_rpc *)atomic_long_read(&interest.ready_rpc); if (!rpc && !hsk->shutdown) { __u64 end; - __u64 start = get_cycles(); + __u64 start = sched_clock(); tt_record1("homa_wait_for_message sleeping, pid %d", current->pid); schedule(); - end = get_cycles(); + end = sched_clock(); blocked = 1; - INC_METRIC(blocked_cycles, end - start); + INC_METRIC(blocked_ns, end - start); } __set_current_state(TASK_RUNNING); @@ -1379,7 +1377,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, struct homa_interest *homa_choose_interest(struct homa *homa, struct list_head *head, int offset) { - __u64 busy_time = get_cycles() - homa->busy_cycles; + __u64 busy_time = sched_clock() - homa->busy_ns; struct homa_interest *backup = NULL; struct homa_interest *interest; struct list_head *pos; @@ -1470,7 +1468,8 @@ void homa_rpc_handoff(struct homa_rpc *rpc) /* Update the last_app_active time for the thread's core, so Homa * will try to avoid doing any work there. */ - per_cpu(homa_offload_core, interest->core).last_app_active = get_cycles(); + per_cpu(homa_offload_core, interest->core).last_app_active = + sched_clock(); /* Clear the interest. This serves two purposes. First, it saves * the waking thread from acquiring the socket lock again, which @@ -1508,22 +1507,6 @@ void homa_incoming_sysctl_changed(struct homa *homa) if (homa->max_overcommit > HOMA_MAX_GRANTS) homa->max_overcommit = HOMA_MAX_GRANTS; - /* Code below is written carefully to avoid integer underflow or - * overflow under expected usage patterns. Be careful when changing! - */ - tmp = homa->poll_usecs; - tmp = (tmp*cpu_khz)/1000; - homa->poll_cycles = tmp; - - tmp = homa->busy_usecs * cpu_khz; - do_div(tmp, 1000); - homa->busy_cycles = tmp; - - tmp = homa->gro_busy_usecs * cpu_khz; - do_div(tmp, 1000); - homa->gro_busy_cycles = tmp; - - tmp = homa->bpage_lease_usecs * cpu_khz; - do_div(tmp, 1000); - homa->bpage_lease_cycles = tmp; + homa->busy_ns = homa->busy_usecs * 1000; + homa->gro_busy_ns = homa->gro_busy_usecs * 1000; } diff --git a/homa_metrics.c b/homa_metrics.c index 879efe5d..7783c3b5 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -82,10 +82,8 @@ char *homa_metrics_print(struct homa *homa) homa->metrics_length = 0; #define M(...) homa_metric_append(homa, __VA_ARGS__) - M("rdtsc_cycles %20llu RDTSC cycle counter when metrics were gathered\n", - get_cycles()); - M("cpu_khz %15llu Clock rate for RDTSC counter, in khz\n", - cpu_khz); + M("time_ns %20llu sched_clock() time when metrics were gathered\n", + sched_clock()); for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = &per_cpu(homa_metrics, core); __s64 delta; @@ -135,16 +133,16 @@ char *homa_metrics_print(struct homa *homa) } M("skb_allocs %15llu sk_buffs allocated\n", m->skb_allocs); - M("skb_alloc_cycles %15llu Time spent allocating sk_buffs\n", - m->skb_alloc_cycles); + M("skb_alloc_ns %15llu Time spent allocating sk_buffs\n", + m->skb_alloc_ns); M("skb_frees %15llu Data sk_buffs freed in normal paths\n", m->skb_frees); - M("skb_free_cycles %15llu Time spent freeing data sk_buffs\n", - m->skb_free_cycles); + M("skb_free_ns %15llu Time spent freeing data sk_buffs\n", + m->skb_free_ns); M("skb_page_allocs %15llu Pages allocated for sk_buff frags\n", m->skb_page_allocs); - M("skb_page_alloc_cycles %15llu Time spent allocating pages for sk_buff frags\n", - m->skb_page_alloc_cycles); + M("skb_page_alloc_ns %15llu Time spent allocating pages for sk_buff frags\n", + m->skb_page_alloc_ns); M("requests_received %15llu Incoming request messages\n", m->requests_received); M("requests_queued %15llu Requests for which no thread was waiting\n", @@ -161,74 +159,74 @@ char *homa_metrics_print(struct homa *homa) m->handoffs_thread_waiting); M("handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", m->handoffs_alt_thread); - M("poll_cycles %15llu Time spent polling for incoming messages\n", - m->poll_cycles); + M("poll_ns %15llu Time spent polling for incoming messages\n", + m->poll_ns); M("softirq_calls %15llu Calls to homa_softirq (i.e. # GRO pkts received)\n", m->softirq_calls); - M("softirq_cycles %15llu Time spent in homa_softirq during SoftIRQ\n", - m->softirq_cycles); - M("bypass_softirq_cycles %15llu Time spent in homa_softirq during bypass from GRO\n", - m->bypass_softirq_cycles); - M("linux_softirq_cycles %15llu Time spent in all Linux SoftIRQ\n", - m->linux_softirq_cycles); - M("napi_cycles %15llu Time spent in NAPI-level packet handling\n", - m->napi_cycles); - M("send_cycles %15llu Time spent in homa_sendmsg for requests\n", - m->send_cycles); + M("softirq_ns %15llu Time spent in homa_softirq during SoftIRQ\n", + m->softirq_ns); + M("bypass_softirq_ns %15llu Time spent in homa_softirq during bypass from GRO\n", + m->bypass_softirq_ns); + M("linux_softirq_ns %15llu Time spent in all Linux SoftIRQ\n", + m->linux_softirq_ns); + M("napi_ns %15llu Time spent in NAPI-level packet handling\n", + m->napi_ns); + M("send_ns %15llu Time spent in homa_sendmsg for requests\n", + m->send_ns); M("send_calls %15llu Total invocations of homa_sendmsg for equests\n", m->send_calls); // It is possible for us to get here at a time when a // thread has been blocked for a long time and has - // recorded blocked_cycles, but hasn't finished the - // system call so recv_cycles hasn't been incremented + // recorded blocked_ns, but hasn't finished the + // system call so recv_nss hasn't been incremented // yet. If that happens, just record 0 to prevent // underflow errors. - delta = m->recv_cycles - m->blocked_cycles; + delta = m->recv_ns - m->blocked_ns; if (delta < 0) delta = 0; - M("recv_cycles %15llu Unblocked time spent in recvmsg kernel call\n", + M("recv_ns %15llu Unblocked time spent in recvmsg kernel call\n", delta); M("recv_calls %15llu Total invocations of recvmsg kernel call\n", m->recv_calls); - M("blocked_cycles %15llu Time spent blocked in homa_recvmsg\n", - m->blocked_cycles); - M("reply_cycles %15llu Time spent in homa_sendmsg for responses\n", - m->reply_cycles); + M("blocked_ns %15llu Time spent blocked in homa_recvmsg\n", + m->blocked_ns); + M("reply_ns %15llu Time spent in homa_sendmsg for responses\n", + m->reply_ns); M("reply_calls %15llu Total invocations of homa_sendmsg for responses\n", m->reply_calls); - M("abort_cycles %15llu Time spent in homa_ioc_abort kernel call\n", - m->reply_cycles); + M("abort_ns %15llu Time spent in homa_ioc_abort kernel call\n", + m->reply_ns); M("abort_calls %15llu Total invocations of abort kernel call\n", m->reply_calls); - M("so_set_buf_cycles %15llu Time spent in setsockopt SO_HOMA_SET_BUF\n", - m->so_set_buf_cycles); + M("so_set_buf_ns %15llu Time spent in setsockopt SO_HOMA_SET_BUF\n", + m->so_set_buf_ns); M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_SET_BUF\n", m->so_set_buf_calls); - M("grantable_lock_cycles %15llu Time spent with homa->grantable_lock locked\n", - m->grantable_lock_cycles); - M("timer_cycles %15llu Time spent in homa_timer\n", - m->timer_cycles); - M("timer_reap_cycles %15llu Time in homa_timer spent reaping RPCs\n", - m->timer_reap_cycles); - M("data_pkt_reap_cycles %15llu Time in homa_data_pkt spent reaping RPCs\n", - m->data_pkt_reap_cycles); - M("pacer_cycles %15llu Time spent in homa_pacer_main\n", - m->pacer_cycles); - M("homa_cycles %15llu Total time in all Homa-related functions\n", - m->softirq_cycles + m->napi_cycles + - m->send_cycles + m->recv_cycles + - m->reply_cycles - m->blocked_cycles + - m->timer_cycles + m->pacer_cycles); - M("pacer_lost_cycles %15llu Lost transmission time because pacer was slow\n", - m->pacer_lost_cycles); + M("grantable_lock_ns %15llu Time spent with homa->grantable_lock locked\n", + m->grantable_lock_ns); + M("timer_ns %15llu Time spent in homa_timer\n", + m->timer_ns); + M("timer_reap_ns %15llu Time in homa_timer spent reaping RPCs\n", + m->timer_reap_ns); + M("data_pkt_reap_ns %15llu Time in homa_data_pkt spent reaping RPCs\n", + m->data_pkt_reap_ns); + M("pacer_ns %15llu Time spent in homa_pacer_main\n", + m->pacer_ns); + M("homa_ns %15llu Total time in all Homa-related functions\n", + m->softirq_ns + m->napi_ns + + m->send_ns + m->recv_ns + + m->reply_ns - m->blocked_ns + + m->timer_ns + m->pacer_ns); + M("pacer_lost_ns %15llu Lost transmission time because pacer was slow\n", + m->pacer_lost_ns); M("pacer_bytes %15llu Bytes transmitted when the pacer was active\n", m->pacer_bytes); M("pacer_skipped_rpcs %15llu Pacer aborts because of locked RPCs\n", m->pacer_skipped_rpcs); M("pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", m->pacer_needed_help); - M("throttled_cycles %15llu Time when the throttled queue was nonempty\n", - m->throttled_cycles); + M("throttled_ns %15llu Time when the throttled queue was nonempty\n", + m->throttled_ns); M("resent_packets %15llu DATA packets sent in response to RESENDs\n", m->resent_packets); M("peer_hash_links %15llu Hash chain link traversals in peer table\n", @@ -265,34 +263,34 @@ char *homa_metrics_print(struct homa *homa) m->server_rpcs_unknown); M("client_lock_misses %15llu Bucket lock misses for client RPCs\n", m->client_lock_misses); - M("client_lock_miss_cycles %15llu Time lost waiting for client bucket locks\n", - m->client_lock_miss_cycles); + M("client_lock_miss_ns %15llu Time lost waiting for client bucket locks\n", + m->client_lock_miss_ns); M("server_lock_misses %15llu Bucket lock misses for server RPCs\n", m->server_lock_misses); - M("server_lock_miss_cycles %15llu Time lost waiting for server bucket locks\n", - m->server_lock_miss_cycles); + M("server_lock_miss_ns %15llu Time lost waiting for server bucket locks\n", + m->server_lock_miss_ns); M("socket_lock_misses %15llu Socket lock misses\n", m->socket_lock_misses); - M("socket_lock_miss_cycles %15llu Time lost waiting for socket locks\n", - m->socket_lock_miss_cycles); + M("socket_lock_miss_ns %15llu Time lost waiting for socket locks\n", + m->socket_lock_miss_ns); M("throttle_lock_misses %15llu Throttle lock misses\n", m->throttle_lock_misses); - M("throttle_lock_miss_cycles %15llu Time lost waiting for throttle locks\n", - m->throttle_lock_miss_cycles); + M("throttle_lock_miss_ns %15llu Time lost waiting for throttle locks\n", + m->throttle_lock_miss_ns); M("peer_ack_lock_misses %15llu Misses on peer ack locks\n", m->peer_ack_lock_misses); - M("peer_ack_lock_miss_cycles %15llu Time lost waiting for peer ack locks\n", - m->peer_ack_lock_miss_cycles); + M("peer_ack_lock_miss_ns %15llu Time lost waiting for peer ack locks\n", + m->peer_ack_lock_miss_ns); M("grantable_lock_misses %15llu Grantable lock misses\n", m->grantable_lock_misses); - M("grantable_lock_miss_cycles%15llu Time lost waiting for grantable lock\n", - m->grantable_lock_miss_cycles); + M("grantable_lock_miss_ns %15llu Time lost waiting for grantable lock\n", + m->grantable_lock_miss_ns); M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", m->grantable_rpcs_integral); M("grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", m->grant_recalc_calls); - M("grant_recalc_cycles %15llu Time spent in homa_grant_recalc\n", - m->grant_recalc_cycles); + M("grant_recalc_ns %15llu Time spent in homa_grant_recalc\n", + m->grant_recalc_ns); M("grant_recalc_skips %15llu Number of times homa_grant_recalc skipped redundant work\n", m->grant_recalc_skips); M("grant_recalc_loops %15llu Number of times homa_grant_recalc looped back\n", diff --git a/homa_metrics.h b/homa_metrics.h index a7ddcb05..23d3ecf6 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -78,11 +78,8 @@ struct homa_metrics { */ __u64 skb_allocs; - /** - * @skb_alloc_cycles: total time spent in homa_skb_new_tx, as - * measured with get_cycles(). - */ - __u64 skb_alloc_cycles; + /** @skb_alloc_ns: total time spent in homa_skb_new_tx. */ + __u64 skb_alloc_ns; /** * @skb_frees: total number of sk_buffs for data packets that have @@ -90,22 +87,16 @@ struct homa_metrics { */ __u64 skb_frees; - /** - * @skb_free_cycles: total time spent freeing sk_buffs, as - * measured with get_cycles(). - */ - __u64 skb_free_cycles; + /** @skb_free_ns: total time spent freeing sk_buffs. */ + __u64 skb_free_ns; /** * @skb_page_allocs: total number of calls to homa_skb_page_alloc. */ __u64 skb_page_allocs; - /** - * @skb_page_alloc_cycles: total time spent in homa_skb_page_alloc, as - * measured with get_cycles(). - */ - __u64 skb_page_alloc_cycles; + /** @skb_page_alloc_ns: total time spent in homa_skb_page_alloc. */ + __u64 skb_page_alloc_ns; /** * @requests_received: total number of request messages received. @@ -155,10 +146,10 @@ struct homa_metrics { __u64 handoffs_alt_thread; /** - * @poll_cycles: total time spent in the polling loop in - * homa_wait_for_message, as measured with get_cycles(). + * @poll_ns: total time spent in the polling loop in + * homa_wait_for_message. */ - __u64 poll_cycles; + __u64 poll_ns; /** * @softirq_calls: total number of calls to homa_softirq (i.e., @@ -168,36 +159,36 @@ struct homa_metrics { __u64 softirq_calls; /** - * @softirq_cycles: total time spent executing homa_softirq when - * invoked under Linux's SoftIRQ handler, as measured with get_cycles(). + * @softirq_ns: total time spent executing homa_softirq when + * invoked under Linux's SoftIRQ handler. */ - __u64 softirq_cycles; + __u64 softirq_ns; /** - * @bypass_softirq_cycles: total time spent executing homa_softirq when + * @bypass_softirq_ns: total time spent executing homa_softirq when * invoked during GRO, bypassing the SoftIRQ mechanism. */ - __u64 bypass_softirq_cycles; + __u64 bypass_softirq_ns; /** - * @linux_softirq_cycles: total time spent executing all softirq - * activities, as measured by the linux softirq module, in get_cycles() - * units. Only available with modified Linux kernels. + * @linux_softirq_ns: total time spent executing all softirq + * activities, as measured by the linux softirq module. Only + * available with modified Linux kernels. */ - __u64 linux_softirq_cycles; + __u64 linux_softirq_ns; /** - * @napi_cycles: total time spent executing all NAPI activities, - * as measured by the linux softirq module, in get_cycles() units. - * Only available with modified Linux kernels. + * @napi_ns: total time spent executing all NAPI activities, as + * measured by the linux softirq module. Only available with modified + * Linux kernels. */ - __u64 napi_cycles; + __u64 napi_ns; /** - * @send_cycles: total time spent executing the homa_sendmsg kernel - * call handler to send requests, as measured with get_cycles(). + * @send_ns: total time spent executing the homa_sendmsg kernel + * call handler to send requests. */ - __u64 send_cycles; + __u64 send_ns; /** @send_calls: total number of invocations of homa_semdmsg * for requests. @@ -205,25 +196,25 @@ struct homa_metrics { __u64 send_calls; /** - * @recv_cycles: total time spent executing homa_recvmsg (including - * time when the thread is blocked), as measured with get_cycles(). + * @recv_ns: total time spent executing homa_recvmsg (including + * time when the thread is blocked). */ - __u64 recv_cycles; + __u64 recv_ns; /** @recv_calls: total number of invocations of homa_recvmsg. */ __u64 recv_calls; /** - * @blocked_cycles: total time threads spend in blocked state + * @blocked_ns: total time spent by threads in blocked state * while executing the homa_recvmsg kernel call handler. */ - __u64 blocked_cycles; + __u64 blocked_ns; /** - * @reply_cycles: total time spent executing the homa_sendmsg kernel - * call handler to send responses, as measured with get_cycles(). + * @reply_ns: total time spent executing the homa_sendmsg kernel + * call handler to send responses. */ - __u64 reply_cycles; + __u64 reply_ns; /** * @reply_calls: total number of invocations of homa_semdmsg @@ -232,10 +223,10 @@ struct homa_metrics { __u64 reply_calls; /** - * @abort_cycles: total time spent executing the homa_ioc_abort - * kernel call handler, as measured with get_cycles(). + * @abort_ns: total time spent executing the homa_ioc_abort + * kernel call handler. */ - __u64 abort_cycles; + __u64 abort_ns; /** * @abort_calls: total number of invocations of the homa_ioc_abort @@ -244,10 +235,10 @@ struct homa_metrics { __u64 abort_calls; /** - * @so_set_buf_cycles: total time spent executing the homa_ioc_set_buf - * kernel call handler, as measured with get_cycles(). + * @so_set_buf_ns: total time spent executing the homa_ioc_set_buf + * kernel call handler. */ - __u64 so_set_buf_cycles; + __u64 so_set_buf_ns; /** * @so_set_buf_calls: total number of invocations of the homa_ioc_set_buf @@ -256,42 +247,38 @@ struct homa_metrics { __u64 so_set_buf_calls; /** - * @grantable_lock_cycles: total time spent with homa->grantable_lock + * @grantable_lock_ns: total time spent with homa->grantable_lock * locked. */ - __u64 grantable_lock_cycles; + __u64 grantable_lock_ns; - /** - * @timer_cycles: total time spent in homa_timer, as measured with - * get_cycles(). - */ - __u64 timer_cycles; + /** @timer_ns: total time spent in homa_timer. */ + __u64 timer_ns; /** - * @timer_reap_cycles: total time spent by homa_timer to reap dead - * RPCs, as measured with get_cycles(). This time is included in - * @timer_cycles. + * @timer_reap_ns: total time spent by homa_timer to reap dead + * RPCs. This time is included in @timer_ns. */ - __u64 timer_reap_cycles; + __u64 timer_reap_ns; /** - * @data_pkt_reap_cycles: total time spent by homa_data_pkt to reap - * dead RPCs, as measured with get_cycles(). + * @data_pkt_reap_ns: total time spent by homa_data_pkt to reap + * dead RPCs. */ - __u64 data_pkt_reap_cycles; + __u64 data_pkt_reap_ns; /** - * @pacer_cycles: total time spent executing in homa_pacer_main - * (not including blocked time), as measured with get_cycles(). + * @pacer_ns: total time spent executing in homa_pacer_main + * (not including blocked time). */ - __u64 pacer_cycles; + __u64 pacer_ns; /** - * @pacer_lost_cycles: unnecessary delays in transmitting packets + * @pacer_lost_ns: unnecessary delays in transmitting packets * (i.e. wasted output bandwidth) because the pacer was slow or got * descheduled. */ - __u64 pacer_lost_cycles; + __u64 pacer_lost_ns; /** * @pacer_bytes: total number of bytes transmitted when @@ -313,10 +300,10 @@ struct homa_metrics { __u64 pacer_needed_help; /** - * @throttled_cycles: total amount of time that @homa->throttled_rpcs - * is nonempty, as measured with get_cycles(). + * @throttled_ns: total amount of time that @homa->throttled_rpcs + * is nonempty. */ - __u64 throttled_cycles; + __u64 throttled_ns; /** * @resent_packets: total number of data packets issued in response to @@ -431,10 +418,10 @@ struct homa_metrics { __u64 client_lock_misses; /** - * @client_lock_miss_cycles: total time spent waiting for client - * bucket lock misses, measured by get_cycles(). + * @client_lock_miss_ns: total time spent waiting for client + * bucket lock misses. */ - __u64 client_lock_miss_cycles; + __u64 client_lock_miss_ns; /** * @server_lock_misses: total number of times that Homa had to wait @@ -443,16 +430,16 @@ struct homa_metrics { __u64 server_lock_misses; /** - * @server_lock_miss_cycles: total time spent waiting for server - * bucket lock misses, measured by get_cycles(). + * @server_lock_miss_ns: total time spent waiting for server + * bucket lock misses. */ - __u64 server_lock_miss_cycles; + __u64 server_lock_miss_ns; /** - * @socket_lock_miss_cycles: total time spent waiting for socket - * lock misses, measured by get_cycles(). + * @socket_lock_miss_ns: total time spent waiting for socket + * lock misses. */ - __u64 socket_lock_miss_cycles; + __u64 socket_lock_miss_ns; /** * @socket_lock_misses: total number of times that Homa had to wait @@ -461,10 +448,10 @@ struct homa_metrics { __u64 socket_lock_misses; /** - * @throttle_lock_miss_cycles: total time spent waiting for throttle - * lock misses, measured by get_cycles(). + * @throttle_lock_miss_ns: total time spent waiting for throttle + * lock misses. */ - __u64 throttle_lock_miss_cycles; + __u64 throttle_lock_miss_ns; /** * @throttle_lock_misses: total number of times that Homa had to wait @@ -473,10 +460,9 @@ struct homa_metrics { __u64 throttle_lock_misses; /** - * @peer_acklock_miss_cycles: total time spent waiting for peer - * lock misses, measured by get_cycles(). + * @peer_acklock_miss_ns: total time spent waiting for peer lock misses. */ - __u64 peer_ack_lock_miss_cycles; + __u64 peer_ack_lock_miss_ns; /** * @peer_ack_lock_misses: total number of times that Homa had to wait @@ -485,10 +471,10 @@ struct homa_metrics { __u64 peer_ack_lock_misses; /** - * @grantable_lock_miss_cycles: total time spent waiting for grantable - * lock misses, measured by get_cycles(). + * @grantable_lock_miss_ns: total time spent waiting for grantable + * lock misses. */ - __u64 grantable_lock_miss_cycles; + __u64 grantable_lock_miss_ns; /** * @grantable_lock_misses: total number of times that Homa had to wait @@ -498,8 +484,8 @@ struct homa_metrics { /** * @grantable_rpcs_integral: cumulative sum of time_delta*grantable, - * where time_delta is a get_cycles time and grantable is the - * value of homa->num_grantable_rpcs over that time period. + * where time_delta is in nanoseconds and grantable is the value of + * homa->num_grantable_rpcs over that time period. */ __u64 grantable_rpcs_integral; @@ -509,11 +495,8 @@ struct homa_metrics { */ __u64 grant_recalc_calls; - /** - * @grant_recalc_cycles: total time spent in homa_grant_recalc, - * in get_cycles() units. - */ - __u64 grant_recalc_cycles; + /** @grant_recalc_ns: total time spent in homa_grant_recalc. */ + __u64 grant_recalc_ns; /** * @grant_recalc_loops: cumulative number of times homa_grant_recalc @@ -676,7 +659,7 @@ static inline struct homa_metrics *homa_metrics_per_cpu(void) * happens is that one of the INC_METRICs is lost, which isn't a big deal. */ #define INC_METRIC(metric, count) per_cpu(homa_metrics, \ - raw_smp_processor_id()).metric+= (count) + raw_smp_processor_id()).metric += (count) extern void homa_metric_append(struct homa *homa, const char *format, ...); extern loff_t homa_metrics_lseek(struct file *file, loff_t offset, diff --git a/homa_offload.c b/homa_offload.c index 9111c14d..f9f6b184 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -278,13 +278,13 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * gro_list by the caller, so it will be considered for merges * in the future. */ - __u64 saved_softirq_metric, softirq_cycles; + __u64 saved_softirq_metric, softirq_ns; struct homa_offload_core *offload_core; struct sk_buff *result = NULL; - __u64 *softirq_cycles_metric; + __u64 *softirq_ns_metric; struct data_header *h_new; struct sk_buff *held_skb; - __u64 now = get_cycles(); + __u64 now = sched_clock(); int priority; __u32 saddr; __u32 hash; @@ -292,7 +292,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, h_new = (struct data_header *) skb_transport_header(skb); offload_core = &per_cpu(homa_offload_core, raw_smp_processor_id()); - busy = (now - offload_core->last_gro) < homa->gro_busy_cycles; + busy = (now - offload_core->last_gro) < homa->gro_busy_ns; offload_core->last_active = now; if (skb_is_ipv6(skb)) { priority = ipv6_hdr(skb)->priority; @@ -433,20 +433,20 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, done: homa_check_pacer(homa, 1); - offload_core->last_gro = get_cycles(); + offload_core->last_gro = sched_clock(); return result; bypass: /* Record SoftIRQ cycles in a different metric to reflect that * they happened during bypass. */ - softirq_cycles_metric = &homa_metrics_per_cpu()->softirq_cycles; - saved_softirq_metric = *softirq_cycles_metric; + softirq_ns_metric = &homa_metrics_per_cpu()->softirq_ns; + saved_softirq_metric = *softirq_ns_metric; homa_softirq(skb); - softirq_cycles = *softirq_cycles_metric - saved_softirq_metric; - *softirq_cycles_metric = saved_softirq_metric; - INC_METRIC(bypass_softirq_cycles, softirq_cycles); - offload_core->last_gro = get_cycles(); + softirq_ns = *softirq_ns_metric - saved_softirq_metric; + *softirq_ns_metric = saved_softirq_metric; + INC_METRIC(bypass_softirq_ns, softirq_ns); + offload_core->last_gro = sched_clock(); /* This return value indicates that we have freed skb. */ return ERR_PTR(-EINPROGRESS); @@ -473,7 +473,7 @@ void homa_gro_gen2(struct sk_buff *skb) int this_core = raw_smp_processor_id(); struct homa_offload_core *offload_core; int candidate = this_core; - __u64 now = get_cycles(); + __u64 now = sched_clock(); int i; for (i = CORES_TO_CHECK; i > 0; i--) { @@ -483,7 +483,7 @@ void homa_gro_gen2(struct sk_buff *skb) offload_core = &per_cpu(homa_offload_core, candidate); if (atomic_read(&offload_core->softirq_backlog) > 0) continue; - if ((offload_core->last_gro + homa->busy_cycles) > now) + if ((offload_core->last_gro + homa->busy_ns) > now) continue; tt_record3("homa_gro_gen2 chose core %d for id %d offset %d", candidate, homa_local_id(h->common.sender_id), @@ -531,8 +531,8 @@ void homa_gro_gen3(struct sk_buff *skb) candidates = per_cpu(homa_offload_core, raw_smp_processor_id()).gen3_softirq_cores; - now = get_cycles(); - busy_time = now - homa->busy_cycles; + now = sched_clock(); + busy_time = now - homa->busy_ns; core = candidates[0]; for (i = 0; i < NUM_GEN3_SOFTIRQ_CORES; i++) { diff --git a/homa_offload.h b/homa_offload.h index c0f3c9bb..6bf79154 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -13,14 +13,14 @@ */ struct homa_offload_core { /** - * @last_active: the last time (in get_cycle() units) that + * @last_active: the last time (in sched_clock() units) that * there was system activity, such NAPI or SoftIRQ, on this * core. Used for load balancing. */ __u64 last_active; /** - * @last_gro: the last time (in get_cycle() units) that + * @last_gro: the last time (in sched_clock() units) that * homa_gro_receive returned on this core. Used to determine * whether GRO is keeping a core busy. */ @@ -50,7 +50,7 @@ struct homa_offload_core { int gen3_softirq_cores[NUM_GEN3_SOFTIRQ_CORES]; /** - * @last_app_active: the most recent time (get_cycles() units) + * @last_app_active: the most recent time (sched_clock() units) * when an application was actively using Homa on this core (e.g., * by sending or receiving messages). Used for load balancing * (see balance.txt). diff --git a/homa_outgoing.c b/homa_outgoing.c index 24673b4d..bd5b2727 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -48,7 +48,7 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) if (rpc->msgout.unscheduled > length) rpc->msgout.unscheduled = length; rpc->msgout.sched_priority = 0; - rpc->msgout.init_cycles = get_cycles(); + rpc->msgout.init_ns= sched_clock(); } /** @@ -734,16 +734,12 @@ void homa_outgoing_sysctl_changed(struct homa *homa) { __u64 tmp; - /* Code below is written carefully to avoid integer underflow or - * overflow under expected usage patterns. Be careful when changing! - */ - homa->cycles_per_kbyte = 8 * (__u64)cpu_khz; - do_div(homa->cycles_per_kbyte, homa->link_mbps); - homa->cycles_per_kbyte = 101 * homa->cycles_per_kbyte; - do_div(homa->cycles_per_kbyte, 100); - tmp = homa->max_nic_queue_ns * cpu_khz; - do_div(tmp, 1000000); - homa->max_nic_queue_cycles = tmp; + tmp = 8 * 1000ULL * 1000ULL * 1000ULL; + + /* Underestimate link bandwidth (overestimate time) by 1%. */ + tmp = tmp * 101 / 100; + do_div(tmp, homa->link_mbps); + homa->ns_per_mbyte = tmp; } /** @@ -764,16 +760,17 @@ void homa_outgoing_sysctl_changed(struct homa *homa) */ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) { - int cycles_for_packet, bytes; - __u64 idle, new_idle, clock; + __u64 idle, new_idle, clock, ns_for_packet; + int bytes; bytes = homa_get_skb_info(skb)->wire_bytes; - cycles_for_packet = bytes * homa->cycles_per_kbyte; - do_div(cycles_for_packet, 1000); + ns_for_packet = homa->ns_per_mbyte; + ns_for_packet *= bytes; + do_div(ns_for_packet, 1000000); while (1) { - clock = get_cycles(); + clock = sched_clock(); idle = atomic64_read(&homa->link_idle_time); - if ((clock + homa->max_nic_queue_cycles) < idle && !force && + if ((clock + homa->max_nic_queue_ns) < idle && !force && !(homa->flags & HOMA_FLAG_DONT_THROTTLE)) return 0; if (!list_empty(&homa->throttled_rpcs)) @@ -784,18 +781,18 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) __u64 lost = (homa->pacer_wake_time > idle) ? clock - homa->pacer_wake_time : clock - idle; - INC_METRIC(pacer_lost_cycles, lost); + INC_METRIC(pacer_lost_ns, lost); tt_record1("pacer lost %d cycles", lost); } - new_idle = clock + cycles_for_packet; + new_idle = clock + ns_for_packet; } else { - new_idle = idle + cycles_for_packet; + new_idle = idle + ns_for_packet; } #else /* See strip.py */ if (idle < clock) - new_idle = clock + cycles_for_packet; + new_idle = clock + ns_for_packet; else - new_idle = idle + cycles_for_packet; + new_idle = idle + ns_for_packet; #endif /* See strip.py */ /* This method must be thread-safe. */ @@ -816,7 +813,7 @@ int homa_pacer_main(void *transport) { struct homa *homa = (struct homa *)transport; - homa->pacer_wake_time = get_cycles(); + homa->pacer_wake_time = sched_clock(); while (1) { if (homa->pacer_exit) { homa->pacer_wake_time = 0; @@ -842,10 +839,10 @@ int homa_pacer_main(void *transport) throttled_links) != NULL) #endif /* See strip.py */ __set_current_state(TASK_RUNNING); - INC_METRIC(pacer_cycles, get_cycles() - homa->pacer_wake_time); + INC_METRIC(pacer_ns, sched_clock() - homa->pacer_wake_time); homa->pacer_wake_time = 0; schedule(); - homa->pacer_wake_time = get_cycles(); + homa->pacer_wake_time = sched_clock(); __set_current_state(TASK_RUNNING); } kthread_complete_and_exit(&homa_pacer_kthread_done, 0); @@ -885,16 +882,16 @@ void homa_pacer_xmit(struct homa *homa) __u64 idle_time, now; /* If the NIC queue is too long, wait until it gets shorter. */ - now = get_cycles(); + now = sched_clock(); idle_time = atomic64_read(&homa->link_idle_time); - while ((now + homa->max_nic_queue_cycles) < idle_time) { + while ((now + homa->max_nic_queue_ns) < idle_time) { /* If we've xmitted at least one packet then * return (this helps with testing and also * allows homa_pacer_main to yield the core). */ if (i != 0) goto done; - now = get_cycles(); + now = sched_clock(); } /* Note: when we get here, it's possible that the NIC queue is * still too long because other threads have queued packets, @@ -920,9 +917,9 @@ void homa_pacer_xmit(struct homa *homa) rpc = NULL; list_for_each_entry_rcu(cur, &homa->throttled_rpcs, throttled_links) { - if (cur->msgout.init_cycles < oldest) { + if (cur->msgout.init_ns < oldest) { rpc = cur; - oldest = cur->msgout.init_cycles; + oldest = cur->msgout.init_ns; } } } else { @@ -962,7 +959,7 @@ void homa_pacer_xmit(struct homa *homa) rpc->id, rpc->msgout.next_xmit_offset); list_del_rcu(&rpc->throttled_links); if (list_empty(&homa->throttled_rpcs)) - INC_METRIC(throttled_cycles, get_cycles() + INC_METRIC(throttled_ns, sched_clock() - homa->throttle_add); /* Note: this reinitialization is only safe @@ -1012,9 +1009,9 @@ void homa_add_to_throttled(struct homa_rpc *rpc) if (!list_empty(&rpc->throttled_links)) return; - now = get_cycles(); + now = sched_clock(); if (!list_empty(&homa->throttled_rpcs)) - INC_METRIC(throttled_cycles, now - homa->throttle_add); + INC_METRIC(throttled_ns, now - homa->throttle_add); homa->throttle_add = now; bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; homa_throttle_lock(homa); @@ -1056,7 +1053,7 @@ void homa_remove_from_throttled(struct homa_rpc *rpc) homa_throttle_lock(rpc->hsk->homa); list_del(&rpc->throttled_links); if (list_empty(&rpc->hsk->homa->throttled_rpcs)) - INC_METRIC(throttled_cycles, get_cycles() + INC_METRIC(throttled_ns, sched_clock() - rpc->hsk->homa->throttle_add); homa_throttle_unlock(rpc->hsk->homa); INIT_LIST_HEAD(&rpc->throttled_links); diff --git a/homa_peer.c b/homa_peer.c index 0294867c..1a643481 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -111,7 +111,7 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, * homa_peertab_gc_dsts() - Invoked to free unused dst_entries, if it is * safe to do so. * @peertab: The table in which to free entries. - * @now: Current time, in get_cycles units; entries with expiration + * @now: Current time, in sched_clock() units; entries with expiration * dates no later than this will be freed. Specify ~0 to * free all entries. */ @@ -120,7 +120,7 @@ void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now) while (!list_empty(&peertab->dead_dsts)) { struct homa_dead_dst *dead = list_first_entry(&peertab->dead_dsts, struct homa_dead_dst, - dst_links); + dst_links); if (dead->gc_time > now) break; dst_release(dead->dst); @@ -244,10 +244,10 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, */ dst_release(peer->dst); } else { - __u64 now = get_cycles(); + __u64 now = sched_clock(); dead->dst = peer->dst; - dead->gc_time = now + (cpu_khz << 7); + dead->gc_time = now + 125000000; list_add_tail(&dead->dst_links, &peertab->dead_dsts); homa_peertab_gc_dsts(peertab, now); } @@ -360,13 +360,13 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, */ void homa_peer_lock_slow(struct homa_peer *peer) { - __u64 start = get_cycles(); + __u64 start = sched_clock(); tt_record("beginning wait for peer lock"); spin_lock_bh(&peer->ack_lock); tt_record("ending wait for peer lock"); INC_METRIC(peer_ack_lock_misses, 1); - INC_METRIC(peer_ack_lock_miss_cycles, get_cycles() - start); + INC_METRIC(peer_ack_lock_miss_ns, sched_clock() - start); } /** diff --git a/homa_peer.h b/homa_peer.h index e26574c8..50fc2d20 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -23,7 +23,7 @@ struct homa_dead_dst { struct dst_entry *dst; /** - * @gc_time: Time (in units of get_cycles) when it is safe + * @gc_time: Time (in units of sched_clock()) when it is safe * to free @dst. */ __u64 gc_time; diff --git a/homa_plumbing.c b/homa_plumbing.c index cde33667..96e3e06d 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -696,7 +696,7 @@ void homa_close(struct sock *sk, long timeout) homa_sock_destroy(hsk); sk_common_release(sk); - tt_record1("closed socket, port %d\n", hsk->port); + tt_record1("closed socket, port %d", hsk->port); if (hsk->homa->freeze_type == SOCKET_CLOSE) tt_freeze(); } @@ -778,13 +778,13 @@ int homa_ioc_abort(struct sock *sk, int *karg) int homa_ioctl(struct sock *sk, int cmd, int *karg) { int result; - __u64 start = get_cycles(); + __u64 start = sched_clock(); switch (cmd) { case HOMAIOCABORT: result = homa_ioc_abort(sk, karg); INC_METRIC(abort_calls, 1); - INC_METRIC(abort_cycles, get_cycles() - start); + INC_METRIC(abort_ns, sched_clock() - start); break; case HOMAIOCFREEZE: tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, pid %d", current->pid); @@ -829,7 +829,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, { struct homa_sock *hsk = homa_sk(sk); struct homa_set_buf_args args; - __u64 start = get_cycles(); + __u64 start = sched_clock(); int ret; if (level != IPPROTO_HOMA || optname != SO_HOMA_SET_BUF || @@ -849,7 +849,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, ret = homa_pool_init(hsk, args.start, args.length); homa_sock_unlock(hsk); INC_METRIC(so_set_buf_calls, 1); - INC_METRIC(so_set_buf_cycles, get_cycles() - start); + INC_METRIC(so_set_buf_ns, sched_clock() - start); return ret; } @@ -882,7 +882,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { struct homa_sock *hsk = homa_sk(sk); struct homa_sendmsg_args args; - __u64 start = get_cycles(); + __u64 start = sched_clock(); __u64 finish; int result = 0; struct homa_rpc *rpc = NULL; @@ -938,8 +938,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) result = -EFAULT; goto error; } - finish = get_cycles(); - INC_METRIC(send_cycles, finish - start); + finish = sched_clock(); + INC_METRIC(send_ns, finish - start); } else { /* This is a response message. */ struct in6_addr canonical_dest; @@ -983,8 +983,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (result && rpc->state != RPC_DEAD) goto error; homa_rpc_unlock(rpc); - finish = get_cycles(); - INC_METRIC(reply_cycles, finish - start); + finish = sched_clock(); + INC_METRIC(reply_ns, finish - start); } tt_record1("homa_sendmsg finished, id %d", args.id); return 0; @@ -1015,7 +1015,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, { struct homa_sock *hsk = homa_sk(sk); struct homa_recvmsg_args control; - __u64 start = get_cycles(); + __u64 start = sched_clock(); struct homa_rpc *rpc; __u64 finish; int result; @@ -1071,7 +1071,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, * for performance debugging). */ if (rpc->hsk->homa->freeze_type == SLOW_RPC) { - uint64_t elapsed = (get_cycles() - rpc->start_cycles)>>10; + uint64_t elapsed = (sched_clock() - rpc->start_ns)>>10; if ((elapsed <= hsk->homa->temp[1]) && (elapsed >= hsk->homa->temp[0]) @@ -1136,11 +1136,11 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, result = -EFAULT; } - finish = get_cycles(); + finish = sched_clock(); tt_record3("homa_recvmsg returning id %d, length %d, bpage0 %d", control.id, result, control.bpage_offsets[0] >> HOMA_BPAGE_SHIFT); - INC_METRIC(recv_cycles, finish - start); + INC_METRIC(recv_ns, finish - start); return result; } @@ -1193,7 +1193,7 @@ int homa_softirq(struct sk_buff *skb) int pull_length; __u64 start; - start = get_cycles(); + start = sched_clock(); INC_METRIC(softirq_calls, 1); per_cpu(homa_offload_core, raw_smp_processor_id()).last_active = start; @@ -1332,7 +1332,7 @@ int homa_softirq(struct sk_buff *skb) } atomic_dec(&per_cpu(homa_offload_core, raw_smp_processor_id()).softirq_backlog); - INC_METRIC(softirq_cycles, get_cycles() - start); + INC_METRIC(softirq_ns, sched_clock() - start); return 0; } diff --git a/homa_pool.c b/homa_pool.c index 1e82e8c9..fe809b8f 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -137,7 +137,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, { int core_num = raw_smp_processor_id(); struct homa_pool_core *core; - __u64 now = get_cycles(); + __u64 now = sched_clock(); int alloced = 0; int limit = 0; @@ -211,8 +211,8 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, if (set_owner) { atomic_set(&bpage->refs, 2); bpage->owner = core_num; - bpage->expiration = now - + pool->hsk->homa->bpage_lease_cycles; + bpage->expiration = now + 1000 * + pool->hsk->homa->bpage_lease_usecs; } else { atomic_set(&bpage->refs, 1); bpage->owner = -1; @@ -241,7 +241,6 @@ int homa_pool_allocate(struct homa_rpc *rpc) __u32 pages[HOMA_MAX_BPAGES]; struct homa_pool_core *core; struct homa_bpage *bpage; - __u64 now = get_cycles(); struct homa_rpc *other; if (!pool->region) @@ -292,7 +291,8 @@ int homa_pool_allocate(struct homa_rpc *rpc) goto new_page; } } - bpage->expiration = now + pool->hsk->homa->bpage_lease_cycles; + bpage->expiration = sched_clock() + + 1000 * pool->hsk->homa->bpage_lease_usecs; atomic_inc(&bpage->refs); spin_unlock_bh(&bpage->lock); goto allocate_partial; diff --git a/homa_pool.h b/homa_pool.h index 0c0f2bd2..0ac08d49 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -38,7 +38,7 @@ struct homa_bpage { int owner; /** - * @expiration: time (in get_cycles units) after + * @expiration: time (in sched_clock() units) after * which it's OK to steal this page from its current * owner (if @refs is 1). */ diff --git a/homa_rpc.c b/homa_rpc.c index d6c6949d..9baca396 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -63,7 +63,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, crpc->resend_timer_ticks = hsk->homa->timer_ticks; crpc->done_timer_ticks = 0; crpc->magic = HOMA_RPC_MAGIC; - crpc->start_cycles = get_cycles(); + crpc->start_ns = sched_clock(); /* Initialize fields that require locking. This allows the most * expensive work, such as copying in the message from user space, @@ -165,7 +165,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, srpc->resend_timer_ticks = hsk->homa->timer_ticks; srpc->done_timer_ticks = 0; srpc->magic = HOMA_RPC_MAGIC; - srpc->start_cycles = get_cycles(); + srpc->start_ns = sched_clock(); tt_record2("Incoming message for id %d has %d unscheduled bytes", srpc->id, ntohl(h->incoming)); err = homa_message_in_init(srpc, ntohl(h->message_length), diff --git a/homa_rpc.h b/homa_rpc.h index 27380eb1..8eebe08a 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -84,10 +84,10 @@ struct homa_message_out { __u8 sched_priority; /** - * @init_cycles: Time in get_cycles units when this structure was + * @init_ns: Time in sched_clock units when this structure was * initialized. Used to find the oldest outgoing message. */ - __u64 init_cycles; + __u64 init_ns; }; /** @@ -102,7 +102,7 @@ struct homa_gap { int end; /** - * @time: time (in get_cycles units) when the gap was first detected. + * @time: time (in sched_clock units) when the gap was first detected. * As of 7/2024 this isn't used for anything. */ __u64 time; @@ -174,7 +174,7 @@ struct homa_message_in { __u8 resend_all; /** - * @birth: get_cycles time when this RPC was added to the grantable + * @birth: sched_clock() time when this RPC was added to the grantable * list. Invalid if RPC isn't in the grantable list. */ __u64 birth; @@ -405,10 +405,10 @@ struct homa_rpc { int magic; /** - * @start_cycles: time (from get_cycles()) when this RPC was created. + * @start_ns: time (from sched_clock()) when this RPC was created. * Used (sometimes) for testing. */ - u64 start_cycles; + u64 start_ns; }; void homa_check_rpc(struct homa_rpc *rpc); diff --git a/homa_skb.c b/homa_skb.c index f4e0776e..57bd18b9 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -118,7 +118,7 @@ void homa_skb_cleanup(struct homa *homa) */ struct sk_buff *homa_skb_new_tx(int length) { - __u64 start = get_cycles(); + __u64 start = sched_clock(); struct sk_buff *skb; /* Note: allocate space for an IPv6 header, which is larger than @@ -132,7 +132,7 @@ struct sk_buff *homa_skb_new_tx(int length) skb_reset_transport_header(skb); } INC_METRIC(skb_allocs, 1); - INC_METRIC(skb_alloc_cycles, get_cycles() - start); + INC_METRIC(skb_alloc_ns, sched_clock() - start); return skb; } @@ -276,17 +276,17 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) /* Step 3: can we allocate a new big page? */ INC_METRIC(skb_page_allocs, 1); - start = get_cycles(); + start = sched_clock(); skb_core->skb_page = alloc_pages((GFP_KERNEL & ~__GFP_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, HOMA_SKB_PAGE_ORDER); if (likely(skb_core->skb_page)) { - INC_METRIC(skb_page_alloc_cycles, get_cycles() - start); + INC_METRIC(skb_page_alloc_ns, sched_clock() - start); goto success; } /* Step 4: can we allocate a normal page? */ skb_core->skb_page = alloc_page(GFP_KERNEL); - INC_METRIC(skb_page_alloc_cycles, get_cycles() - start); + INC_METRIC(skb_page_alloc_ns, sched_clock() - start); if (likely(skb_core->skb_page)) { skb_core->page_size = PAGE_SIZE; goto success; @@ -445,7 +445,7 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) #define MAX_PAGES_AT_ONCE 50 #endif struct page *pages_to_cache[MAX_PAGES_AT_ONCE]; - __u64 start = get_cycles(); + __u64 start = sched_clock(); int num_pages = 0; int i, j; @@ -484,7 +484,7 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) if (num_pages > 0) homa_skb_cache_pages(homa, pages_to_cache, num_pages); INC_METRIC(skb_frees, count); - INC_METRIC(skb_free_cycles, get_cycles() - start); + INC_METRIC(skb_free_ns, sched_clock() - start); } /** @@ -575,15 +575,13 @@ void homa_skb_release_pages(struct homa *homa) { int i, max_low_mark, min_pages, release, release_max; struct homa_page_pool *max_pool; - __u64 now = get_cycles(); - __s64 interval; + __u64 now = sched_clock(); if (now < homa->skb_page_free_time) return; /* Free pages every 0.5 second. */ - interval = cpu_khz*500; - homa->skb_page_free_time = now + interval; + homa->skb_page_free_time = now + 500000000ULL; release_max = homa->skb_page_frees_per_sec/2; if (homa->pages_to_free_slots < release_max) { if (homa->skb_pages_to_free != NULL) diff --git a/homa_sock.c b/homa_sock.c index b3f274f2..521f0841 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -363,13 +363,13 @@ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) */ void homa_sock_lock_slow(struct homa_sock *hsk) { - __u64 start = get_cycles(); + __u64 start = sched_clock(); tt_record("beginning wait for socket lock"); spin_lock_bh(&hsk->lock); tt_record("ending wait for socket lock"); INC_METRIC(socket_lock_misses, 1); - INC_METRIC(socket_lock_miss_cycles, get_cycles() - start); + INC_METRIC(socket_lock_miss_ns, sched_clock() - start); } /** @@ -383,7 +383,7 @@ void homa_sock_lock_slow(struct homa_sock *hsk) */ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id) { - __u64 start = get_cycles(); + __u64 start = sched_clock(); tt_record2("beginning wait for rpc lock, id %d (bucket %d)", id, bucket->id); @@ -392,9 +392,9 @@ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id) id, bucket->id); if (homa_is_client(id)) { INC_METRIC(client_lock_misses, 1); - INC_METRIC(client_lock_miss_cycles, get_cycles() - start); + INC_METRIC(client_lock_miss_ns, sched_clock() - start); } else { INC_METRIC(server_lock_misses, 1); - INC_METRIC(server_lock_miss_cycles, get_cycles() - start); + INC_METRIC(server_lock_miss_ns, sched_clock() - start); } } diff --git a/homa_timer.c b/homa_timer.c index 6b131a99..36eb8dca 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -169,7 +169,7 @@ void homa_timer(struct homa *homa) int rpc_count = 0; int core; - start = get_cycles(); + start = sched_clock(); homa->timer_ticks++; total_grants = 0; @@ -209,12 +209,12 @@ void homa_timer(struct homa *homa) * isn't keeping up with RPC reaping, so we'll help * out. See reap.txt for more info. */ - uint64_t start = get_cycles(); + uint64_t start = sched_clock(); tt_record("homa_timer calling homa_rpc_reap"); if (homa_rpc_reap(hsk, hsk->homa->reap_limit) == 0) break; - INC_METRIC(timer_reap_cycles, get_cycles() - start); + INC_METRIC(timer_reap_ns, sched_clock() - start); } if (list_empty(&hsk->active_rpcs) || hsk->shutdown) @@ -259,6 +259,6 @@ void homa_timer(struct homa *homa) total_incoming_rpcs, sum_incoming, sum_incoming_rec, atomic_read(&homa->total_incoming)); homa_skb_release_pages(homa); - end = get_cycles(); - INC_METRIC(timer_cycles, end-start); + end = sched_clock(); + INC_METRIC(timer_ns, end-start); } diff --git a/homa_utils.c b/homa_utils.c index 3eb53a54..f98c816f 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -29,14 +29,14 @@ int homa_init(struct homa *homa) homa->pacer_kthread = NULL; init_completion(&homa_pacer_kthread_done); atomic64_set(&homa->next_outgoing_id, 2); - atomic64_set(&homa->link_idle_time, get_cycles()); + atomic64_set(&homa->link_idle_time, sched_clock()); spin_lock_init(&homa->grantable_lock); homa->grantable_lock_time = 0; atomic_set(&homa->grant_recalc_count, 0); INIT_LIST_HEAD(&homa->grantable_peers); INIT_LIST_HEAD(&homa->grantable_rpcs); homa->num_grantable_rpcs = 0; - homa->last_grantable_change = get_cycles(); + homa->last_grantable_change = sched_clock(); homa->max_grantable_rpcs = 0; homa->oldest_rpc = NULL; homa->num_active_rpcs = 0; @@ -110,7 +110,7 @@ int homa_init(struct homa *homa) } homa->pacer_exit = false; homa->max_nic_queue_ns = 2000; - homa->cycles_per_kbyte = 0; + homa->ns_per_mbyte = 0; homa->verbose = 0; homa->max_gso_size = 10000; homa->gso_force_software = 0; @@ -655,10 +655,8 @@ void homa_spin(int ns) { __u64 end; - end = ns * cpu_khz; - do_div(end, 1000000); - end += get_cycles(); - while (get_cycles() < end) + end = sched_clock() + ns; + while (sched_clock() < end) /* Empty loop body.*/ ; } @@ -672,13 +670,13 @@ void homa_spin(int ns) */ void homa_throttle_lock_slow(struct homa *homa) { - __u64 start = get_cycles(); + __u64 start = sched_clock(); tt_record("beginning wait for throttle lock"); spin_lock_bh(&homa->throttle_lock); tt_record("ending wait for throttle lock"); INC_METRIC(throttle_lock_misses, 1); - INC_METRIC(throttle_lock_miss_cycles, get_cycles() - start); + INC_METRIC(throttle_lock_miss_ns, sched_clock() - start); } /** diff --git a/test/mock.c b/test/mock.c index 95593d9c..30677b2b 100644 --- a/test/mock.c +++ b/test/mock.c @@ -125,10 +125,17 @@ static int mock_active_locks; static int mock_active_rcu_locks; /* Used as the return value for calls to get_cycles. A value of ~0 means - * return actual clock time. + * return actual clock time. Shouldn't be used much anymore (get_cycles + * shouldn't be used). */ cycles_t mock_cycles; +/* Used as the return value for calls to sched_clock. */ +__u64 mock_ns; + +/* Add this value to mock_ns every time sched_clock is invoked. */ +__u64 mock_ns_tick; + /* Indicates whether we should be simulation IPv6 or IPv4 in the * current test. Can be overridden by a test. */ @@ -936,6 +943,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} +__u64 sched_clock(void) +{ + mock_ns += mock_ns_tick; + return mock_ns; +} + void schedule(void) { UNIT_HOOK("schedule"); @@ -1510,6 +1523,8 @@ void mock_teardown(void) mock_copy_to_user_errors = 0; mock_cpu_idle = 0; mock_cycles = 0; + mock_ns = 0; + mock_ns_tick = 0; mock_ipv6 = mock_ipv6_default; mock_import_ubuf_errors = 0; mock_import_iovec_errors = 0; diff --git a/test/mock.h b/test/mock.h index 20dcb2bf..89ed992b 100644 --- a/test/mock.h +++ b/test/mock.h @@ -28,6 +28,8 @@ extern int mock_max_skb_frags; extern int mock_mtu; extern struct net_device mock_net_device; +extern __u64 mock_ns; +extern __u64 mock_ns_tick; extern int mock_numa_mask; extern int mock_page_nid_mask; extern int mock_route_errors; diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 3448ac6a..423713b9 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -32,7 +32,7 @@ static void grantable_spinlock_hook(char *id) return; if (hook_homa != NULL) atomic_inc(&hook_homa->grant_recalc_count); - mock_cycles = 1000; + mock_ns = 1000; } FIXTURE(homa_grant) { @@ -66,7 +66,7 @@ FIXTURE_SETUP(homa_grant) self->server_id = 1235; homa_init(&self->homa); self->homa.num_priorities = 1; - self->homa.poll_cycles = 0; + self->homa.poll_usecs = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.pacer_fifo_fraction = 0; self->homa.grant_fifo_fraction = 0; @@ -190,7 +190,7 @@ TEST_F(homa_grant, homa_grant_add_rpc__update_metrics) { self->homa.last_grantable_change = 100; self->homa.num_grantable_rpcs = 3; - mock_cycles = 200; + mock_ns = 200; test_rpc(self, 100, self->server_ip, 100000); EXPECT_EQ(4, self->homa.num_grantable_rpcs); EXPECT_EQ(300, homa_metrics_per_cpu()->grantable_rpcs_integral); @@ -340,7 +340,7 @@ TEST_F(homa_grant, homa_grant_remove_rpc__update_metrics) EXPECT_EQ(1, self->homa.num_grantable_rpcs); self->homa.last_grantable_change = 100; self->homa.num_grantable_rpcs = 3; - mock_cycles = 200; + mock_ns = 200; homa_grant_remove_rpc(rpc); EXPECT_EQ(2, self->homa.num_grantable_rpcs); @@ -752,7 +752,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) rpc4 = test_rpc(self, 106, self->server_ip+1, 35000); self->homa.max_incoming = 100000; self->homa.max_overcommit = 3; - mock_cycles = ~0; + mock_ns_tick = 10; unit_log_clear(); homa_grant_recalc(&self->homa, 0); @@ -773,7 +773,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); - EXPECT_NE(0, homa_metrics_per_cpu()->grant_recalc_cycles); + EXPECT_NE(0, homa_metrics_per_cpu()->grant_recalc_ns); } TEST_F(homa_grant, homa_grant_recalc__already_locked) { @@ -1016,7 +1016,7 @@ TEST_F(homa_grant, homa_grant_pick_rpcs__first_rpc_of_peer_doesnt_fit) TEST_F(homa_grant, homa_grant_find_oldest__basics) { - mock_cycles = ~0; + mock_ns_tick = 10; unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 11, 40000, 100); unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, @@ -1033,7 +1033,7 @@ TEST_F(homa_grant, homa_grant_find_oldest__fifo_grant_unused) { struct homa_rpc *srpc1, *srpc2; - mock_cycles = ~0; + mock_ns_tick = 10; srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 11, 400000, 100); srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, @@ -1111,18 +1111,18 @@ TEST_F(homa_grant, homa_grant_free_rpc__not_in_active_list) TEST_F(homa_grant, homa_grantable_lock_slow__basics) { - mock_cycles = 500; + mock_ns = 500; unit_hook_register(grantable_spinlock_hook); EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); homa_grantable_unlock(&self->homa); EXPECT_EQ(1, homa_metrics_per_cpu()->grantable_lock_misses); - EXPECT_EQ(500, homa_metrics_per_cpu()->grantable_lock_miss_cycles); + EXPECT_EQ(500, homa_metrics_per_cpu()->grantable_lock_miss_ns); } TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) { - mock_cycles = 500; + mock_ns = 500; unit_hook_register(grantable_spinlock_hook); hook_homa = &self->homa; mock_trylock_errors = 0xff; @@ -1131,7 +1131,7 @@ TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) hook_homa = NULL; EXPECT_EQ(1, homa_metrics_per_cpu()->grantable_lock_misses); - EXPECT_EQ(500, homa_metrics_per_cpu()->grantable_lock_miss_cycles); + EXPECT_EQ(500, homa_metrics_per_cpu()->grantable_lock_miss_ns); /* Make sure the check only occurs if the recalc argument is set. */ mock_trylock_errors = 0xff; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index ef4c64b1..3e3b14e0 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -148,7 +148,7 @@ FIXTURE_SETUP(homa_incoming) self->server_id = 1235; homa_init(&self->homa); self->homa.num_priorities = 1; - self->homa.poll_cycles = 0; + self->homa.poll_usecs = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.pacer_fifo_fraction = 0; self->homa.grant_fifo_fraction = 0; @@ -257,7 +257,7 @@ TEST_F(homa_incoming, homa_add_packet__basics) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); - mock_cycles = 5000; + mock_ns = 5000; self->data.seg.offset = htonl(1400); homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 1400)); @@ -527,7 +527,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_in_middle_of_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); - mock_cycles = 1000; + mock_ns = 1000; self->data.seg.offset = htonl(0); homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 0)); @@ -539,7 +539,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_in_middle_of_gap) unit_print_gaps(crpc)); self->data.seg.offset = htonl(2000); - mock_cycles = 2000; + mock_ns = 2000; homa_add_packet(crpc, mock_skb_new(self->client_ip, &self->data.common, 1400, 2000)); EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); @@ -1074,6 +1074,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 20000); struct homa_rpc *srpc; + mock_ns_tick = 10; homa_rpc_free(dead); EXPECT_EQ(31, self->hsk.dead_skbs); @@ -1082,14 +1083,13 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) 10000, 5000); ASSERT_NE(NULL, srpc); self->homa.dead_buffs_limit = 16; - mock_cycles = ~0; /* First packet: below the threshold for reaps. */ self->data.common.dport = htons(self->hsk.port); homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(31, self->hsk.dead_skbs); - EXPECT_EQ(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->data_pkt_reap_ns); /* Second packet: must reap. */ self->homa.dead_buffs_limit = 15; @@ -1097,7 +1097,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(21, self->hsk.dead_skbs); - EXPECT_NE(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); + EXPECT_NE(0, homa_metrics_per_cpu()->data_pkt_reap_ns); } TEST_F(homa_incoming, homa_data_pkt__basics) @@ -2214,7 +2214,6 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_polling) ASSERT_NE(NULL, crpc1); hook_rpc = crpc1; poll_count = 5; - self->homa.poll_cycles = 1000000; unit_hook_register(poll_hook); unit_log_clear(); rpc = homa_wait_for_message(&self->hsk, 0, self->client_id); @@ -2454,8 +2453,8 @@ TEST_F(homa_incoming, homa_choose_interest__find_idle_core) interest3.core = 3; list_add_tail(&interest3.request_links, &self->hsk.request_interests); - mock_cycles = 5000; - self->homa.busy_cycles = 1000; + mock_ns = 5000; + self->homa.busy_ns = 1000; per_cpu(homa_offload_core, 1).last_active = 4100; per_cpu(homa_offload_core, 2).last_active = 3500; per_cpu(homa_offload_core, 3).last_active = 2000; @@ -2481,8 +2480,8 @@ TEST_F(homa_incoming, homa_choose_interest__all_cores_busy) interest3.core = 3; list_add_tail(&interest3.request_links, &self->hsk.request_interests); - mock_cycles = 5000; - self->homa.busy_cycles = 1000; + mock_ns = 5000; + self->homa.busy_ns = 1000; per_cpu(homa_offload_core, 1).last_active = 4100; per_cpu(homa_offload_core, 2).last_active = 4001; per_cpu(homa_offload_core, 3).last_active = 4800; @@ -2683,7 +2682,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__update_last_app_active) interest.reg_rpc = crpc; interest.core = 2; crpc->interest = &interest; - mock_cycles = 10000; + mock_ns = 10000; per_cpu(homa_offload_core, 2).last_app_active = 444; homa_rpc_handoff(crpc); EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); @@ -2692,13 +2691,6 @@ TEST_F(homa_incoming, homa_rpc_handoff__update_last_app_active) } TEST_F(homa_incoming, homa_incoming_sysctl_changed__grant_nonfifo) -{ - cpu_khz = 2000000; - self->homa.poll_usecs = 40; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(80000, self->homa.poll_cycles); -} -TEST_F(homa_incoming, homa_incoming_sysctl_changed__poll_cycles) { self->homa.fifo_grant_increment = 10000; self->homa.grant_fifo_fraction = 0; @@ -2717,3 +2709,25 @@ TEST_F(homa_incoming, homa_incoming_sysctl_changed__poll_cycles) homa_incoming_sysctl_changed(&self->homa); EXPECT_EQ(10000, self->homa.grant_nonfifo); } +TEST_F(homa_incoming, homa_incoming_sysctl_changed__limit_on_max_overcommit) +{ + self->homa.max_overcommit = 2; + homa_incoming_sysctl_changed(&self->homa); + EXPECT_EQ(2, self->homa.max_overcommit); + + self->homa.max_overcommit = HOMA_MAX_GRANTS; + homa_incoming_sysctl_changed(&self->homa); + EXPECT_EQ(HOMA_MAX_GRANTS, self->homa.max_overcommit); + + self->homa.max_overcommit = HOMA_MAX_GRANTS+1; + homa_incoming_sysctl_changed(&self->homa); + EXPECT_EQ(HOMA_MAX_GRANTS, self->homa.max_overcommit); +} +TEST_F(homa_incoming, homa_incoming_sysctl_changed__convert_usec_to_ns) +{ + self->homa.busy_usecs = 53; + self->homa.gro_busy_usecs = 140; + homa_incoming_sysctl_changed(&self->homa); + EXPECT_EQ(53000, self->homa.busy_ns); + EXPECT_EQ(140000, self->homa.gro_busy_ns); +} diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 52179d1d..b8897825 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -89,8 +89,8 @@ FIXTURE_SETUP(homa_offload) unit_log_clear(); /* Configure so core isn't considered too busy for bypasses. */ - mock_cycles = 1000; - self->homa.gro_busy_cycles = 500; + mock_ns = 1000; + self->homa.gro_busy_ns = 500; cur_offload_core->last_gro = 400; } FIXTURE_TEARDOWN(homa_offload) @@ -521,8 +521,8 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) TEST_F(homa_offload, homa_gro_gen2) { homa->gro_policy = HOMA_GRO_GEN2; - mock_cycles = 1000; - homa->busy_cycles = 100; + mock_ns = 1000; + homa->busy_ns = 100; mock_set_core(5); atomic_set(&per_cpu(homa_offload_core, 6).softirq_backlog, 1); per_cpu(homa_offload_core, 6).last_gro = 0; @@ -569,8 +569,8 @@ TEST_F(homa_offload, homa_gro_gen3__basics) offload3->last_app_active = 4100; offload7->last_app_active = 3900; offload5->last_app_active = 2000; - mock_cycles = 5000; - self->homa.busy_cycles = 1000; + mock_ns = 5000; + self->homa.busy_ns = 1000; homa_gro_complete(self->skb, 0); EXPECT_EQ(7, self->skb->hash - 32); @@ -587,8 +587,8 @@ TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) offload_core->gen3_softirq_cores[2] = 5; per_cpu(homa_offload_core, 3).last_app_active = 4100; per_cpu(homa_offload_core, 5).last_app_active = 2000; - mock_cycles = 5000; - self->homa.busy_cycles = 1000; + mock_ns = 5000; + self->homa.busy_ns = 1000; homa_gro_complete(self->skb, 0); EXPECT_EQ(3, self->skb->hash - 32); @@ -605,8 +605,8 @@ TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) per_cpu(homa_offload_core, 3).last_app_active = 4100; per_cpu(homa_offload_core, 7).last_app_active = 4001; per_cpu(homa_offload_core, 5).last_app_active = 4500; - mock_cycles = 5000; - self->homa.busy_cycles = 1000; + mock_ns = 5000; + self->homa.busy_ns = 1000; homa_gro_complete(self->skb, 0); EXPECT_EQ(3, self->skb->hash - 32); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 612139ab..49245fe5 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -54,9 +54,9 @@ FIXTURE_SETUP(homa_outgoing) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); - mock_cycles = 10000; + mock_ns = 10000; atomic64_set(&self->homa.link_idle_time, 10000); - self->homa.cycles_per_kbyte = 1000; + self->homa.ns_per_mbyte = 1000000; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; mock_sock_init(&self->hsk, &self->homa, self->client_port); self->server_addr.in6.sin6_family = AF_INET; @@ -657,7 +657,7 @@ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 11000); - self->homa.max_nic_queue_cycles = 500; + self->homa.max_nic_queue_ns = 500; self->homa.throttle_min_bytes = 250; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_xmit_data(crpc, false); @@ -677,7 +677,7 @@ TEST_F(homa_outgoing, homa_xmit_data__force) /* First, get an RPC on the throttled list. */ atomic64_set(&self->homa.link_idle_time, 11000); - self->homa.max_nic_queue_cycles = 3000; + self->homa.max_nic_queue_ns = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_xmit_data(crpc1, false); unit_log_clear(); @@ -702,7 +702,7 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 11000); - self->homa.max_nic_queue_cycles = 3000; + self->homa.max_nic_queue_ns = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_xmit_data(crpc, false); @@ -929,20 +929,15 @@ TEST_F(homa_outgoing, homa_outgoing_sysctl_changed) { self->homa.link_mbps = 10000; homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(808, self->homa.cycles_per_kbyte); + EXPECT_EQ(808000, self->homa.ns_per_mbyte); self->homa.link_mbps = 1000; homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(8080, self->homa.cycles_per_kbyte); + EXPECT_EQ(8080000, self->homa.ns_per_mbyte); self->homa.link_mbps = 40000; homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(202, self->homa.cycles_per_kbyte); - - self->homa.max_nic_queue_ns = 200; - cpu_khz = 2000000; - homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(400, self->homa.max_nic_queue_cycles); + EXPECT_EQ(202000, self->homa.ns_per_mbyte); } TEST_F(homa_outgoing, homa_check_nic_queue__basics) @@ -954,8 +949,8 @@ TEST_F(homa_outgoing, homa_check_nic_queue__basics) homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 9000); - mock_cycles = 8000; - self->homa.max_nic_queue_cycles = 1000; + mock_ns = 8000; + self->homa.max_nic_queue_ns = 1000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, false)); @@ -970,8 +965,8 @@ TEST_F(homa_outgoing, homa_check_nic_queue__queue_full) homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 9000); - mock_cycles = 7999; - self->homa.max_nic_queue_cycles = 1000; + mock_ns = 7999; + self->homa.max_nic_queue_ns = 1000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; EXPECT_EQ(0, homa_check_nic_queue(&self->homa, crpc->msgout.packets, false)); @@ -986,8 +981,8 @@ TEST_F(homa_outgoing, homa_check_nic_queue__queue_full_but_force) homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 9000); - mock_cycles = 7999; - self->homa.max_nic_queue_cycles = 1000; + mock_ns = 7999; + self->homa.max_nic_queue_ns = 1000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, true)); @@ -1004,14 +999,14 @@ TEST_F(homa_outgoing, homa_check_nic_queue__pacer_metrics) unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 9000); self->homa.pacer_wake_time = 9800; - mock_cycles = 10000; - self->homa.max_nic_queue_cycles = 1000; + mock_ns = 10000; + self->homa.max_nic_queue_ns = 1000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, true)); EXPECT_EQ(10500, atomic64_read(&self->homa.link_idle_time)); EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); - EXPECT_EQ(200, homa_metrics_per_cpu()->pacer_lost_cycles); + EXPECT_EQ(200, homa_metrics_per_cpu()->pacer_lost_ns); } TEST_F(homa_outgoing, homa_check_nic_queue__queue_empty) { @@ -1022,8 +1017,8 @@ TEST_F(homa_outgoing, homa_check_nic_queue__queue_empty) homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.link_idle_time, 9000); - mock_cycles = 10000; - self->homa.max_nic_queue_cycles = 1000; + mock_ns = 10000; + self->homa.max_nic_queue_ns = 1000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, true)); @@ -1050,7 +1045,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__basics) homa_add_to_throttled(crpc1); homa_add_to_throttled(crpc2); homa_add_to_throttled(crpc3); - self->homa.max_nic_queue_cycles = 2000; + self->homa.max_nic_queue_ns = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); homa_pacer_xmit(&self->homa); @@ -1066,13 +1061,13 @@ TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) { struct homa_rpc *crpc1, *crpc2, *crpc3; - mock_cycles = 10000; + mock_ns = 10000; crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 2, 20000, 1000); - mock_cycles = 11000; + mock_ns = 11000; crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 4, 10000, 1000); - mock_cycles = 12000; + mock_ns = 12000; crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 6, 30000, 1000); homa_add_to_throttled(crpc1); @@ -1080,10 +1075,10 @@ TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) homa_add_to_throttled(crpc3); /* First attempt: pacer_fifo_count doesn't reach zero. */ - self->homa.max_nic_queue_cycles = 1300; + self->homa.max_nic_queue_ns = 1300; self->homa.pacer_fifo_count = 200; self->homa.pacer_fifo_fraction = 150; - mock_cycles = 13000; + mock_ns= 13000; atomic64_set(&self->homa.link_idle_time, 10000); self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); @@ -1119,7 +1114,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__pacer_busy) 10000, 1000); homa_add_to_throttled(crpc); - self->homa.max_nic_queue_cycles = 2000; + self->homa.max_nic_queue_ns = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; mock_trylock_errors = 1; unit_log_clear(); @@ -1131,7 +1126,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__pacer_busy) } TEST_F(homa_outgoing, homa_pacer_xmit__queue_empty) { - self->homa.max_nic_queue_cycles = 2000; + self->homa.max_nic_queue_ns = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); homa_pacer_xmit(&self->homa); @@ -1146,8 +1141,8 @@ TEST_F(homa_outgoing, homa_pacer_xmit__nic_queue_fills) 10000, 1000); homa_add_to_throttled(crpc); - self->homa.max_nic_queue_cycles = 2001; - mock_cycles = 10000; + self->homa.max_nic_queue_ns = 2001; + mock_ns = 10000; atomic64_set(&self->homa.link_idle_time, 12000); self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); @@ -1165,7 +1160,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) 5000, 1000); homa_add_to_throttled(crpc); - self->homa.max_nic_queue_cycles = 2000; + self->homa.max_nic_queue_ns = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); mock_trylock_errors = ~1; @@ -1191,7 +1186,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__remove_from_queue) homa_add_to_throttled(crpc1); homa_add_to_throttled(crpc2); - self->homa.max_nic_queue_cycles = 2000; + self->homa.max_nic_queue_ns = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); homa_pacer_xmit(&self->homa); diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index c076be2d..d2c49480 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -54,7 +54,7 @@ static void peer_spinlock_hook(char *id) { if (strcmp(id, "spin_lock") != 0) return; - mock_cycles += 1000; + mock_ns += 1000; } TEST_F(homa_peer, homa_peer_find__basics) @@ -109,11 +109,11 @@ TEST_F(homa_peer, homa_peertab_gc_dsts) struct homa_peer *peer; peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - mock_cycles = 0; + mock_ns = 0; homa_dst_refresh(&self->peertab, peer, &self->hsk); - mock_cycles = 50000000; + mock_ns = 50000000; homa_dst_refresh(&self->peertab, peer, &self->hsk); - mock_cycles = 100000000; + mock_ns = 100000000; homa_dst_refresh(&self->peertab, peer, &self->hsk); EXPECT_EQ(3, dead_count(&self->peertab)); @@ -267,11 +267,11 @@ TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); - mock_cycles = 0; + mock_ns = 0; homa_dst_refresh(self->homa.peers, peer, &self->hsk); homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(2, dead_count(self->homa.peers)); - mock_cycles = 500000000; + mock_ns = 500000000; homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(1, dead_count(self->homa.peers)); } @@ -337,17 +337,17 @@ TEST_F(homa_peer, homa_peer_lock_slow) &self->hsk.inet); ASSERT_NE(NULL, peer); - mock_cycles = 10000; + mock_ns = 10000; homa_peer_lock(peer); EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_miss_ns); homa_peer_unlock(peer); mock_trylock_errors = 1; unit_hook_register(peer_spinlock_hook); homa_peer_lock(peer); EXPECT_EQ(1, homa_metrics_per_cpu()->peer_ack_lock_misses); - EXPECT_EQ(1000, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); + EXPECT_EQ(1000, homa_metrics_per_cpu()->peer_ack_lock_miss_ns); homa_peer_unlock(peer); } diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 1ad35fc5..43109255 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -44,11 +44,11 @@ static void steal_bpages_hook(char *id) case 2: atomic_set(&cur_pool->descriptors[1].refs, 1); cur_pool->descriptors[1].owner = 3; - cur_pool->descriptors[1].expiration = mock_cycles + 1; + cur_pool->descriptors[1].expiration = mock_ns + 1; case 3: atomic_set(&cur_pool->descriptors[2].refs, 1); cur_pool->descriptors[2].owner = 3; - cur_pool->descriptors[2].expiration = mock_cycles - 1; + cur_pool->descriptors[2].expiration = mock_ns - 1; case 4: atomic_set(&cur_pool->descriptors[3].refs, 1); } @@ -168,14 +168,14 @@ TEST_F(homa_pool, homa_pool_get_pages__skip_unusable_bpages) struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; - mock_cycles = 1000; + mock_ns = 1000; atomic_set(&pool->descriptors[0].refs, 2); atomic_set(&pool->descriptors[1].refs, 1); pool->descriptors[1].owner = 3; - pool->descriptors[1].expiration = mock_cycles + 1; + pool->descriptors[1].expiration = mock_ns + 1; atomic_set(&pool->descriptors[2].refs, 1); pool->descriptors[2].owner = 3; - pool->descriptors[2].expiration = mock_cycles - 1; + pool->descriptors[2].expiration = mock_ns - 1; atomic_set(&pool->descriptors[3].refs, 1); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(2, pages[0]); @@ -186,7 +186,7 @@ TEST_F(homa_pool, homa_pool_get_pages__cant_lock_pages) struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; - mock_cycles = 1000; + mock_ns = 1000; mock_trylock_errors = 3; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(2, pages[0]); @@ -197,7 +197,7 @@ TEST_F(homa_pool, homa_pool_get_pages__state_changes_while_locking) struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; - mock_cycles = 1000; + mock_ns = 1000; unit_hook_register(steal_bpages_hook); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(2, pages[0]); @@ -209,8 +209,8 @@ TEST_F(homa_pool, homa_pool_get_pages__steal_expired_page) __u32 pages[10]; pool->descriptors[0].owner = 5; - mock_cycles = 5000; - pool->descriptors[0].expiration = mock_cycles - 1; + mock_ns = 5000; + pool->descriptors[0].expiration = mock_ns - 1; atomic_set(&pool->free_bpages, 20); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(0, pages[0]); @@ -223,11 +223,11 @@ TEST_F(homa_pool, homa_pool_get_pages__set_owner) struct homa_pool *pool = self->hsk.buffer_pool; __u32 pages[10]; - self->homa.bpage_lease_cycles = 1000; - mock_cycles = 5000; + self->homa.bpage_lease_usecs = 1; + mock_ns = 5000; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 1)); EXPECT_EQ(1, pool->descriptors[pages[0]].owner); - EXPECT_EQ(mock_cycles + 1000, + EXPECT_EQ(mock_ns + 1000, pool->descriptors[pages[1]].expiration); EXPECT_EQ(2, atomic_read(&pool->descriptors[1].refs)); } diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 23aa15aa..17e111e0 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -270,7 +270,7 @@ TEST_F(homa_rpc, homa_bucket_lock_slow) struct homa_rpc *crpc, *srpc; int created; - mock_cycles = ~0; + mock_ns_tick = 10; crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); ASSERT_FALSE(IS_ERR(crpc)); homa_rpc_free(crpc); @@ -281,17 +281,17 @@ TEST_F(homa_rpc, homa_bucket_lock_slow) homa_rpc_unlock(srpc); EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_miss_ns); homa_bucket_lock_slow(crpc->bucket, crpc->id); homa_rpc_unlock(crpc); EXPECT_EQ(1, homa_metrics_per_cpu()->client_lock_misses); - EXPECT_NE(0, homa_metrics_per_cpu()->client_lock_miss_cycles); + EXPECT_NE(0, homa_metrics_per_cpu()->client_lock_miss_ns); EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_miss_ns); homa_bucket_lock_slow(srpc->bucket, srpc->id); homa_rpc_unlock(srpc); EXPECT_EQ(1, homa_metrics_per_cpu()->server_lock_misses); - EXPECT_NE(0, homa_metrics_per_cpu()->server_lock_miss_cycles); + EXPECT_EQ(10, homa_metrics_per_cpu()->server_lock_miss_ns); } TEST_F(homa_rpc, homa_rpc_acked__basics) @@ -632,7 +632,7 @@ TEST_F(homa_rpc, homa_rpc_reap__free_gaps) ASSERT_NE(NULL, crpc); homa_gap_new(&crpc->msgin.gaps, 1000, 2000); - mock_cycles = 1000; + mock_ns = 1000; homa_gap_new(&crpc->msgin.gaps, 5000, 6000); EXPECT_STREQ("start 1000, end 2000; start 5000, end 6000, time 1000", diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 4806dcbe..cae8a987 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -337,20 +337,20 @@ TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) { struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); - mock_cycles = ~0; + mock_ns_tick = 100; EXPECT_EQ(0, skb_core->pool->avail); EXPECT_EQ(0, skb_core->num_stashed_pages); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); EXPECT_NE(NULL, skb_core->skb_page); EXPECT_EQ(HOMA_SKB_PAGE_SIZE, skb_core->page_size); EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); - EXPECT_NE(0, homa_metrics_per_cpu()->skb_page_alloc_cycles); + EXPECT_EQ(100, homa_metrics_per_cpu()->skb_page_alloc_ns); } TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) { struct homa_skb_core *skb_core = get_skb_core(2); - mock_cycles = ~0; + mock_ns_tick = 50; mock_alloc_page_errors = 1; EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); EXPECT_NE(NULL, skb_core->skb_page); @@ -358,7 +358,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) EXPECT_EQ(PAGE_SIZE, skb_core->page_size); EXPECT_EQ(0, skb_core->page_inuse); EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); - EXPECT_NE(0, homa_metrics_per_cpu()->skb_page_alloc_cycles); + EXPECT_EQ(50, homa_metrics_per_cpu()->skb_page_alloc_ns); } TEST_F(homa_skb, homa_skb_page_alloc__no_pages_available) { @@ -660,7 +660,7 @@ TEST_F(homa_skb, homa_skb_get) TEST_F(homa_skb, homa_skb_release_pages__basics) { EXPECT_EQ(0UL, self->homa.skb_page_free_time); - mock_cycles = 1000000; + mock_ns = 1000000; self->homa.skb_page_free_time = 500000; self->homa.skb_page_frees_per_sec = 10; self->homa.skb_page_pool_min_kb = 0; @@ -677,7 +677,7 @@ TEST_F(homa_skb, homa_skb_release_pages__basics) TEST_F(homa_skb, homa_skb_release_pages__not_time_to_free) { EXPECT_EQ(0UL, self->homa.skb_page_free_time); - mock_cycles = 1000000; + mock_ns = 1000000; self->homa.skb_page_free_time = 1000001; self->homa.skb_page_frees_per_sec = 10; self->homa.skb_page_pool_min_kb = 0; @@ -689,7 +689,7 @@ TEST_F(homa_skb, homa_skb_release_pages__not_time_to_free) TEST_F(homa_skb, homa_skb_release_pages__allocate_skb_pages_to_free) { EXPECT_EQ(0, self->homa.pages_to_free_slots); - mock_cycles = 1000000; + mock_ns= 1000000; self->homa.skb_page_frees_per_sec = 10; self->homa.skb_page_free_time = 500000; @@ -706,7 +706,7 @@ TEST_F(homa_skb, homa_skb_release_pages__allocate_skb_pages_to_free) TEST_F(homa_skb, homa_skb_release_pages__limited_by_min_kb) { EXPECT_EQ(0UL, self->homa.skb_page_free_time); - mock_cycles = 1000000; + mock_ns = 1000000; self->homa.skb_page_free_time = 500000; self->homa.skb_page_frees_per_sec = 20; self->homa.skb_page_pool_min_kb = (5 * HOMA_SKB_PAGE_SIZE) / 1000; @@ -719,7 +719,7 @@ TEST_F(homa_skb, homa_skb_release_pages__limited_by_min_kb) TEST_F(homa_skb, homa_skb_release_pages__empty_pool) { EXPECT_EQ(0UL, self->homa.skb_page_free_time); - mock_cycles = 2000000; + mock_ns= 2000000; self->homa.skb_page_free_time = 500000; self->homa.skb_page_frees_per_sec = 1000; self->homa.skb_page_pool_min_kb = 0; diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 5f60f73b..667c32df 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -371,16 +371,16 @@ TEST_F(homa_sock, homa_sock_find__long_hash_chain) TEST_F(homa_sock, homa_sock_lock_slow) { - mock_cycles = ~0; + mock_ns_tick = 100; homa_sock_lock(&self->hsk, "unit test"); EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_miss_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_miss_ns); homa_sock_unlock(&self->hsk); mock_trylock_errors = 1; homa_sock_lock(&self->hsk, "unit test"); EXPECT_EQ(1, homa_metrics_per_cpu()->socket_lock_misses); - EXPECT_NE(0, homa_metrics_per_cpu()->socket_lock_miss_cycles); + EXPECT_EQ(100, homa_metrics_per_cpu()->socket_lock_miss_ns); homa_sock_unlock(&self->hsk); } diff --git a/timetrace.c b/timetrace.c index 6213cdf3..c4165936 100644 --- a/timetrace.c +++ b/timetrace.c @@ -15,7 +15,7 @@ void (*tt_linux_freeze)(void); atomic_t *tt_linux_freeze_count; atomic_t tt_linux_freeze_no_homa; int *tt_linux_homa_temp; -int tt_linux_homa_temp_default[]; +int tt_linux_homa_temp_default[16]; void (*tt_linux_inc_metrics)(int metric, __u64 count); void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, const char *format, __u32 arg0, __u32 arg1, __u32 arg2, @@ -837,8 +837,8 @@ void tt_inc_metric(int metric, __u64 count) * for the legal values of metric. */ static int offsets[] = { - offsetof(struct homa_metrics, napi_cycles), - offsetof(struct homa_metrics, linux_softirq_cycles), + offsetof(struct homa_metrics, napi_ns), + offsetof(struct homa_metrics, linux_softirq_ns), offsetof(struct homa_metrics, linux_pkt_alloc_bytes), }; __u64 *metric_addr = (__u64 *)(((char *) homa_metrics_per_cpu()) diff --git a/timetrace.h b/timetrace.h index ba7a5b72..44767ede 100644 --- a/timetrace.h +++ b/timetrace.h @@ -5,6 +5,12 @@ #include +#ifdef __UNIT_TEST__ +#undef get_cycles +#define get_cycles mock_get_cycles +cycles_t mock_get_cycles(void); +#endif /* __UNIT_TEST__ */ + // Change 1 -> 0 in the following line to disable time tracing globally. // Used only in debugging. #define ENABLE_TIME_TRACE 1 diff --git a/util/metrics.py b/util/metrics.py index e569ad88..61121a74 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -115,7 +115,7 @@ def scale_number(number): # Sum all of the individual core counts for both the new and old data and # compute the difference in "deltas" for symbol in symbols: - if (symbol == "rdtsc_cycles") or (symbol == "cpu_khz") or (symbol == "core"): + if (symbol == "time_ns") or (symbol == "core"): # This symbol shouldn't be summed. continue total_cur = 0 @@ -134,25 +134,21 @@ def scale_number(number): elapsed_secs = 0 reaper_calls = 0 pad = "" -cpu_khz = float(cur[0]["cpu_khz"]) if len(prev) > 0: - time_delta = cur[0]["rdtsc_cycles"] - prev[0]["rdtsc_cycles"] - elapsed_secs = float(time_delta)/(cpu_khz * 1000.0) + time_delta = cur[0]["time_ns"] - prev[0]["time_ns"] + elapsed_secs = float(time_delta)*1e-09 pad = pad.ljust(13) secs = "(%.1f s)" % (elapsed_secs) secs = secs.ljust(12) - print("%-28s %15d %s %s" % ("rdtsc_cycles", time_delta, secs, - docs["rdtsc_cycles"])) + print("%-28s %15d %s %s" % ("time_ns", time_delta, secs, + docs["time_ns"])) else: - print("%-15s %28d %s%s" % ("rdtsc_cycles", cur[0]["rdtsc_cycles"], - "", docs["rdtsc_cycles"])) - -print("%-28s %5.2f %sCPU clock rate (GHz)" % ("clock_rate", - cpu_khz/1e06, pad)) + print("%-15s %28d %s%s" % ("time_ns", cur[0]["time_ns"], + "", docs["time_ns"])) for symbol in symbols: - if (symbol == "rdtsc_cycles") or (symbol == "cpu_khz"): + if (symbol == "time_ns"): # This symbol is handled specially above continue delta = deltas[symbol] @@ -164,7 +160,7 @@ def scale_number(number): rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) if ("msg_bytes" in symbol) and (symbol != "sent_msg_bytes"): total_received_bytes += delta - if symbol.endswith("_cycles") and (time_delta != 0): + if symbol.endswith("_ns") and (time_delta != 0): percent = "(%.1f%%)" % (100.0*delta/time_delta) percent = percent.ljust(12) print("%-28s %15d %s %s" % (symbol, delta, percent, doc)) @@ -185,10 +181,10 @@ def scale_number(number): if (symbol == "reaper_dead_skbs") and ("reaper_calls" in deltas): print("%-28s %6.1f %sAvg. hsk->dead_skbs in reaper" % ( "avg_dead_skbs", delta/deltas["reaper_calls"], pad)) - if symbol.endswith("_miss_cycles") and (time_delta != 0): + if symbol.endswith("_miss_ns") and (time_delta != 0): prefix = symbol[:-12] if ((prefix + "_misses") in deltas) and (deltas[prefix + "_misses"] != 0): - ns = (delta/deltas[prefix + "_misses"])/(cpu_khz * 1e-06) + ns = (delta/deltas[prefix + "_misses"]) print("%-28s %6.1f %sAvg. wait time per %s miss (ns)" % ( prefix + "_miss_delay", ns, pad, prefix)) if (symbol == "large_msg_bytes") and (total_received_bytes != 0) \ @@ -227,9 +223,9 @@ def scale_number(number): for where in ["napi", "softirq", "send", "recv", "reply", "timer", "pacer"]: if where == "softirq": - symbol = "linux_softirq_cycles" + symbol = "linux_softirq_ns" else: - symbol = where + "_cycles" + symbol = where + "_ns" line = "%-10s " % (where) for core in range(first_core, end_core): frac = float(cur[core][symbol] - prev[core][symbol]) / float( @@ -266,7 +262,7 @@ def scale_number(number): total_cores_used = 0.0 total_syscalls = 0 - time = float(deltas["send_cycles"]) + time = float(deltas["send_ns"]) cores = time/time_delta total_cores_used += cores calls = float(deltas["send_calls"]) @@ -274,10 +270,10 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (time/calls)/(cpu_khz/1e03) + us_per = (time/calls)/1000 print("send syscall %6.2f %7.2f us/syscall" % (cores, us_per)) - time = float(deltas["recv_cycles"]) - float(deltas["poll_cycles"]) + time = float(deltas["recv_ns"]) - float(deltas["poll_ns"]) cores = time/time_delta total_cores_used += cores calls = float(deltas["recv_calls"]) @@ -285,10 +281,10 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (time/calls)/(cpu_khz/1e03) + us_per = (time/calls)/1000 print("recv syscall (-poll) %6.2f %7.2f us/syscall" % (cores, us_per)) - time = float(deltas["reply_cycles"]) + time = float(deltas["reply_ns"]) cores = time/time_delta total_cores_used += cores calls = float(deltas["reply_calls"]) @@ -296,31 +292,31 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (time/calls)/(cpu_khz/1e03) + us_per = (time/calls)/1000 print("reply syscall %6.2f %7.2f us/syscall" % (cores, us_per)) - for print_name, symbol in [["NAPI", "napi_cycles"], - [" Bypass homa_softirq", "bypass_softirq_cycles"], - ["Linux SoftIRQ", "linux_softirq_cycles"], - [" Normal homa_softirq", "softirq_cycles"]]: + for print_name, symbol in [["NAPI", "napi_ns"], + [" Bypass homa_softirq", "bypass_softirq_ns"], + ["Linux SoftIRQ", "linux_softirq_ns"], + [" Normal homa_softirq", "softirq_ns"]]: cpu_time = float(deltas[symbol]) cores = cpu_time/time_delta if packets_received > 0: print("%s %6.2f %7.2f us/packet" % (print_name.ljust(22), - cores, (cpu_time/packets_received) / (cpu_khz/1e03))) + cores, (cpu_time/packets_received) / 1000)) else: print("%s %6.2f" % (print_name.ljust(22), cores)) - cpu_time = float(deltas["napi_cycles"]) + cpu_time = float(deltas["napi_ns"]) if cpu_time == 0: - cpu_time = float(deltas["bypass_softirq_cycles"]) + cpu_time = float(deltas["bypass_softirq_ns"]) total_cores_used += cpu_time/time_delta - cpu_time = float(deltas["linux_softirq_cycles"]) + cpu_time = float(deltas["linux_softirq_ns"]) if cpu_time == 0: - cpu_time = float(deltas["softirq_cycles"]) + cpu_time = float(deltas["softirq_ns"]) total_cores_used += cpu_time/time_delta - for print_name, symbol in [["Pacer", "pacer_cycles"], - ["Timer handler", "timer_cycles"]]: + for print_name, symbol in [["Pacer", "pacer_ns"], + ["Timer handler", "timer_ns"]]: cpu_time = float(deltas[symbol]) cores = cpu_time/time_delta total_cores_used += cores @@ -329,13 +325,13 @@ def scale_number(number): print("----------------------------------") print("Total Core Utilization %6.2f" % (total_cores_used)) - time = float(deltas["poll_cycles"]) + time = float(deltas["poll_ns"]) cores = time/time_delta calls = float(deltas["recv_calls"]) if calls == 0: us_per = 0 else: - us_per = (time/calls)/(cpu_khz/1e03) + us_per = (time/calls)/1000 print("") print("Polling in recv %6.2f %7.2f us/syscall" % (cores, us_per)) @@ -343,31 +339,31 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (deltas["skb_alloc_cycles"]/calls)/(cpu_khz/1e03) + us_per = (deltas["skb_alloc_ns"]/calls)/1000 print("Skb allocation %6.2f %7.2f us/skb" % ( - deltas["skb_alloc_cycles"]/time_delta, us_per)) + deltas["skb_alloc_ns"]/time_delta, us_per)) calls = deltas["skb_frees"] if calls == 0: us_per = 0 else: - us_per = (deltas["skb_free_cycles"]/calls)/(cpu_khz/1e03) + us_per = (deltas["skb_free_ns"]/calls)/1000 print("Skb freeing %6.2f %7.2f us/skb" % ( - deltas["skb_free_cycles"]/time_delta, us_per)) + deltas["skb_free_ns"]/time_delta, us_per)) print("\nLock Misses:") print("------------") print(" Misses/sec. ns/Miss %CPU") for lock in ["client", "server", "socket", "grantable", "throttle", "peer_ack"]: misses = float(deltas[lock + "_lock_misses"]) - cycles = float(deltas[lock + "_lock_miss_cycles"]) + ns = float(deltas[lock + "_lock_miss_ns"]) if misses == 0: - cycles_per_miss = 0.0 + ns_per_miss = 0.0 else: - cycles_per_miss = cycles/misses + ns_per_miss = ns/misses print("%-10s %s %6.1f %5.1f" % (lock, scale_number(misses/elapsed_secs), - cycles_per_miss/(cpu_khz/1e06), 100.0*cycles/time_delta)) + ns_per_miss, 100.0*ns/time_delta)) total_messages = float(deltas["requests_received"] + deltas["responses_received"]) @@ -421,25 +417,25 @@ def scale_number(number): print(" %5.2f Gbps/core (goodput)" % ( 8e-9*(total_received_bytes + float(deltas["sent_msg_bytes"])) /(total_cores_used * elapsed_secs))) - if deltas["pacer_cycles"] != 0: - pacer_secs = float(deltas["pacer_cycles"])/(cpu_khz * 1000.0) + if deltas["pacer_ns"] != 0: + pacer_secs = float(deltas["pacer_ns"])/1000 print("Pacer throughput: %6.2f Gbps (pacer output when pacer running)" % ( deltas["pacer_bytes"]*8e-09/pacer_secs)) - if deltas["throttled_cycles"] != 0: - throttled_secs = float(deltas["throttled_cycles"])/(cpu_khz * 1000.0) + if deltas["throttled_ns"] != 0: + throttled_secs = float(deltas["throttled_ns"])/1000 print("Throttled throughput: %5.2f Gbps (pacer output when throttled)" % ( deltas["pacer_bytes"]*8e-09/throttled_secs)) if deltas["skb_allocs"] != 0: print("Skb alloc time: %4.2f usec/skb" % ( - float(deltas["skb_alloc_cycles"]) / (cpu_khz / 1000.0) / + float(deltas["skb_alloc_ns"]) / 1000 / deltas["skb_allocs"])) if deltas["skb_page_allocs"] != 0: print("Skb page alloc time: %5.2f usec/skb" % ( - float(deltas["skb_page_alloc_cycles"]) / (cpu_khz / 1000.0) / + float(deltas["skb_page_alloc_ns"]) / 1000 / deltas["skb_page_allocs"])) if deltas["grant_recalc_calls"] != 0: print("homa_grant_recalc: %5.2f usec/call" % ( - float(deltas["grant_recalc_cycles"]) / (cpu_khz / 1000.0) / + float(deltas["grant_recalc_ns"]) / 1000 / deltas["grant_recalc_calls"])) print("\nCanaries (possible problem indicators):") @@ -466,8 +462,8 @@ def scale_number(number): rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) print("%-28s %15d %s%s" % (symbol, deltas[symbol], rate_info, docs[symbol])) - for symbol in ["pacer_lost_cycles", "timer_reap_cycles", - "data_pkt_reap_cycles", "grantable_lock_cycles"]: + for symbol in ["pacer_lost_ns", "timer_reap_ns", + "data_pkt_reap_ns", "grantable_lock_ns"]: delta = deltas[symbol] if delta == 0 or time_delta == 0: continue From 79a01262a664f27721b64d13de70ebd089c74c19 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 1 Nov 2024 17:19:49 -0700 Subject: [PATCH 062/625] Add missing checks of kmalloc return value --- homa_incoming.c | 18 +++++++++++++-- homa_skb.c | 7 +++++- homa_skb.h | 2 +- homa_utils.c | 33 +++++++++++++++++++++------- test/mock.c | 30 ++++++++++++++++++++++++- test/mock.h | 2 ++ test/unit_homa_incoming.c | 46 +++++++++++++++++++++++++++++++++++++++ test/unit_homa_skb.c | 15 +++++++++++-- test/unit_homa_utils.c | 43 ++++++++++++++++++++++++++++++++++++ test/utils.c | 2 +- 10 files changed, 182 insertions(+), 16 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index a98bd80b..abf13ab8 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -59,13 +59,16 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) * @next: Add the new gap just before this list element. * @start: Offset of first byte covered by the gap. * @end: Offset of byte just after the last one covered by the gap. - * Return: Pointer to the new gap. + * Return: Pointer to the new gap, or NULL if memory couldn't be allocated + * for the gap object. */ struct homa_gap *homa_gap_new(struct list_head *next, int start, int end) { struct homa_gap *gap; gap = kmalloc(sizeof(*gap), GFP_KERNEL); + if (!gap) + return NULL; gap->start = start; gap->end = end; gap->time = sched_clock(); @@ -122,7 +125,12 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (start > rpc->msgin.recv_end) { /* Packet creates a new gap. */ - homa_gap_new(&rpc->msgin.gaps, rpc->msgin.recv_end, start); + if (!homa_gap_new(&rpc->msgin.gaps, rpc->msgin.recv_end, start)) { + pr_err("Homa couldn't allocate gap: insufficient memory\n"); + tt_record2("Couldn't allocate gap for id %d (start %d): no memory", + rpc->id, start); + goto discard; + } rpc->msgin.recv_end = end; goto keep; } @@ -170,6 +178,12 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) /* Packet is in the middle of the gap; must split the gap. */ gap2 = homa_gap_new(&gap->links, gap->start, start); + if (!gap2) { + pr_err("Homa couldn't allocate gap for split: insufficient memory\n"); + tt_record2("Couldn't allocate gap for split for id %d (start %d): no memory", + rpc->id, end); + goto discard; + } gap2->time = gap->time; gap->start = end; goto keep; diff --git a/homa_skb.c b/homa_skb.c index 57bd18b9..d06262b6 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -25,8 +25,10 @@ static inline void frag_page_set(skb_frag_t *frag, struct page *page) /** * homa_skb_init() - Invoked when a struct homa is created to initialize * information related to sk_buff management. + * @homa: Shared information about the Homa transport + * Return: 0 for success, negative errno on error */ -void homa_skb_init(struct homa *homa) +int homa_skb_init(struct homa *homa) { int i; @@ -51,6 +53,8 @@ void homa_skb_init(struct homa *homa) struct homa_page_pool *pool; pool = kmalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return -ENOMEM; pool->avail = 0; pool->low_mark = 0; memset(pool->pages, 0, sizeof(pool->pages)); @@ -59,6 +63,7 @@ void homa_skb_init(struct homa *homa) skb_core->pool = homa->page_pools[numa]; } pr_notice("homa_skb_init found max NUMA node %d\n", homa->max_numa); + return 0; } /** diff --git a/homa_skb.h b/homa_skb.h index c613c717..fd52919a 100644 --- a/homa_skb.h +++ b/homa_skb.h @@ -113,7 +113,7 @@ extern void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count); extern void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length); -extern void homa_skb_init(struct homa *homa); +extern int homa_skb_init(struct homa *homa); extern struct sk_buff *homa_skb_new_tx(int length); extern bool homa_skb_page_alloc(struct homa *homa, diff --git a/homa_utils.c b/homa_utils.c index f98c816f..1c114a62 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -57,14 +57,27 @@ int homa_init(struct homa *homa) atomic_set(&homa->total_incoming, 0); homa->next_client_port = HOMA_MIN_DEFAULT_PORT; homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL); + if (!homa->port_map) { + pr_err("homa_init could create port_map: kmalloc failure"); + return -ENOMEM; + } homa_socktab_init(homa->port_map); homa->peers = kmalloc(sizeof(*homa->peers), GFP_KERNEL); + if (!homa->peers) { + pr_err("homa_init could create peers: kmalloc failure"); + return -ENOMEM; + } err = homa_peertab_init(homa->peers); if (err) { pr_err("Couldn't initialize peer table (errno %d)\n", -err); return err; } - homa_skb_init(homa); + err = homa_skb_init(homa); + if (err) { + pr_err("Couldn't initialize skb management (errno %d)\n", + -err); + return err; + } /* Wild guesses to initialize configuration values... */ homa->unsched_bytes = 10000; @@ -149,13 +162,17 @@ void homa_destroy(struct homa *homa) wait_for_completion(&homa_pacer_kthread_done); } - /* The order of the following 2 statements matters! */ - homa_socktab_destroy(homa->port_map); - kfree(homa->port_map); - homa->port_map = NULL; - homa_peertab_destroy(homa->peers); - kfree(homa->peers); - homa->peers = NULL; + /* The order of the following statements matters! */ + if (homa->port_map) { + homa_socktab_destroy(homa->port_map); + kfree(homa->port_map); + homa->port_map = NULL; + } + if (homa->peers) { + homa_peertab_destroy(homa->peers); + kfree(homa->peers); + homa->peers = NULL; + } homa_skb_cleanup(homa); kfree(homa->metrics); homa->metrics = NULL; diff --git a/test/mock.c b/test/mock.c index 30677b2b..dee45528 100644 --- a/test/mock.c +++ b/test/mock.c @@ -43,6 +43,7 @@ int mock_import_iovec_errors; int mock_ip6_xmit_errors; int mock_ip_queue_xmit_errors; int mock_kmalloc_errors; +int mock_kthread_create_errors; int mock_route_errors; int mock_spin_lock_held; int mock_trylock_errors; @@ -171,6 +172,9 @@ int mock_compound_order_mask; */ int mock_page_nid_mask; +/* Used to collect printk output. */ +char mock_printk_output [5000]; + struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; struct netdev_queue mock_net_queue = {.state = 0}; struct net_device mock_net_device = { @@ -767,6 +771,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), const char namefmt[], ...) { + if (mock_check_error(&mock_kthread_create_errors)) + return ERR_PTR(-EACCES); return NULL; } @@ -835,8 +841,28 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, return 0; } -int _printk(const char *fmt, ...) +int _printk(const char *format, ...) { + int len = strlen(mock_printk_output); + int available; + va_list ap; + + available = sizeof(mock_printk_output) - len; + if (available >= 10) { + if (len != 0) { + strcpy(mock_printk_output + len, "; "); + len += 2; + available -= 2; + } + va_start(ap, format); + vsnprintf(mock_printk_output + len, available, format, ap); + va_end(ap); + + /* Remove trailing newline. */ + len += strlen(mock_printk_output + len); + if (mock_printk_output[len-1] == '\n') + mock_printk_output[len-1] = 0; + } return 0; } @@ -1531,6 +1557,7 @@ void mock_teardown(void) mock_ip6_xmit_errors = 0; mock_ip_queue_xmit_errors = 0; mock_kmalloc_errors = 0; + mock_kthread_create_errors = 0; mock_copy_to_user_dont_copy = 0; mock_bpage_size = 0x10000; mock_bpage_shift = 16; @@ -1549,6 +1576,7 @@ void mock_teardown(void) mock_numa_mask = 5; mock_compound_order_mask = 0; mock_page_nid_mask = 0; + mock_printk_output[0] = 0; mock_net_device.gso_max_size = 0; mock_net_device.gso_max_segs = 1000; memset(inet_offloads, 0, sizeof(inet_offloads)); diff --git a/test/mock.h b/test/mock.h index 89ed992b..96e9af01 100644 --- a/test/mock.h +++ b/test/mock.h @@ -21,6 +21,7 @@ extern int mock_ip_queue_xmit_errors; extern bool mock_ipv6; extern bool mock_ipv6_default; extern int mock_kmalloc_errors; +extern int mock_kthread_create_errors; extern char mock_xmit_prios[]; extern int mock_log_rcu_sched; extern int mock_max_grants; @@ -32,6 +33,7 @@ extern __u64 mock_ns; extern __u64 mock_ns_tick; extern int mock_numa_mask; extern int mock_page_nid_mask; +extern char mock_printk_output[]; extern int mock_route_errors; extern int mock_spin_lock_held; extern struct task_struct diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 3e3b14e0..9e27e390 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -339,6 +339,25 @@ TEST_F(homa_incoming, homa_add_packet__new_gap) EXPECT_EQ(5600, crpc->msgin.recv_end); EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } +TEST_F(homa_incoming, homa_add_packet__no_memory_for_new_gap) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + homa_add_packet(crpc, mock_skb_new(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(4200); + mock_kmalloc_errors = 1; + homa_add_packet(crpc, mock_skb_new(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("", unit_print_gaps(crpc)); + EXPECT_EQ(1400, crpc->msgin.recv_end); + EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); +} TEST_F(homa_incoming, homa_add_packet__packet_before_gap) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -546,6 +565,33 @@ TEST_F(homa_incoming, homa_add_packet__packet_in_middle_of_gap) EXPECT_STREQ("start 1400, end 2000, time 1000; start 3400, end 4200, time 1000", unit_print_gaps(crpc)); } +TEST_F(homa_incoming, homa_add_packet__kmalloc_failure_while_splitting_gap) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 1000); + + homa_message_in_init(crpc, 10000, 0); + unit_log_clear(); + mock_ns = 1000; + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_new(self->client_ip, + &self->data.common, 1400, 0)); + + self->data.seg.offset = htonl(4200); + homa_add_packet(crpc, mock_skb_new(self->client_ip, + &self->data.common, 1400, 4200)); + EXPECT_STREQ("start 1400, end 4200, time 1000", + unit_print_gaps(crpc)); + + self->data.seg.offset = htonl(2000); + mock_ns = 2000; + mock_kmalloc_errors = 1; + homa_add_packet(crpc, mock_skb_new(self->client_ip, + &self->data.common, 1400, 2000)); + EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); + EXPECT_STREQ("start 1400, end 4200, time 1000", unit_print_gaps(crpc)); +} TEST_F(homa_incoming, homa_add_packet__scan_multiple_gaps) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index cae8a987..81558548 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -92,12 +92,12 @@ FIXTURE_TEARDOWN(homa_skb) unit_teardown(); } -TEST_F(homa_skb, homa_skb_init) +TEST_F(homa_skb, homa_skb_init__success) { homa_skb_cleanup(&self->homa); EXPECT_EQ(NULL, self->homa.page_pools[0]); mock_numa_mask = 0x83; - homa_skb_init(&self->homa); + EXPECT_EQ(0, homa_skb_init(&self->homa)); EXPECT_NE(NULL, self->homa.page_pools[0]); EXPECT_NE(NULL, self->homa.page_pools[1]); EXPECT_EQ(NULL, self->homa.page_pools[2]); @@ -108,6 +108,17 @@ TEST_F(homa_skb, homa_skb_init) EXPECT_EQ(self->homa.page_pools[1], get_skb_core(7)->pool); EXPECT_EQ(1, self->homa.max_numa); } +TEST_F(homa_skb, homa_skb_init__kmalloc_failure) +{ + homa_skb_cleanup(&self->homa); + EXPECT_EQ(NULL, self->homa.page_pools[0]); + mock_numa_mask = 0x2; + mock_kmalloc_errors = 0x2; + EXPECT_EQ(ENOMEM, -homa_skb_init(&self->homa)); + EXPECT_NE(NULL, self->homa.page_pools[0]); + EXPECT_EQ(NULL, self->homa.page_pools[1]); + EXPECT_EQ(NULL, self->homa.page_pools[2]); +} TEST_F(homa_skb, homa_skb_cleanup) { diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index ec04cb11..10c5f1b4 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -50,6 +50,49 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, homa->unsched_cutoffs[7] = c7; } +TEST_F(homa_utils, homa_init__kmalloc_failure_for_port_map) +{ + struct homa homa2; + + memset(&homa2, 0, sizeof(homa2)); + mock_kmalloc_errors = 1; + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_EQ(NULL, homa2.port_map); + homa_destroy(&homa2); +} +TEST_F(homa_utils, homa_init__kmalloc_failure_for_peers) +{ + struct homa homa2; + + memset(&homa2, 0, sizeof(homa2)); + mock_kmalloc_errors = 2; + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_NE(NULL, homa2.port_map); + EXPECT_EQ(NULL, homa2.peers); + homa_destroy(&homa2); +} +TEST_F(homa_utils, homa_init__homa_skb_init_failure) +{ + struct homa homa2; + + memset(&homa2, 0, sizeof(homa2)); + mock_kmalloc_errors = 4; + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_SUBSTR("Couldn't initialize skb management (errno 12)", + mock_printk_output); + homa_destroy(&homa2); +} +TEST_F(homa_utils, homa_init__cant_create_pacer_thread) +{ + struct homa homa2; + + memset(&homa2, 0, sizeof(homa2)); + mock_kthread_create_errors = 1; + EXPECT_EQ(EACCES, -homa_init(&homa2)); + EXPECT_EQ(NULL, homa2.pacer_kthread); + homa_destroy(&homa2); +} + TEST_F(homa_utils, homa_print_ipv4_addr) { struct in6_addr test_addr1 = unit_get_in_addr("192.168.0.1"); diff --git a/test/utils.c b/test/utils.c index 5c85cf90..b520ad90 100644 --- a/test/utils.c +++ b/test/utils.c @@ -459,6 +459,6 @@ char *unit_ack_string(struct homa_ack *ack) */ void unit_homa_destroy(struct homa *homa) { - if (!list_empty(&homa->port_map->active_scans)) + if (homa->port_map && !list_empty(&homa->port_map->active_scans)) FAIL("struct homa deleted with active socktab scans"); } \ No newline at end of file From 2f834c72fa5235a10311a16acbf18325f4538f18 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 15 Nov 2024 11:59:10 -0800 Subject: [PATCH 063/625] Don't include linux/version.h in stripped code --- homa_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_impl.h b/homa_impl.h index 5009dcd1..92c87427 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -45,6 +44,7 @@ #include #if 1 /* See strip.py --alt */ +#include #include "homa.h" #else /* See strip.py */ #include From c672115f5d069ed32d5c5eef55de01abfcba97a3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sat, 2 Nov 2024 13:44:44 -0700 Subject: [PATCH 064/625] Remove inline qualifier from functions in .c files Also remove now-unused function set_priority. --- homa_grant.c | 4 ++-- homa_offload.c | 2 +- homa_outgoing.c | 20 -------------------- homa_pool.c | 2 +- homa_skb.c | 2 +- 5 files changed, 5 insertions(+), 25 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index d540b162..e799579a 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -17,7 +17,7 @@ * @rpc1: First RPC to consider. * @rpc2: Second RPC to consider. */ -inline int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) +int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) { /* Fewest bytes remaining is the primary criterion; if those are * equal, then favor the older RPC. @@ -41,7 +41,7 @@ inline int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) * may be possible to send out additional grants to some RPCs (doing * this is left to the caller). */ -inline int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) +int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) { int incoming = rpc->msgin.granted - (rpc->msgin.length - rpc->msgin.bytes_remaining); diff --git a/homa_offload.c b/homa_offload.c index f9f6b184..174d9465 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -160,7 +160,7 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, * @cpu: Index of core to which the packet should be directed for * SoftIRQ processing. */ -static inline void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) +static void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) { struct rps_sock_flow_table *sock_flow_table; int hash; diff --git a/homa_outgoing.c b/homa_outgoing.c index bd5b2727..e3410a2f 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -10,26 +10,6 @@ #include "homa_skb.h" #include "homa_wire.h" -/** - * set_priority() - Arrange for an outgoing packet to have a particular - * priority level. - * @skb: The packet was priority should be set. - * @hsk: Socket on which the packet will be sent. - * @priority: Priority level for the packet; must be less than - * HOMA_MAX_PRIORITIES. - */ -static inline void set_priority(struct sk_buff *skb, struct homa_sock *hsk, - int priority) -{ - /* Note: this code initially specified the priority in the VLAN - * header, but as of 3/2020, this performed badly on the CloudLab - * cluster being used for testing: 100 us of extra delay occurred - * whenever a packet's VLAN priority differed from the previous - * packet. So, now we use the DSCP field in the IP header instead. - */ - hsk->inet.tos = hsk->homa->priority_map[priority]<<5; -} - /** * homa_message_out_init() - Initialize rpc->msgout. * @rpc: RPC whose output message should be initialized. diff --git a/homa_pool.c b/homa_pool.c index fe809b8f..89d313df 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -31,7 +31,7 @@ * The caller must own the lock for @pool->hsk. * @pool: Pool to update. */ -static inline void set_bpages_needed(struct homa_pool *pool) +static void set_bpages_needed(struct homa_pool *pool) { struct homa_rpc *rpc = list_first_entry(&pool->hsk->waiting_for_bufs, struct homa_rpc, buf_links); diff --git a/homa_skb.c b/homa_skb.c index d06262b6..dede213c 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -17,7 +17,7 @@ extern int mock_max_skb_frags; #define HOMA_MAX_SKB_FRAGS MAX_SKB_FRAGS #endif -static inline void frag_page_set(skb_frag_t *frag, struct page *page) +static void frag_page_set(skb_frag_t *frag, struct page *page) { frag->netmem = page_to_netmem(page); } From bd8be84b065305bc949d21f954dbb784461cc464 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sat, 2 Nov 2024 13:47:59 -0700 Subject: [PATCH 065/625] Clean up lint in homa.h * Use more precise type "uint32_t" rather than "int" * Document that pad fields must be zero --- homa.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/homa.h b/homa.h index 382f9e5b..e7c35d4d 100644 --- a/homa.h +++ b/homa.h @@ -109,7 +109,7 @@ struct homa_recvmsg_args { * @flags: (in) OR-ed combination of bits that control the operation. * See below for values. */ - int flags; + uint32_t flags; /** * @error_addr: the address of the peer is stored here when available. @@ -127,6 +127,7 @@ struct homa_recvmsg_args { */ uint32_t num_bpages; + /* Reserved for future use; must be zero. */ uint32_t _pad[1]; /** From ce717fbc0b8f0a5ae966853846f4d21bd731e853 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 5 Nov 2024 14:50:09 -0800 Subject: [PATCH 066/625] Fix errors/warnings from sparse and checkpatch.pl --- homa.h | 2 +- homa_grant.c | 4 + homa_grant.h | 36 +++---- homa_impl.h | 243 ++++++++++++++++++++++++------------------------ homa_incoming.c | 8 +- homa_metrics.h | 24 ++--- homa_offload.c | 22 +++-- homa_offload.h | 31 +++--- homa_outgoing.c | 9 +- homa_peer.c | 15 ++- homa_peer.h | 2 + homa_plumbing.c | 68 ++++++++------ homa_rpc.c | 8 +- homa_skb.c | 2 +- homa_skb.h | 53 +++++------ homa_sock.c | 21 +++-- homa_sock.h | 5 +- homa_utils.c | 10 +- 18 files changed, 306 insertions(+), 257 deletions(-) diff --git a/homa.h b/homa.h index e7c35d4d..218a101d 100644 --- a/homa.h +++ b/homa.h @@ -127,7 +127,7 @@ struct homa_recvmsg_args { */ uint32_t num_bpages; - /* Reserved for future use; must be zero. */ + /* Reserved for future use; must be zero. */ uint32_t _pad[1]; /** diff --git a/homa_grant.c b/homa_grant.c index e799579a..3bb5571b 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -148,6 +148,7 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) list_add(&prev_peer->grantable_links, &peer->grantable_links); } done: + return; } /** @@ -270,6 +271,7 @@ int homa_grant_send(struct homa_rpc *rpc, struct homa *homa) * WILL RELEASE THE LOCK before returning. */ void homa_grant_check_rpc(struct homa_rpc *rpc) + __releases(rpc->bucket_lock) { /* Overall design notes: * The grantable lock has proven to be a performance bottleneck, @@ -605,6 +607,7 @@ void homa_grant_find_oldest(struct homa *homa) * @rpc: The RPC to clean up. Must be locked by the caller. */ void homa_grant_free_rpc(struct homa_rpc *rpc) + __releases(rpc->bucket_lock) { struct homa *homa = rpc->hsk->homa; @@ -645,6 +648,7 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) * thread started a fresh calculation after this method was invoked. */ int homa_grantable_lock_slow(struct homa *homa, int recalc) + __acquires(&homa->grantable_lock) { int starting_count = atomic_read(&homa->grant_recalc_count); __u64 start = sched_clock(); diff --git a/homa_grant.h b/homa_grant.h index 68feca11..45073e68 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -5,22 +5,22 @@ #ifndef _HOMA_GRANT_H #define _HOMA_GRANT_H -extern int homa_grantable_lock_slow(struct homa *homa, int recalc); -extern void homa_grant_add_rpc(struct homa_rpc *rpc); -extern void homa_grant_check_rpc(struct homa_rpc *rpc); -extern void homa_grant_find_oldest(struct homa *homa); -extern void homa_grant_free_rpc(struct homa_rpc *rpc); -extern void homa_grant_log_tt(struct homa *homa); -extern int homa_grant_outranks(struct homa_rpc *rpc1, - struct homa_rpc *rpc2); -extern int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, - int max_rpcs); -extern void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -extern void homa_grant_recalc(struct homa *homa, int locked); -extern void homa_grant_remove_rpc(struct homa_rpc *rpc); -extern int homa_grant_send(struct homa_rpc *rpc, struct homa *homa); -extern int homa_grant_update_incoming(struct homa_rpc *rpc, - struct homa *homa); +int homa_grantable_lock_slow(struct homa *homa, int recalc); +void homa_grant_add_rpc(struct homa_rpc *rpc); +void homa_grant_check_rpc(struct homa_rpc *rpc); +void homa_grant_find_oldest(struct homa *homa); +void homa_grant_free_rpc(struct homa_rpc *rpc); +void homa_grant_log_tt(struct homa *homa); +int homa_grant_outranks(struct homa_rpc *rpc1, + struct homa_rpc *rpc2); +int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, + int max_rpcs); +void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +void homa_grant_recalc(struct homa *homa, int locked); +void homa_grant_remove_rpc(struct homa_rpc *rpc); +int homa_grant_send(struct homa_rpc *rpc, struct homa *homa); +int homa_grant_update_incoming(struct homa_rpc *rpc, + struct homa *homa); /** * homa_grantable_lock() - Acquire the grantable lock. If the lock @@ -34,6 +34,7 @@ extern int homa_grant_update_incoming(struct homa_rpc *rpc, * thread started a fresh calculation after this method was invoked. */ static inline int homa_grantable_lock(struct homa *homa, int recalc) + __acquires(&homa->grantable_lock) { int result; @@ -50,10 +51,11 @@ static inline int homa_grantable_lock(struct homa *homa, int recalc) * @homa: Overall data about the Homa protocol implementation. */ static inline void homa_grantable_unlock(struct homa *homa) + __releases(&homa->grantable_lock) { INC_METRIC(grantable_lock_ns, sched_clock() - homa->grantable_lock_time); spin_unlock_bh(&homa->grantable_lock); } -#endif /* _HOMA_GRANT_H */ \ No newline at end of file +#endif /* _HOMA_GRANT_H */ diff --git a/homa_impl.h b/homa_impl.h index 92c87427..a3e0a2a3 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -936,6 +936,7 @@ static inline void homa_set_doff(struct data_header *h, int size) * @homa: Overall data about the Homa protocol implementation. */ static inline void homa_throttle_lock(struct homa *homa) + __acquires(&homa->throttle_lock) { if (!spin_trylock_bh(&homa->throttle_lock)) homa_throttle_lock_slow(homa); @@ -946,6 +947,7 @@ static inline void homa_throttle_lock(struct homa *homa) * @homa: Overall data about the Homa protocol implementation. */ static inline void homa_throttle_unlock(struct homa *homa) + __releases(&homa->throttle_lock) { spin_unlock_bh(&homa->throttle_lock); } @@ -967,7 +969,7 @@ static inline struct in6_addr ipv4_to_ipv6(__be32 ip4) { struct in6_addr ret = {}; - if (ip4 == INADDR_ANY) + if (ip4 == htonl(INADDR_ANY)) return in6addr_any; ret.in6_u.u6_addr32[2] = htonl(0xffff); ret.in6_u.u6_addr32[3] = ip4; @@ -1040,7 +1042,7 @@ static inline bool is_homa_pkt(struct sk_buff *skb) * provide a unique identifier for the address in a timetrace record. * @x: Address (either IPv6 or IPv4-mapped IPv6) */ -static inline __be32 tt_addr(const struct in6_addr x) +static inline uint32_t tt_addr(const struct in6_addr x) { return is_mapped_ipv4(x) ? ntohl(x.in6_u.u6_addr32[3]) : (x.in6_u.u6_addr32[3] ? ntohl(x.in6_u.u6_addr32[3]) @@ -1060,132 +1062,129 @@ void unit_hook(char *id); #endif /* __UNIT_TEST__ */ #endif /* See strip.py */ -extern void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, - int port, int error); -extern void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); -extern void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_rpc *rpc); -extern void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb); -extern void homa_add_to_throttled(struct homa_rpc *rpc); -extern int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb); -extern int homa_bind(struct socket *sk, struct sockaddr *addr, - int addr_len); -extern int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, - bool force); -extern struct homa_rpc - *homa_choose_fifo_grant(struct homa *homa); -extern struct homa_interest - *homa_choose_interest(struct homa *homa, struct list_head *head, - int offset); -extern void homa_close(struct sock *sock, long timeout); -extern int homa_copy_to_user(struct homa_rpc *rpc); -extern void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); -extern void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -extern void homa_destroy(struct homa *homa); -extern int homa_disconnect(struct sock *sk, int flags); -extern void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); +void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, + int port, int error); +void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); +void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, + struct homa_rpc *rpc); +void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb); +void homa_add_to_throttled(struct homa_rpc *rpc); +int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb); +int homa_bind(struct socket *sk, struct sockaddr *addr, + int addr_len); +int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, + bool force); +struct homa_rpc *homa_choose_fifo_grant(struct homa *homa); +struct homa_interest *homa_choose_interest(struct homa *homa, + struct list_head *head, + int offset); +void homa_close(struct sock *sock, long timeout); +int homa_copy_to_user(struct homa_rpc *rpc); +void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); +void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +void homa_destroy(struct homa *homa); +int homa_disconnect(struct sock *sk, int flags); +void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -extern int homa_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int homa_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); #else -extern int homa_dointvec(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int homa_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #endif -extern int homa_err_handler_v4(struct sk_buff *skb, u32 info); -extern int homa_err_handler_v6(struct sk_buff *skb, - struct inet6_skb_parm *opt, u8 type, u8 code, int offset, - __be32 info); -extern int homa_fill_data_interleaved(struct homa_rpc *rpc, - struct sk_buff *skb, struct iov_iter *iter); -extern void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, - char *format); -extern void homa_freeze_peers(struct homa *homa); -extern struct homa_gap - *homa_gap_new(struct list_head *next, int start, int end); -extern void homa_gap_retry(struct homa_rpc *rpc); -extern int homa_get_port(struct sock *sk, unsigned short snum); -extern int homa_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option); -extern int homa_hash(struct sock *sk); -extern enum hrtimer_restart - homa_hrtimer(struct hrtimer *timer); -extern int homa_init(struct homa *homa); -extern void homa_incoming_sysctl_changed(struct homa *homa); -extern int homa_ioc_abort(struct sock *sk, int *karg); -extern int homa_ioctl(struct sock *sk, int cmd, int *karg); -extern void homa_log_throttled(struct homa *homa); -extern int homa_message_in_init(struct homa_rpc *rpc, int length, - int unsched); -extern int homa_message_out_fill(struct homa_rpc *rpc, - struct iov_iter *iter, int xmit); -extern void homa_message_out_init(struct homa_rpc *rpc, int length); -extern void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, - struct homa_rpc *rpc); -extern struct sk_buff - *homa_new_data_packet(struct homa_rpc *rpc, - struct iov_iter *iter, int offset, int length, - int max_seg_data); -extern void homa_outgoing_sysctl_changed(struct homa *homa); -extern int homa_pacer_main(void *transportInfo); -extern void homa_pacer_stop(struct homa *homa); -extern void homa_pacer_xmit(struct homa *homa); -extern __poll_t homa_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); -extern char *homa_print_ipv4_addr(__be32 addr); -extern char *homa_print_ipv6_addr(const struct in6_addr *addr); -extern char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); -extern char *homa_print_packet_short(struct sk_buff *skb, char *buffer, - int buf_len); -extern void homa_prios_changed(struct homa *homa); -extern int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len); -extern int homa_register_interests(struct homa_interest *interest, - struct homa_sock *hsk, int flags, __u64 id); -extern void homa_remove_from_throttled(struct homa_rpc *rpc); -extern void homa_resend_data(struct homa_rpc *rpc, int start, int end, - int priority); -extern void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, - struct homa_sock *hsk); -extern void homa_rpc_abort(struct homa_rpc *crpc, int error); -extern void homa_rpc_acked(struct homa_sock *hsk, - const struct in6_addr *saddr, struct homa_ack *ack); -extern void homa_rpc_free(struct homa_rpc *rpc); -extern void homa_rpc_handoff(struct homa_rpc *rpc); -extern int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -extern int homa_setsockopt(struct sock *sk, int level, int optname, - sockptr_t __user optval, unsigned int optlen); -extern int homa_shutdown(struct socket *sock, int how); -extern int homa_snprintf(char *buffer, int size, int used, - const char *format, ...) __printf(4, 5); -extern int homa_softirq(struct sk_buff *skb); -extern void homa_spin(int ns); -extern char *homa_symbol_for_type(uint8_t type); +int homa_err_handler_v4(struct sk_buff *skb, u32 info); +int homa_err_handler_v6(struct sk_buff *skb, + struct inet6_skb_parm *opt, u8 type, u8 code, + int offset, __be32 info); +int homa_fill_data_interleaved(struct homa_rpc *rpc, + struct sk_buff *skb, struct iov_iter *iter); +void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, + char *format); +void homa_freeze_peers(struct homa *homa); +struct homa_gap *homa_gap_new(struct list_head *next, int start, int end); +void homa_gap_retry(struct homa_rpc *rpc); +int homa_get_port(struct sock *sk, unsigned short snum); +int homa_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *option); +int homa_hash(struct sock *sk); +enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); +int homa_init(struct homa *homa); +void homa_incoming_sysctl_changed(struct homa *homa); +int homa_ioc_abort(struct sock *sk, int *karg); +int homa_ioctl(struct sock *sk, int cmd, int *karg); +void homa_log_throttled(struct homa *homa); +int homa_message_in_init(struct homa_rpc *rpc, int length, + int unsched); +int homa_message_out_fill(struct homa_rpc *rpc, + struct iov_iter *iter, int xmit); +void homa_message_out_init(struct homa_rpc *rpc, int length); +void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, + struct homa_rpc *rpc); +struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, + struct iov_iter *iter, int offset, + int length, int max_seg_data); +void homa_outgoing_sysctl_changed(struct homa *homa); +int homa_pacer_main(void *transportInfo); +void homa_pacer_stop(struct homa *homa); +void homa_pacer_xmit(struct homa *homa); +__poll_t homa_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); +char *homa_print_ipv4_addr(__be32 addr); +char *homa_print_ipv6_addr(const struct in6_addr *addr); +char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); +char *homa_print_packet_short(struct sk_buff *skb, char *buffer, + int buf_len); +void homa_prios_changed(struct homa *homa); +int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len); +int homa_register_interests(struct homa_interest *interest, + struct homa_sock *hsk, int flags, __u64 id); +void homa_remove_from_throttled(struct homa_rpc *rpc); +void homa_resend_data(struct homa_rpc *rpc, int start, int end, + int priority); +void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, + struct homa_sock *hsk); +void homa_rpc_abort(struct homa_rpc *crpc, int error); +void homa_rpc_acked(struct homa_sock *hsk, + const struct in6_addr *saddr, struct homa_ack *ack); +void homa_rpc_free(struct homa_rpc *rpc); +void homa_rpc_handoff(struct homa_rpc *rpc); +int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +int homa_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); +int homa_shutdown(struct socket *sock, int how); +int homa_snprintf(char *buffer, int size, int used, + const char *format, ...) __printf(4, 5); +int homa_softirq(struct sk_buff *skb); +void homa_spin(int ns); +char *homa_symbol_for_type(uint8_t type); #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -extern int homa_sysctl_softirq_cores(struct ctl_table *table, - int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int homa_sysctl_softirq_cores(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); #else -extern int homa_sysctl_softirq_cores(const struct ctl_table *table, - int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int homa_sysctl_softirq_cores(const struct ctl_table *table, + int write, void *buffer, size_t *lenp, + loff_t *ppos); #endif -extern void homa_timer(struct homa *homa); -extern int homa_timer_main(void *transportInfo); -extern void homa_unhash(struct sock *sk); -extern void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -extern int homa_unsched_priority(struct homa *homa, - struct homa_peer *peer, int length); -extern int homa_validate_incoming(struct homa *homa, int verbose, - int *link_errors); -extern struct homa_rpc - *homa_wait_for_message(struct homa_sock *hsk, int flags, - __u64 id); -extern int homa_xmit_control(enum homa_packet_type type, void *contents, - size_t length, struct homa_rpc *rpc); -extern int __homa_xmit_control(void *contents, size_t length, - struct homa_peer *peer, struct homa_sock *hsk); -extern void homa_xmit_data(struct homa_rpc *rpc, bool force); -extern void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, - int priority); -extern void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); +void homa_timer(struct homa *homa); +int homa_timer_main(void *transportInfo); +void homa_unhash(struct sock *sk); +void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, + int length); +int homa_validate_incoming(struct homa *homa, int verbose, + int *link_errors); +struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, + __u64 id); +int homa_xmit_control(enum homa_packet_type type, void *contents, + size_t length, struct homa_rpc *rpc); +int __homa_xmit_control(void *contents, size_t length, + struct homa_peer *peer, struct homa_sock *hsk); +void homa_xmit_data(struct homa_rpc *rpc, bool force); +void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, + int priority); +void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); /** * homa_check_pacer() - This method is invoked at various places in Homa to diff --git a/homa_incoming.c b/homa_incoming.c index abf13ab8..a8f17d85 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -217,6 +217,8 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) * will be RPC_DEAD. */ int homa_copy_to_user(struct homa_rpc *rpc) + __releases(rpc->bucket_lock) + __acquires(rpc->bucket_lock) { #ifdef __UNIT_TEST__ #define MAX_SKBS 3 @@ -295,8 +297,8 @@ int homa_copy_to_user(struct homa_rpc *rpc) } chunk_size = buf_bytes; } - error = import_ubuf(READ, dst, chunk_size, - &iter); + error = import_ubuf(READ, (void __user *)dst, + chunk_size, &iter); if (error) goto free_skbs; error = skb_copy_datagram_iter(skbs[i], @@ -861,6 +863,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, */ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc) + __releases(rpc->bucket_lock) { const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); struct ack_header *h = (struct ack_header *)skb->data; @@ -1198,6 +1201,7 @@ int homa_register_interests(struct homa_interest *interest, */ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, __u64 id) + __acquires(&rpc->bucket_lock) { uint64_t poll_start, poll_end, now; int error, blocked = 0, polled = 0; diff --git a/homa_metrics.h b/homa_metrics.h index 23d3ecf6..f43db27d 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -661,15 +661,15 @@ static inline struct homa_metrics *homa_metrics_per_cpu(void) #define INC_METRIC(metric, count) per_cpu(homa_metrics, \ raw_smp_processor_id()).metric += (count) -extern void homa_metric_append(struct homa *homa, const char *format, ...); -extern loff_t homa_metrics_lseek(struct file *file, loff_t offset, - int whence); -extern int homa_metrics_open(struct inode *inode, struct file *file); -extern char *homa_metrics_print(struct homa *homa); -extern ssize_t homa_metrics_read(struct file *file, char __user *buffer, - size_t length, loff_t *offset); -extern int homa_metrics_release(struct inode *inode, struct file *file); -extern int homa_proc_read_metrics(char *buffer, char **start, off_t offset, - int count, int *eof, void *data); - -#endif /* _HOMA_METRICS_H */ \ No newline at end of file +void homa_metric_append(struct homa *homa, const char *format, ...); +loff_t homa_metrics_lseek(struct file *file, loff_t offset, + int whence); +int homa_metrics_open(struct inode *inode, struct file *file); +char *homa_metrics_print(struct homa *homa); +ssize_t homa_metrics_read(struct file *file, char __user *buffer, + size_t length, loff_t *offset); +int homa_metrics_release(struct inode *inode, struct file *file); +int homa_proc_read_metrics(char *buffer, char **start, off_t offset, + int count, int *eof, void *data); + +#endif /* _HOMA_METRICS_H */ diff --git a/homa_offload.c b/homa_offload.c index 174d9465..e1e7597f 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -24,8 +24,8 @@ extern struct homa *homa; /* Pointers to TCP's net_offload structures. NULL means homa_gro_hook_tcp * hasn't been called yet. */ -const struct net_offload *tcp_net_offload; -const struct net_offload *tcp6_net_offload; +static const struct net_offload *tcp_net_offload; +static const struct net_offload *tcp6_net_offload; /* * Identical to *tcp_net_offload except that the gro_receive function @@ -91,15 +91,17 @@ void homa_gro_hook_tcp(void) return; pr_notice("Homa setting up TCP hijacking\n"); - tcp_net_offload = inet_offloads[IPPROTO_TCP]; + tcp_net_offload = rcu_dereference(inet_offloads[IPPROTO_TCP]); hook_tcp_net_offload = *tcp_net_offload; hook_tcp_net_offload.callbacks.gro_receive = homa_tcp_gro_receive; - inet_offloads[IPPROTO_TCP] = &hook_tcp_net_offload; + inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + &hook_tcp_net_offload; - tcp6_net_offload = inet6_offloads[IPPROTO_TCP]; + tcp6_net_offload = rcu_dereference(inet6_offloads[IPPROTO_TCP]); hook_tcp6_net_offload = *tcp6_net_offload; hook_tcp6_net_offload.callbacks.gro_receive = homa_tcp_gro_receive; - inet6_offloads[IPPROTO_TCP] = &hook_tcp6_net_offload; + inet6_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + &hook_tcp6_net_offload; } /** @@ -112,9 +114,11 @@ void homa_gro_unhook_tcp(void) if (tcp_net_offload == NULL) return; pr_notice("Homa cancelling TCP hijacking\n"); - inet_offloads[IPPROTO_TCP] = tcp_net_offload; + inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + tcp_net_offload; tcp_net_offload = NULL; - inet6_offloads[IPPROTO_TCP] = tcp6_net_offload; + inet6_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + tcp6_net_offload; tcp6_net_offload = NULL; } @@ -307,7 +311,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, // tt_record("homa_gro_receive can't pull enough data " // "from packet for trace"); if (h_new->common.type == DATA) { - if (h_new->seg.offset == -1) { + if (h_new->seg.offset == (__force __be32)-1) { tt_record2("homa_gro_receive replaced offset %d with %d", ntohl(h_new->seg.offset), ntohl(h_new->common.sequence)); diff --git a/homa_offload.h b/homa_offload.h index 6bf79154..21628914 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -73,22 +73,19 @@ struct homa_offload_core { }; DECLARE_PER_CPU(struct homa_offload_core, homa_offload_core); -extern int homa_gro_complete(struct sk_buff *skb, int thoff); -extern void homa_gro_gen2(struct sk_buff *skb); -extern void homa_gro_gen3(struct sk_buff *skb); -extern void homa_gro_hook_tcp(void); -extern void homa_gro_unhook_tcp(void); -extern struct sk_buff - *homa_gro_receive(struct list_head *gro_list, - struct sk_buff *skb); -extern struct sk_buff - *homa_gso_segment(struct sk_buff *skb, - netdev_features_t features); -extern int homa_offload_end(void); -extern int homa_offload_init(void); -extern void homa_send_ipis(void); -extern struct sk_buff - *homa_tcp_gro_receive(struct list_head *held_list, - struct sk_buff *skb); +int homa_gro_complete(struct sk_buff *skb, int thoff); +void homa_gro_gen2(struct sk_buff *skb); +void homa_gro_gen3(struct sk_buff *skb); +void homa_gro_hook_tcp(void); +void homa_gro_unhook_tcp(void); +struct sk_buff *homa_gro_receive(struct list_head *gro_list, + struct sk_buff *skb); +struct sk_buff *homa_gso_segment(struct sk_buff *skb, + netdev_features_t features); +int homa_offload_end(void); +int homa_offload_init(void); +void homa_send_ipis(void); +struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, + struct sk_buff *skb); #endif /* _HOMA_OFFLOAD_H */ diff --git a/homa_outgoing.c b/homa_outgoing.c index e3410a2f..9d8ea657 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -28,7 +28,7 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) if (rpc->msgout.unscheduled > length) rpc->msgout.unscheduled = length; rpc->msgout.sched_priority = 0; - rpc->msgout.init_ns= sched_clock(); + rpc->msgout.init_ns = sched_clock(); } /** @@ -132,7 +132,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, homa_peer_get_acks(rpc->peer, 1, &h->ack); h->cutoff_version = rpc->peer->cutoff_version; h->retransmit = 0; - h->seg.offset = -1; + h->seg.offset = htonl(-1); segs = length + max_seg_data - 1; do_div(segs, max_seg_data); @@ -195,6 +195,8 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, * rpc->state will be RPC_DEAD. */ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) + __releases(rpc->bucket_lock) + __acquires(rpc->bucket_lock) { /* Geometry information for packets: * mtu: largest size for an on-the-wire packet (including @@ -484,6 +486,8 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) * the NIC queue is sufficiently long. */ void homa_xmit_data(struct homa_rpc *rpc, bool force) + __releases(rpc->bucket_lock) + __acquires(rpc->bucket_lock) { struct homa *homa = rpc->hsk->homa; #if 1 /* See strip.py */ @@ -703,6 +707,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, } resend_done: + return; } /** diff --git a/homa_peer.c b/homa_peer.c index 1a643481..7293409b 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -153,11 +153,15 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, struct dst_entry *dst; // Should use siphash or jhash here: - __u32 bucket = hash_32(addr->in6_u.u6_addr32[0], HOMA_PEERTAB_BUCKET_BITS); - - bucket ^= hash_32(addr->in6_u.u6_addr32[1], HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32(addr->in6_u.u6_addr32[2], HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32(addr->in6_u.u6_addr32[3], HOMA_PEERTAB_BUCKET_BITS); + __u32 bucket = hash_32((__force __u32)addr->in6_u.u6_addr32[0], + HOMA_PEERTAB_BUCKET_BITS); + + bucket ^= hash_32((__force __u32)addr->in6_u.u6_addr32[1], + HOMA_PEERTAB_BUCKET_BITS); + bucket ^= hash_32((__force __u32)addr->in6_u.u6_addr32[2], + HOMA_PEERTAB_BUCKET_BITS); + bucket ^= hash_32((__force __u32)addr->in6_u.u6_addr32[3], + HOMA_PEERTAB_BUCKET_BITS); hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], peertab_links) { if (ipv6_addr_equal(&peer->addr, addr)) @@ -359,6 +363,7 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, * @peer: Peer to lock. */ void homa_peer_lock_slow(struct homa_peer *peer) + __acquires(&peer->ack_lock) { __u64 start = sched_clock(); diff --git a/homa_peer.h b/homa_peer.h index 50fc2d20..babf1a56 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -222,6 +222,7 @@ void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now); * @peer: Peer to lock. */ static inline void homa_peer_lock(struct homa_peer *peer) + __acquires(&peer->ack_lock) { if (!spin_trylock_bh(&peer->ack_lock)) homa_peer_lock_slow(peer); @@ -232,6 +233,7 @@ static inline void homa_peer_lock(struct homa_peer *peer) * @peer: Peer to lock. */ static inline void homa_peer_unlock(struct homa_peer *peer) + __releases(&peer->ack_lock) { spin_unlock_bh(&peer->ack_lock); } diff --git a/homa_plumbing.c b/homa_plumbing.c index 96e3e06d..0f89f83c 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -17,14 +17,19 @@ MODULE_DESCRIPTION("Homa transport protocol"); MODULE_VERSION("0.01"); /* Not yet sure what these variables are for */ -long sysctl_homa_mem[3] __read_mostly; -int sysctl_homa_rmem_min __read_mostly; -int sysctl_homa_wmem_min __read_mostly; +static long sysctl_homa_mem[3] __read_mostly; +static int sysctl_homa_rmem_min __read_mostly; +static int sysctl_homa_wmem_min __read_mostly; /* Global data for Homa. Never reference homa_data directory. Always use * the homa variable instead; this allows overriding during unit tests. */ -struct homa homa_data; +static struct homa homa_data; + +/* This variable should almost never be used directly; it is normally + * passed as a parameter to functions that need it. Thus it is not declared + * in a header file. + */ struct homa *homa = &homa_data; /* True means that the Homa module is in the process of unloading itself, @@ -46,7 +51,7 @@ static int action; * be implemented by PF_INET6 functions that are independent of the * Homa protocol. */ -const struct proto_ops homa_proto_ops = { +static const struct proto_ops homa_proto_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -67,7 +72,7 @@ const struct proto_ops homa_proto_ops = { .set_peek_off = sk_set_peek_off, }; -const struct proto_ops homav6_proto_ops = { +static const struct proto_ops homav6_proto_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -94,7 +99,7 @@ const struct proto_ops homav6_proto_ops = { * protocol family, and in many cases they are invoked by functions in * homa_proto_ops. Most of these functions have Homa-specific implementations. */ -struct proto homa_prot = { +static struct proto homa_prot = { .name = "HOMA", .owner = THIS_MODULE, .close = homa_close, @@ -102,7 +107,7 @@ struct proto homa_prot = { .disconnect = homa_disconnect, .ioctl = homa_ioctl, .init = homa_socket, - .destroy = 0, + .destroy = NULL, .setsockopt = homa_setsockopt, .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, @@ -118,7 +123,7 @@ struct proto homa_prot = { .no_autobind = 1, }; -struct proto homav6_prot = { +static struct proto homav6_prot = { .name = "HOMAv6", .owner = THIS_MODULE, .close = homa_close, @@ -126,7 +131,7 @@ struct proto homav6_prot = { .disconnect = homa_disconnect, .ioctl = homa_ioctl, .init = homa_socket, - .destroy = 0, + .destroy = NULL, .setsockopt = homa_setsockopt, .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, @@ -147,7 +152,7 @@ struct proto homav6_prot = { }; /* Top-level structure describing the Homa protocol. */ -struct inet_protosw homa_protosw = { +static struct inet_protosw homa_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_HOMA, .prot = &homa_prot, @@ -155,7 +160,7 @@ struct inet_protosw homa_protosw = { .flags = INET_PROTOSW_REUSE, }; -struct inet_protosw homav6_protosw = { +static struct inet_protosw homav6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_HOMA, .prot = &homav6_prot, @@ -745,7 +750,7 @@ int homa_ioc_abort(struct sock *sk, int *karg) struct homa_abort_args args; struct homa_rpc *rpc; - if (unlikely(copy_from_user(&args, (void *)karg, sizeof(args)))) + if (unlikely(copy_from_user(&args, (void __user *)karg, sizeof(args)))) return -EFAULT; if (args._pad1 || args._pad2[0] || args._pad2[1]) @@ -824,8 +829,8 @@ int homa_socket(struct sock *sk) * @optlen: Number of bytes of data at @optval. * Return: 0 on success, otherwise a negative errno. */ -int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) +int homa_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct homa_sock *hsk = homa_sk(sk); struct homa_set_buf_args args; @@ -842,7 +847,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, /* Do a trivial test to make sure we can at least write the first * page of the region. */ - if (copy_to_user(args.start, &args, sizeof(args))) + if (copy_to_user((void __user *)args.start, &args, sizeof(args))) return -EFAULT; homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_SET_BUF"); @@ -888,13 +893,15 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) struct homa_rpc *rpc = NULL; union sockaddr_in_union *addr = (union sockaddr_in_union *)msg->msg_name; - per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; + per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = + start; if (unlikely(!msg->msg_control_is_user)) { tt_record("homa_sendmsg error: !msg->msg_control_is_user"); result = -EINVAL; goto error; } - if (unlikely(copy_from_user(&args, msg->msg_control, sizeof(args)))) { + if (unlikely(copy_from_user(&args, (void __user *)msg->msg_control, + sizeof(args)))) { result = -EFAULT; goto error; } @@ -932,8 +939,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) homa_rpc_unlock(rpc); rpc = NULL; - if (unlikely(copy_to_user(msg->msg_control, &args, - sizeof(args)))) { + if (unlikely(copy_to_user((void __user *)msg->msg_control, + &args, sizeof(args)))) { rpc = homa_find_client_rpc(hsk, args.id); result = -EFAULT; goto error; @@ -973,7 +980,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) tt_record2("homa_sendmsg error: RPC id %d in bad state %d", rpc->id, rpc->state); homa_rpc_unlock(rpc); - rpc = 0; + rpc = NULL; result = -EINVAL; goto error; } @@ -1032,7 +1039,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, result = -EINVAL; goto done; } - if (unlikely(copy_from_user(&control, msg->msg_control, + if (unlikely(copy_from_user(&control, (void __user *)msg->msg_control, sizeof(control)))) { result = -EFAULT; goto done; @@ -1130,7 +1137,8 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, homa_rpc_unlock(rpc); done: - if (unlikely(copy_to_user(msg->msg_control, &control, sizeof(control)))) { + if (unlikely(copy_to_user((void __user *)msg->msg_control, &control, + sizeof(control)))) { /* Note: in this case the message's buffers will be leaked. */ pr_notice("%s couldn't copy back args\n", __func__); result = -EFAULT; @@ -1374,7 +1382,7 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) iph = (struct iphdr *)(icmp + sizeof(struct icmphdr)); h = (struct common_header *)(icmp + sizeof(struct icmphdr) + iph->ihl * 4); - homa_abort_rpcs(homa, &saddr, htons(h->dport), -ENOTCONN); + homa_abort_rpcs(homa, &saddr, ntohs(h->dport), -ENOTCONN); } else if (type == ICMP_DEST_UNREACH) { int error; @@ -1383,7 +1391,7 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) else error = -EHOSTUNREACH; tt_record2("ICMP destination unreachable: 0x%x (daddr 0x%x)", - iph->saddr, iph->daddr); + ntohl(iph->saddr), ntohl(iph->daddr)); homa_abort_rpcs(homa, &saddr, 0, error); } else { pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n", @@ -1416,7 +1424,7 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, iph = (struct ipv6hdr *)(icmp + sizeof(struct icmphdr)); h = (struct common_header *)(icmp + sizeof(struct icmphdr) + HOMA_IPV6_HEADER_LENGTH); - homa_abort_rpcs(homa, &iph->daddr, htons(h->dport), -ENOTCONN); + homa_abort_rpcs(homa, &iph->daddr, ntohs(h->dport), -ENOTCONN); } else if (type == ICMPV6_DEST_UNREACH) { int error; @@ -1450,7 +1458,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; - __poll_t mask; + __u32 mask; /* It seems to be standard practice for poll functions *not* to * acquire the socket lock, so we don't do it here; not sure @@ -1463,7 +1471,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, if (!list_empty(&homa_sk(sk)->ready_requests) || !list_empty(&homa_sk(sk)->ready_responses)) mask |= POLLIN | POLLRDNORM; - return mask; + return (__poll_t)mask; } /** @@ -1483,7 +1491,7 @@ int homa_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) #else int homa_dointvec(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) #endif { int result; @@ -1564,7 +1572,7 @@ int homa_sysctl_softirq_cores(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) #else int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) #endif { struct homa_offload_core *offload_core; diff --git a/homa_rpc.c b/homa_rpc.c index 9baca396..9f938186 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -21,6 +21,7 @@ */ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, const union sockaddr_in_union *dest) + __acquires(&crpc->bucket->lock) { struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); struct homa_rpc_bucket *bucket; @@ -108,6 +109,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, const struct in6_addr *source, struct data_header *h, int *created) + __acquires(&srpc->bucket->lock) { __u64 id = homa_local_id(h->common.sender_id); struct homa_rpc_bucket *bucket; @@ -244,6 +246,8 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, * not be locked. */ void homa_rpc_free(struct homa_rpc *rpc) + __acquires(&rpc->hsk->lock) + __releases(&rpc->hsk->lock) { /* The goal for this function is to make the RPC inaccessible, * so that no other code will ever access it again. However, don't @@ -369,7 +373,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) /* Collect buffers and freeable RPCs. */ list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) { if ((atomic_read(&rpc->flags) & RPC_CANT_REAP) || - atomic_read(&rpc->grants_in_progress)!= 0 || + atomic_read(&rpc->grants_in_progress) != 0 || atomic_read(&rpc->msgout.active_xmits) != 0) { INC_METRIC(disabled_rpc_reaps, 1); continue; @@ -479,6 +483,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) * by invoking homa_rpc_unlock. */ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) + __acquires(&crpc->bucket->lock) { struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); struct homa_rpc *crpc; @@ -507,6 +512,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, const struct in6_addr *saddr, __u16 sport, __u64 id) + __acquires(&srpc->bucket->lock) { struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); struct homa_rpc *srpc; diff --git a/homa_skb.c b/homa_skb.c index dede213c..07aeb21e 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -257,7 +257,7 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) skb_core->page_inuse = 0; if (skb_core->num_stashed_pages > 0) { skb_core->num_stashed_pages--; - skb_core->skb_page =skb_core->stashed_pages[ + skb_core->skb_page = skb_core->stashed_pages[ skb_core->num_stashed_pages]; goto success; } diff --git a/homa_skb.h b/homa_skb.h index fd52919a..db617670 100644 --- a/homa_skb.h +++ b/homa_skb.h @@ -96,29 +96,30 @@ struct homa_skb_core { }; DECLARE_PER_CPU(struct homa_skb_core, homa_skb_core); -extern int homa_skb_append_from_iter(struct homa *homa, - struct sk_buff *skb, struct iov_iter *iter, int length); -extern int homa_skb_append_from_skb(struct homa *homa, - struct sk_buff *dst_skb, struct sk_buff *src_skb, - int offset, int length); -extern int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, - void *buf, int length); -extern void homa_skb_cache_pages(struct homa *homa, struct page **pages, - int count); -extern void homa_skb_cleanup(struct homa *homa); -extern void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, - int *length); -extern void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb); -extern void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, - int count); -extern void homa_skb_get(struct sk_buff *skb, void *dest, int offset, - int length); -extern int homa_skb_init(struct homa *homa); -extern struct sk_buff - *homa_skb_new_tx(int length); -extern bool homa_skb_page_alloc(struct homa *homa, - struct homa_skb_core *core); -extern void homa_skb_release_pages(struct homa *homa); -extern void homa_skb_stash_pages(struct homa *homa, int length); - -#endif /* _HOMA_SKB_H */ \ No newline at end of file +int homa_skb_append_from_iter(struct homa *homa, + struct sk_buff *skb, struct iov_iter *iter, + int length); +int homa_skb_append_from_skb(struct homa *homa, + struct sk_buff *dst_skb, + struct sk_buff *src_skb, int offset, + int length); +int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, + void *buf, int length); +void homa_skb_cache_pages(struct homa *homa, struct page **pages, + int count); +void homa_skb_cleanup(struct homa *homa); +void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, + int *length); +void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb); +void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, + int count); +void homa_skb_get(struct sk_buff *skb, void *dest, int offset, + int length); +int homa_skb_init(struct homa *homa); +struct sk_buff *homa_skb_new_tx(int length); +bool homa_skb_page_alloc(struct homa *homa, + struct homa_skb_core *core); +void homa_skb_release_pages(struct homa *homa); +void homa_skb_stash_pages(struct homa *homa, int length); + +#endif /* _HOMA_SKB_H */ diff --git a/homa_sock.c b/homa_sock.c index 521f0841..9f01b2ce 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -90,17 +90,19 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) while (1) { while (!scan->next) { + struct hlist_head *bucket; + scan->current_bucket++; if (scan->current_bucket >= HOMA_SOCKTAB_BUCKETS) return NULL; + bucket = &scan->socktab->buckets[scan->current_bucket]; scan->next = (struct homa_socktab_links *) - hlist_first_rcu(&scan->socktab->buckets - [scan->current_bucket]); + rcu_dereference(hlist_first_rcu(bucket)); } links = scan->next; hsk = links->sock; - scan->next = (struct homa_socktab_links *)hlist_next_rcu(&links - ->hash_links); + scan->next = (struct homa_socktab_links *) + rcu_dereference(hlist_next_rcu(&links->hash_links)); return hsk; } } @@ -116,6 +118,7 @@ void homa_socktab_end_scan(struct homa_socktab_scan *scan) list_del(&scan->scan_links); spin_unlock_bh(&scan->socktab->write_lock); } + /** * homa_sock_init() - Constructor for homa_sock objects. This function * initializes only the parts of the socket that are owned by Homa. @@ -195,10 +198,10 @@ void homa_sock_unlink(struct homa_sock *hsk) */ spin_lock_bh(&socktab->write_lock); list_for_each_entry(scan, &socktab->active_scans, scan_links) { - if (!scan->next || (scan->next->sock != hsk)) + if (!scan->next || scan->next->sock != hsk) continue; - scan->next = (struct homa_socktab_links *)hlist_next_rcu( - &scan->next->hash_links); + scan->next = (struct homa_socktab_links *)rcu_dereference( + hlist_next_rcu(&scan->next->hash_links)); } hlist_del_rcu(&hsk->socktab_links.hash_links); spin_unlock_bh(&socktab->write_lock); @@ -211,6 +214,8 @@ void homa_sock_unlink(struct homa_sock *hsk) * @hsk: Socket to shut down. */ void homa_sock_shutdown(struct homa_sock *hsk) + __acquires(&hsk->lock) + __releases(&hsk->lock) { struct homa_interest *interest; struct homa_rpc *rpc; @@ -362,6 +367,7 @@ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) * @hsk: socket to lock. */ void homa_sock_lock_slow(struct homa_sock *hsk) + __acquires(&hsk->lock) { __u64 start = sched_clock(); @@ -382,6 +388,7 @@ void homa_sock_lock_slow(struct homa_sock *hsk) * share a single bucket lock). */ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id) + __acquires(&bucket->lock) { __u64 start = sched_clock(); diff --git a/homa_sock.h b/homa_sock.h index aaccb41f..5c832116 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -278,7 +278,7 @@ void homa_socktab_end_scan(struct homa_socktab_scan *scan); void homa_socktab_init(struct homa_socktab *socktab); struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan); struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, - struct homa_socktab_scan *scan); + struct homa_socktab_scan *scan); /** * homa_sock_lock() - Acquire the lock for a socket. If the socket @@ -288,6 +288,7 @@ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, * used to track down deadlocks. */ static inline void homa_sock_lock(struct homa_sock *hsk, const char *locker) + __acquires(&hsk->lock) { if (!spin_trylock_bh(&hsk->lock)) { // printk(KERN_NOTICE "Slow path for socket %d, last locker %s", @@ -302,6 +303,7 @@ static inline void homa_sock_lock(struct homa_sock *hsk, const char *locker) * @hsk: Socket to lock. */ static inline void homa_sock_unlock(struct homa_sock *hsk) + __releases(&hsk->lock) { spin_unlock_bh(&hsk->lock); } @@ -398,6 +400,7 @@ static inline int homa_bucket_try_lock(struct homa_rpc_bucket *bucket, * @id: ID of the RPC that was using the lock. */ static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id) + __releases(&bucket->lock) { spin_unlock_bh(&bucket->lock); } diff --git a/homa_utils.c b/homa_utils.c index 1c114a62..da0273b9 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -58,18 +58,19 @@ int homa_init(struct homa *homa) homa->next_client_port = HOMA_MIN_DEFAULT_PORT; homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL); if (!homa->port_map) { - pr_err("homa_init could create port_map: kmalloc failure"); + pr_err("%s couldn't create port_map: kmalloc failure", __func__); return -ENOMEM; } homa_socktab_init(homa->port_map); homa->peers = kmalloc(sizeof(*homa->peers), GFP_KERNEL); if (!homa->peers) { - pr_err("homa_init could create peers: kmalloc failure"); + pr_err("%s couldn't create peers: kmalloc failure", __func__); return -ENOMEM; } err = homa_peertab_init(homa->peers); if (err) { - pr_err("Couldn't initialize peer table (errno %d)\n", -err); + pr_err("%s couldn't initialize peer table (errno %d)\n", + __func__, -err); return err; } err = homa_skb_init(homa); @@ -310,7 +311,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) ", message_length %d, offset %d, data_length %d, incoming %d", ntohl(h->message_length), offset, seg_length, ntohl(h->incoming)); - if (ntohs(h->cutoff_version != 0)) + if (ntohs(h->cutoff_version) != 0) used = homa_snprintf(buffer, buf_len, used, ", cutoff_version %d", ntohs(h->cutoff_version)); @@ -686,6 +687,7 @@ void homa_spin(int ns) * @homa: Overall data about the Homa protocol implementation. */ void homa_throttle_lock_slow(struct homa *homa) + __acquires(&homa->throttle_lock) { __u64 start = sched_clock(); From 60792443b50caebe3a2e861981e017bb453d69cb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 11 Nov 2024 14:10:10 -0800 Subject: [PATCH 067/625] Remove sockaddr_in_union and peer_addr from homa.h sockaddr_in_union is no longer in any public interfaces. --- homa.h | 43 +++++++++------------------------------ homa_api.c | 31 ++++++++++++++-------------- homa_impl.h | 11 +++++++++- homa_plumbing.c | 6 +----- homa_receiver.h | 10 ++++++--- man/homa_reply.3 | 23 +++++---------------- man/homa_send.3 | 29 ++++++++------------------ man/recvmsg.2 | 26 +++++++++++++---------- test/unit_homa_plumbing.c | 14 ++----------- util/buffer_server.c | 1 + util/cp_node.cc | 26 ++++++++++++++--------- util/homa_test.cc | 21 ++++++++++++------- util/send_raw.c | 1 + util/server.cc | 3 ++- util/test_utils.cc | 2 +- util/test_utils.h | 20 ++++++++++++++++++ 16 files changed, 129 insertions(+), 138 deletions(-) diff --git a/homa.h b/homa.h index 218a101d..8f5ae1ca 100644 --- a/homa.h +++ b/homa.h @@ -49,16 +49,6 @@ extern "C" */ #define HOMA_MIN_DEFAULT_PORT 0x8000 -/** - * Holds either an IPv4 or IPv6 address (smaller and easier to use than - * sockaddr_storage). - */ -union sockaddr_in_union { - struct sockaddr sa; - struct sockaddr_in in4; - struct sockaddr_in6 in6; -}; - /** * struct homa_sendmsg_args - Provides information needed by Homa's * sendmsg; passed to sendmsg using the msg_control field. @@ -111,15 +101,6 @@ struct homa_recvmsg_args { */ uint32_t flags; - /** - * @error_addr: the address of the peer is stored here when available. - * This field is different from the msg_name field in struct msghdr - * in that the msg_name field isn't set after errors. This field will - * always be set when peer information is available, which includes - * some error cases. - */ - union sockaddr_in_union peer_addr; - /** * @num_bpages: (in/out) Number of valid entries in @bpage_offsets. * Passes in bpages from previous messages that can now be @@ -127,9 +108,6 @@ struct homa_recvmsg_args { */ uint32_t num_bpages; - /* Reserved for future use; must be zero. */ - uint32_t _pad[1]; - /** * @bpage_offsets: (in/out) Each entry is an offset into the buffer * region for the socket pool. When returned from recvmsg, the @@ -144,9 +122,9 @@ struct homa_recvmsg_args { }; #if !defined(__cplusplus) -_Static_assert(sizeof(struct homa_recvmsg_args) >= 120, +_Static_assert(sizeof(struct homa_recvmsg_args) >= 88, "homa_recvmsg_args shrunk"); -_Static_assert(sizeof(struct homa_recvmsg_args) <= 120, +_Static_assert(sizeof(struct homa_recvmsg_args) <= 88, "homa_recvmsg_args grew"); #endif @@ -214,18 +192,17 @@ struct homa_set_buf_args { int homa_abortp(int fd, struct homa_abort_args *args); int homa_send(int sockfd, const void *message_buf, - size_t length, const union sockaddr_in_union *dest_addr, - uint64_t *id, uint64_t completion_cookie); + size_t length, const struct sockaddr *dest_addr, + uint32_t addrlen, uint64_t *id, uint64_t completion_cookie); int homa_sendv(int sockfd, const struct iovec *iov, - int iovcnt, const union sockaddr_in_union *dest_addr, - uint64_t *id, uint64_t completion_cookie); + int iovcnt, const struct sockaddr *dest_addr, + uint32_t addrlen, uint64_t *id, uint64_t completion_cookie); ssize_t homa_reply(int sockfd, const void *message_buf, - size_t length, const union sockaddr_in_union *dest_addr, - uint64_t id); + size_t length, const struct sockaddr *dest_addr, + uint32_t addrlen, uint64_t id); ssize_t homa_replyv(int sockfd, const struct iovec *iov, - int iovcnt, const union sockaddr_in_union *dest_addr, - uint64_t id); -int homa_abort(int sockfd, uint64_t id, int error); + int iovcnt, const struct sockaddr *dest_addr, + uint32_t addrlen, uint64_t id); #ifdef __cplusplus } diff --git a/homa_api.c b/homa_api.c index e351dda8..7f45e352 100644 --- a/homa_api.c +++ b/homa_api.c @@ -24,6 +24,7 @@ * @length: Number of bytes in the message at @message_buf. * @dest_addr: Address of the RPC's client (returned by recvmsg when * the message was received). + * @addrlen: # bytes at *dest_addr. * @id: Unique identifier for the request, as returned by recvmsg * when the request was received. * @@ -35,7 +36,8 @@ * error occurred, -1 is returned and errno is set appropriately. */ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, - const union sockaddr_in_union *dest_addr, uint64_t id) + const struct sockaddr *dest_addr, uint32_t addrlen, + uint64_t id) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -49,7 +51,7 @@ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, vec.iov_len = length; hdr.msg_name = (void *)dest_addr; - hdr.msg_namelen = sizeof(*dest_addr); + hdr.msg_namelen = addrlen; hdr.msg_iov = &vec; hdr.msg_iovlen = 1; hdr.msg_control = &args; @@ -67,6 +69,7 @@ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, * @iovcnt: Number of elements in @iov. * @dest_addr: Address of the RPC's client (returned by recvmsg when * the message was received). + * @addrlen: # bytes at *dest_addr. * @id: Unique identifier for the request, as returned by recvmsg * when the request was received. * @@ -78,7 +81,8 @@ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, * error occurred, -1 is returned and errno is set appropriately. */ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, - const union sockaddr_in_union *dest_addr, uint64_t id) + const struct sockaddr *dest_addr, uint32_t addrlen, + uint64_t id) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -88,7 +92,7 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, args.completion_cookie = 0; hdr.msg_name = (void *)dest_addr; - hdr.msg_namelen = sizeof(*dest_addr); + hdr.msg_namelen = addrlen; hdr.msg_iov = (struct iovec *)iov; hdr.msg_iovlen = iovcnt; hdr.msg_control = &args; @@ -104,6 +108,7 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, * @message_buf: First byte of buffer containing the request message. * @length: Number of bytes at @message_buf. * @dest_addr: Address of server to which the request should be sent. + * @addrlen: # bytes at *dest_addr. * @id: A unique identifier for the request will be returned * here; this can be used later to find the response for * this request. @@ -113,8 +118,8 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, * error occurred, -1 is returned and errno is set appropriately. */ int homa_send(int sockfd, const void *message_buf, size_t length, - const union sockaddr_in_union *dest_addr, uint64_t *id, - uint64_t completion_cookie) + const struct sockaddr *dest_addr, uint32_t addrlen, + uint64_t *id, uint64_t completion_cookie) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -128,12 +133,7 @@ int homa_send(int sockfd, const void *message_buf, size_t length, vec.iov_len = length; hdr.msg_name = (void *)dest_addr; - /* For some unknown reason, this change improves short-message P99 - * latency by 20% in W3 under IPv4 (as of December 2022). - */ -// hdr.msg_namelen = sizeof(*dest_addr); - hdr.msg_namelen = dest_addr->in4.sin_family == AF_INET ? - sizeof(dest_addr->in4) : sizeof(dest_addr->in6); + hdr.msg_namelen = addrlen; hdr.msg_iov = &vec; hdr.msg_iovlen = 1; hdr.msg_control = &args; @@ -153,6 +153,7 @@ int homa_send(int sockfd, const void *message_buf, size_t length, * message. * @iovcnt: Number of elements in @iov. * @dest_addr: Address of server to which the request should be sent. + * @addrlen: # bytes at *dest_addr. * @id: A unique identifier for the request will be returned * here; this can be used later to find the response for * this request. @@ -162,8 +163,8 @@ int homa_send(int sockfd, const void *message_buf, size_t length, * error occurred, -1 is returned and errno is set appropriately. */ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, - const union sockaddr_in_union *dest_addr, uint64_t *id, - uint64_t completion_cookie) + const struct sockaddr *dest_addr, uint32_t addrlen, + uint64_t *id, uint64_t completion_cookie) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -173,7 +174,7 @@ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, args.completion_cookie = completion_cookie; hdr.msg_name = (void *)dest_addr; - hdr.msg_namelen = sizeof(*dest_addr); + hdr.msg_namelen = addrlen; hdr.msg_iov = (struct iovec *)iov; hdr.msg_iovlen = iovcnt; hdr.msg_control = &args; diff --git a/homa_impl.h b/homa_impl.h index a3e0a2a3..c42bf887 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -156,6 +156,15 @@ void homa_throttle_lock_slow(struct homa *homa); */ #define HOMA_MAX_GRANTS 10 +/* Holds either an IPv4 or IPv6 address (smaller and easier to use than + * sockaddr_storage). + */ +union sockaddr_in_union { + struct sockaddr sa; + struct sockaddr_in in4; + struct sockaddr_in6 in6; +}; + /** * struct homa_interest - Contains various information used while waiting * for incoming messages (indicates what kinds of messages a particular @@ -990,7 +999,7 @@ static inline __be32 ipv6_to_ipv4(const struct in6_addr ip6) * skb_canonical_ipv6_addr() - Convert a socket address to the "standard" * form used in Homa, which is always an IPv6 address; if the original address * was IPv4, convert it to an IPv4-mapped IPv6 address. - * @addr: Address to canonicalize. + * @addr: Address to canonicalize (if NULL, "any" is returned). */ static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union *addr) { diff --git a/homa_plumbing.c b/homa_plumbing.c index 0f89f83c..24963778 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1045,10 +1045,6 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, goto done; } control.completion_cookie = 0; - if (control._pad[0]) { - result = -EINVAL; - goto done; - } tt_record3("homa_recvmsg starting, port %d, pid %d, flags %d", hsk->port, current->pid, control.flags); @@ -1116,7 +1112,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, in4->sin_addr.s_addr = ipv6_to_ipv4(rpc->peer->addr); *addr_len = sizeof(*in4); } - memcpy(&control.peer_addr, msg->msg_name, *addr_len); + /* This indicates that the application now owns the buffers, so * we won't free them in homa_rpc_free. */ diff --git a/homa_receiver.h b/homa_receiver.h index 0d2b2282..770ff511 100644 --- a/homa_receiver.h +++ b/homa_receiver.h @@ -129,9 +129,9 @@ class receiver { * of the sender of the current message. The result is undefined * if there is no current message. */ - const sockaddr_in_union *src_addr(void) const + const struct sockaddr *src_addr(void) const { - return &source; + return &source.sa; } protected: @@ -150,7 +150,11 @@ class receiver { struct homa_recvmsg_args control; /** @source: Address of the node that sent the current message. */ - sockaddr_in_union source; + union { + struct sockaddr sa; + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } source; /** @length: Length of the current message, or < 0 if none. */ ssize_t msg_length; diff --git a/man/homa_reply.3 b/man/homa_reply.3 index 4a898cb9..f5956ac7 100644 --- a/man/homa_reply.3 +++ b/man/homa_reply.3 @@ -1,4 +1,4 @@ -.TH HOMA_REPLY 3 2022-12-13 "Homa" "Linux Programmer's Manual" +.TH HOMA_REPLY 3 2024-11-11 "Homa" "Linux Programmer's Manual" .SH NAME homa_reply, homa_replyv \- send a Homa response message .SH SYNOPSIS @@ -46,23 +46,10 @@ descriptors at In either case the total message length must not exceed .BR HOMA_MAX_MESSAGE_LENGTH . The destination for the response is given by -.I dest_addr , -which can hold either an IPv4 or an IPv6 address: -.PP -.in +4n -.ps -1 -.vs -2 -.EX -typedef union sockaddr_in_union { - struct sockaddr sa; - struct sockaddr_in in4; - struct sockaddr_in6 in6; -} sockaddr_in_union; -.EE -.vs +2 -.ps +1 -.in -.PP +.IR dest_addr , +which can hold either an IPv4 or an IPv6 address. The length +of the address is given by +.IR addrlen . The argument .I id is an identifier previously returned by diff --git a/man/homa_send.3 b/man/homa_send.3 index 56f0bad3..0cc63409 100644 --- a/man/homa_send.3 +++ b/man/homa_send.3 @@ -1,4 +1,4 @@ -.TH HOMA_SEND 3 2022-12-13 "Homa" "Linux Programmer's Manual" +.TH HOMA_SEND 3 2024-11-11 "Homa" "Linux Programmer's Manual" .SH NAME homa_send, homa_sendv \- send a request message .SH SYNOPSIS @@ -6,13 +6,13 @@ homa_send, homa_sendv \- send a request message .B #include .PP .BI "int homa_send(int " sockfd ", const void *" message_buf ", size_t " length \ -", const sockaddr_in_union *" dest_addr ", -.BI " uint64_t *" id ", uint64_t " \ +", const struct sockaddr *" dest_addr ", +.BI " size_t " addrlen ", uint64_t *" id ", uint64_t " \ "completion_cookie" ); .PP .BI "int homa_sendv(int " sockfd ", const struct iovec *" iov ", size_t " \ -iovcnt ", const sockaddr_in_union *" dest_addr , -.BI " uint64_t *" id ", uint64_t " \ +iovcnt ", const sockaddr *" dest_addr , +.BI " size_t " addrlen ", uint64_t *" id ", uint64_t " \ "completion_cookie" ); .fi .SH DESCRIPTION @@ -45,22 +45,9 @@ In either case, the total message length must not exceed .BR HOMA_MAX_MESSAGE_LENGTH . The destination socket for the request is given by .IR dest_addr , -which can hold either an IPv4 or IPv6 address: -.PP -.in +4n -.ps -1 -.vs -2 -.EX -typedef union sockaddr_in_union { - struct sockaddr sa; - struct sockaddr_in in4; - struct sockaddr_in6 in6; -} sockaddr_in_union; -.EE -.vs +2 -.ps +1 -.in -.PP +which can hold either an IPv4 or IPv6 address. The length of +the address is given by +.IR addrlen . If .I id is not NULL, an identifier for the request is returned at diff --git a/man/recvmsg.2 b/man/recvmsg.2 index 57162de7..b8c63dee 100644 --- a/man/recvmsg.2 +++ b/man/recvmsg.2 @@ -1,4 +1,4 @@ -.TH RECVMSG 2 2024-7-16 "Homa" "Linux Programmer's Manual" +.TH RECVMSG 2 2024-11-11 "Homa" "Linux Programmer's Manual" .SH NAME recvmsg \- receive a Homa message .SH SYNOPSIS @@ -57,13 +57,13 @@ field must refer to a structure of the following type: .vs -2 .EX struct homa_recvmsg_args { - uint64_t id; /* If nonzero, specifies id of + uint64_t id; /* If nonzero, specifies id of * desired RPC. */ - uint64_t completion_cookie; /* Value from sendmsg for request. */ - int flags; /* OR-ed combination of bits. */ - uint32_t num_bpages; /* Number of valid entries in + uint64_t completion_cookie; /* Value from sendmsg for request. */ + int flags; /* OR-ed combination of bits. */ + uint32_t num_bpages; /* Number of valid entries in * bpage_offsets. */ - uint32_t bpage_offsets[HOMA_MAX_BPAGES] /* Tokens for buffer pages. */ + uint32_t bpage_offsets[HOMA_MAX_BPAGES]; /* Tokens for buffer pages. */ }; .EE .vs +2 @@ -98,8 +98,8 @@ is nonzero, then the caller is interested in receiving a response for the RPC given by .B id. .IP \[bu] -Homa will use the structs to return information about the message received. -The +On a successful return Homa will use the structs to return information +about the message received. The .B id field will be set to the RPC identifier for the received message (if .B id @@ -111,7 +111,7 @@ is not NULL, then a .B sockaddr_in or .B sockaddr_in6 -will be stored at its target (depnding on the address family of +will be stored at its target (depending on the address family of .IR sockfd ), describing the sender of the message. For response messages, the @@ -209,7 +209,7 @@ is zero after an error, it means that the failure prevented a message from being received. If .B id -is zero, it means that a specific RPC has failed. +is nonzero, it means that a specific RPC has failed. For request messages, this can happen after errors such as .B EFAULT errors (the kernel couldn't write to user space to return @@ -226,7 +226,11 @@ or can occur if there was no server at the specified address, it couldn't be reached, or it timed out, respectively. .B ENOMEM -can also occur for responses. +can also occur for responses. If +.B id +is nonzero and even (i.e. the message that failed was a response) then the +.B completion_cookie +field will also be set. .PP After sucessfully receiving a message, an application has two responsibilities. First, it must eventually return the message's bpages to Homa as described diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 6893149a..8cb51836 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -477,14 +477,6 @@ TEST_F(homa_plumbing, homa_recvmsg__clear_cookie) 0, 0, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, self->recvmsg_args.completion_cookie); } -TEST_F(homa_plumbing, homa_recvmsg__nonzero_pad) -{ - EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); - self->recvmsg_args._pad[0] = 1; - EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); -} TEST_F(homa_plumbing, homa_recvmsg__num_bpages_too_large) { self->recvmsg_args.num_bpages = HOMA_MAX_BPAGES + 1; @@ -557,8 +549,6 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) EXPECT_EQ(AF_INET, self->addr.in4.sin_family); EXPECT_STREQ("1.2.3.4", homa_print_ipv4_addr( self->addr.in4.sin_addr.s_addr)); - EXPECT_STREQ("1.2.3.4", homa_print_ipv4_addr( - self->recvmsg_args.peer_addr.in4.sin_addr.s_addr)); EXPECT_EQ(sizeof32(struct sockaddr_in), self->recvmsg_hdr.msg_namelen); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); @@ -610,9 +600,9 @@ TEST_F(homa_plumbing, homa_recvmsg__rpc_has_error) &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(self->client_id, self->recvmsg_args.id); EXPECT_EQ(44444, self->recvmsg_args.completion_cookie); - EXPECT_EQ(AF_INET6, self->recvmsg_args.peer_addr.in6.sin6_family); + EXPECT_EQ(AF_INET6, self->addr.in6.sin6_family); EXPECT_STREQ("1.2.3.4", homa_print_ipv6_addr( - &self->recvmsg_args.peer_addr.in6.sin6_addr)); + &self->addr.in6.sin6_addr)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, self->recvmsg_args.num_bpages); } diff --git a/util/buffer_server.c b/util/buffer_server.c index e98f8e0e..142ba4ae 100644 --- a/util/buffer_server.c +++ b/util/buffer_server.c @@ -21,6 +21,7 @@ #include #include "homa.h" +#include "test_utils.h" int main(int argc, char** argv) { int fd, port; diff --git a/util/cp_node.cc b/util/cp_node.cc index 36808d8a..4407fb11 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -661,9 +661,8 @@ int tcp_connection::read(bool loop, && (errno == ECONNRESET))) { /* Connection was closed by the client. */ snprintf(error_message, sizeof(error_message), - "TCP connection on port %d " - "(fd %d) closed by peer %s", - port, fd, print_address(&peer)); + "TCP connection on port %d (fd %d) closed by peer %s", + port, fd, print_address(&peer)); return 1; } @@ -1108,7 +1107,8 @@ void homa_server::server(int thread_id, server_metrics *metrics) num_vecs++; } result = homa_replyv(fd, vecs, num_vecs, receiver.src_addr(), - receiver.id()); + sockaddr_size(receiver.src_addr()), + receiver.id()); if (result < 0) { log(NORMAL, "FATAL: homa_reply failed for server " "port %d: %s\n", @@ -2056,7 +2056,8 @@ bool homa_client::wait_response(homa::receiver *receiver, uint64_t rpc_id) log(NORMAL, "FATAL: error in Homa recvmsg: %s (id %lu, " "server %s)\n", strerror(errno), rpc_id, - print_address(receiver->src_addr())); + print_address((union sockaddr_in_union *) + receiver->src_addr())); exit(1); } header = receiver->get(0); @@ -2131,10 +2132,14 @@ void homa_client::sender() vec[1].iov_base = sender_buffer + 20; vec[1].iov_len = header->length - 20; status = homa_sendv(fd, vec, 2, - &server_addrs[server], &rpc_id, 0); + &server_addrs[server].sa, + sockaddr_size(&server_addrs[server].sa), + &rpc_id, 0); } else status = homa_send(fd, sender_buffer, header->length, - &server_addrs[server], &rpc_id, 0); + &server_addrs[server].sa, + sockaddr_size(&server_addrs[server].sa), + &rpc_id, 0); if (status < 0) { log(NORMAL, "FATAL: error in homa_send: %s (request " "length %d)\n", strerror(errno), @@ -2196,8 +2201,8 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, header->cid = server_conns[server]; header->cid.client_port = id; start = rdtsc(); - status = homa_send(fd, buffer, header->length, - &server_addrs[server], &rpc_id, 0); + status = homa_send(fd, buffer, header->length, &server_addrs[server].sa, + sockaddr_size(&server_addrs[server].sa), &rpc_id, 0); if (status < 0) { log(NORMAL, "FATAL: error in homa_send: %s (request " "length %d)\n", strerror(errno), @@ -2211,7 +2216,8 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, log(NORMAL, "FATAL: measure_rtt got error in recvmsg: %s " "(id %lu, server %s)\n", strerror(errno), rpc_id, - print_address(receiver->src_addr())); + print_address((union sockaddr_in_union *) + receiver->src_addr())); exit(1); } return rdtsc() - start; diff --git a/util/homa_test.cc b/util/homa_test.cc index 8082b8c6..af9eb01f 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -81,7 +81,8 @@ void send_fd(int fd, const sockaddr_in_union *addr, char *request) int status; sleep(1); - status = homa_send(fd, request, length, addr, &id, 0); + status = homa_send(fd, request, length, &addr->sa, + sockaddr_size(&addr->sa), &id, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); @@ -168,7 +169,8 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) uint64_t start = rdtsc(); for (int i = 1; i <= count; i++) { - status = homa_send(fd, request, length, dest, &id, 0); + status = homa_send(fd, request, length, &dest->sa, + sockaddr_size(&dest->sa), &id, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); @@ -216,7 +218,8 @@ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) int status; ssize_t resp_length; - status = homa_send(fd, request, length, dest, &id, 0); + status = homa_send(fd, request, length, &dest->sa, + sockaddr_size(&dest->sa), &id, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); return; @@ -370,7 +373,8 @@ void test_rtt(int fd, const sockaddr_in_union *dest, char *request) for (int i = -10; i < count; i++) { start = rdtsc(); - status = homa_send(fd, request, length, dest, NULL, 0); + status = homa_send(fd, request, length, &dest->sa, + sockaddr_size(&dest->sa), NULL, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); @@ -407,7 +411,8 @@ void test_send(int fd, const sockaddr_in_union *dest, char *request) uint64_t id; int status; - status = homa_send(fd, request, length, dest, &id, 0); + status = homa_send(fd, request, length, &dest->sa, + sockaddr_size(&dest->sa), &id, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); @@ -509,7 +514,8 @@ void test_stream(int fd, const sockaddr_in_union *dest) seed_buffer(buffers[i]+2, length - 2*sizeof32(int), 1000*i); } for (i = 0; i < count; i++) { - status = homa_send(fd, buffers[i], length, dest, &id, 0); + status = homa_send(fd, buffers[i], length, &dest->sa, + sockaddr_size(&dest->sa), &id, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); return; @@ -537,7 +543,8 @@ void test_stream(int fd, const sockaddr_in_union *dest) resp_length); response = (int *) (buf_region + recv_args.bpage_offsets[0]); status = homa_send(fd, buffers[(response[2]/1000) %count], - length, dest, &id, 0); + length, &dest->sa, sockaddr_size(&dest->sa), + &id, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); return; diff --git a/util/send_raw.c b/util/send_raw.c index 41b6d8af..1ebfb8e2 100644 --- a/util/send_raw.c +++ b/util/send_raw.c @@ -18,6 +18,7 @@ #include #include "homa.h" +#include "test_utils.h" int main(int argc, char** argv) { int fd, status; diff --git a/util/server.cc b/util/server.cc index 1f402fab..fb034781 100644 --- a/util/server.cc +++ b/util/server.cc @@ -147,7 +147,8 @@ void homa_server(int port) resp_length -= vecs[num_vecs].iov_len; num_vecs++; } - result = homa_replyv(fd, vecs, num_vecs, &source, recv_args.id); + result = homa_replyv(fd, vecs, num_vecs, &source.sa, + sockaddr_size(&source.sa), recv_args.id); if (result < 0) { printf("homa_reply failed: %s\n", strerror(errno)); } diff --git a/util/test_utils.cc b/util/test_utils.cc index 444390d3..3a7415a9 100644 --- a/util/test_utils.cc +++ b/util/test_utils.cc @@ -226,7 +226,7 @@ void seed_buffer(void *buffer, size_t length, int seed) * strings, so callers don't have to worry about allocating space, even if * several addresses are in use at once. This function is also thread-safe. */ -const char *print_address(const sockaddr_in_union *addr) +const char *print_address(const union sockaddr_in_union *addr) { // Avoid cache line conflicts: diff --git a/util/test_utils.h b/util/test_utils.h index a9c9a7c8..2dc1b6df 100644 --- a/util/test_utils.h +++ b/util/test_utils.h @@ -19,6 +19,26 @@ extern "C" { #endif +/** + * Holds either an IPv4 or IPv6 address (smaller and easier to use than + * sockaddr_storage). + */ +union sockaddr_in_union { + struct sockaddr sa; + struct sockaddr_in in4; + struct sockaddr_in6 in6; +}; + +/** + * sockaddr_size() - Return the number of bytes used by the argument. + * @sa: Pointer to either an IPv4 or an IPv6 address. + */ +static inline uint32_t sockaddr_size(const struct sockaddr *sa) +{ + return (sa->sa_family == AF_INET) ? sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6); +} + #define sizeof32(type) static_cast(sizeof(type)) extern int check_buffer(void *buffer, size_t length); From b1f390f8c7dc363cd379c978175543be6ccb4cd5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 11 Nov 2024 14:31:58 -0800 Subject: [PATCH 068/625] Strip wrapper function declarations from upstream code --- homa.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/homa.h b/homa.h index 8f5ae1ca..1ff26e2d 100644 --- a/homa.h +++ b/homa.h @@ -189,8 +189,8 @@ struct homa_set_buf_args { #define HOMAIOCABORT _IOWR(0x89, 0xe3, struct homa_abort_args) #define HOMAIOCFREEZE _IO(0x89, 0xef) -int homa_abortp(int fd, struct homa_abort_args *args); - +#if 1 /* See strip.py */ +int homa_abort(int sockfd, uint64_t id, int error); int homa_send(int sockfd, const void *message_buf, size_t length, const struct sockaddr *dest_addr, uint32_t addrlen, uint64_t *id, uint64_t completion_cookie); @@ -203,6 +203,7 @@ ssize_t homa_reply(int sockfd, const void *message_buf, ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, const struct sockaddr *dest_addr, uint32_t addrlen, uint64_t id); +#endif /* See strip.py */ #ifdef __cplusplus } From 8e7e7c29c5b753c94134dce9f5a4d201eb3b631c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 12 Nov 2024 10:07:45 -0800 Subject: [PATCH 069/625] Clean up issues from checkpatch and kdoc --- homa.h | 12 ++++++++---- homa_impl.h | 20 +++++++++++--------- homa_incoming.c | 15 ++++++++------- homa_outgoing.c | 5 +++-- homa_peer.c | 10 +++++----- homa_peer.h | 6 +++--- homa_plumbing.c | 19 ++++++++++++------- homa_pool.c | 17 ++++++++++------- homa_pool.h | 10 +++++----- homa_rpc.c | 16 ++++++++++------ homa_rpc.h | 6 ++++-- homa_sock.h | 17 +++++++++-------- homa_stub.h | 5 +++-- homa_utils.c | 3 ++- homa_wire.h | 9 +++++---- 15 files changed, 98 insertions(+), 72 deletions(-) diff --git a/homa.h b/homa.h index 1ff26e2d..006f5f42 100644 --- a/homa.h +++ b/homa.h @@ -31,11 +31,11 @@ extern "C" * define HOMA_BPAGE_SIZE - Number of bytes in pages used for receive * buffers. Must be power of two. */ -#define HOMA_BPAGE_SHIFT 16 #define HOMA_BPAGE_SIZE (1 << HOMA_BPAGE_SHIFT) +#define HOMA_BPAGE_SHIFT 16 /** - * define HOMA_MAX_BPAGES: The largest number of bpages that will be required + * define HOMA_MAX_BPAGES - The largest number of bpages that will be required * to store an incoming message. */ #define HOMA_MAX_BPAGES ((HOMA_MAX_MESSAGE_LENGTH + HOMA_BPAGE_SIZE - 1) \ @@ -170,14 +170,18 @@ struct homa_set_buf_args { size_t length; }; +/* Meanings of the bits in Homa's flag word, which can be set using + * "sysctl /net/homa/flags". + */ + /** * Meanings of the bits in Homa's flag word, which can be set using * "sysctl /net/homa/flags". */ /** - * Disable the output throttling mechanism: always send all packets - * immediately. + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism: + * always send all packets immediately. */ #define HOMA_FLAG_DONT_THROTTLE 2 diff --git a/homa_impl.h b/homa_impl.h index c42bf887..e6492ed1 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -156,8 +156,9 @@ void homa_throttle_lock_slow(struct homa *homa); */ #define HOMA_MAX_GRANTS 10 -/* Holds either an IPv4 or IPv6 address (smaller and easier to use than - * sockaddr_storage). +/** + * union sockaddr_in_union - Holds either an IPv4 or IPv6 address (smaller + * and easier to use than sockaddr_storage). */ union sockaddr_in_union { struct sockaddr sa; @@ -745,8 +746,8 @@ struct homa { /* Bits that can be specified for gro_policy. These were created for * testing, in order to evaluate various possible policies; you almost * certainly should not use any value other than HOMA_GRO_NORMAL. - * HOMA_GRO_SAME_CORE If isolated packets arrive (not part of - * a batch) use the GRO core for SoftIRQ also. + * HOMA_GRO_SAME_CORE If isolated packets arrive (not part of a + * batch) use the GRO core for SoftIRQ also. * HOMA_GRO_IDLE Use old mechanism for selecting an idle * core for SoftIRQ (deprecated). * HOMA_GRO_NEXT Always use the next core in circular @@ -970,8 +971,8 @@ static inline bool skb_is_ipv6(const struct sk_buff *skb) } /** - * Given an IPv4 address, return an equivalent IPv6 address (an IPv4-mapped - * one) + * ipv4_to_ipv6() - Given an IPv4 address, return an equivalent IPv6 address + * (an IPv4-mapped one). * @ip4: IPv4 address, in network byte order. */ static inline struct in6_addr ipv4_to_ipv6(__be32 ip4) @@ -996,12 +997,13 @@ static inline __be32 ipv6_to_ipv4(const struct in6_addr ip6) } /** - * skb_canonical_ipv6_addr() - Convert a socket address to the "standard" + * canonical_ipv6_addr() - Convert a socket address to the "standard" * form used in Homa, which is always an IPv6 address; if the original address * was IPv4, convert it to an IPv4-mapped IPv6 address. * @addr: Address to canonicalize (if NULL, "any" is returned). */ -static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union *addr) +static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union + *addr) { if (addr) { return (addr->sa.sa_family == AF_INET6) @@ -1046,6 +1048,7 @@ static inline bool is_homa_pkt(struct sk_buff *skb) (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); } +#if 1 /* See strip.py --alt */ /** * tt_addr() - Given an address, return a 4-byte id that will (hopefully) * provide a unique identifier for the address in a timetrace record. @@ -1058,7 +1061,6 @@ static inline uint32_t tt_addr(const struct in6_addr x) : ntohl(x.in6_u.u6_addr32[1])); } -#if 1 /* See strip.py --alt */ #ifdef __UNIT_TEST__ void unit_log_printf(const char *separator, const char *format, ...) __printf(2, 3); diff --git a/homa_incoming.c b/homa_incoming.c index a8f17d85..5cd361f7 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -125,7 +125,8 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (start > rpc->msgin.recv_end) { /* Packet creates a new gap. */ - if (!homa_gap_new(&rpc->msgin.gaps, rpc->msgin.recv_end, start)) { + if (!homa_gap_new(&rpc->msgin.gaps, + rpc->msgin.recv_end, start)) { pr_err("Homa couldn't allocate gap: insufficient memory\n"); tt_record2("Couldn't allocate gap for id %d (start %d): no memory", rpc->id, start); @@ -302,9 +303,8 @@ int homa_copy_to_user(struct homa_rpc *rpc) if (error) goto free_skbs; error = skb_copy_datagram_iter(skbs[i], - sizeof(*h) + copied, - &iter, - chunk_size); + sizeof(*h) + copied, &iter, + chunk_size); if (error) goto free_skbs; copied += chunk_size; @@ -339,7 +339,8 @@ int homa_copy_to_user(struct homa_rpc *rpc) n = 0; atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc, "homa_copy_to_user"); - atomic_andnot(APP_NEEDS_LOCK | RPC_COPYING_TO_USER, &rpc->flags); + atomic_andnot(APP_NEEDS_LOCK | RPC_COPYING_TO_USER, + &rpc->flags); if (error) break; } @@ -439,8 +440,8 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) } } else { rpc = homa_find_server_rpc(hsk, &saddr, - ntohs(h->common.sport), - id); + ntohs(h->common.sport), + id); } } else { rpc = homa_find_client_rpc(hsk, id); diff --git a/homa_outgoing.c b/homa_outgoing.c index 9d8ea657..25fe3d8a 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -42,6 +42,7 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) * homa_skb_info has been filled in with the packet geometry. * @iter: Describes location(s) of (remaining) message data in user * space. + * Return: Either a negative errno or 0 (for success). */ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter) @@ -935,8 +936,8 @@ void homa_pacer_xmit(struct homa *homa) */ if (!*rpc->msgout.next_xmit || (rpc->msgout.next_xmit_offset >= rpc->msgout.granted)) { - /* Nothing more to transmit from this message (right now), - * so remove it from the throttled list. + /* Nothing more to transmit from this message (right + * now), so remove it from the throttled list. */ homa_throttle_lock(homa); if (!list_empty(&rpc->throttled_links)) { diff --git a/homa_peer.c b/homa_peer.c index 7293409b..77cf4715 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -118,9 +118,9 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now) { while (!list_empty(&peertab->dead_dsts)) { - struct homa_dead_dst *dead = list_first_entry(&peertab->dead_dsts, - struct homa_dead_dst, - dst_links); + struct homa_dead_dst *dead = + list_first_entry(&peertab->dead_dsts, + struct homa_dead_dst, dst_links); if (dead->gc_time > now) break; dst_release(dead->dst); @@ -299,8 +299,8 @@ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, flowi4_init_output(&peer->flow.u.ip4, inet->sk.sk_bound_dev_if, inet->sk.sk_mark, inet->tos, RT_SCOPE_UNIVERSE, inet->sk.sk_protocol, 0, - peer->addr.in6_u.u6_addr32[3], inet->inet_saddr, - 0, 0, inet->sk.sk_uid); + peer->addr.in6_u.u6_addr32[3], + inet->inet_saddr, 0, 0, inet->sk.sk_uid); security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); rt = ip_route_output_flow(sock_net(&inet->sk), &peer->flow.u.ip4, &inet->sk); diff --git a/homa_peer.h b/homa_peer.h index babf1a56..da2d40fb 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -33,7 +33,7 @@ struct homa_dead_dst { }; /** - * define HOMA_PEERTAB_BUCKETS - Number of bits in the bucket index for a + * define HOMA_PEERTAB_BUCKET_BITS - Number of bits in the bucket index for a * homa_peertab. Should be large enough to hold an entry for every server * in a datacenter without long hash chains. */ @@ -110,13 +110,13 @@ struct homa_peer { __be16 cutoff_version; /** - * last_update_jiffies: time in jiffies when we sent the most + * @last_update_jiffies: time in jiffies when we sent the most * recent CUTOFFS packet to this peer. */ unsigned long last_update_jiffies; /** - * grantable_rpcs: Contains all homa_rpcs (both requests and + * @grantable_rpcs: Contains all homa_rpcs (both requests and * responses) involving this peer whose msgins require (or required * them in the past) and have not been fully received. The list is * sorted in priority order (head has fewest bytes_remaining). diff --git a/homa_plumbing.c b/homa_plumbing.c index 24963778..1a890e75 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -147,7 +147,8 @@ static struct proto homav6_prot = { /* IPv6 data comes *after* Homa's data, and isn't included in * struct homa_sock. */ - .obj_size = sizeof(struct homa_sock) + sizeof(struct ipv6_pinfo), + .obj_size = sizeof(struct homa_sock) + + sizeof(struct ipv6_pinfo), .no_autobind = 1, }; @@ -847,11 +848,13 @@ int homa_setsockopt(struct sock *sk, int level, int optname, /* Do a trivial test to make sure we can at least write the first * page of the region. */ - if (copy_to_user((void __user *)args.start, &args, sizeof(args))) + if (copy_to_user((__force void __user *)args.start, &args, + sizeof(args))) return -EFAULT; homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_SET_BUF"); - ret = homa_pool_init(hsk, args.start, args.length); + ret = homa_pool_init(hsk, (__force void __user *)args.start, + args.length); homa_sock_unlock(hsk); INC_METRIC(so_set_buf_calls, 1); INC_METRIC(so_set_buf_ns, sched_clock() - start); @@ -891,7 +894,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) __u64 finish; int result = 0; struct homa_rpc *rpc = NULL; - union sockaddr_in_union *addr = (union sockaddr_in_union *)msg->msg_name; + union sockaddr_in_union *addr = (union sockaddr_in_union *) + msg->msg_name; per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; @@ -1133,8 +1137,8 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, homa_rpc_unlock(rpc); done: - if (unlikely(copy_to_user((void __user *)msg->msg_control, &control, - sizeof(control)))) { + if (unlikely(copy_to_user((__force void __user *)msg->msg_control, + &control, sizeof(control)))) { /* Note: in this case the message's buffers will be leaked. */ pr_notice("%s couldn't copy back args\n", __func__); result = -EFAULT; @@ -1662,7 +1666,8 @@ int homa_timer_main(void *transport) while (1) { set_current_state(TASK_UNINTERRUPTIBLE); if (!exiting) { - hrtimer_start(&hrtimer, tick_interval, HRTIMER_MODE_REL); + hrtimer_start(&hrtimer, tick_interval, + HRTIMER_MODE_REL); schedule(); } __set_current_state(TASK_RUNNING); diff --git a/homa_pool.c b/homa_pool.c index 89d313df..b3f417ce 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -48,7 +48,8 @@ static void set_bpages_needed(struct homa_pool *pool) * @region_size: Total number of bytes available at @buf_region. * Return: Either zero (for success) or a negative errno for failure. */ -int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) +int homa_pool_init(struct homa_sock *hsk, void __user *region, + __u64 region_size) { struct homa_pool *pool = hsk->buffer_pool; int i, result; @@ -65,7 +66,8 @@ int homa_pool_init(struct homa_sock *hsk, void *region, __u64 region_size) goto error; } pool->descriptors = kmalloc_array(pool->num_bpages, - sizeof(struct homa_bpage), GFP_ATOMIC); + sizeof(struct homa_bpage), + GFP_ATOMIC); if (!pool->descriptors) { result = -ENOMEM; goto error; @@ -196,13 +198,13 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, */ ref_count = atomic_read(&bpage->refs); if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || - bpage->expiration > now))) + bpage->expiration > now))) continue; if (!spin_trylock_bh(&bpage->lock)) continue; ref_count = atomic_read(&bpage->refs); if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || - bpage->expiration > now))) { + bpage->expiration > now))) { spin_unlock_bh(&bpage->lock); continue; } @@ -252,7 +254,8 @@ int homa_pool_allocate(struct homa_rpc *rpc) if (homa_pool_get_pages(pool, full_pages, pages, 0) != 0) goto out_of_space; for (i = 0; i < full_pages; i++) - rpc->msgin.bpage_offsets[i] = pages[i] << HOMA_BPAGE_SHIFT; + rpc->msgin.bpage_offsets[i] = pages[i] << + HOMA_BPAGE_SHIFT; } rpc->msgin.num_bpages = full_pages; @@ -365,8 +368,8 @@ void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available) *available = (bpage_index < (rpc->msgin.num_bpages - 1)) ? HOMA_BPAGE_SIZE - bpage_offset : rpc->msgin.length - offset; - return rpc->hsk->buffer_pool->region + rpc->msgin.bpage_offsets[bpage_index] - + bpage_offset; + return rpc->hsk->buffer_pool->region + + rpc->msgin.bpage_offsets[bpage_index] + bpage_offset; } /** diff --git a/homa_pool.h b/homa_pool.h index 0ac08d49..6f41ecbf 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -126,9 +126,9 @@ struct homa_pool { atomic_t free_bpages; /** - * The number of free bpages required to satisfy the needs of the - * first RPC on @hsk->waiting_for_bufs, or INT_MAX if that queue - * is empty. + * @bpages_needed: the number of free bpages required to satisfy the + * needs of the first RPC on @hsk->waiting_for_bufs, or INT_MAX if + * that queue is empty. */ int bpages_needed; @@ -148,8 +148,8 @@ struct homa_pool { int homa_pool_allocate(struct homa_rpc *rpc); void homa_pool_check_waiting(struct homa_pool *pool); void homa_pool_destroy(struct homa_pool *pool); -void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, - int *available); +void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, + int *available); int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, int leave_locked); int homa_pool_init(struct homa_sock *hsk, void *buf_region, diff --git a/homa_rpc.c b/homa_rpc.c index 9f938186..014c9c8d 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -206,7 +206,8 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, * to the RPC, but can sometimes be used to avoid a socket lookup. * @saddr: Source address from which the act was received (the client * note for the RPC) - * @ack: Information about an RPC from @saddr that may now be deleted safely. + * @ack: Information about an RPC from @saddr that may now be deleted + * safely. */ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack) @@ -295,7 +296,8 @@ void homa_rpc_free(struct homa_rpc *rpc) if (rpc->msgin.length >= 0) { rpc->hsk->dead_skbs += skb_queue_len(&rpc->msgin.packets); while (1) { - struct homa_gap *gap = list_first_entry_or_null(&rpc->msgin.gaps, + struct homa_gap *gap = list_first_entry_or_null( + &rpc->msgin.gaps, struct homa_gap, links); if (!gap) break; @@ -386,8 +388,8 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) if (rpc->msgout.length >= 0) { while (rpc->msgout.packets) { skbs[num_skbs] = rpc->msgout.packets; - rpc->msgout.packets = homa_get_skb_info(rpc - ->msgout.packets)->next_skb; + rpc->msgout.packets = homa_get_skb_info( + rpc->msgout.packets)->next_skb; num_skbs++; rpc->msgout.num_skbs--; if (num_skbs >= batch_size) @@ -448,8 +450,10 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) rpc->msgin.bpage_offsets); if (rpc->msgin.length >= 0) { while (1) { - struct homa_gap *gap = list_first_entry_or_null(&rpc - ->msgin.gaps, + struct homa_gap *gap; + + gap = list_first_entry_or_null( + &rpc->msgin.gaps, struct homa_gap, links); if (!gap) break; diff --git a/homa_rpc.h b/homa_rpc.h index 8eebe08a..ca8b81e2 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -185,7 +185,8 @@ struct homa_message_in { */ __u32 num_bpages; - /** @bpage_offsets: Describes buffer space allocated for this message. + /** + * @bpage_offsets: Describes buffer space allocated for this message. * Each entry is an offset from the start of the buffer region. * All but the last pointer refer to areas of size HOMA_BPAGE_SIZE. */ @@ -201,7 +202,8 @@ struct homa_rpc { /** @hsk: Socket that owns the RPC. */ struct homa_sock *hsk; - /** @bucket: Pointer to the bucket in hsk->client_rpc_buckets or + /** + * @bucket: Pointer to the bucket in hsk->client_rpc_buckets or * hsk->server_rpc_buckets where this RPC is linked. Used primarily * for locking the RPC (which is done by locking its bucket). */ diff --git a/homa_sock.h b/homa_sock.h index 5c832116..b7d1041a 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -26,7 +26,7 @@ void homa_sock_lock_slow(struct homa_sock *hsk); */ struct homa_socktab { /** - * @mutex: Controls all modifications to this object; not needed + * @write_lock: Controls all modifications to this object; not needed * for socket lookups (RCU is used instead). Also used to * synchronize port allocation. */ @@ -101,7 +101,8 @@ struct homa_rpc_bucket { /** @rpcs: list of RPCs that hash to this bucket. */ struct hlist_head rpcs; - /** @id: identifier for this bucket, used in error messages etc. + /** + * @id: identifier for this bucket, used in error messages etc. * It's the index of the bucket within its hash table bucket * array, with an additional offset to separate server and * client RPCs. @@ -184,7 +185,7 @@ struct homa_sock { int ip_header_length; /** - * @client_socktab_links: Links this socket into the homa_socktab + * @socktab_links: Links this socket into the homa_socktab * based on @port. */ struct homa_socktab_links socktab_links; @@ -309,7 +310,7 @@ static inline void homa_sock_unlock(struct homa_sock *hsk) } /** - * port_hash() - Hash function for port numbers. + * homa_port_hash() - Hash function for port numbers. * @port: Port number being looked up. * * Return: The index of the bucket in which this port will be found (if @@ -332,8 +333,8 @@ static inline int homa_port_hash(__u16 port) * * Return: The bucket in which this RPC will appear, if the RPC exists. */ -static inline struct homa_rpc_bucket *homa_client_rpc_bucket(struct homa_sock *hsk, - __u64 id) +static inline struct homa_rpc_bucket *homa_client_rpc_bucket( + struct homa_sock *hsk, __u64 id) { /* We can use a really simple hash function here because RPC ids * are allocated sequentially. @@ -350,8 +351,8 @@ static inline struct homa_rpc_bucket *homa_client_rpc_bucket(struct homa_sock *h * * Return: The bucket in which this RPC will appear, if the RPC exists. */ -static inline struct homa_rpc_bucket *homa_server_rpc_bucket(struct homa_sock *hsk, - __u64 id) +static inline struct homa_rpc_bucket *homa_server_rpc_bucket( + struct homa_sock *hsk, __u64 id) { /* Each client allocates RPC ids sequentially, so they will * naturally distribute themselves across the hash space. diff --git a/homa_stub.h b/homa_stub.h index fb2ba419..7b47b63e 100644 --- a/homa_stub.h +++ b/homa_stub.h @@ -22,8 +22,9 @@ static inline int homa_skb_append_from_iter(struct homa *homa, return 0; } -static inline int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, - void *buf, int length) +static inline int homa_skb_append_to_frag(struct homa *homa, + struct sk_buff *skb, void *buf, + int length) { char *dst = skb_put(skb, length); diff --git a/homa_utils.c b/homa_utils.c index da0273b9..d737f047 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -58,7 +58,8 @@ int homa_init(struct homa *homa) homa->next_client_port = HOMA_MIN_DEFAULT_PORT; homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL); if (!homa->port_map) { - pr_err("%s couldn't create port_map: kmalloc failure", __func__); + pr_err("%s couldn't create port_map: kmalloc failure", + __func__); return -ENOMEM; } homa_socktab_init(homa->port_map); diff --git a/homa_wire.h b/homa_wire.h index d68bc8e9..13e5d708 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -104,8 +104,7 @@ struct common_header { */ __be32 sequence; - /** - * The fields below correspond to the acknowledgment field in TCP + /* The fields below correspond to the acknowledgment field in TCP * headers; not used by Homa, except for the low-order 8 bits, which * specify the Homa packet type (one of the values in the * homa_packet_type enum). @@ -173,7 +172,7 @@ struct common_header { */ struct homa_ack { /** - * @id: The client's identifier for the RPC. 0 means this ack + * @client_id: The client's identifier for the RPC. 0 means this ack * is invalid. */ __be64 client_id; @@ -295,6 +294,7 @@ _Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) & 0x3) * homa_data_len() - Returns the total number of bytes in a DATA packet * after the data_header. Note: if the packet is a GSO packet, the result * may include metadata as well as packet data. + * @skb: Incoming data packet */ static inline int homa_data_len(struct sk_buff *skb) { @@ -466,10 +466,11 @@ struct ack_header { /** @common: Fields common to all packet types. */ struct common_header common; - /** @num_acks: number of (leading) elements in @acks that are valid. */ + /** @num_acks: Number of (leading) elements in @acks that are valid. */ __be16 num_acks; #define HOMA_MAX_ACKS_PER_PKT 5 + /** @acks: Info about RPCs that are no longer active. */ struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; } __packed; _Static_assert(sizeof(struct ack_header) <= HOMA_MAX_HEADER, From 4ff8b00662a60ccc927646dcf819aabf33a9a8f7 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 12 Nov 2024 10:20:23 -0800 Subject: [PATCH 070/625] Fix do_div type errors found by test robot --- homa_outgoing.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index 25fe3d8a..70115dc9 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -210,9 +210,10 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) */ int mtu, max_seg_data, max_gso_data; - int overlap_xmit, segs_per_gso; struct sk_buff **last_link; struct dst_entry *dst; + uint64_t segs_per_gso; + int overlap_xmit; /* Bytes of the message that haven't yet been copied into skbs. */ int bytes_left; From 3feb2edf4076b32aec4eda5788aa066a50b37951 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 13 Nov 2024 13:14:52 -0800 Subject: [PATCH 071/625] Fix minor comment error --- homa.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/homa.h b/homa.h index 006f5f42..4a17f8ea 100644 --- a/homa.h +++ b/homa.h @@ -174,11 +174,6 @@ struct homa_set_buf_args { * "sysctl /net/homa/flags". */ -/** - * Meanings of the bits in Homa's flag word, which can be set using - * "sysctl /net/homa/flags". - */ - /** * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism: * always send all packets immediately. From abb5317dbc25f860d3003c1ba8a858bf5a2bf7ad Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 20 Nov 2024 11:02:32 -0800 Subject: [PATCH 072/625] Restore "externs" incorrectly dropped in 89b1e9963b --- timetrace.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/timetrace.c b/timetrace.c index c4165936..3fb8eb28 100644 --- a/timetrace.c +++ b/timetrace.c @@ -10,25 +10,25 @@ //#define TT_KERNEL 1 #endif /* __UNIT_TEST__ */ #ifdef TT_KERNEL -struct tt_buffer *tt_linux_buffers[]; -void (*tt_linux_freeze)(void); -atomic_t *tt_linux_freeze_count; -atomic_t tt_linux_freeze_no_homa; -int *tt_linux_homa_temp; -int tt_linux_homa_temp_default[16]; -void (*tt_linux_inc_metrics)(int metric, __u64 count); -void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, +extern struct tt_buffer *tt_linux_buffers[]; +extern void (*tt_linux_freeze)(void); +extern atomic_t *tt_linux_freeze_count; +extern atomic_t tt_linux_freeze_no_homa; +extern int *tt_linux_homa_temp; +extern int tt_linux_homa_temp_default[16]; +extern void (*tt_linux_inc_metrics)(int metric, __u64 count); +extern void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, const char *format, __u32 arg0, __u32 arg1, __u32 arg2, __u32 arg3); -void tt_linux_skip_metrics(int metric, __u64 count); -void (*tt_linux_printk)(void); -void (*tt_linux_dbg1)(char *msg, ...); -void (*tt_linux_dbg2)(char *msg, ...); -void (*tt_linux_dbg3)(char *msg, ...); -void tt_linux_nop(void); -void homa_trace(__u64 u0, __u64 u1, int i0, int i1); - -void ltt_record_nop(struct tt_buffer *buffer, __u64 timestamp, +extern void tt_linux_skip_metrics(int metric, __u64 count); +extern void (*tt_linux_printk)(void); +extern void (*tt_linux_dbg1)(char *msg, ...); +extern void (*tt_linux_dbg2)(char *msg, ...); +extern void (*tt_linux_dbg3)(char *msg, ...); +extern void tt_linux_nop(void); +extern void homa_trace(__u64 u0, __u64 u1, int i0, int i1); + +extern void ltt_record_nop(struct tt_buffer *buffer, __u64 timestamp, const char *format, __u32 arg0, __u32 arg1, __u32 arg2, __u32 arg3); #endif From 565f4cb826677d349217c1f02f2111d826e0fa4a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 20 Nov 2024 21:54:27 -0800 Subject: [PATCH 073/625] Include netdev_queue address in txpkts output --- util/tthoma.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 36aa161e..d33cddd9 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -171,6 +171,7 @@ def __missing__(self, id): # present if xmit is present) # tx_core: Core on which ip*xmit was invoked # tx_qid: NIC channel on which packet was transmitted +# tx_queue: Hex address of queue corresponding to tx_qid, if known # rx_node: Name of node on which packet was received # gro_core: Core on which homa_gro_receive was invoked # softirq_core: Core on which SoftIRQ processed the packet @@ -234,7 +235,7 @@ def __missing__(self, key): # but not yet returned via the tx completion queue, as of the # end of the interval # tx_qdisc: Bytes of data that have been passed to ip*xmit but not -# yet transmitted, as of the end of the interval (large +# yet passed to the NIC, as of the end of the interval (large # numbers probably due to qdisc) # tx_q: Estimate of the number of unsent bytes in the NIC (based # on when packets passed to the NIC if available, otherwise @@ -1130,25 +1131,28 @@ def __mlx_data(self, trace, time, core, match, interests): peer = match.group(1) id = int(match.group(2)) offset = int(match.group(3)) + tx_queue = match.group(4) for interest in interests: - interest.tt_mlx_data(trace, time, core, peer, id, offset) + interest.tt_mlx_data(trace, time, core, peer, id, offset, tx_queue) patterns.append({ 'name': 'mlx_data', 'regexp': 'mlx sent homa data packet to ([^,]+), id ([0-9]+), ' - 'offset ([0-9]+)' + 'offset ([0-9]+), queue (0x[0-9a-f]+)' }) def __mlx_grant(self, trace, time, core, match, interests): peer = match.group(1) id = int(match.group(2)) offset = int(match.group(3)) + tx_queue = match.group(4) for interest in interests: - interest.tt_mlx_grant(trace, time, core, peer, id, offset) + interest.tt_mlx_grant(trace, time, core, peer, id, offset, tx_queue) patterns.append({ 'name': 'mlx_grant', - 'regexp': 'mlx sent homa grant to ([^,]+), id ([0-9]+), offset ([0-9]+)' + 'regexp': 'mlx sent homa grant to ([^,]+), id ([0-9]+), ' + 'offset ([0-9]+), queue (0x[0-9a-f]+)' }) def __free_tx_skb(self, trace, time, core, match, interests): @@ -5338,12 +5342,13 @@ def tt_ip_xmit(self, trace, t, core, id, offset): p['tx_core'] = core rpcs[id]['send_data_pkts'].append(p) - def tt_mlx_data(self, trace, t, core, peer, id, offset): + def tt_mlx_data(self, trace, t, core, peer, id, offset, tx_queue): global packets p = packets[pkt_id(id, offset)] if not 'retransmits' in p: p['nic'] = t p['tx_node'] = trace['node'] + p['tx_queue'] = tx_queue def tt_free_tx_skb(self, trace, t, core, id, offset, qid, msg_length): global packets @@ -5421,11 +5426,12 @@ def tt_send_grant(self, trace, t, core, id, offset, priority, increment): g['tx_node'] = trace['node'] g['increment'] = increment - def tt_mlx_grant(self, trace, t, core, peer, id, offset): + def tt_mlx_grant(self, trace, t, core, peer, id, offset, tx_queue): global grants g = grants[pkt_id(id, offset)] g['nic'] = t g['tx_node'] = trace['node'] + g['tx_queue'] = tx_queue def tt_gro_grant(self, trace, t, core, peer, id, offset, priority): global grants @@ -7168,7 +7174,7 @@ def output(self): f.write('# Pkts: Packets transmitted during the interval\n') f.write('# QDisc: KB of data that have been passed to ip*xmit ' 'but not yet\n') - f.write('# transmitted by NIC, as of the end of the ' + f.write('# passed to the NIC, as of the end of the ' 'interval\n') f.write('# NicKB: KB of data passed to NIC during the interval\n') f.write('# NQEst: Estimate of NIC queue length at the end ' @@ -7295,6 +7301,7 @@ def output(self): print('Summary statistics on delays related to outgoing packets:') print('Node: Name of node') print('Qid: Identifier of transmit queue') + print('TxQueue: Address of netdev_queue struct for Qid') print('Tsos: Total number of TSO frames transmitted by node ' 'or queue') print('Segs: Total number of segments (packets received by GRO) ' @@ -7379,6 +7386,9 @@ def output(self): # longer than options.threshold in the NIC. qid_slow_bytes = defaultdict(lambda: 0) + # Tx queue number -> hex address of netdev_queue + qid_tx_queue = defaultdict(lambda: '') + total_pkts = 0 f = open('%s/txpkts_%s.dat' % (options.data, node), 'w') @@ -7420,6 +7430,8 @@ def output(self): qid_segs[qid] += segs if 'pacer' in pkt: qid_pacer_segs[qid] += segs + if 'tx_queue' in pkt: + qid_tx_queue[qid] = pkt['tx_queue'] qid_string = str(qid) if (options.tx_qid != None) and (qid != options.tx_qid): continue @@ -7484,9 +7496,11 @@ def print_type(delays): if not first_node: q_details += '\n' q_details += 'Transmit queues for %s\n' % (node) - q_details += 'Qid Tsos Segs PSegs Backlog BFrac NicP10 NicP50 NicP90 ' + q_details += 'Qid TxQueue Tsos Segs PSegs Backlog BFrac ' + q_details += 'NicP10 NicP50 NicP90 ' q_details += 'GroP10 GroP50 GroP90 FreP10 FreP50 FreP90\n' - q_details += '------------------------------------------------------------' + q_details += '-------------------------------------------------' + q_details += '----------------------' q_details += '------------------------------------------\n' first_node = False totals = defaultdict(list) @@ -7495,8 +7509,9 @@ def print_type(delays): q_delays = delays[qid] for type, d in q_delays.items(): totals[type].extend(d) - q_details += '%4d %5d %5d %5d %6.1f %5.2f %s %s %s\n' % ( - qid, qid_tsos[qid], qid_segs[qid], qid_pacer_segs[qid], + q_details += '%4d %10s %5d %5d %5d %6.1f %5.2f %s %s %s\n' % ( + qid, qid_tx_queue[qid], qid_tsos[qid], qid_segs[qid], + qid_pacer_segs[qid], 1e-3*qid_backlog[qid]/total_time, qid_slow_bytes[qid]/qid_total_bytes[qid], print_type(q_delays['nic']), From 86a98f8ab6d26a021504882c10d94237bc6f35ef Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 21 Nov 2024 10:46:35 -0800 Subject: [PATCH 074/625] Add more lines to strip in strip.py Strip C++-style comments. Strip "#if LINUX_VERSION" code --- homa_plumbing.c | 2 ++ homa_rpc.c | 2 ++ util/strip.py | 61 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 47 insertions(+), 18 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 1a890e75..8f37f312 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1525,8 +1525,10 @@ int homa_dointvec(const struct ctl_table *table, int write, if (action == 2) homa_rpc_log_active(homa, 0); else if (action == 3) { +#if 1 /* See strip.py */ tt_record("Freezing because of sysctl"); tt_freeze(); +#endif /* See strip.py */ } else if (action == 4) homa_log_throttled(homa); else if (action == 5) diff --git a/homa_rpc.c b/homa_rpc.c index 014c9c8d..357bf23c 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -606,6 +606,7 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) */ void homa_rpc_log_tt(struct homa_rpc *rpc) { +#if 1 /* See strip.py */ if (rpc->state == RPC_INCOMING) { int received = rpc->msgin.length - rpc->msgin.bytes_remaining; @@ -639,6 +640,7 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) } else { tt_record2("RPC id %d is in state %d", rpc->id, rpc->state); } +#endif /* See strip.py */ } /** diff --git a/util/strip.py b/util/strip.py index f3cb2534..185ab9a2 100755 --- a/util/strip.py +++ b/util/strip.py @@ -43,7 +43,7 @@ was specified then these lines are ignored. If the --alt option is specified, it means the output is intended for -testing outside the Linux kernel. In this case, the lines +testing outside the Linux kernel. In this case, the lines should remain. """ from collections import defaultdict @@ -138,12 +138,20 @@ def scan(file, alt_mode): # Used to strip out unit testing code. Value is one of: # None: We're not in the middle of an '#ifdef __UNIT_TEST__' - # 'if': An '#idfdef __UNIT_TEST__" has been seen, but not the - # corresponding #else or #endif not been seen yet + # 'if': An '#idfdef __UNIT_TEST__" has been seen, but the + # corresponding #else or #endif has not been seen yet # 'else': We are in the middle of an '#else' clause for an # '#ifdef __UNIT_TEST__' in_unit = None + # Used to strip out conditional code based on version + # None: We're not in the middle of an '#if LINUX_VERSION_CODE' + # 'if': An '#if LINUX_VERSION_CODE" has been seen, but not the + # corresponding #else or #endif (code should be stripped) + # 'else': We are in the middle of an '#else' clause for an + # '#if LINUX_VERSION_CODE' (this code should remain) + in_version = None + # Array of lines containing the stripped version of the file slines = [] @@ -190,9 +198,13 @@ def scan(file, alt_mode): in_comment = True index = pline.find('//') if index >= 0: - current_has_comment = True + current_has_comment = True pline = pline.strip() + if pline.startswith('//') and not 'SPDX-License' in pline: + # Strip // comment lines: these are used only for commenting + # out debugging code. + continue # Strip groups of lines labeled with special '#if' if in_labeled_skip != None: @@ -238,7 +250,8 @@ def scan(file, alt_mode): match = re.match('(//[ \t]*)?tt_record[1-4]?[(]', pline) if match: # If this is the only statement in its block, delete the - # outer block statement (if, while, etc.). + # outer block statement (if, while, etc.). Don't delete case + # statements. if not match.group(1): indent = leading_space(line) for i in range(len(slines)-1, -1, -1): @@ -250,12 +263,8 @@ def scan(file, alt_mode): # Label or method start; no need to continue further break if leading_space(prev) < indent: - if prev.lstrip().startswith('case'): - print('%s:%d: \'case\' before tt_record; don\'t know how to handle' - % (file, i), file=sys.stderr) - exit_code = 1 - break - slines = slines[:i] + if not prev.lstrip().startswith('case'): + slines = slines[:i] break if pline[-1] != ';': @@ -286,15 +295,31 @@ def scan(file, alt_mode): if in_unit == 'if': continue elif line.startswith('#ifdef __UNIT_TEST__') and not alt_mode: - in_unit = 'if' - if slines[-1].strip() == '': - delete_empty_line = True - continue + in_unit = 'if' + if slines[-1].strip() == '': + delete_empty_line = True + continue elif line.startswith('#ifndef __UNIT_TEST__') and not alt_mode: - in_unit = 'else' - if slines[-1].strip() == '': - delete_empty_line = True + in_unit = 'else' + if slines[-1].strip() == '': + delete_empty_line = True + continue + + # Strip 'if LINUX_VERSION_CODE' blocks (keep #else clauses) + if in_version: + if line.startswith('#endif'): + in_version = None + continue + if line.startswith('#else'): + in_version = 'else' continue + if in_version == 'if': + continue + elif line.startswith('#if LINUX_VERSION_CODE') and not alt_mode: + in_version = 'if' + if slines[-1].strip() == '': + delete_empty_line = True + continue if not pline: if not line.isspace() or not delete_empty_line: From dd48bbc8557953479a9a27edb996d0b2b59ce41c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 2 Dec 2024 15:29:53 -0800 Subject: [PATCH 075/625] Fix problems from checkpatch.pl --- homa_grant.c | 113 +++++++++--------- homa_impl.h | 8 +- homa_incoming.c | 108 +++++++++-------- homa_metrics.c | 241 +++++++++++++++++++------------------- homa_metrics.h | 4 +- homa_offload.c | 100 ++++++++-------- homa_outgoing.c | 52 ++++---- homa_peer.c | 20 ++-- homa_plumbing.c | 79 +++++++------ homa_pool.c | 7 +- homa_rpc.c | 86 +++++++------- homa_skb.c | 82 +++++++------ homa_skb.h | 2 +- homa_sock.c | 4 +- homa_sock.h | 8 +- homa_timer.c | 21 ++-- homa_utils.c | 138 +++++++++++----------- homa_wire.h | 6 +- test/unit_homa_incoming.c | 2 +- timetrace.c | 18 ++- 20 files changed, 559 insertions(+), 540 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 3bb5571b..71f40a01 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -22,10 +22,10 @@ int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) /* Fewest bytes remaining is the primary criterion; if those are * equal, then favor the older RPC. */ - return (rpc1->msgin.bytes_remaining < rpc2->msgin.bytes_remaining) - || ((rpc1->msgin.bytes_remaining - == rpc2->msgin.bytes_remaining) - && (rpc1->msgin.birth < rpc2->msgin.birth)); + return (rpc1->msgin.bytes_remaining < rpc2->msgin.bytes_remaining) || + ((rpc1->msgin.bytes_remaining == + rpc2->msgin.bytes_remaining) && + (rpc1->msgin.birth < rpc2->msgin.birth)); } /** @@ -53,8 +53,8 @@ int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) int old = atomic_fetch_add(delta, &homa->total_incoming); rpc->msgin.rec_incoming = incoming; - return ((old >= homa->max_incoming) - && ((old + delta) < homa->max_incoming)); + return (old >= homa->max_incoming && + (old + delta) < homa->max_incoming); } return 0; } @@ -87,22 +87,23 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) homa->last_grantable_change = time; homa->num_grantable_rpcs++; tt_record2("Incremented num_grantable_rpcs to %d, id %d", - homa->num_grantable_rpcs, rpc->id); + homa->num_grantable_rpcs, rpc->id); if (homa->num_grantable_rpcs > homa->max_grantable_rpcs) homa->max_grantable_rpcs = homa->num_grantable_rpcs; rpc->msgin.birth = time; list_for_each_entry(candidate, &peer->grantable_rpcs, - grantable_links) { + grantable_links) { if (homa_grant_outranks(rpc, candidate)) { list_add_tail(&rpc->grantable_links, - &candidate->grantable_links); + &candidate->grantable_links); goto position_peer; } } list_add_tail(&rpc->grantable_links, &peer->grantable_rpcs); } else { while (rpc != list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links)) { + struct homa_rpc, + grantable_links)) { /* Message is on the list, but its priority may have * increased because of the recent packet arrival. If * so, adjust its position in the list. @@ -123,12 +124,13 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) if (list_empty(&peer->grantable_links)) { /* Must add peer to the overall Homa list. */ list_for_each_entry(peer_cand, &homa->grantable_peers, - grantable_links) { + grantable_links) { candidate = list_first_entry(&peer_cand->grantable_rpcs, - struct homa_rpc, grantable_links); + struct homa_rpc, + grantable_links); if (homa_grant_outranks(rpc, candidate)) { list_add_tail(&peer->grantable_links, - &peer_cand->grantable_links); + &peer_cand->grantable_links); goto done; } } @@ -137,11 +139,11 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) } /* The peer is on Homa's list, but it may need to move upward. */ while (peer != list_first_entry(&homa->grantable_peers, - struct homa_peer, grantable_links)) { - struct homa_peer *prev_peer = list_prev_entry( - peer, grantable_links); + struct homa_peer, grantable_links)) { + struct homa_peer *prev_peer = list_prev_entry(peer, + grantable_links); candidate = list_first_entry(&prev_peer->grantable_rpcs, - struct homa_rpc, grantable_links); + struct homa_rpc, grantable_links); if (!homa_grant_outranks(rpc, candidate)) goto done; __list_del_entry(&prev_peer->grantable_links); @@ -172,14 +174,14 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) homa->oldest_rpc = NULL; head = list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links); + struct homa_rpc, grantable_links); list_del_init(&rpc->grantable_links); INC_METRIC(grantable_rpcs_integral, homa->num_grantable_rpcs * (time - homa->last_grantable_change)); homa->last_grantable_change = time; homa->num_grantable_rpcs--; tt_record2("Decremented num_grantable_rpcs to %d, id %d", - homa->num_grantable_rpcs, rpc->id); + homa->num_grantable_rpcs, rpc->id); if (rpc != head) return; @@ -196,13 +198,13 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) * an RPC can't cause the peer to move up). */ head = list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links); + struct homa_rpc, grantable_links); while (peer != list_last_entry(&homa->grantable_peers, struct homa_peer, - grantable_links)) { - struct homa_peer *next_peer = list_next_entry( - peer, grantable_links); + grantable_links)) { + struct homa_peer *next_peer = list_next_entry(peer, + grantable_links); candidate = list_first_entry(&next_peer->grantable_rpcs, - struct homa_rpc, grantable_links); + struct homa_rpc, grantable_links); if (!homa_grant_outranks(rpc, candidate)) break; __list_del_entry(&peer->grantable_links); @@ -255,8 +257,8 @@ int homa_grant_send(struct homa_rpc *rpc, struct homa *homa) grant.resend_all = rpc->msgin.resend_all; rpc->msgin.resend_all = 0; tt_record4("sending grant for id %llu, offset %d, priority %d, increment %d", - rpc->id, rpc->msgin.granted, rpc->msgin.priority, - increment); + rpc->id, rpc->msgin.granted, rpc->msgin.priority, + increment); homa_xmit_control(GRANT, &grant, sizeof(grant), rpc); return 1; } @@ -288,9 +290,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) struct homa *homa = rpc->hsk->homa; int rank, recalc; - - if ((rpc->msgin.length < 0) || (rpc->state == RPC_DEAD) - || (rpc->msgin.num_bpages <= 0)) { + if (rpc->msgin.length < 0 || rpc->state == RPC_DEAD || + rpc->msgin.num_bpages <= 0) { homa_rpc_unlock(rpc); goto done; } @@ -302,8 +303,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) } tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", - rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, - rpc->msgin.length); + rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, + rpc->msgin.length); /* This message requires grants; if it is a new message, set up * granting. @@ -312,9 +313,10 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) homa_grant_update_incoming(rpc, homa); homa_grantable_lock(homa, 0); homa_grant_add_rpc(rpc); - recalc = ((homa->num_active_rpcs < homa->max_overcommit) - || (rpc->msgin.bytes_remaining < atomic_read( - &homa->active_remaining[homa->max_overcommit-1]))); + recalc = (homa->num_active_rpcs < homa->max_overcommit || + rpc->msgin.bytes_remaining < + atomic_read(&homa->active_remaining + [homa->max_overcommit - 1])); homa_rpc_unlock(rpc); if (recalc) homa_grant_recalc(homa, 1); @@ -327,8 +329,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) rank = atomic_read(&rpc->msgin.rank); if (rank < 0) { homa_grant_update_incoming(rpc, homa); - if (rpc->msgin.bytes_remaining < atomic_read( - &homa->active_remaining[homa->max_overcommit-1])) { + if (rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining[homa->max_overcommit - 1])) { homa_rpc_unlock(rpc); INC_METRIC(grant_priority_bumps, 1); homa_grant_recalc(homa, 0); @@ -338,8 +339,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) goto done; } atomic_set(&homa->active_remaining[rank], rpc->msgin.bytes_remaining); - if ((rank > 0) && (rpc->msgin.bytes_remaining < atomic_read( - &homa->active_remaining[rank-1]))) { + if (rank > 0 && rpc->msgin.bytes_remaining < + atomic_read(&homa->active_remaining[rank - 1])) { homa_grant_update_incoming(rpc, homa); homa_rpc_unlock(rpc); INC_METRIC(grant_priority_bumps, 1); @@ -421,7 +422,7 @@ void homa_grant_recalc(struct homa *homa, int locked) * about them. */ active = homa_grant_pick_rpcs(homa, homa->active_rpcs, - homa->max_overcommit); + homa->max_overcommit); homa->num_active_rpcs = active; for (i = 0; i < active; i++) { struct homa_rpc *rpc = homa->active_rpcs[i]; @@ -431,7 +432,7 @@ void homa_grant_recalc(struct homa *homa, int locked) atomic_inc(&rpc->grants_in_progress); atomic_set(&rpc->msgin.rank, i); atomic_set(&homa->active_remaining[i], - rpc->msgin.bytes_remaining); + rpc->msgin.bytes_remaining); /* Compute the priority to use for this RPC's grants: * if there aren't enough RPCs to consume all of the @@ -458,8 +459,8 @@ void homa_grant_recalc(struct homa *homa, int locked) if (homa->window_param != 0) homa->grant_window = homa->window_param; else - homa->grant_window = homa->max_incoming/ - (homa->num_active_rpcs+1); + homa->grant_window = homa->max_incoming / + (homa->num_active_rpcs + 1); /* See comment above, which explains why this is here. */ homa_grantable_unlock(homa); @@ -467,7 +468,6 @@ void homa_grant_recalc(struct homa *homa, int locked) for (i = 0; i < active; i++) { struct homa_rpc *rpc = active_rpcs[i]; - homa_rpc_lock(rpc, "homa_grant_recalc"); homa_grant_send(rpc, homa); try_again += homa_grant_update_incoming(rpc, homa); @@ -502,7 +502,7 @@ void homa_grant_recalc(struct homa *homa, int locked) * Return: The number of RPCs actually stored in @rpcs. */ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, - int max_rpcs) + int max_rpcs) { struct homa_peer *peer; struct homa_rpc *rpc; @@ -518,13 +518,13 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, * in decreasing order of priority. */ list_for_each_entry(rpc, &peer->grantable_rpcs, - grantable_links) { + grantable_links) { int i, pos; /* Figure out where this RPC should be positioned * in the result. */ - for (i = num_rpcs-1; i >= 0; i--) { + for (i = num_rpcs - 1; i >= 0; i--) { if (!homa_grant_outranks(rpc, rpcs[i])) break; } @@ -534,12 +534,12 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, if (pos >= max_rpcs) break; if (num_rpcs < max_rpcs) { - for (i = num_rpcs-1; i >= pos; i--) - rpcs[i+1] = rpcs[i]; + for (i = num_rpcs - 1; i >= pos; i--) + rpcs[i + 1] = rpcs[i]; num_rpcs++; } else { - for (i = max_rpcs-2; i >= pos; i--) - rpcs[i+1] = rpcs[i]; + for (i = max_rpcs - 2; i >= pos; i--) + rpcs[i + 1] = rpcs[i]; } rpcs[pos] = rpc; rpcs_from_peer++; @@ -556,7 +556,6 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, return num_rpcs; } - /** * homa_grant_find_oldest() - Recompute the value of homa->oldest_rpc. * @homa: Overall data about the Homa protocol implementation. The @@ -564,7 +563,7 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, */ void homa_grant_find_oldest(struct homa *homa) { - int max_incoming = homa->grant_window + 2*homa->fifo_grant_increment; + int max_incoming = homa->grant_window + 2 * homa->fifo_grant_increment; struct homa_rpc *rpc, *oldest; struct homa_peer *peer; __u64 oldest_birth; @@ -577,7 +576,7 @@ void homa_grant_find_oldest(struct homa *homa) */ list_for_each_entry(peer, &homa->grantable_peers, grantable_links) { list_for_each_entry(rpc, &peer->grantable_rpcs, - grantable_links) { + grantable_links) { int received, incoming; if (rpc->msgin.birth >= oldest_birth) @@ -601,6 +600,7 @@ void homa_grant_find_oldest(struct homa *homa) } homa->oldest_rpc = oldest; } + /** * homa_grant_free_rpc() - This function is invoked when an RPC is freed; * it cleans up any state related to grants for that RPC's incoming message. @@ -626,8 +626,9 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) homa_rpc_unlock(rpc); homa_grant_recalc(homa, 1); homa_rpc_lock(rpc, "homa_grant_free_rpc"); - } else + } else { homa_grantable_unlock(homa); + } } if (rpc->msgin.rec_incoming != 0) @@ -683,10 +684,10 @@ void homa_grant_log_tt(struct homa *homa) homa_grantable_lock(homa, 0); tt_record1("homa_grant_log_tt found %d active RPCs:", - homa->num_active_rpcs); + homa->num_active_rpcs); for (i = 0; i < homa->num_active_rpcs; i++) { tt_record2("active_rpcs[%d]: id %d", i, - homa->active_rpcs[i]->id); + homa->active_rpcs[i]->id); homa_rpc_log_tt(homa->active_rpcs[i]); } homa_grantable_unlock(homa); diff --git a/homa_impl.h b/homa_impl.h index e6492ed1..728d6329 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1044,8 +1044,8 @@ static inline bool is_homa_pkt(struct sk_buff *skb) struct iphdr *iph = ip_hdr(skb); return ((iph->protocol == IPPROTO_HOMA) || - ((iph->protocol == IPPROTO_TCP) && - (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); + ((iph->protocol == IPPROTO_TCP) && + (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); } #if 1 /* See strip.py --alt */ @@ -1135,7 +1135,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data); void homa_outgoing_sysctl_changed(struct homa *homa); -int homa_pacer_main(void *transportInfo); +int homa_pacer_main(void *transport); void homa_pacer_stop(struct homa *homa); void homa_pacer_xmit(struct homa *homa); __poll_t homa_poll(struct file *file, struct socket *sock, @@ -1179,7 +1179,7 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, loff_t *ppos); #endif void homa_timer(struct homa *homa); -int homa_timer_main(void *transportInfo); +int homa_timer_main(void *transport); void homa_unhash(struct sock *sk); void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, diff --git a/homa_incoming.c b/homa_incoming.c index 5cd361f7..601bf243 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -43,10 +43,10 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) */ rpc->msgin.granted = 0; } - if (length < HOMA_NUM_SMALL_COUNTS*64) { - INC_METRIC(small_msg_bytes[(length-1) >> 6], length); - } else if (length < HOMA_NUM_MEDIUM_COUNTS*1024) { - INC_METRIC(medium_msg_bytes[(length-1) >> 10], length); + if (length < HOMA_NUM_SMALL_COUNTS * 64) { + INC_METRIC(small_msg_bytes[(length - 1) >> 6], length); + } else if (length < HOMA_NUM_MEDIUM_COUNTS * 1024) { + INC_METRIC(medium_msg_bytes[(length - 1) >> 10], length); } else { INC_METRIC(large_msg_count, 1); INC_METRIC(large_msg_bytes, length); @@ -129,7 +129,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) rpc->msgin.recv_end, start)) { pr_err("Homa couldn't allocate gap: insufficient memory\n"); tt_record2("Couldn't allocate gap for id %d (start %d): no memory", - rpc->id, start); + rpc->id, start); goto discard; } rpc->msgin.recv_end = end; @@ -182,7 +182,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (!gap2) { pr_err("Homa couldn't allocate gap for split: insufficient memory\n"); tt_record2("Couldn't allocate gap for split for id %d (start %d): no memory", - rpc->id, end); + rpc->id, end); goto discard; } gap2->time = gap->time; @@ -303,8 +303,9 @@ int homa_copy_to_user(struct homa_rpc *rpc) if (error) goto free_skbs; error = skb_copy_datagram_iter(skbs[i], - sizeof(*h) + copied, &iter, - chunk_size); + sizeof(*h) + + copied, &iter, + chunk_size); if (error) goto free_skbs; copied += chunk_size; @@ -433,36 +434,37 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) h, &created); if (IS_ERR(rpc)) { pr_warn("homa_pkt_dispatch couldn't create server rpc: error %lu", - -PTR_ERR(rpc)); + -PTR_ERR(rpc)); INC_METRIC(server_cant_create_rpcs, 1); rpc = NULL; goto discard; } } else { rpc = homa_find_server_rpc(hsk, &saddr, - ntohs(h->common.sport), - id); + ntohs(h->common.sport), + id); } } else { rpc = homa_find_client_rpc(hsk, id); } } if (unlikely(!rpc)) { - if ((h->common.type != CUTOFFS) - && (h->common.type != NEED_ACK) - && (h->common.type != ACK) - && (h->common.type != RESEND)) { + if (h->common.type != CUTOFFS && + h->common.type != NEED_ACK && + h->common.type != ACK && + h->common.type != RESEND) { tt_record4("Discarding packet for unknown RPC, id %u, type %d, peer 0x%x:%d", - id, h->common.type, - tt_addr(saddr), - ntohs(h->common.sport)); - if ((h->common.type != GRANT) || homa_is_client(id)) + id, h->common.type, tt_addr(saddr), + ntohs(h->common.sport)); + if (h->common.type != GRANT || + homa_is_client(id)) INC_METRIC(unknown_rpcs, 1); goto discard; } } else { - if ((h->common.type == DATA) || (h->common.type == GRANT) - || (h->common.type == BUSY)) + if (h->common.type == DATA || + h->common.type == GRANT || + h->common.type == BUSY) rpc->silent_ticks = 0; rpc->peer->outstanding_resends = 0; } @@ -541,7 +543,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) * nor homa_timer can keep up with reaping dead * RPCs. See reap.txt for details. */ - uint64_t start = sched_clock(); + __u64 start = sched_clock(); tt_record("homa_data_pkt calling homa_rpc_reap"); homa_rpc_reap(hsk, hsk->homa->reap_limit); @@ -558,7 +560,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) */ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) { - struct data_header *h = (struct data_header *) skb->data; + struct data_header *h = (struct data_header *)skb->data; struct homa *homa = rpc->hsk->homa; tt_record4("incoming data packet, id %d, peer 0x%x, offset %d/%d", @@ -572,7 +574,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) INC_METRIC(responses_received, 1); rpc->state = RPC_INCOMING; tt_record2("Incoming message for id %d has %d unscheduled bytes", - rpc->id, ntohl(h->incoming)); + rpc->id, ntohl(h->incoming)); if (homa_message_in_init(rpc, ntohl(h->message_length), ntohl(h->incoming)) != 0) goto discard; @@ -644,16 +646,16 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) */ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) { - struct grant_header *h = (struct grant_header *) skb->data; + struct grant_header *h = (struct grant_header *)skb->data; int new_offset = ntohl(h->offset); tt_record4("processing grant for id %llu, offset %d, priority %d, increment %d", - homa_local_id(h->common.sender_id), ntohl(h->offset), - h->priority, new_offset - rpc->msgout.granted); + homa_local_id(h->common.sender_id), ntohl(h->offset), + h->priority, new_offset - rpc->msgout.granted); if (rpc->state == RPC_OUTGOING) { if (h->resend_all) homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset, - h->priority); + h->priority); if (new_offset > rpc->msgout.granted) { rpc->msgout.granted = new_offset; @@ -708,8 +710,8 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, * send BUSY instead. */ tt_record3("sending BUSY from resend, id %d, offset %d, granted %d", - rpc->id, rpc->msgout.next_xmit_offset, - rpc->msgout.granted); + rpc->id, rpc->msgout.next_xmit_offset, + rpc->msgout.granted); homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); } else { if (ntohl(h->length) == 0) @@ -745,25 +747,27 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) tt_record4("Restarting id %d to server 0x%x:%d, lost %d bytes", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgout.next_xmit_offset); - homa_freeze(rpc, RESTART_RPC, "Freezing because of RPC restart, id %d, peer 0x%x"); + homa_freeze(rpc, RESTART_RPC, + "Freezing because of RPC restart, id %d, peer 0x%x"); homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset, homa_unsched_priority(rpc->hsk->homa, - rpc->peer, rpc->msgout.length)); + rpc->peer, + rpc->msgout.length)); goto done; } pr_err("Received unknown for RPC id %llu, peer %s:%d in bogus state %d; discarding unknown\n", - rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), - rpc->dport, rpc->state); + rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), + rpc->dport, rpc->state); tt_record4("Discarding unknown for RPC id %d, peer 0x%x:%d: bad state %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->state); } else { if (rpc->hsk->homa->verbose) pr_notice("Freeing rpc id %llu from client %s:%d: unknown to client", - rpc->id, - homa_print_ipv6_addr(&rpc->peer->addr), - rpc->dport); + rpc->id, + homa_print_ipv6_addr(&rpc->peer->addr), + rpc->dport); homa_rpc_free(rpc); INC_METRIC(server_rpcs_unknown, 1); } @@ -779,7 +783,7 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) */ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) { - struct cutoffs_header *h = (struct cutoffs_header *) skb->data; + struct cutoffs_header *h = (struct cutoffs_header *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); struct homa_peer *peer; int i; @@ -928,7 +932,7 @@ struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) oldest = rpc; oldest_birth = rpc->msgin.birth; } - if (oldest == NULL) + if (!oldest) return NULL; INC_METRIC(fifo_grants, 1); if ((oldest->msgin.length - oldest->msgin.bytes_remaining) @@ -949,7 +953,7 @@ struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) * will eventually get updated elsewhere. */ if (homa_bucket_try_lock(oldest->bucket, oldest->id, - "homa_choose_fifo_grant")) { + "homa_choose_fifo_grant")) { homa_grant_update_incoming(oldest, homa); homa_rpc_unlock(oldest); } @@ -1204,7 +1208,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, __u64 id) __acquires(&rpc->bucket_lock) { - uint64_t poll_start, poll_end, now; + __u64 poll_start, poll_end, now; int error, blocked = 0, polled = 0; struct homa_rpc *result = NULL; struct homa_interest interest; @@ -1259,17 +1263,17 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, /* Busy-wait for a while before going to sleep; this avoids * context-switching overhead to wake up. */ - poll_start = now = sched_clock(); + now = sched_clock(); + poll_start = now; poll_end = now + (1000 * hsk->homa->poll_usecs); while (1) { __u64 blocked; - rpc = (struct homa_rpc *) atomic_long_read( - &interest.ready_rpc); + rpc = (struct homa_rpc *)atomic_long_read(&interest.ready_rpc); if (rpc) { tt_record3("received RPC handoff while polling, id %d, socket %d, pid %d", - rpc->id, hsk->port, - current->pid); + rpc->id, hsk->port, + current->pid); polled = 1; INC_METRIC(poll_ns, now - poll_start); goto found_rpc; @@ -1286,7 +1290,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, poll_start += blocked; } tt_record2("Poll ended unsuccessfully on socket %d, pid %d", - hsk->port, current->pid); + hsk->port, current->pid); INC_METRIC(poll_ns, now - poll_start); /* Now it's time to sleep. */ @@ -1402,14 +1406,14 @@ struct homa_interest *homa_choose_interest(struct homa *homa, struct list_head *pos; list_for_each(pos, head) { - interest = (struct homa_interest *) (((char *) pos) - offset); + interest = (struct homa_interest *)(((char *)pos) - offset); if (per_cpu(homa_offload_core, interest->core).last_active < busy_time) { - if (backup != NULL) + if (backup) INC_METRIC(handoffs_alt_thread, 1); return interest; } - if (backup == NULL) + if (!backup) backup = interest; } @@ -1519,8 +1523,8 @@ void homa_incoming_sysctl_changed(struct homa *homa) homa->grant_fifo_fraction = 500; tmp = homa->grant_fifo_fraction; if (tmp != 0) - tmp = (1000*homa->fifo_grant_increment)/tmp - - homa->fifo_grant_increment; + tmp = (1000 * homa->fifo_grant_increment) / tmp - + homa->fifo_grant_increment; homa->grant_nonfifo = tmp; if (homa->max_overcommit > HOMA_MAX_GRANTS) diff --git a/homa_metrics.c b/homa_metrics.c index 7783c3b5..853898bc 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -48,8 +48,8 @@ void homa_metric_append(struct homa *homa, const char *format, ...) while (true) { va_start(ap, format); new_chars = vsnprintf(homa->metrics + homa->metrics_length, - homa->metrics_capacity - homa->metrics_length, - format, ap); + homa->metrics_capacity - + homa->metrics_length, format, ap); va_end(ap); if ((homa->metrics_length + new_chars) < homa->metrics_capacity) break; @@ -83,98 +83,96 @@ char *homa_metrics_print(struct homa *homa) homa->metrics_length = 0; #define M(...) homa_metric_append(homa, __VA_ARGS__) M("time_ns %20llu sched_clock() time when metrics were gathered\n", - sched_clock()); + sched_clock()); for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = &per_cpu(homa_metrics, core); __s64 delta; M("core %15d Core id for following metrics\n", - core); + core); for (i = 0; i < HOMA_NUM_SMALL_COUNTS; i++) { M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", - (i+1)*64, m->small_msg_bytes[i], lower, - (i+1)*64); - lower = (i+1)*64 + 1; + (i + 1) * 64, m->small_msg_bytes[i], lower, + (i + 1) * 64); + lower = (i + 1) * 64 + 1; } - for (i = (HOMA_NUM_SMALL_COUNTS*64)/1024; + for (i = (HOMA_NUM_SMALL_COUNTS * 64) / 1024; i < HOMA_NUM_MEDIUM_COUNTS; i++) { M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", - (i+1)*1024, m->medium_msg_bytes[i], lower, - (i+1)*1024); - lower = (i+1)*1024 + 1; + (i + 1) * 1024, m->medium_msg_bytes[i], lower, + (i + 1) * 1024); + lower = (i + 1) * 1024 + 1; } M("large_msg_count %15llu # of incoming messages >= %d bytes\n", - m->large_msg_count, lower); + m->large_msg_count, lower); M("large_msg_bytes %15llu Bytes in incoming messages >= %d bytes\n", - m->large_msg_bytes, lower); + m->large_msg_bytes, lower); M("sent_msg_bytes %15llu otal bytes in all outgoing messages\n", - m->sent_msg_bytes); + m->sent_msg_bytes); for (i = DATA; i < BOGUS; i++) { char *symbol = homa_symbol_for_type(i); M("packets_sent_%-7s %15llu %s packets sent\n", - symbol, m->packets_sent[i-DATA], - symbol); + symbol, m->packets_sent[i - DATA], symbol); } for (i = DATA; i < BOGUS; i++) { char *symbol = homa_symbol_for_type(i); M("packets_rcvd_%-7s %15llu %s packets received\n", - symbol, m->packets_received[i-DATA], - symbol); + symbol, m->packets_received[i - DATA], symbol); } for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { M("priority%d_bytes %15llu Bytes sent at priority %d (including headers)\n", - i, m->priority_bytes[i], i); + i, m->priority_bytes[i], i); } for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { M("priority%d_packets %15llu Packets sent at priority %d\n", - i, m->priority_packets[i], i); + i, m->priority_packets[i], i); } M("skb_allocs %15llu sk_buffs allocated\n", - m->skb_allocs); + m->skb_allocs); M("skb_alloc_ns %15llu Time spent allocating sk_buffs\n", - m->skb_alloc_ns); + m->skb_alloc_ns); M("skb_frees %15llu Data sk_buffs freed in normal paths\n", - m->skb_frees); + m->skb_frees); M("skb_free_ns %15llu Time spent freeing data sk_buffs\n", - m->skb_free_ns); + m->skb_free_ns); M("skb_page_allocs %15llu Pages allocated for sk_buff frags\n", - m->skb_page_allocs); + m->skb_page_allocs); M("skb_page_alloc_ns %15llu Time spent allocating pages for sk_buff frags\n", - m->skb_page_alloc_ns); + m->skb_page_alloc_ns); M("requests_received %15llu Incoming request messages\n", - m->requests_received); + m->requests_received); M("requests_queued %15llu Requests for which no thread was waiting\n", - m->requests_queued); + m->requests_queued); M("responses_received %15llu Incoming response messages\n", - m->responses_received); + m->responses_received); M("responses_queued %15llu Responses for which no thread was waiting\n", - m->responses_queued); + m->responses_queued); M("fast_wakeups %15llu Messages received while polling\n", - m->fast_wakeups); + m->fast_wakeups); M("slow_wakeups %15llu Messages received after thread went to sleep\n", - m->slow_wakeups); + m->slow_wakeups); M("handoffs_thread_waiting %15llu RPC handoffs to waiting threads (vs. queue)\n", - m->handoffs_thread_waiting); + m->handoffs_thread_waiting); M("handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", - m->handoffs_alt_thread); + m->handoffs_alt_thread); M("poll_ns %15llu Time spent polling for incoming messages\n", - m->poll_ns); + m->poll_ns); M("softirq_calls %15llu Calls to homa_softirq (i.e. # GRO pkts received)\n", - m->softirq_calls); + m->softirq_calls); M("softirq_ns %15llu Time spent in homa_softirq during SoftIRQ\n", - m->softirq_ns); + m->softirq_ns); M("bypass_softirq_ns %15llu Time spent in homa_softirq during bypass from GRO\n", - m->bypass_softirq_ns); + m->bypass_softirq_ns); M("linux_softirq_ns %15llu Time spent in all Linux SoftIRQ\n", - m->linux_softirq_ns); + m->linux_softirq_ns); M("napi_ns %15llu Time spent in NAPI-level packet handling\n", - m->napi_ns); + m->napi_ns); M("send_ns %15llu Time spent in homa_sendmsg for requests\n", - m->send_ns); + m->send_ns); M("send_calls %15llu Total invocations of homa_sendmsg for equests\n", - m->send_calls); + m->send_calls); // It is possible for us to get here at a time when a // thread has been blocked for a long time and has // recorded blocked_ns, but hasn't finished the @@ -185,163 +183,164 @@ char *homa_metrics_print(struct homa *homa) if (delta < 0) delta = 0; M("recv_ns %15llu Unblocked time spent in recvmsg kernel call\n", - delta); + delta); M("recv_calls %15llu Total invocations of recvmsg kernel call\n", - m->recv_calls); + m->recv_calls); M("blocked_ns %15llu Time spent blocked in homa_recvmsg\n", - m->blocked_ns); + m->blocked_ns); M("reply_ns %15llu Time spent in homa_sendmsg for responses\n", - m->reply_ns); + m->reply_ns); M("reply_calls %15llu Total invocations of homa_sendmsg for responses\n", - m->reply_calls); + m->reply_calls); M("abort_ns %15llu Time spent in homa_ioc_abort kernel call\n", - m->reply_ns); + m->reply_ns); M("abort_calls %15llu Total invocations of abort kernel call\n", - m->reply_calls); + m->reply_calls); M("so_set_buf_ns %15llu Time spent in setsockopt SO_HOMA_SET_BUF\n", - m->so_set_buf_ns); + m->so_set_buf_ns); M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_SET_BUF\n", - m->so_set_buf_calls); + m->so_set_buf_calls); M("grantable_lock_ns %15llu Time spent with homa->grantable_lock locked\n", - m->grantable_lock_ns); + m->grantable_lock_ns); M("timer_ns %15llu Time spent in homa_timer\n", - m->timer_ns); + m->timer_ns); M("timer_reap_ns %15llu Time in homa_timer spent reaping RPCs\n", - m->timer_reap_ns); + m->timer_reap_ns); M("data_pkt_reap_ns %15llu Time in homa_data_pkt spent reaping RPCs\n", - m->data_pkt_reap_ns); + m->data_pkt_reap_ns); M("pacer_ns %15llu Time spent in homa_pacer_main\n", - m->pacer_ns); + m->pacer_ns); M("homa_ns %15llu Total time in all Homa-related functions\n", - m->softirq_ns + m->napi_ns + - m->send_ns + m->recv_ns + - m->reply_ns - m->blocked_ns + - m->timer_ns + m->pacer_ns); + m->softirq_ns + m->napi_ns + + m->send_ns + m->recv_ns + + m->reply_ns - m->blocked_ns + + m->timer_ns + m->pacer_ns); M("pacer_lost_ns %15llu Lost transmission time because pacer was slow\n", - m->pacer_lost_ns); + m->pacer_lost_ns); M("pacer_bytes %15llu Bytes transmitted when the pacer was active\n", - m->pacer_bytes); + m->pacer_bytes); M("pacer_skipped_rpcs %15llu Pacer aborts because of locked RPCs\n", - m->pacer_skipped_rpcs); + m->pacer_skipped_rpcs); M("pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", - m->pacer_needed_help); + m->pacer_needed_help); M("throttled_ns %15llu Time when the throttled queue was nonempty\n", - m->throttled_ns); + m->throttled_ns); M("resent_packets %15llu DATA packets sent in response to RESENDs\n", - m->resent_packets); + m->resent_packets); M("peer_hash_links %15llu Hash chain link traversals in peer table\n", - m->peer_hash_links); + m->peer_hash_links); M("peer_new_entries %15llu New entries created in peer table\n", - m->peer_new_entries); + m->peer_new_entries); M("peer_kmalloc_errors %15llu kmalloc failures creating peer table entries\n", - m->peer_kmalloc_errors); + m->peer_kmalloc_errors); M("peer_route_errors %15llu Routing failures creating peer table entries\n", - m->peer_route_errors); + m->peer_route_errors); M("control_xmit_errors %15llu Errors sending control packets\n", - m->control_xmit_errors); + m->control_xmit_errors); M("data_xmit_errors %15llu Errors sending data packets\n", - m->data_xmit_errors); + m->data_xmit_errors); M("unknown_rpcs %15llu Non-grant packets discarded because RPC unknown\n", - m->unknown_rpcs); + m->unknown_rpcs); M("server_cant_create_rpcs %15llu Packets discarded because server couldn't create RPC\n", - m->server_cant_create_rpcs); + m->server_cant_create_rpcs); M("unknown_packet_types %15llu Packets discarded because of unsupported type\n", - m->unknown_packet_types); + m->unknown_packet_types); M("short_packets %15llu Packets discarded because too short\n", - m->short_packets); + m->short_packets); M("packet_discards %15llu Non-resent packets discarded because data already received\n", - m->packet_discards); + m->packet_discards); M("resent_discards %15llu Resent packets discarded because data already received\n", - m->resent_discards); + m->resent_discards); M("resent_packets_used %15llu Retransmitted packets that were actually used\n", - m->resent_packets_used); + m->resent_packets_used); M("rpc_timeouts %15llu RPCs aborted because peer was nonresponsive\n", - m->rpc_timeouts); + m->rpc_timeouts); M("server_rpc_discards %15llu RPCs discarded by server because of errors\n", - m->server_rpc_discards); + m->server_rpc_discards); M("server_rpcs_unknown %15llu RPCs aborted by server because unknown to client\n", - m->server_rpcs_unknown); + m->server_rpcs_unknown); M("client_lock_misses %15llu Bucket lock misses for client RPCs\n", - m->client_lock_misses); + m->client_lock_misses); M("client_lock_miss_ns %15llu Time lost waiting for client bucket locks\n", - m->client_lock_miss_ns); + m->client_lock_miss_ns); M("server_lock_misses %15llu Bucket lock misses for server RPCs\n", - m->server_lock_misses); + m->server_lock_misses); M("server_lock_miss_ns %15llu Time lost waiting for server bucket locks\n", - m->server_lock_miss_ns); + m->server_lock_miss_ns); M("socket_lock_misses %15llu Socket lock misses\n", - m->socket_lock_misses); + m->socket_lock_misses); M("socket_lock_miss_ns %15llu Time lost waiting for socket locks\n", - m->socket_lock_miss_ns); + m->socket_lock_miss_ns); M("throttle_lock_misses %15llu Throttle lock misses\n", - m->throttle_lock_misses); + m->throttle_lock_misses); M("throttle_lock_miss_ns %15llu Time lost waiting for throttle locks\n", - m->throttle_lock_miss_ns); + m->throttle_lock_miss_ns); M("peer_ack_lock_misses %15llu Misses on peer ack locks\n", - m->peer_ack_lock_misses); + m->peer_ack_lock_misses); M("peer_ack_lock_miss_ns %15llu Time lost waiting for peer ack locks\n", - m->peer_ack_lock_miss_ns); + m->peer_ack_lock_miss_ns); M("grantable_lock_misses %15llu Grantable lock misses\n", - m->grantable_lock_misses); + m->grantable_lock_misses); M("grantable_lock_miss_ns %15llu Time lost waiting for grantable lock\n", - m->grantable_lock_miss_ns); + m->grantable_lock_miss_ns); M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", - m->grantable_rpcs_integral); + m->grantable_rpcs_integral); M("grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", - m->grant_recalc_calls); + m->grant_recalc_calls); M("grant_recalc_ns %15llu Time spent in homa_grant_recalc\n", - m->grant_recalc_ns); + m->grant_recalc_ns); M("grant_recalc_skips %15llu Number of times homa_grant_recalc skipped redundant work\n", - m->grant_recalc_skips); + m->grant_recalc_skips); M("grant_recalc_loops %15llu Number of times homa_grant_recalc looped back\n", - m->grant_recalc_loops); + m->grant_recalc_loops); M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", - m->grant_priority_bumps); + m->grant_priority_bumps); M("fifo_grants %15llu Grants issued using FIFO priority\n", - m->fifo_grants); + m->fifo_grants); M("fifo_grants_no_incoming %15llu FIFO grants to messages with no outstanding grants\n", - m->fifo_grants_no_incoming); + m->fifo_grants_no_incoming); M("disabled_reaps %15llu Reaper invocations that were disabled\n", - m->disabled_reaps); + m->disabled_reaps); M("disabled_rpc_reaps %15llu Disabled RPCs skipped by reaper\n", - m->disabled_rpc_reaps); + m->disabled_rpc_reaps); M("reaper_calls %15llu Reaper invocations that were not disabled\n", - m->reaper_calls); + m->reaper_calls); M("reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper calls\n", - m->reaper_dead_skbs); + m->reaper_dead_skbs); M("forced_reaps %15llu Reaps forced by accumulation of dead RPCs\n", - m->forced_reaps); + m->forced_reaps); M("throttle_list_adds %15llu Calls to homa_add_to_throttled\n", - m->throttle_list_adds); + m->throttle_list_adds); M("throttle_list_checks %15llu List elements checked in homa_add_to_throttled\n", - m->throttle_list_checks); + m->throttle_list_checks); M("ack_overflows %15llu Explicit ACKs sent because peer->acks was full\n", - m->ack_overflows); + m->ack_overflows); M("ignored_need_acks %15llu NEED_ACKs ignored because RPC result not yet received\n", - m->ignored_need_acks); + m->ignored_need_acks); M("bpage_reuses %15llu Buffer page could be reused because ref count was zero\n", - m->bpage_reuses); + m->bpage_reuses); M("buffer_alloc_failures %15llu homa_pool_allocate didn't find enough buffer space for an RPC\n", - m->buffer_alloc_failures); + m->buffer_alloc_failures); M("linux_pkt_alloc_bytes %15llu Bytes allocated in new packets by NIC driver due to cache overflows\n", - m->linux_pkt_alloc_bytes); + m->linux_pkt_alloc_bytes); M("dropped_data_no_bufs %15llu Data bytes dropped because app buffers full\n", - m->dropped_data_no_bufs); + m->dropped_data_no_bufs); M("gen3_handoffs %15llu GRO->SoftIRQ handoffs made by Gen3 balancer\n", - m->gen3_handoffs); + m->gen3_handoffs); M("gen3_alt_handoffs %15llu Gen3 handoffs to secondary core (primary was busy)\n", - m->gen3_alt_handoffs); + m->gen3_alt_handoffs); M("gro_grant_bypasses %15llu Grant packets passed directly to homa_softirq by homa_gro_receive\n", - m->gro_grant_bypasses); + m->gro_grant_bypasses); M("gro_data_bypasses %15llu Data packets passed directly to homa_softirq by homa_gro_receive\n", - m->gro_data_bypasses); + m->gro_data_bypasses); for (i = 0; i < NUM_TEMP_METRICS; i++) M("temp%-2d %15llu Temporary use in testing\n", - i, m->temp[i]); + i, m->temp[i]); } return homa->metrics; } + /** * homa_metrics_open() - This function is invoked when /proc/net/homa_metrics is * opened. @@ -380,7 +379,7 @@ int homa_metrics_open(struct inode *inode, struct file *file) * file was reached, and a negative number indicates an error (-errno). */ ssize_t homa_metrics_read(struct file *file, char __user *buffer, - size_t length, loff_t *offset) + size_t length, loff_t *offset) { size_t copied; diff --git a/homa_metrics.h b/homa_metrics.h index f43db27d..248573a8 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -59,13 +59,13 @@ struct homa_metrics { * @packets_sent: total number of packets sent for each packet type * (entry 0 corresponds to DATA, and so on). */ - __u64 packets_sent[BOGUS-DATA]; + __u64 packets_sent[BOGUS - DATA]; /** * @packets_received: total number of packets received for each * packet type (entry 0 corresponds to DATA, and so on). */ - __u64 packets_received[BOGUS-DATA]; + __u64 packets_received[BOGUS - DATA]; /** @priority_bytes: total bytes sent at each priority level. */ __u64 priority_bytes[HOMA_MAX_PRIORITIES]; diff --git a/homa_offload.c b/homa_offload.c index e1e7597f..d9db3186 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -52,7 +52,7 @@ int homa_offload_init(void) offload_core->last_gro = 0; atomic_set(&offload_core->softirq_backlog, 0); offload_core->softirq_offset = 0; - offload_core->gen3_softirq_cores[0] = i^1; + offload_core->gen3_softirq_cores[0] = i ^ 1; for (j = 1; j < NUM_GEN3_SOFTIRQ_CORES; j++) offload_core->gen3_softirq_cores[j] = -1; offload_core->last_app_active = 0; @@ -87,7 +87,7 @@ int homa_offload_end(void) */ void homa_gro_hook_tcp(void) { - if (tcp_net_offload != NULL) + if (tcp_net_offload) return; pr_notice("Homa setting up TCP hijacking\n"); @@ -111,7 +111,7 @@ void homa_gro_hook_tcp(void) */ void homa_gro_unhook_tcp(void) { - if (tcp_net_offload == NULL) + if (!tcp_net_offload) return; pr_notice("Homa cancelling TCP hijacking\n"); inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) @@ -131,7 +131,7 @@ void homa_gro_unhook_tcp(void) * @skb: The newly arrived packet. */ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, - struct sk_buff *skb) + struct sk_buff *skb) { struct common_header *h = (struct common_header *) skb_transport_header(skb); @@ -139,7 +139,8 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, // tt_record4("homa_tcp_gro_receive got type 0x%x, flags 0x%x, " // "urgent 0x%x, id %d", h->type, h->flags, // ntohs(h->urgent), homa_local_id(h->sender_id)); - if ((h->flags != HOMA_TCP_FLAGS) || (ntohs(h->urgent) != HOMA_TCP_URGENT)) + if (h->flags != HOMA_TCP_FLAGS || + ntohs(h->urgent) != HOMA_TCP_URGENT) return tcp_net_offload->callbacks.gro_receive(held_list, skb); /* Change the packet's IP protocol to Homa so that it will get @@ -149,8 +150,8 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, ipv6_hdr(skb)->nexthdr = IPPROTO_HOMA; } else { ip_hdr(skb)->check = ~csum16_add(csum16_sub(~ip_hdr(skb)->check, - htons(ip_hdr(skb)->protocol)), - htons(IPPROTO_HOMA)); + htons(ip_hdr(skb)->protocol)), + htons(IPPROTO_HOMA)); ip_hdr(skb)->protocol = IPPROTO_HOMA; } return homa_gro_receive(held_list, skb); @@ -170,13 +171,12 @@ static void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) int hash; sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); - if (sock_flow_table == NULL) + if (!sock_flow_table) return; hash = cpu + net_hotdata.rps_cpu_mask + 1; if (sock_flow_table->ents[hash] != hash) { rcu_read_lock(); - sock_flow_table = rcu_dereference( - net_hotdata.rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); sock_flow_table->ents[hash] = hash; rcu_read_unlock(); } @@ -223,12 +223,12 @@ void homa_send_ipis(void) * Return: A list of packets, or NULL if for the packet couldn't be split. */ struct sk_buff *homa_gso_segment(struct sk_buff *skb, - netdev_features_t features) + netdev_features_t features) { struct sk_buff *segs; tt_record2("homa_gso_segment invoked, frags %d, headlen %d", - skb_shinfo(skb)->nr_frags, skb_headlen(skb)); + skb_shinfo(skb)->nr_frags, skb_headlen(skb)); /* This is needed to separate header info (which is replicated * in each segment) from data, which is divided among the segments. @@ -244,7 +244,7 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, struct sk_buff *seg; int i = 0; - for (seg = segs; seg != NULL; seg = seg->next) { + for (seg = segs; seg; seg = seg->next) { ip_hdr(seg)->id = htons(i); i++; } @@ -270,7 +270,7 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, * passed up the stack immediately. */ struct sk_buff *homa_gro_receive(struct list_head *held_list, - struct sk_buff *skb) + struct sk_buff *skb) { /* This function will do one of the following things: * 1. Merge skb with a packet in gro_list by appending it to @@ -294,7 +294,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, __u32 hash; int busy; - h_new = (struct data_header *) skb_transport_header(skb); + h_new = (struct data_header *)skb_transport_header(skb); offload_core = &per_cpu(homa_offload_core, raw_smp_processor_id()); busy = (now - offload_core->last_gro) < homa->gro_busy_ns; offload_core->last_active = now; @@ -302,7 +302,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, priority = ipv6_hdr(skb)->priority; saddr = ntohl(ipv6_hdr(skb)->saddr.in6_u.u6_addr32[3]); } else { - priority = ((struct iphdr *) skb_network_header(skb))->tos >> 5; + priority = ((struct iphdr *)skb_network_header(skb))->tos >> 5; saddr = ntohl(ip_hdr(skb)->saddr); } @@ -313,24 +313,24 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, if (h_new->common.type == DATA) { if (h_new->seg.offset == (__force __be32)-1) { tt_record2("homa_gro_receive replaced offset %d with %d", - ntohl(h_new->seg.offset), - ntohl(h_new->common.sequence)); + ntohl(h_new->seg.offset), + ntohl(h_new->common.sequence)); h_new->seg.offset = h_new->common.sequence; } tt_record4("homa_gro_receive got packet from 0x%x id %llu, offset %d, priority %d", - saddr, homa_local_id(h_new->common.sender_id), - ntohl(h_new->seg.offset), priority); - if ((homa_data_len(skb) == ntohl(h_new->message_length)) - && (homa->gro_policy & HOMA_GRO_SHORT_BYPASS) - && !busy) { + saddr, homa_local_id(h_new->common.sender_id), + ntohl(h_new->seg.offset), priority); + if (homa_data_len(skb) == ntohl(h_new->message_length) && + (homa->gro_policy & HOMA_GRO_SHORT_BYPASS) && + !busy) { INC_METRIC(gro_data_bypasses, 1); goto bypass; } } else if (h_new->common.type == GRANT) { tt_record4("homa_gro_receive got grant from 0x%x id %llu, offset %d, priority %d", - saddr, homa_local_id(h_new->common.sender_id), - ntohl(((struct grant_header *) h_new)->offset), - priority); + saddr, homa_local_id(h_new->common.sender_id), + ntohl(((struct grant_header *)h_new)->offset), + priority); /* The following optimization handles grants here at NAPI * level, bypassing the SoftIRQ mechanism (and avoiding the * delay of handing off to a different core). This makes @@ -341,10 +341,11 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, INC_METRIC(gro_grant_bypasses, 1); goto bypass; } - } else + } else { tt_record4("homa_gro_receive got packet from 0x%x id %llu, type 0x%x, priority %d", - saddr, homa_local_id(h_new->common.sender_id), - h_new->common.type, priority); + saddr, homa_local_id(h_new->common.sender_id), + h_new->common.type, priority); + } /* The GRO mechanism tries to separate packets onto different * gro_lists by hash. This is bad for us, because we want to batch @@ -370,8 +371,8 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * some other protocol). */ list_for_each_entry(held_skb, - &napi->gro_hash[offload_core->held_bucket].list, - list) { + &napi->gro_hash[offload_core->held_bucket].list, + list) { int protocol; if (held_skb != offload_core->held_skb) @@ -382,7 +383,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, protocol = ip_hdr(held_skb)->protocol; if (protocol != IPPROTO_HOMA) { tt_record3("homa_gro_receive held_skb 0x%0x%0x isn't Homa: protocol %d", - SPLIT_64(held_skb), protocol); + SPLIT_64(held_skb), protocol); continue; } @@ -415,7 +416,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, napi->gro_hash[offload_core->held_bucket].count--; if (napi->gro_hash[offload_core->held_bucket].count == 0) __clear_bit(offload_core->held_bucket, - &napi->gro_bitmask); + &napi->gro_bitmask); result = ERR_PTR(-EINPROGRESS); } goto done; @@ -473,7 +474,7 @@ void homa_gro_gen2(struct sk_buff *skb) * cores. See balance.txt for overall design information on load * balancing. */ - struct data_header *h = (struct data_header *) skb_transport_header(skb); + struct data_header *h = (struct data_header *)skb_transport_header(skb); int this_core = raw_smp_processor_id(); struct homa_offload_core *offload_core; int candidate = this_core; @@ -490,8 +491,8 @@ void homa_gro_gen2(struct sk_buff *skb) if ((offload_core->last_gro + homa->busy_ns) > now) continue; tt_record3("homa_gro_gen2 chose core %d for id %d offset %d", - candidate, homa_local_id(h->common.sender_id), - ntohl(h->seg.offset)); + candidate, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); break; } if (i <= 0) { @@ -508,8 +509,8 @@ void homa_gro_gen2(struct sk_buff *skb) while (candidate >= nr_cpu_ids) candidate -= nr_cpu_ids; tt_record3("homa_gro_gen2 chose core %d for id %d offset %d (all cores busy)", - candidate, homa_local_id(h->common.sender_id), - ntohl(h->seg.offset)); + candidate, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); } atomic_inc(&per_cpu(homa_offload_core, candidate).softirq_backlog); homa_set_softirq_cpu(skb, candidate); @@ -528,13 +529,13 @@ void homa_gro_gen3(struct sk_buff *skb) /* See balance.txt for overall design information on the Gen3 * load balancer. */ - struct data_header *h = (struct data_header *) skb_transport_header(skb); + struct data_header *h = (struct data_header *)skb_transport_header(skb); __u64 now, busy_time; int *candidates; int i, core; candidates = per_cpu(homa_offload_core, - raw_smp_processor_id()).gen3_softirq_cores; + raw_smp_processor_id()).gen3_softirq_cores; now = sched_clock(); busy_time = now - homa->busy_ns; @@ -553,9 +554,9 @@ void homa_gro_gen3(struct sk_buff *skb) homa_set_softirq_cpu(skb, core); per_cpu(homa_offload_core, core).last_active = now; tt_record4("homa_gro_gen3 chose core %d for id %d, offset %d, delta %d", - core, homa_local_id(h->common.sender_id), - ntohl(h->seg.offset), - now - per_cpu(homa_offload_core, core).last_app_active); + core, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset), + now - per_cpu(homa_offload_core, core).last_app_active); INC_METRIC(gen3_handoffs, 1); if (core != candidates[0]) INC_METRIC(gen3_alt_handoffs, 1); @@ -573,7 +574,7 @@ void homa_gro_gen3(struct sk_buff *skb) */ int homa_gro_complete(struct sk_buff *skb, int hoffset) { - struct data_header *h = (struct data_header *) skb_transport_header(skb); + struct data_header *h = (struct data_header *)skb_transport_header(skb); // tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", // h->common.type, homa_local_id(h->common.sender_id), @@ -597,7 +598,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) * hasn't done NAPI or SoftIRQ processing for Homa in the * longest time. */ - core = best = raw_smp_processor_id(); + best = raw_smp_processor_id(); + core = best; for (i = 0; i < CORES_TO_CHECK; i++) { core++; if (unlikely(core >= nr_cpu_ids)) @@ -610,8 +612,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) } homa_set_softirq_cpu(skb, best); tt_record3("homa_gro_complete chose core %d for id %d offset %d with IDLE policy", - best, homa_local_id(h->common.sender_id), - ntohl(h->seg.offset)); + best, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); } else if (homa->gro_policy & HOMA_GRO_NEXT) { /* Use the next core (in circular order) to handle the * SoftIRQ processing. @@ -622,8 +624,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) target = 0; homa_set_softirq_cpu(skb, target); tt_record3("homa_gro_complete chose core %d for id %d offset %d with NEXT policy", - target, homa_local_id(h->common.sender_id), - ntohl(h->seg.offset)); + target, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); } return 0; diff --git a/homa_outgoing.c b/homa_outgoing.c index 70115dc9..9ae3dfb7 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -107,7 +107,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, struct data_header *h; struct sk_buff *skb; int err, gso_size; - uint64_t segs; + __u64 segs; /* Initialize the overall skb. */ skb = homa_skb_new_tx(sizeof32(struct data_header)); @@ -145,7 +145,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, homa_info->seg_length = max_seg_data; homa_info->offset = offset; - if ((segs > 1) && (rpc->hsk->sock.sk_protocol != IPPROTO_TCP)) { + if (segs > 1 && rpc->hsk->sock.sk_protocol != IPPROTO_TCP) { homa_set_doff(h, sizeof(struct data_header) - sizeof32(struct seg_header)); h->seg.offset = htonl(offset); @@ -212,7 +212,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) struct sk_buff **last_link; struct dst_entry *dst; - uint64_t segs_per_gso; + __u64 segs_per_gso; int overlap_xmit; /* Bytes of the message that haven't yet been copied into skbs. */ @@ -265,9 +265,8 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) homa_rpc_unlock(rpc); skb_data_bytes = max_gso_data; offset = rpc->msgout.length - bytes_left; - if ((offset < rpc->msgout.unscheduled) && - ((offset + skb_data_bytes) - > rpc->msgout.unscheduled)) { + if (offset < rpc->msgout.unscheduled && + (offset + skb_data_bytes) > rpc->msgout.unscheduled) { /* Insert a packet boundary at the unscheduled limit, * so we don't transmit extra data. */ @@ -296,8 +295,8 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) *last_link = NULL; rpc->msgout.num_skbs++; rpc->msgout.copied_from_user = rpc->msgout.length - bytes_left; - if (overlap_xmit && list_empty(&rpc->throttled_links) && xmit - && (offset < rpc->msgout.granted)) { + if (overlap_xmit && list_empty(&rpc->throttled_links) && + xmit && offset < rpc->msgout.granted) { tt_record1("waking up pacer for id %d", rpc->id); homa_add_to_throttled(rpc); } @@ -383,7 +382,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, UNIT_LOG(",", "padded control packet with %d bytes", extra_bytes); } - priority = hsk->homa->num_priorities-1; + priority = hsk->homa->num_priorities - 1; skb->ooo_okay = 1; skb_get(skb); if (hsk->inet.sk.sk_family == AF_INET6) { @@ -392,7 +391,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, 0); } else { /* This will find its way to the DSCP field in the IPv4 hdr. */ - hsk->inet.tos = hsk->homa->priority_map[priority]<<5; + hsk->inet.tos = hsk->homa->priority_map[priority] << 5; result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); } if (unlikely(result != 0)) { @@ -456,8 +455,8 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) if (hsk->homa->verbose) pr_notice("sending UNKNOWN to peer %s:%d for id %llu", - homa_print_ipv6_addr(&saddr), - ntohs(h->sport), homa_local_id(h->sender_id)); + homa_print_ipv6_addr(&saddr), + ntohs(h->sport), homa_local_id(h->sender_id)); tt_record3("sending unknown to 0x%x:%d for id %llu", tt_addr(saddr), ntohs(h->sport), homa_local_id(h->sender_id)); @@ -503,8 +502,8 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) if (rpc->msgout.next_xmit_offset >= rpc->msgout.granted) { tt_record3("homa_xmit_data stopping at offset %d for id %u: granted is %d", - rpc->msgout.next_xmit_offset, rpc->id, - rpc->msgout.granted); + rpc->msgout.next_xmit_offset, rpc->id, + rpc->msgout.granted); break; } @@ -520,7 +519,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) if (rpc->msgout.next_xmit_offset < rpc->msgout.unscheduled) { priority = homa_unsched_priority(homa, rpc->peer, - rpc->msgout.length); + rpc->msgout.length); } else { priority = rpc->msgout.sched_priority; } @@ -565,8 +564,8 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) /* Update info that may have changed since the message was initially * created. */ - ((struct data_header *) skb_transport_header(skb))->cutoff_version - = rpc->peer->cutoff_version; + ((struct data_header *)skb_transport_header(skb))->cutoff_version = + rpc->peer->cutoff_version; dst = homa_get_dst(rpc->peer, rpc->hsk); dst_hold(dst); @@ -590,7 +589,8 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) tt_addr(rpc->peer->addr), rpc->id, homa_info->offset); - rpc->hsk->inet.tos = rpc->hsk->homa->priority_map[priority]<<5; + rpc->hsk->inet.tos = + rpc->hsk->homa->priority_map[priority] << 5; err = ip_queue_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow); } tt_record4("Finished queueing packet: rpc id %llu, offset %d, len %d, qid %d", @@ -614,7 +614,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) * @priority: Priority level to use for the retransmitted data packets. */ void homa_resend_data(struct homa_rpc *rpc, int start, int end, - int priority) + int priority) { struct homa_skb_info *homa_info; struct sk_buff *skb; @@ -666,7 +666,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, if (unlikely(!new_skb)) { if (rpc->hsk->homa->verbose) pr_notice("%s couldn't allocate skb\n", - __func__); + __func__); UNIT_LOG("; ", "skb allocation error"); goto resend_done; } @@ -935,8 +935,8 @@ void homa_pacer_xmit(struct homa *homa) /* Note: rpc->state could be RPC_DEAD here, but the code * below should work anyway. */ - if (!*rpc->msgout.next_xmit || (rpc->msgout.next_xmit_offset - >= rpc->msgout.granted)) { + if (!*rpc->msgout.next_xmit || rpc->msgout.next_xmit_offset >= + rpc->msgout.granted) { /* Nothing more to transmit from this message (right * now), so remove it from the throttled list. */ @@ -1055,7 +1055,7 @@ void homa_remove_from_throttled(struct homa_rpc *rpc) void homa_log_throttled(struct homa *homa) { struct homa_rpc *rpc; - int64_t bytes = 0; + __s64 bytes = 0; int rpcs = 0; pr_notice("Printing throttled list\n"); @@ -1063,11 +1063,11 @@ void homa_log_throttled(struct homa *homa) list_for_each_entry_rcu(rpc, &homa->throttled_rpcs, throttled_links) { rpcs++; if (!homa_bucket_try_lock(rpc->bucket, rpc->id, - "homa_log_throttled")) { + "homa_log_throttled")) { pr_notice("Skipping throttled RPC: locked\n"); continue; } - if (*rpc->msgout.next_xmit != NULL) + if (*rpc->msgout.next_xmit) bytes += rpc->msgout.length - rpc->msgout.next_xmit_offset; if (rpcs <= 20) @@ -1076,5 +1076,5 @@ void homa_log_throttled(struct homa *homa) } homa_throttle_unlock(homa); pr_notice("Finished printing throttle list: %d rpcs, %lld bytes\n", - rpcs, bytes); + rpcs, bytes); } diff --git a/homa_peer.c b/homa_peer.c index 77cf4715..5aad3696 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -70,7 +70,7 @@ void homa_peertab_destroy(struct homa_peertab *peertab) * are no peers, NULL is returned. */ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, - int *num_peers) + int *num_peers) { struct homa_peer **result; struct hlist_node *next; @@ -85,7 +85,7 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, count = 0; for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], - peertab_links) + peertab_links) count++; } @@ -93,13 +93,13 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, return NULL; result = kmalloc_array(count, sizeof(peer), GFP_KERNEL); - if (result == NULL) + if (!result) return NULL; *num_peers = count; count = 0; for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], - peertab_links) { + peertab_links) { result[count] = peer; count++; } @@ -196,8 +196,8 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, goto done; } peer->dst = dst; - peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-1] = 0; - peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2] = INT_MAX; + peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; + peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 2] = INT_MAX; peer->cutoff_version = 0; peer->last_update_jiffies = 0; INIT_LIST_HEAD(&peer->grantable_rpcs); @@ -236,7 +236,7 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, /* Retain the existing dst if we can't create a new one. */ if (hsk->homa->verbose) pr_notice("%s couldn't recreate dst: error %ld", - __func__, PTR_ERR(dst)); + __func__, PTR_ERR(dst)); INC_METRIC(peer_route_errors, 1); } else { struct homa_dead_dst *dead = (struct homa_dead_dst *) @@ -270,11 +270,11 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, * Return: A priority level. */ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, - int length) + int length) { int i; - for (i = homa->num_priorities-1; ; i--) { + for (i = homa->num_priorities - 1; ; i--) { if (peer->unsched_cutoffs[i] >= length) return i; } @@ -343,7 +343,7 @@ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, * @c7: Largest message size that will use priority 7. */ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, - int c3, int c4, int c5, int c6, int c7) + int c3, int c4, int c5, int c6, int c7) { peer->unsched_cutoffs[0] = c0; peer->unsched_cutoffs[1] = c1; diff --git a/homa_plumbing.c b/homa_plumbing.c index 8f37f312..1799adfa 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -394,7 +394,7 @@ static struct ctl_table homa_ctl_table[] = { { .procname = "priority_map", .data = &homa_data.priority_map, - .maxlen = HOMA_MAX_PRIORITIES*sizeof(int), + .maxlen = HOMA_MAX_PRIORITIES * sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, @@ -478,7 +478,7 @@ static struct ctl_table homa_ctl_table[] = { { .procname = "unsched_cutoffs", .data = &homa_data.unsched_cutoffs, - .maxlen = HOMA_MAX_PRIORITIES*sizeof(int), + .maxlen = HOMA_MAX_PRIORITIES * sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, @@ -576,7 +576,7 @@ static int __init homa_load(void) if (status) goto out_cleanup; metrics_dir_entry = proc_create("homa_metrics", 0444, - init_net.proc_net, &homa_metrics_pops); + init_net.proc_net, &homa_metrics_pops); if (!metrics_dir_entry) { pr_err("couldn't create /proc/net/homa_metrics\n"); status = -ENOMEM; @@ -584,7 +584,7 @@ static int __init homa_load(void) } homa_ctl_header = register_net_sysctl(&init_net, "net/homa", - homa_ctl_table); + homa_ctl_table); if (!homa_ctl_header) { pr_err("couldn't register Homa sysctl parameters\n"); status = -ENOMEM; @@ -793,7 +793,8 @@ int homa_ioctl(struct sock *sk, int cmd, int *karg) INC_METRIC(abort_ns, sched_clock() - start); break; case HOMAIOCFREEZE: - tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, pid %d", current->pid); + tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, pid %d", + current->pid); tt_freeze(); result = 0; break; @@ -905,7 +906,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) goto error; } if (unlikely(copy_from_user(&args, (void __user *)msg->msg_control, - sizeof(args)))) { + sizeof(args)))) { result = -EFAULT; goto error; } @@ -1078,18 +1079,18 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, * for performance debugging). */ if (rpc->hsk->homa->freeze_type == SLOW_RPC) { - uint64_t elapsed = (sched_clock() - rpc->start_ns)>>10; + u64 elapsed = (sched_clock() - rpc->start_ns) >> 10; - if ((elapsed <= hsk->homa->temp[1]) - && (elapsed >= hsk->homa->temp[0]) - && homa_is_client(rpc->id) - && (rpc->msgin.length >= hsk->homa->temp[2]) - && (rpc->msgin.length < hsk->homa->temp[3])) { + if (elapsed <= hsk->homa->temp[1] && + elapsed >= hsk->homa->temp[0] && + homa_is_client(rpc->id) && + rpc->msgin.length >= hsk->homa->temp[2] && + rpc->msgin.length < hsk->homa->temp[3]) { tt_record4("Long RTT: kcycles %d, id %d, peer 0x%x, length %d", - elapsed, rpc->id, - tt_addr(rpc->peer->addr), - rpc->msgin.length); - homa_freeze(rpc, SLOW_RPC, "Freezing because of long elapsed time for RPC id %d, peer 0x%x"); + elapsed, rpc->id, tt_addr(rpc->peer->addr), + rpc->msgin.length); + homa_freeze(rpc, SLOW_RPC, + "Freezing because of long elapsed time for RPC id %d, peer 0x%x"); } } @@ -1241,10 +1242,10 @@ int homa_softirq(struct sk_buff *skb) __skb_pull(skb, header_offset); /* Reject packets that are too short or have bogus types. */ - h = (struct common_header *) skb->data; + h = (struct common_header *)skb->data; if (unlikely(skb->len < sizeof(struct common_header) || - h->type < DATA || h->type >= BOGUS || - skb->len < header_lengths[h->type-DATA])) { + h->type < DATA || h->type >= BOGUS || + skb->len < header_lengths[h->type - DATA])) { if (homa->verbose) pr_warn("Homa %s packet from %s too short: %d bytes\n", homa_symbol_for_type(h->type), @@ -1268,9 +1269,9 @@ int homa_softirq(struct sk_buff *skb) if (!tt_frozen) { homa_rpc_log_active_tt(homa, 0); tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", - ntohs(h->dport), tt_addr(saddr), - ntohs(h->sport), - homa_local_id(h->sender_id)); + ntohs(h->dport), tt_addr(saddr), + ntohs(h->sport), + homa_local_id(h->sender_id)); tt_freeze(); } goto discard; @@ -1438,7 +1439,7 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, } else { if (homa->verbose) pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n", - __func__, info, type, code); + __func__, info, type, code); } return 0; } @@ -1488,10 +1489,10 @@ __poll_t homa_poll(struct file *file, struct socket *sock, */ #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) int homa_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void __user *buffer, size_t *lenp, loff_t *ppos) #else int homa_dointvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) #endif { int result; @@ -1508,8 +1509,8 @@ int homa_dointvec(const struct ctl_table *table, int write, * particular value was written (don't want to increment * cutoff_version otherwise). */ - if ((table->data == &homa_data.unsched_cutoffs) - || (table->data == &homa_data.num_priorities)) { + if (table->data == &homa_data.unsched_cutoffs || + table->data == &homa_data.num_priorities) { homa_prios_changed(homa); } @@ -1522,18 +1523,18 @@ int homa_dointvec(const struct ctl_table *table, int write, * to print information to the log. */ if (table->data == &action) { - if (action == 2) + if (action == 2) { homa_rpc_log_active(homa, 0); - else if (action == 3) { + } else if (action == 3) { #if 1 /* See strip.py */ tt_record("Freezing because of sysctl"); tt_freeze(); #endif /* See strip.py */ - } else if (action == 4) + } else if (action == 4) { homa_log_throttled(homa); - else if (action == 5) + } else if (action == 5) { tt_printk(); - else if (action == 6) { + } else if (action == 6) { tt_record("Calling homa_rpc_log_active because of action 6"); homa_rpc_log_active_tt(homa, 0); tt_record("Freezing because of action 6"); @@ -1546,11 +1547,12 @@ int homa_dointvec(const struct ctl_table *table, int write, tt_freeze(); } else if (action == 8) { pr_notice("homa_total_incoming is %d\n", - atomic_read(&homa->total_incoming)); + atomic_read(&homa->total_incoming)); } else if (action == 9) { tt_print_file("/users/ouster/node.tt"); - } else + } else { homa_rpc_log_active(homa, action); + } action = 0; } } @@ -1571,10 +1573,10 @@ int homa_dointvec(const struct ctl_table *table, int write, */ #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) int homa_sysctl_softirq_cores(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void __user *buffer, size_t *lenp, loff_t *ppos) #else int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) #endif { struct homa_offload_core *offload_core; @@ -1584,7 +1586,7 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, max_values = (NUM_GEN3_SOFTIRQ_CORES + 1) * nr_cpu_ids; values = kmalloc_array(max_values, sizeof(int), GFP_KERNEL); - if (values == NULL) + if (!values) return -ENOMEM; table_copy = *table; @@ -1607,7 +1609,8 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, break; offload_core = &per_cpu(homa_offload_core, values[i]); for (j = 0; j < NUM_GEN3_SOFTIRQ_CORES; j++) - offload_core->gen3_softirq_cores[j] = values[i+j+1]; + offload_core->gen3_softirq_cores[j] = + values[i + j + 1]; } } else { /* Read: return values from all of the cores. */ diff --git a/homa_pool.c b/homa_pool.c index b3f417ce..84407149 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -198,13 +198,13 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, */ ref_count = atomic_read(&bpage->refs); if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || - bpage->expiration > now))) + bpage->expiration > now))) continue; if (!spin_trylock_bh(&bpage->lock)) continue; ref_count = atomic_read(&bpage->refs); if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || - bpage->expiration > now))) { + bpage->expiration > now))) { spin_unlock_bh(&bpage->lock); continue; } @@ -453,7 +453,8 @@ void homa_pool_check_waiting(struct homa_pool *pool) /* Allocation succeeded; "wake up" the RPC. */ rpc->msgin.resend_all = 1; homa_grant_check_rpc(rpc); - } else + } else { homa_rpc_unlock(rpc); + } } } diff --git a/homa_rpc.c b/homa_rpc.c index 357bf23c..ea36033b 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -296,9 +296,9 @@ void homa_rpc_free(struct homa_rpc *rpc) if (rpc->msgin.length >= 0) { rpc->hsk->dead_skbs += skb_queue_len(&rpc->msgin.packets); while (1) { - struct homa_gap *gap = list_first_entry_or_null( - &rpc->msgin.gaps, - struct homa_gap, links); + struct homa_gap *gap = list_first_entry_or_null(&rpc->msgin.gaps, + struct homa_gap, + links); if (!gap) break; list_del(&gap->links); @@ -454,7 +454,8 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) gap = list_first_entry_or_null( &rpc->msgin.gaps, - struct homa_gap, links); + struct homa_gap, + links); if (!gap) break; list_del(&gap->links); @@ -543,24 +544,19 @@ void homa_rpc_log(struct homa_rpc *rpc) if (rpc->state == RPC_INCOMING) pr_notice("%s RPC INCOMING, id %llu, peer %s:%d, %d/%d bytes received, incoming %d\n", - type, rpc->id, peer, rpc->dport, - rpc->msgin.length - - rpc->msgin.bytes_remaining, - rpc->msgin.length, rpc->msgin.granted); + type, rpc->id, peer, rpc->dport, + rpc->msgin.length - rpc->msgin.bytes_remaining, + rpc->msgin.length, rpc->msgin.granted); else if (rpc->state == RPC_OUTGOING) { pr_notice("%s RPC OUTGOING, id %llu, peer %s:%d, out length %d, left %d, granted %d, in left %d, resend_ticks %u, silent_ticks %d\n", - type, rpc->id, peer, rpc->dport, - rpc->msgout.length, - rpc->msgout.length - rpc->msgout.next_xmit_offset, - rpc->msgout.granted, - rpc->msgin.bytes_remaining, - rpc->resend_timer_ticks, - rpc->silent_ticks); + type, rpc->id, peer, rpc->dport, rpc->msgout.length, + rpc->msgout.length - rpc->msgout.next_xmit_offset, + rpc->msgout.granted, rpc->msgin.bytes_remaining, + rpc->resend_timer_ticks, rpc->silent_ticks); } else { pr_notice("%s RPC %s, id %llu, peer %s:%d, incoming length %d, outgoing length %d\n", - type, homa_symbol_for_state(rpc), - rpc->id, peer, rpc->dport, - rpc->msgin.length, rpc->msgout.length); + type, homa_symbol_for_state(rpc), rpc->id, peer, + rpc->dport, rpc->msgin.length, rpc->msgout.length); } } @@ -581,7 +577,7 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) pr_notice("Logging active Homa RPCs:\n"); rcu_read_lock(); for (hsk = homa_socktab_start_scan(homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { + hsk; hsk = homa_socktab_next(&scan)) { if (list_empty(&hsk->active_rpcs) || hsk->shutdown) continue; @@ -589,7 +585,7 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) continue; list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { count++; - if ((id != 0) && (id != rpc->id)) + if (id != 0 && id != rpc->id) continue; homa_rpc_log(rpc); } @@ -611,32 +607,32 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) int received = rpc->msgin.length - rpc->msgin.bytes_remaining; tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes received", - rpc->id, tt_addr(rpc->peer->addr), - received, rpc->msgin.length); + rpc->id, tt_addr(rpc->peer->addr), + received, rpc->msgin.length); if (1) tt_record4("RPC id %d has incoming %d, granted %d, prio %d", rpc->id, - rpc->msgin.granted - received, - rpc->msgin.granted, rpc->msgin.priority); + rpc->msgin.granted - received, + rpc->msgin.granted, rpc->msgin.priority); tt_record4("RPC id %d: length %d, remaining %d, rank %d", - rpc->id, rpc->msgin.length, - rpc->msgin.bytes_remaining, - atomic_read(&rpc->msgin.rank)); + rpc->id, rpc->msgin.length, + rpc->msgin.bytes_remaining, + atomic_read(&rpc->msgin.rank)); if (rpc->msgin.num_bpages == 0) tt_record1("RPC id %d is blocked waiting for buffers", - rpc->id); + rpc->id); else tt_record2("RPC id %d has %d bpages allocated", - rpc->id, rpc->msgin.num_bpages); + rpc->id, rpc->msgin.num_bpages); } else if (rpc->state == RPC_OUTGOING) { tt_record4("Outgoing RPC id %d, peer 0x%x, %d/%d bytes sent", - rpc->id, tt_addr(rpc->peer->addr), - rpc->msgout.next_xmit_offset, - rpc->msgout.length); + rpc->id, tt_addr(rpc->peer->addr), + rpc->msgout.next_xmit_offset, + rpc->msgout.length); if (rpc->msgout.granted > rpc->msgout.next_xmit_offset) tt_record3("RPC id %d has %d unsent grants (granted %d)", - rpc->id, rpc->msgout.granted - - rpc->msgout.next_xmit_offset, - rpc->msgout.granted); + rpc->id, rpc->msgout.granted - + rpc->msgout.next_xmit_offset, + rpc->msgout.granted); } else { tt_record2("RPC id %d is in state %d", rpc->id, rpc->state); } @@ -661,7 +657,7 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) tt_record("Logging active Homa RPCs:"); rcu_read_lock(); for (hsk = homa_socktab_start_scan(homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { + hsk; hsk = homa_socktab_next(&scan)) { if (list_empty(&hsk->active_rpcs) || hsk->shutdown) continue; @@ -710,11 +706,11 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) int actual; tt_record1("homa_validate_incoming starting, total_incoming %d", - atomic_read(&homa->total_incoming)); + atomic_read(&homa->total_incoming)); *link_errors = 0; rcu_read_lock(); for (hsk = homa_socktab_start_scan(homa->port_map, &scan); - hsk != NULL; hsk = homa_socktab_next(&scan)) { + hsk; hsk = homa_socktab_next(&scan)) { if (list_empty(&hsk->active_rpcs) || hsk->shutdown) continue; @@ -735,18 +731,18 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) total_incoming += rpc->msgin.rec_incoming; if (verbose) tt_record3("homa_validate_incoming: RPC id %d, ncoming %d, rec_incoming %d", - rpc->id, incoming, - rpc->msgin.rec_incoming); + rpc->id, incoming, + rpc->msgin.rec_incoming); if (rpc->msgin.granted >= rpc->msgin.length) continue; if (list_empty(&rpc->grantable_links)) { tt_record1("homa_validate_incoming: RPC id %d not linked in grantable list", - rpc->id); + rpc->id); *link_errors = 1; } if (list_empty(&rpc->grantable_links)) { tt_record1("homa_validate_incoming: RPC id %d peer not linked in grantable list", - rpc->id); + rpc->id); *link_errors = 1; } } @@ -756,7 +752,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) rcu_read_unlock(); actual = atomic_read(&homa->total_incoming); tt_record3("homa_validate_incoming diff %d (expected %d, got %d)", - actual - total_incoming, total_incoming, actual); + actual - total_incoming, total_incoming, actual); return actual - total_incoming; } @@ -783,7 +779,7 @@ char *homa_symbol_for_state(struct homa_rpc *rpc) } /* See safety comment in homa_symbol_for_type. */ - snprintf(buffer, sizeof(buffer)-1, "unknown(%u)", rpc->state); - buffer[sizeof(buffer)-1] = 0; + snprintf(buffer, sizeof(buffer) - 1, "unknown(%u)", rpc->state); + buffer[sizeof(buffer) - 1] = 0; return buffer; } diff --git a/homa_skb.c b/homa_skb.c index 07aeb21e..5775e72a 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -38,7 +38,7 @@ int homa_skb_init(struct homa *homa) homa->skb_pages_to_free = NULL; homa->pages_to_free_slots = 0; homa->skb_page_free_time = 0; - homa->skb_page_pool_min_kb = (3*HOMA_MAX_MESSAGE_LENGTH)/1000; + homa->skb_page_pool_min_kb = (3 * HOMA_MAX_MESSAGE_LENGTH) / 1000; /* Initialize NUMA-specfific page pools. */ homa->max_numa = -1; @@ -49,7 +49,7 @@ int homa_skb_init(struct homa *homa) BUG_ON(numa >= MAX_NUMNODES); if (numa > homa->max_numa) homa->max_numa = numa; - if (homa->page_pools[numa] == NULL) { + if (!homa->page_pools[numa]) { struct homa_page_pool *pool; pool = kmalloc(sizeof(*pool), GFP_KERNEL); @@ -62,7 +62,7 @@ int homa_skb_init(struct homa *homa) } skb_core->pool = homa->page_pools[numa]; } - pr_notice("homa_skb_init found max NUMA node %d\n", homa->max_numa); + pr_notice("%s found max NUMA node %d\n", __func__, homa->max_numa); return 0; } @@ -78,7 +78,7 @@ void homa_skb_cleanup(struct homa *homa) for (i = 0; i < nr_cpu_ids; i++) { struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, i); - if (skb_core->skb_page != NULL) { + if (skb_core->skb_page) { put_page(skb_core->skb_page); skb_core->skb_page = NULL; skb_core->page_size = 0; @@ -93,7 +93,7 @@ void homa_skb_cleanup(struct homa *homa) for (i = 0; i < MAX_NUMNODES; i++) { struct homa_page_pool *pool = homa->page_pools[i]; - if (pool == NULL) + if (!pool) continue; for (j = pool->avail - 1; j >= 0; j--) put_page(pool->pages[j]); @@ -102,7 +102,7 @@ void homa_skb_cleanup(struct homa *homa) homa->page_pools[i] = NULL; } - if (homa->skb_pages_to_free != NULL) { + if (homa->skb_pages_to_free) { kfree(homa->skb_pages_to_free); homa->skb_pages_to_free = NULL; homa->pages_to_free_slots = 0; @@ -158,7 +158,7 @@ void homa_skb_stash_pages(struct homa *homa, int length) struct homa_page_pool *pool = skb_core->pool; int pages_needed = HOMA_MAX_STASHED(length); - if ((pages_needed < 2) || (skb_core->num_stashed_pages >= pages_needed)) + if (pages_needed < 2 || skb_core->num_stashed_pages >= pages_needed) return; spin_lock_bh(&homa->page_pool_mutex); while (pool->avail && (skb_core->num_stashed_pages < pages_needed)) { @@ -194,15 +194,18 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) /* Can we just extend the skb's last fragment? */ skb_core = &per_cpu(homa_skb_core, raw_smp_processor_id()); frag = &shinfo->frags[shinfo->nr_frags - 1]; - if ((shinfo->nr_frags > 0) && (skb_frag_page(frag) == skb_core->skb_page) - && (skb_core->page_inuse < skb_core->page_size) - && ((frag->offset + skb_frag_size(frag)) - == skb_core->page_inuse)) { - if ((skb_core->page_size - skb_core->page_inuse) < actual_size) - actual_size = skb_core->page_size - skb_core->page_inuse; + if (shinfo->nr_frags > 0 && + skb_frag_page(frag) == skb_core->skb_page && + skb_core->page_inuse < skb_core->page_size && + (frag->offset + skb_frag_size(frag)) == skb_core->page_inuse) { + if ((skb_core->page_size - skb_core->page_inuse) < + actual_size) + actual_size = skb_core->page_size - + skb_core->page_inuse; *length = actual_size; skb_frag_size_add(frag, actual_size); - result = page_address(skb_frag_page(frag)) + skb_core->page_inuse; + result = page_address(skb_frag_page(frag)) + + skb_core->page_inuse; skb_core->page_inuse += actual_size; skb_len_add(skb, actual_size); return result; @@ -257,12 +260,11 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) skb_core->page_inuse = 0; if (skb_core->num_stashed_pages > 0) { skb_core->num_stashed_pages--; - skb_core->skb_page = skb_core->stashed_pages[ - skb_core->num_stashed_pages]; + skb_core->skb_page = skb_core->stashed_pages[skb_core->num_stashed_pages]; goto success; } - /* Step 2: can we retreive a page from the pool for this NUMA node? */ + /* Step 2: can we retrieve a page from the pool for this NUMA node? */ pool = skb_core->pool; if (pool->avail) { spin_lock_bh(&homa->page_pool_mutex); @@ -296,7 +298,8 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) skb_core->page_size = PAGE_SIZE; goto success; } - skb_core->page_size = skb_core->page_inuse = 0; + skb_core->page_size = 0; + skb_core->page_inuse = 0; return false; success: @@ -314,15 +317,15 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) * Return: 0 or a negative errno. */ int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, void *buf, - int length) + int length) { - char *src = (char *) buf; + char *src = buf; int chunk_length; char *dst; while (length > 0) { chunk_length = length; - dst = (char *) homa_skb_extend_frags(homa, skb, &chunk_length); + dst = (char *)homa_skb_extend_frags(homa, skb, &chunk_length); if (!dst) return -ENOMEM; memcpy(dst, src, chunk_length); @@ -343,14 +346,14 @@ int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, void *buf, * Return: 0 or a negative errno. */ int homa_skb_append_from_iter(struct homa *homa, struct sk_buff *skb, - struct iov_iter *iter, int length) + struct iov_iter *iter, int length) { int chunk_length; char *dst; while (length > 0) { chunk_length = length; - dst = (char *) homa_skb_extend_frags(homa, skb, &chunk_length); + dst = (char *)homa_skb_extend_frags(homa, skb, &chunk_length); if (!dst) return -ENOMEM; if (copy_from_iter(dst, chunk_length, iter) != chunk_length) @@ -375,7 +378,7 @@ int homa_skb_append_from_iter(struct homa *homa, struct sk_buff *skb, * Return: 0 for success or a negative errno if an error occurred. */ int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, - struct sk_buff *src_skb, int offset, int length) + struct sk_buff *src_skb, int offset, int length) { int src_frag_offset, src_frags_left, chunk_size, err, head_len; struct skb_shared_info *src_shinfo = skb_shinfo(src_skb); @@ -389,8 +392,8 @@ int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, if (chunk_size > (head_len - offset)) chunk_size = head_len - offset; err = homa_skb_append_to_frag(homa, dst_skb, - skb_transport_header(src_skb) + offset, - chunk_size); + skb_transport_header(src_skb) + offset, + chunk_size); if (err) return err; offset += chunk_size; @@ -471,17 +474,18 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) for (j = 0; j < shinfo->nr_frags; j++) { struct page *page = skb_frag_page(&shinfo->frags[j]); - if ((compound_order(page) == HOMA_SKB_PAGE_ORDER) - && (page_ref_count(page) == 1)) { + if (compound_order(page) == HOMA_SKB_PAGE_ORDER && + page_ref_count(page) == 1) { pages_to_cache[num_pages] = page; num_pages++; if (num_pages == MAX_PAGES_AT_ONCE) { homa_skb_cache_pages(homa, pages_to_cache, - num_pages); + num_pages); num_pages = 0; } - } else + } else { put_page(page); + } } shinfo->nr_frags = 0; kfree_skb(skb); @@ -517,8 +521,9 @@ void homa_skb_cache_pages(struct homa *homa, struct page **pages, int count) if (pool->avail < LIMIT) { pool->pages[pool->avail] = page; pool->avail++; - } else + } else { put_page(pages[i]); + } } spin_unlock_bh(&homa->page_pool_mutex); } @@ -536,7 +541,7 @@ void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length) { int chunk_size, frags_left, frag_offset, head_len; struct skb_shared_info *shinfo = skb_shinfo(skb); - char *dst = (char *) dest; + char *dst = dest; skb_frag_t *frag; /* Copy bytes from the linear part of the skb, if any. */ @@ -587,12 +592,13 @@ void homa_skb_release_pages(struct homa *homa) /* Free pages every 0.5 second. */ homa->skb_page_free_time = now + 500000000ULL; - release_max = homa->skb_page_frees_per_sec/2; + release_max = homa->skb_page_frees_per_sec / 2; if (homa->pages_to_free_slots < release_max) { - if (homa->skb_pages_to_free != NULL) + if (homa->skb_pages_to_free) kfree(homa->skb_pages_to_free); homa->skb_pages_to_free = kmalloc_array(release_max, - sizeof(struct page *), GFP_KERNEL); + sizeof(struct page *), + GFP_KERNEL); homa->pages_to_free_slots = release_max; } @@ -602,14 +608,14 @@ void homa_skb_release_pages(struct homa *homa) for (i = 0; i <= homa->max_numa; i++) { struct homa_page_pool *pool = homa->page_pools[i]; - if (pool == NULL) + if (!pool) continue; if (pool->low_mark > max_low_mark) { max_low_mark = pool->low_mark; max_pool = pool; } tt_record3("NUMA node %d has %d pages in skb page pool, low mark %d", - i, pool->avail, pool->low_mark); + i, pool->avail, pool->low_mark); pool->low_mark = pool->avail; } @@ -633,7 +639,7 @@ void homa_skb_release_pages(struct homa *homa) struct page *page = homa->skb_pages_to_free[i]; tt_record2("homa_skb_release_pages releasing page 0x%08x%08x", - tt_hi(page), tt_lo(page)); + tt_hi(page), tt_lo(page)); put_page(page); } } diff --git a/homa_skb.h b/homa_skb.h index db617670..4c6103bb 100644 --- a/homa_skb.h +++ b/homa_skb.h @@ -78,7 +78,7 @@ struct homa_skb_core { * is 0). This is a rough guess, since it doesn't consider all of * the data_segments that will be needed for the packets. */ -#define HOMA_MAX_STASHED(size) (((size - 1) / HOMA_SKB_PAGE_SIZE) + 1) +#define HOMA_MAX_STASHED(size) ((((size) - 1) / HOMA_SKB_PAGE_SIZE) + 1) /** * @num_stashed_pages: number of pages currently available in diff --git a/homa_sock.c b/homa_sock.c index 9f01b2ce..10d1faa6 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -200,8 +200,8 @@ void homa_sock_unlink(struct homa_sock *hsk) list_for_each_entry(scan, &socktab->active_scans, scan_links) { if (!scan->next || scan->next->sock != hsk) continue; - scan->next = (struct homa_socktab_links *)rcu_dereference( - hlist_next_rcu(&scan->next->hash_links)); + scan->next = (struct homa_socktab_links *) + rcu_dereference(hlist_next_rcu(&scan->next->hash_links)); } hlist_del_rcu(&hsk->socktab_links.hash_links); spin_unlock_bh(&socktab->write_lock); diff --git a/homa_sock.h b/homa_sock.h index b7d1041a..d6da08f9 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -333,8 +333,8 @@ static inline int homa_port_hash(__u16 port) * * Return: The bucket in which this RPC will appear, if the RPC exists. */ -static inline struct homa_rpc_bucket *homa_client_rpc_bucket( - struct homa_sock *hsk, __u64 id) +static inline struct homa_rpc_bucket *homa_client_rpc_bucket(struct homa_sock *hsk, + __u64 id) { /* We can use a really simple hash function here because RPC ids * are allocated sequentially. @@ -351,8 +351,8 @@ static inline struct homa_rpc_bucket *homa_client_rpc_bucket( * * Return: The bucket in which this RPC will appear, if the RPC exists. */ -static inline struct homa_rpc_bucket *homa_server_rpc_bucket( - struct homa_sock *hsk, __u64 id) +static inline struct homa_rpc_bucket *homa_server_rpc_bucket(struct homa_sock *hsk, + __u64 id) { /* Each client allocates RPC ids sequentially, so they will * naturally distribute themselves across the hash space. diff --git a/homa_timer.c b/homa_timer.c index 36eb8dca..0033921f 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -116,7 +116,7 @@ void homa_check_rpc(struct homa_rpc *rpc) if (resend.length == 0) return; } - resend.priority = homa->num_priorities-1; + resend.priority = homa->num_priorities - 1; homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); #if 1 /* See strip.py */ if (homa_is_client(rpc->id)) { @@ -176,7 +176,7 @@ void homa_timer(struct homa *homa) for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = homa_metrics_per_cpu(); - total_grants += m->packets_sent[GRANT-DATA]; + total_grants += m->packets_sent[GRANT - DATA]; } tt_record4("homa_timer found total_incoming %d, num_grantable_rpcs %d, num_active_rpcs %d, new grants %d", @@ -184,18 +184,19 @@ void homa_timer(struct homa *homa) homa->num_grantable_rpcs, homa->num_active_rpcs, total_grants - prev_grant_count); - if ((total_grants == prev_grant_count) - && (homa->num_grantable_rpcs > 20)) { + if (total_grants == prev_grant_count && + homa->num_grantable_rpcs > 20) { zero_count++; - if ((zero_count > 3) && !tt_frozen && 0) { + if (zero_count > 3 && !tt_frozen && 0) { pr_err("%s found no grants going out\n", __func__); homa_rpc_log_active_tt(homa, 0); tt_record("freezing because no grants are going out"); homa_freeze_peers(homa); tt_freeze(); } - } else + } else { zero_count = 0; + } prev_grant_count = total_grants; /* Scan all existing RPCs in all sockets. The rcu_read_lock @@ -209,7 +210,7 @@ void homa_timer(struct homa *homa) * isn't keeping up with RPC reaping, so we'll help * out. See reap.txt for more info. */ - uint64_t start = sched_clock(); + __u64 start = sched_clock(); tt_record("homa_timer calling homa_rpc_reap"); if (homa_rpc_reap(hsk, hsk->homa->reap_limit) == 0) @@ -256,9 +257,9 @@ void homa_timer(struct homa *homa) homa_socktab_end_scan(&scan); rcu_read_unlock(); tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", - total_incoming_rpcs, sum_incoming, sum_incoming_rec, - atomic_read(&homa->total_incoming)); + total_incoming_rpcs, sum_incoming, sum_incoming_rec, + atomic_read(&homa->total_incoming)); homa_skb_release_pages(homa); end = sched_clock(); - INC_METRIC(timer_ns, end-start); + INC_METRIC(timer_ns, end - start); } diff --git a/homa_utils.c b/homa_utils.c index d737f047..d72604bb 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -90,10 +90,10 @@ int homa_init(struct homa *homa) for (i = 0; i < HOMA_MAX_PRIORITIES; i++) homa->priority_map[i] = i; homa->max_sched_prio = HOMA_MAX_PRIORITIES - 5; - homa->unsched_cutoffs[HOMA_MAX_PRIORITIES-1] = 200; - homa->unsched_cutoffs[HOMA_MAX_PRIORITIES-2] = 2800; - homa->unsched_cutoffs[HOMA_MAX_PRIORITIES-3] = 15000; - homa->unsched_cutoffs[HOMA_MAX_PRIORITIES-4] = HOMA_MAX_MESSAGE_LENGTH; + homa->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 200; + homa->unsched_cutoffs[HOMA_MAX_PRIORITIES - 2] = 2800; + homa->unsched_cutoffs[HOMA_MAX_PRIORITIES - 3] = 15000; + homa->unsched_cutoffs[HOMA_MAX_PRIORITIES - 4] = HOMA_MAX_MESSAGE_LENGTH; #ifdef __UNIT_TEST__ /* Unit tests won't send CUTOFFS messages unless the test changes * this variable. @@ -208,7 +208,7 @@ char *homa_print_ipv4_addr(__be32 addr) if (next_buf >= NUM_BUFS_IPV4) next_buf = 0; snprintf(buffer, BUF_SIZE_IPV4, "%u.%u.%u.%u", (a2 >> 24) & 0xff, - (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); + (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); return buffer; } @@ -224,7 +224,7 @@ char *homa_print_ipv4_addr(__be32 addr) */ char *homa_print_ipv6_addr(const struct in6_addr *addr) { -#define NUM_BUFS (1 << 2) +#define NUM_BUFS BIT(2) #define BUF_SIZE 64 static char buffers[NUM_BUFS][BUF_SIZE]; static int next_buf; @@ -245,10 +245,10 @@ char *homa_print_ipv6_addr(const struct in6_addr *addr) __u32 a2 = ntohl(addr->s6_addr32[3]); snprintf(buffer, BUF_SIZE, "%u.%u.%u.%u", (a2 >> 24) & 0xff, - (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); + (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); } else { const char *inet_ntop(int af, const void *src, char *dst, - size_t size); + size_t size); inet_ntop(AF_INET6, addr, buffer + 1, BUF_SIZE); buffer[0] = '['; strcat(buffer, "]"); @@ -275,25 +275,25 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) struct in6_addr saddr; int used = 0; - if (skb == NULL) { + if (!skb) { snprintf(buffer, buf_len, "skb is NULL!"); - buffer[buf_len-1] = 0; + buffer[buf_len - 1] = 0; return buffer; } homa_skb_get(skb, &header, 0, sizeof(header)); - common = (struct common_header *) header; + common = (struct common_header *)header; saddr = skb_canonical_ipv6_saddr(skb); used = homa_snprintf(buffer, buf_len, used, - "%s from %s:%u, dport %d, id %llu", - homa_symbol_for_type(common->type), - homa_print_ipv6_addr(&saddr), - ntohs(common->sport), ntohs(common->dport), - be64_to_cpu(common->sender_id)); + "%s from %s:%u, dport %d, id %llu", + homa_symbol_for_type(common->type), + homa_print_ipv6_addr(&saddr), + ntohs(common->sport), ntohs(common->dport), + be64_to_cpu(common->sender_id)); switch (common->type) { case DATA: { struct homa_skb_info *homa_info = homa_get_skb_info(skb); - struct data_header *h = (struct data_header *) header; + struct data_header *h = (struct data_header *)header; int data_left, i, seg_length, pos, offset; if (skb_shinfo(skb)->gso_segs == 0) { @@ -309,19 +309,19 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) if (offset == -1) offset = ntohl(h->common.sequence); used = homa_snprintf(buffer, buf_len, used, - ", message_length %d, offset %d, data_length %d, incoming %d", - ntohl(h->message_length), offset, - seg_length, ntohl(h->incoming)); + ", message_length %d, offset %d, data_length %d, incoming %d", + ntohl(h->message_length), offset, + seg_length, ntohl(h->incoming)); if (ntohs(h->cutoff_version) != 0) used = homa_snprintf(buffer, buf_len, used, - ", cutoff_version %d", - ntohs(h->cutoff_version)); + ", cutoff_version %d", + ntohs(h->cutoff_version)); if (h->retransmit) used = homa_snprintf(buffer, buf_len, used, - ", RETRANSMIT"); + ", RETRANSMIT"); if (skb_shinfo(skb)->gso_type == 0xd) used = homa_snprintf(buffer, buf_len, used, - ", TSO disabled"); + ", TSO disabled"); if (skb_shinfo(skb)->gso_segs <= 1) break; pos = skb_transport_offset(skb) + sizeof32(*h) + seg_length; @@ -338,28 +338,28 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) if (seg_length > data_left) seg_length = data_left; used = homa_snprintf(buffer, buf_len, used, - " %d@%d", seg_length, offset); + " %d@%d", seg_length, offset); data_left -= seg_length; pos += skb_shinfo(skb)->gso_size; }; break; } case GRANT: { - struct grant_header *h = (struct grant_header *) header; + struct grant_header *h = (struct grant_header *)header; char *resend = (h->resend_all) ? ", resend_all" : ""; used = homa_snprintf(buffer, buf_len, used, - ", offset %d, grant_prio %u%s", - ntohl(h->offset), h->priority, resend); + ", offset %d, grant_prio %u%s", + ntohl(h->offset), h->priority, resend); break; } case RESEND: { - struct resend_header *h = (struct resend_header *) header; + struct resend_header *h = (struct resend_header *)header; used = homa_snprintf(buffer, buf_len, used, - ", offset %d, length %d, resend_prio %u", - ntohl(h->offset), ntohl(h->length), - h->priority); + ", offset %d, length %d, resend_prio %u", + ntohl(h->offset), ntohl(h->length), + h->priority); break; } case UNKNOWN: @@ -369,19 +369,19 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) /* Nothing to add here. */ break; case CUTOFFS: { - struct cutoffs_header *h = (struct cutoffs_header *) header; + struct cutoffs_header *h = (struct cutoffs_header *)header; used = homa_snprintf(buffer, buf_len, used, - ", cutoffs %d %d %d %d %d %d %d %d, version %u", - ntohl(h->unsched_cutoffs[0]), - ntohl(h->unsched_cutoffs[1]), - ntohl(h->unsched_cutoffs[2]), - ntohl(h->unsched_cutoffs[3]), - ntohl(h->unsched_cutoffs[4]), - ntohl(h->unsched_cutoffs[5]), - ntohl(h->unsched_cutoffs[6]), - ntohl(h->unsched_cutoffs[7]), - ntohs(h->cutoff_version)); + ", cutoffs %d %d %d %d %d %d %d %d, version %u", + ntohl(h->unsched_cutoffs[0]), + ntohl(h->unsched_cutoffs[1]), + ntohl(h->unsched_cutoffs[2]), + ntohl(h->unsched_cutoffs[3]), + ntohl(h->unsched_cutoffs[4]), + ntohl(h->unsched_cutoffs[5]), + ntohl(h->unsched_cutoffs[6]), + ntohl(h->unsched_cutoffs[7]), + ntohs(h->cutoff_version)); break; } case FREEZE: @@ -391,23 +391,23 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) /* Nothing to add here. */ break; case ACK: { - struct ack_header *h = (struct ack_header *) header; + struct ack_header *h = (struct ack_header *)header; int i, count; count = ntohs(h->num_acks); used = homa_snprintf(buffer, buf_len, used, ", acks"); for (i = 0; i < count; i++) { used = homa_snprintf(buffer, buf_len, used, - " [cp %d, sp %d, id %llu]", - ntohs(h->acks[i].client_port), - ntohs(h->acks[i].server_port), - be64_to_cpu(h->acks[i].client_id)); + " [cp %d, sp %d, id %llu]", + ntohs(h->acks[i].client_port), + ntohs(h->acks[i].server_port), + be64_to_cpu(h->acks[i].client_id)); } break; } } - buffer[buf_len-1] = 0; + buffer[buf_len - 1] = 0; return buffer; } @@ -426,7 +426,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) char header[HOMA_MAX_HEADER]; struct common_header *common; - common = (struct common_header *) header; + common = (struct common_header *)header; homa_skb_get(skb, header, 0, HOMA_MAX_HEADER); switch (common->type) { case DATA: { @@ -447,8 +447,8 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) pos = skb_transport_offset(skb) + sizeof32(*h) + seg_length; used = homa_snprintf(buffer, buf_len, 0, "DATA%s %d@%d", - h->retransmit ? " retrans" : "", - seg_length, offset); + h->retransmit ? " retrans" : "", + seg_length, offset); for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { struct seg_header seg; @@ -461,26 +461,26 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) if (seg_length > data_left) seg_length = data_left; used = homa_snprintf(buffer, buf_len, used, - " %d@%d", seg_length, offset); + " %d@%d", seg_length, offset); data_left -= seg_length; pos += skb_shinfo(skb)->gso_size; } break; } case GRANT: { - struct grant_header *h = (struct grant_header *) header; + struct grant_header *h = (struct grant_header *)header; char *resend = h->resend_all ? " resend_all" : ""; snprintf(buffer, buf_len, "GRANT %d@%d%s", ntohl(h->offset), - h->priority, resend); + h->priority, resend); break; } case RESEND: { - struct resend_header *h = (struct resend_header *) header; + struct resend_header *h = (struct resend_header *)header; snprintf(buffer, buf_len, "RESEND %d-%d@%d", ntohl(h->offset), - ntohl(h->offset) + ntohl(h->length) - 1, - h->priority); + ntohl(h->offset) + ntohl(h->length) - 1, + h->priority); break; } case UNKNOWN: @@ -503,7 +503,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) break; default: snprintf(buffer, buf_len, "unknown packet type 0x%x", - common->type); + common->type); break; } return buffer; @@ -524,13 +524,13 @@ void homa_freeze_peers(struct homa *homa) /* Find a socket to use (any will do). */ hsk = homa_socktab_start_scan(homa->port_map, &scan); homa_socktab_end_scan(&scan); - if (hsk == NULL) { + if (!hsk) { tt_record("homa_freeze_peers couldn't find a socket"); return; } peers = homa_peertab_get_peers(homa->peers, &num_peers); - if (peers == NULL) { + if (!peers) { tt_record("homa_freeze_peers couldn't find peers to freeze"); return; } @@ -544,8 +544,8 @@ void homa_freeze_peers(struct homa *homa) tt_record1("Sending freeze to 0x%x", tt_addr(peers[i]->addr)); err = __homa_xmit_control(&freeze, sizeof(freeze), peers[i], hsk); if (err != 0) - tt_record2("homa_freeze_peers got error %d in xmit to 0x%x\n", err, - tt_addr(peers[i]->addr)); + tt_record2("homa_freeze_peers got error %d in xmit to 0x%x\n", + err, tt_addr(peers[i]->addr)); } kfree(peers); } @@ -575,7 +575,7 @@ int homa_snprintf(char *buffer, int size, int used, const char *format, ...) va_start(ap, format); - if (used >= (size-1)) + if (used >= (size - 1)) return used; new_chars = vsnprintf(buffer + used, size - used, format, ap); @@ -623,8 +623,8 @@ char *homa_symbol_for_type(uint8_t type) * code below ensures that the string cannot run past the end of the * buffer, so the code is safe. */ - snprintf(buffer, sizeof(buffer)-1, "unknown(%u)", type); - buffer[sizeof(buffer)-1] = 0; + snprintf(buffer, sizeof(buffer) - 1, "unknown(%u)", type); + buffer[sizeof(buffer) - 1] = 0; return buffer; } @@ -648,7 +648,7 @@ void homa_prios_changed(struct homa *homa) */ homa->unsched_cutoffs[0] = INT_MAX; - for (i = HOMA_MAX_PRIORITIES-1; ; i--) { + for (i = HOMA_MAX_PRIORITIES - 1; ; i--) { if (i >= homa->num_priorities) { homa->unsched_cutoffs[i] = 0; continue; @@ -659,7 +659,7 @@ void homa_prios_changed(struct homa *homa) break; } if (homa->unsched_cutoffs[i] >= HOMA_MAX_MESSAGE_LENGTH) { - homa->max_sched_prio = i-1; + homa->max_sched_prio = i - 1; break; } } @@ -719,7 +719,7 @@ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) int dummy; pr_notice("freezing in %s with freeze_type %d\n", __func__, - type); + type); tt_record1("homa_freeze calling homa_rpc_log_active with freeze_type %d", type); homa_rpc_log_active_tt(rpc->hsk->homa, 0); homa_validate_incoming(rpc->hsk->homa, 1, &dummy); diff --git a/homa_wire.h b/homa_wire.h index 13e5d708..d8032490 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -333,7 +333,7 @@ struct grant_header { __u8 resend_all; } __packed; _Static_assert(sizeof(struct grant_header) <= HOMA_MAX_HEADER, - "grant_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + "grant_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct resend_header - Wire format for RESEND packets. @@ -427,7 +427,7 @@ struct cutoffs_header { __be16 cutoff_version; } __packed; _Static_assert(sizeof(struct cutoffs_header) <= HOMA_MAX_HEADER, - "cutoffs_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + "cutoffs_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct freeze_header - Wire format for FREEZE packets. @@ -440,7 +440,7 @@ struct freeze_header { struct common_header common; } __packed; _Static_assert(sizeof(struct freeze_header) <= HOMA_MAX_HEADER, - "freeze_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + "freeze_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct need_ack_header - Wire format for NEED_ACK packets. diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 9e27e390..4a789da6 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2535,9 +2535,9 @@ TEST_F(homa_incoming, homa_choose_interest__all_cores_busy) struct homa_interest *result = homa_choose_interest(&self->homa, &self->hsk.request_interests, offsetof(struct homa_interest, request_links)); + INIT_LIST_HEAD(&self->hsk.request_interests); ASSERT_NE(NULL, result); EXPECT_EQ(1, result->core); - INIT_LIST_HEAD(&self->hsk.request_interests); } TEST_F(homa_incoming, homa_rpc_handoff__handoff_already_in_progress) diff --git a/timetrace.c b/timetrace.c index 3fb8eb28..721b3746 100644 --- a/timetrace.c +++ b/timetrace.c @@ -2,12 +2,17 @@ #include "homa_impl.h" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#include +#pragma GCC diagnostic pop + #ifndef __UNIT_TEST__ /* Uncomment the line below if the main Linux kernel has been compiled with * timetrace stubs; we will then connect the timetrace mechanism here with * those stubs to allow the rest of the kernel to log in our buffers. */ -//#define TT_KERNEL 1 +#define TT_KERNEL 1 #endif /* __UNIT_TEST__ */ #ifdef TT_KERNEL extern struct tt_buffer *tt_linux_buffers[]; @@ -18,8 +23,8 @@ extern int *tt_linux_homa_temp; extern int tt_linux_homa_temp_default[16]; extern void (*tt_linux_inc_metrics)(int metric, __u64 count); extern void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, - const char *format, __u32 arg0, __u32 arg1, __u32 arg2, - __u32 arg3); + const char *format, __u32 arg0, __u32 arg1, + __u32 arg2, __u32 arg3); extern void tt_linux_skip_metrics(int metric, __u64 count); extern void (*tt_linux_printk)(void); extern void (*tt_linux_dbg1)(char *msg, ...); @@ -29,8 +34,8 @@ extern void tt_linux_nop(void); extern void homa_trace(__u64 u0, __u64 u1, int i0, int i1); extern void ltt_record_nop(struct tt_buffer *buffer, __u64 timestamp, - const char *format, __u32 arg0, __u32 arg1, - __u32 arg2, __u32 arg3); + const char *format, __u32 arg0, __u32 arg1, + __u32 arg2, __u32 arg3); #endif void tt_inc_metric(int metric, __u64 count); @@ -145,6 +150,7 @@ int tt_init(char *proc_file, int *temp) tt_linux_dbg1 = tt_dbg1; tt_linux_dbg2 = tt_dbg2; tt_linux_dbg3 = tt_dbg3; + memset(tt_debug_int64, 0, sizeof(tt_debug_int64)); if (temp) tt_linux_homa_temp = temp; #endif @@ -841,7 +847,7 @@ void tt_inc_metric(int metric, __u64 count) offsetof(struct homa_metrics, linux_softirq_ns), offsetof(struct homa_metrics, linux_pkt_alloc_bytes), }; - __u64 *metric_addr = (__u64 *)(((char *) homa_metrics_per_cpu()) + __u64 *metric_addr = (__u64 *)(((char *)homa_metrics_per_cpu()) + offsets[metric]); *metric_addr += count; } From f2ab0562229dcabd9cd7c5d5a6b8031994fa1e06 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 3 Dec 2024 09:22:24 -0800 Subject: [PATCH 076/625] Set ipv6_pinfo_offset field in struct proto --- homa_plumbing.c | 8 +++----- homa_sock.h | 9 +++++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 1799adfa..d30a1dc1 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -144,11 +144,9 @@ static struct proto homav6_prot = { .sysctl_wmem = &sysctl_homa_wmem_min, .sysctl_rmem = &sysctl_homa_rmem_min, - /* IPv6 data comes *after* Homa's data, and isn't included in - * struct homa_sock. - */ - .obj_size = sizeof(struct homa_sock) + - sizeof(struct ipv6_pinfo), + .obj_size = sizeof(struct homa_v6_sock), + .ipv6_pinfo_offset = offsetof(struct homa_v6_sock, inet6), + .no_autobind = 1, }; diff --git a/homa_sock.h b/homa_sock.h index d6da08f9..b2e11e3e 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -264,6 +264,15 @@ struct homa_sock { struct homa_pool *buffer_pool; }; +/** + * struct homa_v6_sock - For IPv6, additional IPv6-specific information + * is present in the socket struct after Homa-specific information. + */ +struct homa_v6_sock { + struct homa_sock homa; + struct ipv6_pinfo inet6; +}; + void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id); int homa_sock_bind(struct homa_socktab *socktab, From 995e0af5b5b05d15a7003d3f30fa3ff76ffc12eb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 3 Dec 2024 10:04:09 -0800 Subject: [PATCH 077/625] Rename global homa variable to global_homa (Avoid confusion over name) --- homa_impl.h | 2 ++ homa_metrics.c | 10 +++++----- homa_offload.c | 14 ++++++++------ homa_offload.h | 4 ++-- homa_plumbing.c | 26 ++++++++++++++++++-------- test/unit_homa_metrics.c | 6 ++---- test/unit_homa_offload.c | 22 ++++++++++------------ test/unit_homa_plumbing.c | 6 ++---- 8 files changed, 49 insertions(+), 41 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 728d6329..668c13e1 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1073,6 +1073,8 @@ void unit_hook(char *id); #endif /* __UNIT_TEST__ */ #endif /* See strip.py */ +extern struct homa *global_homa; + void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, int port, int error); void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); diff --git a/homa_metrics.c b/homa_metrics.c index 853898bc..0800d484 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -8,11 +8,6 @@ DEFINE_PER_CPU(struct homa_metrics, homa_metrics); -/* For functions that are invoked directly by Linux, so they can't be - * passed a struct homa arguments. - */ -extern struct homa *homa; - /** * homa_metric_append() - Formats a new metric and appends it to homa->metrics. * @homa: The new data will appended to the @metrics field of @@ -351,6 +346,8 @@ char *homa_metrics_print(struct homa *homa) */ int homa_metrics_open(struct inode *inode, struct file *file) { + struct homa *homa = global_homa; + /* Collect all of the metrics when the file is opened, and save * these for use by subsequent reads (don't want the metrics to * change between reads). If there are concurrent opens on the @@ -381,6 +378,7 @@ int homa_metrics_open(struct inode *inode, struct file *file) ssize_t homa_metrics_read(struct file *file, char __user *buffer, size_t length, loff_t *offset) { + struct homa *homa = global_homa; size_t copied; if (*offset >= homa->metrics_length) @@ -417,6 +415,8 @@ loff_t homa_metrics_lseek(struct file *file, loff_t offset, int whence) */ int homa_metrics_release(struct inode *inode, struct file *file) { + struct homa *homa = global_homa; + spin_lock(&homa->metrics_lock); homa->metrics_active_opens--; spin_unlock(&homa->metrics_lock); diff --git a/homa_offload.c b/homa_offload.c index d9db3186..e4d7ea64 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -19,8 +19,6 @@ static const struct net_offload homa_offload = { }, }; -extern struct homa *homa; - /* Pointers to TCP's net_offload structures. NULL means homa_gro_hook_tcp * hasn't been called yet. */ @@ -284,6 +282,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, */ __u64 saved_softirq_metric, softirq_ns; struct homa_offload_core *offload_core; + struct homa *homa = global_homa; struct sk_buff *result = NULL; __u64 *softirq_ns_metric; struct data_header *h_new; @@ -461,11 +460,12 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * homa_gro_gen2() - When the Gen2 load balancer is being used this function * is invoked by homa_gro_complete to choose a core to handle SoftIRQ for a * batch of packets + * @homa: Overall information about the Homa transport. * @skb: First in a group of packets that are ready to be passed to SoftIRQ. * Information will be updated in the packet so that Linux will * direct it to the chosen core. */ -void homa_gro_gen2(struct sk_buff *skb) +void homa_gro_gen2(struct homa *homa, struct sk_buff *skb) { /* Scan the next several cores in order after the current core, * trying to find one that is not already busy with SoftIRQ processing, @@ -520,11 +520,12 @@ void homa_gro_gen2(struct sk_buff *skb) * homa_gro_gen3() - When the Gen3 load balancer is being used this function * is invoked by homa_gro_complete to choose a core to handle SoftIRQ for a * batch of packets + * @homa: Overall information about the Homa transport. * @skb: First in a group of packets that are ready to be passed to SoftIRQ. * Information will be updated in the packet so that Linux will * direct it to the chosen core. */ -void homa_gro_gen3(struct sk_buff *skb) +void homa_gro_gen3(struct homa *homa, struct sk_buff *skb) { /* See balance.txt for overall design information on the Gen3 * load balancer. @@ -575,6 +576,7 @@ void homa_gro_gen3(struct sk_buff *skb) int homa_gro_complete(struct sk_buff *skb, int hoffset) { struct data_header *h = (struct data_header *)skb_transport_header(skb); + struct homa *homa = global_homa; // tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", // h->common.type, homa_local_id(h->common.sender_id), @@ -583,9 +585,9 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) per_cpu(homa_offload_core, raw_smp_processor_id()).held_skb = NULL; if (homa->gro_policy & HOMA_GRO_GEN3) { - homa_gro_gen3(skb); + homa_gro_gen3(homa, skb); } else if (homa->gro_policy & HOMA_GRO_GEN2) { - homa_gro_gen2(skb); + homa_gro_gen2(homa, skb); } else if (homa->gro_policy & HOMA_GRO_IDLE) { int i, core, best; __u64 best_time = ~0; diff --git a/homa_offload.h b/homa_offload.h index 21628914..cdc7b795 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -74,8 +74,8 @@ struct homa_offload_core { DECLARE_PER_CPU(struct homa_offload_core, homa_offload_core); int homa_gro_complete(struct sk_buff *skb, int thoff); -void homa_gro_gen2(struct sk_buff *skb); -void homa_gro_gen3(struct sk_buff *skb); +void homa_gro_gen2(struct homa *homa, struct sk_buff *skb); +void homa_gro_gen3(struct homa *homa, struct sk_buff *skb); void homa_gro_hook_tcp(void); void homa_gro_unhook_tcp(void); struct sk_buff *homa_gro_receive(struct list_head *gro_list, diff --git a/homa_plumbing.c b/homa_plumbing.c index d30a1dc1..049959e6 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -21,16 +21,18 @@ static long sysctl_homa_mem[3] __read_mostly; static int sysctl_homa_rmem_min __read_mostly; static int sysctl_homa_wmem_min __read_mostly; -/* Global data for Homa. Never reference homa_data directory. Always use - * the homa variable instead; this allows overriding during unit tests. +/* Global data for Homa. Never reference homa_data directly. Always use + * the global_homa variable instead; this allows overriding during unit tests. */ static struct homa homa_data; -/* This variable should almost never be used directly; it is normally - * passed as a parameter to functions that need it. Thus it is not declared - * in a header file. +/* This variable contains the address of the statically-allocated struct homa + * used throughout Homa. This variable should almost never be used directly: + * it should be passed as a parameter to functions that need it. This + * variable is used only by functions called from Linux (so they can't pass + * in a pointer). */ -struct homa *homa = &homa_data; +struct homa *global_homa = &homa_data; /* True means that the Homa module is in the process of unloading itself, * so everyone should clean up. @@ -523,6 +525,7 @@ static DECLARE_COMPLETION(timer_thread_done); */ static int __init homa_load(void) { + struct homa *homa = global_homa; int status; pr_notice("Homa module loading\n"); @@ -632,6 +635,8 @@ static int __init homa_load(void) */ static void __exit homa_unload(void) { + struct homa *homa = global_homa; + pr_notice("Homa module unloading\n"); exiting = true; @@ -671,8 +676,8 @@ module_exit(homa_unload); */ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { - struct homa_sock *hsk = homa_sk(sock->sk); union sockaddr_in_union *addr_in = (union sockaddr_in_union *)addr; + struct homa_sock *hsk = homa_sk(sock->sk); int port = 0; if (unlikely(addr->sa_family != sock->sk->sk_family)) @@ -686,7 +691,7 @@ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; port = ntohs(addr_in->in6.sin6_port); } - return homa_sock_bind(homa->port_map, hsk, port); + return homa_sock_bind(hsk->homa->port_map, hsk, port); } /** @@ -814,6 +819,7 @@ int homa_ioctl(struct sock *sk, int cmd, int *karg) int homa_socket(struct sock *sk) { struct homa_sock *hsk = homa_sk(sk); + struct homa *homa = global_homa; homa_sock_init(hsk, homa); return 0; @@ -1194,6 +1200,7 @@ int homa_softirq(struct sk_buff *skb) { struct sk_buff *packets, *other_pkts, *next; struct sk_buff **prev_link, **other_link; + struct homa *homa = global_homa; struct common_header *h; int first_packet = 1; int header_offset; @@ -1371,6 +1378,7 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) { const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); const struct iphdr *iph = ip_hdr(skb); + struct homa *homa = global_homa; int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; @@ -1415,6 +1423,7 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct homa *homa = global_homa; if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_PORT_UNREACH) { char *icmp = (char *)icmp_hdr(skb); @@ -1493,6 +1502,7 @@ int homa_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) #endif { + struct homa *homa = global_homa; int result; result = proc_dointvec(table, write, buffer, lenp, ppos); diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c index e4a7df0a..0c02f42a 100644 --- a/test/unit_homa_metrics.c +++ b/test/unit_homa_metrics.c @@ -7,19 +7,17 @@ #include "mock.h" #include "utils.h" -extern struct homa *homa; - FIXTURE(homa_metrics) { struct homa homa; }; FIXTURE_SETUP(homa_metrics) { homa_init(&self->homa); - homa = &self->homa; + global_homa = &self->homa; } FIXTURE_TEARDOWN(homa_metrics) { - homa = NULL; + global_homa = NULL; homa_destroy(&self->homa); unit_teardown(); } diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index b8897825..921e7096 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -11,8 +11,6 @@ #define cur_offload_core (&per_cpu(homa_offload_core, raw_smp_processor_id())) -extern struct homa *homa; - static struct sk_buff *tcp_gro_receive(struct list_head *held_list, struct sk_buff *skb) { @@ -44,7 +42,7 @@ FIXTURE_SETUP(homa_offload) homa_init(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; - homa = &self->homa; + global_homa = &self->homa; mock_sock_init(&self->hsk, &self->homa, 99); self->ip = unit_get_in_addr("196.168.0.1"); self->header = (struct data_header){.common = { @@ -101,7 +99,7 @@ FIXTURE_TEARDOWN(homa_offload) list_for_each_entry_safe(skb, tmp, &self->napi.gro_hash[2].list, list) kfree_skb(skb); homa_destroy(&self->homa); - homa = NULL; + global_homa = NULL; unit_teardown(); } @@ -479,7 +477,7 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) struct sk_buff *skb; // First packet: fits below the limit. - homa->max_gro_skbs = 3; + self->homa.max_gro_skbs = 3; cur_offload_core->held_skb = self->skb2; cur_offload_core->held_bucket = 2; self->header.seg.offset = htonl(6000); @@ -504,7 +502,7 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) // Third packet also hits the limit for skb, causing the bucket // to become empty. - homa->max_gro_skbs = 2; + self->homa.max_gro_skbs = 2; cur_offload_core->held_skb = self->skb; skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); @@ -520,9 +518,9 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) TEST_F(homa_offload, homa_gro_gen2) { - homa->gro_policy = HOMA_GRO_GEN2; + self->homa.gro_policy = HOMA_GRO_GEN2; mock_ns = 1000; - homa->busy_ns = 100; + self->homa.busy_ns = 100; mock_set_core(5); atomic_set(&per_cpu(homa_offload_core, 6).softirq_backlog, 1); per_cpu(homa_offload_core, 6).last_gro = 0; @@ -562,7 +560,7 @@ TEST_F(homa_offload, homa_gro_gen3__basics) struct homa_offload_core *offload5 = &per_cpu(homa_offload_core, 5); struct homa_offload_core *offload7 = &per_cpu(homa_offload_core, 7); - homa->gro_policy = HOMA_GRO_GEN3; + self->homa.gro_policy = HOMA_GRO_GEN3; offload_core->gen3_softirq_cores[0] = 3; offload_core->gen3_softirq_cores[1] = 7; offload_core->gen3_softirq_cores[2] = 5; @@ -581,7 +579,7 @@ TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) { struct homa_offload_core *offload_core = cur_offload_core; - homa->gro_policy = HOMA_GRO_GEN3; + self->homa.gro_policy = HOMA_GRO_GEN3; offload_core->gen3_softirq_cores[0] = 3; offload_core->gen3_softirq_cores[1] = -1; offload_core->gen3_softirq_cores[2] = 5; @@ -598,7 +596,7 @@ TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) { struct homa_offload_core *offload_core = cur_offload_core; - homa->gro_policy = HOMA_GRO_GEN3; + self->homa.gro_policy = HOMA_GRO_GEN3; offload_core->gen3_softirq_cores[0] = 3; offload_core->gen3_softirq_cores[1] = 7; offload_core->gen3_softirq_cores[2] = 5; @@ -625,7 +623,7 @@ TEST_F(homa_offload, homa_gro_complete__clear_held_skb) } TEST_F(homa_offload, homa_gro_complete__GRO_IDLE) { - homa->gro_policy = HOMA_GRO_IDLE; + self->homa.gro_policy = HOMA_GRO_IDLE; per_cpu(homa_offload_core, 6).last_active = 30; per_cpu(homa_offload_core, 7).last_active = 25; per_cpu(homa_offload_core, 0).last_active = 20; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 8cb51836..fb748da9 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -9,8 +9,6 @@ #include "mock.h" #include "utils.h" -extern struct homa *homa; - /* The following hook function frees hook_rpc. */ static struct homa_rpc *hook_rpc; static void unlock_hook(char *id) @@ -57,7 +55,7 @@ FIXTURE_SETUP(homa_plumbing) self->client_addr.in6.sin6_port = htons(self->client_port); self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - homa = &self->homa; + global_homa = &self->homa; homa_init(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); self->client_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; @@ -107,7 +105,7 @@ FIXTURE_TEARDOWN(homa_plumbing) { homa_destroy(&self->homa); unit_teardown(); - homa = NULL; + global_homa = NULL; } TEST_F(homa_plumbing, homa_bind__version_mismatch) From 1f2500e34a52cb9e48c9b464c28358caefd26718 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 3 Dec 2024 14:20:20 -0800 Subject: [PATCH 078/625] Check return value from inet6_register_protosw --- homa_impl.h | 2 ++ homa_plumbing.c | 11 ++++++++--- test/mock.c | 13 ++++++++++++- test/mock.h | 1 + test/unit_homa_plumbing.c | 14 ++++++++++++++ 5 files changed, 37 insertions(+), 4 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 668c13e1..b06148fb 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1125,6 +1125,7 @@ int homa_init(struct homa *homa); void homa_incoming_sysctl_changed(struct homa *homa); int homa_ioc_abort(struct sock *sk, int *karg); int homa_ioctl(struct sock *sk, int cmd, int *karg); +int homa_load(void); void homa_log_throttled(struct homa *homa); int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched); @@ -1184,6 +1185,7 @@ void homa_timer(struct homa *homa); int homa_timer_main(void *transport); void homa_unhash(struct sock *sk); void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +void homa_unload(void); int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, int length); int homa_validate_incoming(struct homa *homa, int verbose, diff --git a/homa_plumbing.c b/homa_plumbing.c index 049959e6..cb7cea1b 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -523,7 +523,7 @@ static DECLARE_COMPLETION(timer_thread_done); * homa_load() - invoked when this module is loaded into the Linux kernel * Return: 0 on success, otherwise a negative errno. */ -static int __init homa_load(void) +int __init homa_load(void) { struct homa *homa = global_homa; int status; @@ -559,7 +559,12 @@ static int __init homa_load(void) goto out; } inet_register_protosw(&homa_protosw); - inet6_register_protosw(&homav6_protosw); + status = inet6_register_protosw(&homav6_protosw); + if (status != 0) { + pr_err("inet6_register_protosw failed in %s: %d\n", __func__, + status); + goto out_cleanup; + } status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA); if (status != 0) { pr_err("inet_add_protocol failed in %s: %d\n", __func__, @@ -633,7 +638,7 @@ static int __init homa_load(void) /** * homa_unload() - invoked when this module is unloaded from the Linux kernel. */ -static void __exit homa_unload(void) +void __exit homa_unload(void) { struct homa *homa = global_homa; diff --git a/test/mock.c b/test/mock.c index dee45528..227328cd 100644 --- a/test/mock.c +++ b/test/mock.c @@ -44,6 +44,7 @@ int mock_ip6_xmit_errors; int mock_ip_queue_xmit_errors; int mock_kmalloc_errors; int mock_kthread_create_errors; +int mock_register_protosw_errors; int mock_route_errors; int mock_spin_lock_held; int mock_trylock_errors; @@ -183,6 +184,8 @@ struct net_device mock_net_device = { ._tx = &mock_net_queue}; const struct net_offload *inet_offloads[MAX_INET_PROTOS]; const struct net_offload *inet6_offloads[MAX_INET_PROTOS]; +struct net_offload tcp_offload; +struct net_offload tcp_v6_offload; static struct hrtimer_clock_base clock_base; unsigned int cpu_khz = 1000000; @@ -460,6 +463,8 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) int inet6_register_protosw(struct inet_protosw *p) { + if (mock_check_error(&mock_register_protosw_errors)) + return -EINVAL; return 0; } @@ -895,6 +900,8 @@ int proc_dointvec(const struct ctl_table *table, int write, void proc_remove(struct proc_dir_entry *de) { + if (!de) + return; if (!proc_files_in_use || unit_hash_get(proc_files_in_use, de) == NULL) { FAIL("%s on unknown dir_entry", __func__); @@ -1376,7 +1383,7 @@ void mock_rcu_read_unlock(void) struct ctl_table_header *mock_register_net_sysctl(struct net *net, const char *path, struct ctl_table *table) { - return NULL; + return (struct ctl_table_header *)11111; } /** @@ -1558,6 +1565,7 @@ void mock_teardown(void) mock_ip_queue_xmit_errors = 0; mock_kmalloc_errors = 0; mock_kthread_create_errors = 0; + mock_register_protosw_errors = 0; mock_copy_to_user_dont_copy = 0; mock_bpage_size = 0x10000; mock_bpage_shift = 16; @@ -1580,7 +1588,10 @@ void mock_teardown(void) mock_net_device.gso_max_size = 0; mock_net_device.gso_max_segs = 1000; memset(inet_offloads, 0, sizeof(inet_offloads)); + inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) &tcp_offload; memset(inet6_offloads, 0, sizeof(inet6_offloads)); + inet6_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) + &tcp_v6_offload; count = unit_hash_size(skbs_in_use); if (count > 0) diff --git a/test/mock.h b/test/mock.h index 96e9af01..9391ec88 100644 --- a/test/mock.h +++ b/test/mock.h @@ -22,6 +22,7 @@ extern bool mock_ipv6; extern bool mock_ipv6_default; extern int mock_kmalloc_errors; extern int mock_kthread_create_errors; +extern int mock_register_protosw_errors; extern char mock_xmit_prios[]; extern int mock_log_rcu_sched; extern int mock_max_grants; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index fb748da9..b7e1709c 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -108,6 +108,20 @@ FIXTURE_TEARDOWN(homa_plumbing) global_homa = NULL; } +TEST_F(homa_plumbing, homa_load__error_in_inet6_register_protosw) +{ + homa_destroy(&self->homa); + + /* First attempt fails. */ + mock_register_protosw_errors = 1; + EXPECT_EQ(EINVAL, -homa_load()); + + /* Second attempt succeeds. */ + EXPECT_EQ(0, -homa_load()); + + homa_unload(); +} + TEST_F(homa_plumbing, homa_bind__version_mismatch) { struct sockaddr addr = {}; From 777a3be542eb20ea97931a7d6d8b8659f4019a0f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 3 Dec 2024 15:09:15 -0800 Subject: [PATCH 079/625] Check return value from kzalloc in homa_sock_init --- homa_plumbing.c | 7 +++++-- homa_sock.c | 16 +++++++++++----- homa_sock.h | 2 +- test/mock.c | 4 +++- test/unit_homa_plumbing.c | 15 +++++++++++++++ test/unit_homa_sock.c | 8 ++++++++ 6 files changed, 43 insertions(+), 9 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index cb7cea1b..9b26fd92 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -825,9 +825,12 @@ int homa_socket(struct sock *sk) { struct homa_sock *hsk = homa_sk(sk); struct homa *homa = global_homa; + int result; - homa_sock_init(hsk, homa); - return 0; + result = homa_sock_init(hsk, homa); + if (result != 0) + homa_sock_destroy(hsk); + return result; } /** diff --git a/homa_sock.c b/homa_sock.c index 10d1faa6..47b58f9f 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -125,11 +125,12 @@ void homa_socktab_end_scan(struct homa_socktab_scan *scan) * @hsk: Object to initialize. * @homa: Homa implementation that will manage the socket. * - * Return: always 0 (success). + * Return: 0 for success, otherwise a negative errno. */ -void homa_sock_init(struct homa_sock *hsk, struct homa *homa) +int homa_sock_init(struct homa_sock *hsk, struct homa *homa) { struct homa_socktab *socktab = homa->port_map; + int result = 0; int i; spin_lock_bh(&socktab->write_lock); @@ -178,9 +179,12 @@ void homa_sock_init(struct homa_sock *hsk, struct homa *homa) bucket->id = i + 1000000; } hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_KERNEL); + if (!hsk->buffer_pool) + result = -ENOMEM; if (homa->hijack_tcp) hsk->sock.sk_protocol = IPPROTO_TCP; spin_unlock_bh(&socktab->write_lock); + return result; } /* @@ -270,9 +274,11 @@ void homa_sock_shutdown(struct homa_sock *hsk) #endif /* See strip.py */ } - homa_pool_destroy(hsk->buffer_pool); - kfree(hsk->buffer_pool); - hsk->buffer_pool = NULL; + if (hsk->buffer_pool) { + homa_pool_destroy(hsk->buffer_pool); + kfree(hsk->buffer_pool); + hsk->buffer_pool = NULL; + } } /** diff --git a/homa_sock.h b/homa_sock.h index b2e11e3e..06f34aab 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -279,7 +279,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, __u16 port); void homa_sock_destroy(struct homa_sock *hsk); struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port); -void homa_sock_init(struct homa_sock *hsk, struct homa *homa); +int homa_sock_init(struct homa_sock *hsk, struct homa *homa); void homa_sock_shutdown(struct homa_sock *hsk); void homa_sock_unlink(struct homa_sock *hsk); int homa_socket(struct sock *sk); diff --git a/test/mock.c b/test/mock.c index 227328cd..ae83a97f 100644 --- a/test/mock.c +++ b/test/mock.c @@ -712,7 +712,7 @@ void kfree(const void *block) if (block == NULL) return; if (!kmallocs_in_use || unit_hash_get(kmallocs_in_use, block) == NULL) { - FAIL("%s on unknown block", __func__); + FAIL("%s on unknown block %p", __func__, block); return; } unit_hash_erase(kmallocs_in_use, block); @@ -760,6 +760,8 @@ void *mock_kmalloc(size_t size, gfp_t flags) FAIL("malloc failed"); return NULL; } + if (flags & __GFP_ZERO) + memset(block, 0, size); if (!kmallocs_in_use) kmallocs_in_use = unit_hash_new(); unit_hash_set(kmallocs_in_use, block, "used"); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index b7e1709c..56246c65 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -249,6 +249,21 @@ TEST_F(homa_plumbing, homa_ioc_abort__nonexistent_rpc) EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); } +TEST_F(homa_plumbing, homa_socket__success) +{ + struct homa_sock sock; + + EXPECT_EQ(0, homa_socket(&sock.sock)); + homa_sock_destroy(&sock); +} +TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) +{ + struct homa_sock sock; + + mock_kmalloc_errors = 1; + EXPECT_EQ(ENOMEM, -homa_socket(&sock.sock)); +} + TEST_F(homa_plumbing, homa_set_sock_opt__bad_level) { EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, 0, 0, diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 667c32df..fe02beda 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -140,6 +140,14 @@ TEST_F(homa_sock, homa_sock_init__ip_header_length) homa_sock_destroy(&hsk_v4); homa_sock_destroy(&hsk_v6); } +TEST_F(homa_sock, homa_sock_init__kzalloc_failure) +{ + struct homa_sock sock; + + mock_kmalloc_errors = 1; + EXPECT_EQ(ENOMEM, -homa_sock_init(&sock, &self->homa)); + homa_sock_destroy(&sock); +} TEST_F(homa_sock, homa_sock_init__hijack_tcp) { struct homa_sock hijack, no_hijack; From fd4b526bd4251633c7dbd64605be47c8a41b57a1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 4 Dec 2024 10:00:34 -0800 Subject: [PATCH 080/625] Fix problem related to stip.py --- homa_offload.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/homa_offload.c b/homa_offload.c index e4d7ea64..f8b8fa90 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -340,10 +340,12 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, INC_METRIC(gro_grant_bypasses, 1); goto bypass; } +#if 1 /* See strip.py */ } else { tt_record4("homa_gro_receive got packet from 0x%x id %llu, type 0x%x, priority %d", saddr, homa_local_id(h_new->common.sender_id), h_new->common.type, priority); +#endif /* See strip.py */ } /* The GRO mechanism tries to separate packets onto different From 79f4b45c8ae52cbca90b889be10ffe6fdedf15fd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 4 Dec 2024 11:00:56 -0800 Subject: [PATCH 081/625] Add MODULE_ALIAS_NET_PF_PROTO_TYPE to auto-load Homa module --- homa_plumbing.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 9b26fd92..1592771f 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -9,13 +9,6 @@ #include "homa_peer.h" #include "homa_pool.h" -#ifndef __UNIT_TEST__ -MODULE_LICENSE("Dual MIT/GPL"); -#endif /* __UNIT_TEST__ */ -MODULE_AUTHOR("John Ousterhout"); -MODULE_DESCRIPTION("Homa transport protocol"); -MODULE_VERSION("0.01"); - /* Not yet sure what these variables are for */ static long sysctl_homa_mem[3] __read_mostly; static int sysctl_homa_rmem_min __read_mostly; @@ -1700,3 +1693,17 @@ int homa_timer_main(void *transport) kthread_complete_and_exit(&timer_thread_done, 0); return 0; } + +#ifndef __UNIT_TEST__ +MODULE_LICENSE("Dual BSD/GPL"); +#endif /* __UNIT_TEST__ */ +MODULE_AUTHOR("John Ousterhout "); +MODULE_DESCRIPTION("Homa transport protocol"); +MODULE_VERSION("1.0"); + +/* Arrange for this module to be loaded automatically when a Homa socket is + * opened. Apparently symbols don't work in the macros below, so must use + * numeric values for IPPROTO_HOMA (146) and SOCK_DGRAM(2). + */ +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 146, 2); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 146, 2); From bd3e20966198b72572d215fcb55af90209c03e10 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 4 Dec 2024 11:14:34 -0800 Subject: [PATCH 082/625] Clean up more carefully after errors in homa_load (Don't uninitialize things that were never initialized) --- homa_plumbing.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 1592771f..821bf568 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -544,42 +544,42 @@ int __init homa_load(void) status = proto_register(&homa_prot, 1); if (status != 0) { pr_err("proto_register failed for homa_prot: %d\n", status); - goto out; + goto proto_register_err; } status = proto_register(&homav6_prot, 1); if (status != 0) { pr_err("proto_register failed for homav6_prot: %d\n", status); - goto out; + goto proto_register_v6_err; } inet_register_protosw(&homa_protosw); status = inet6_register_protosw(&homav6_protosw); if (status != 0) { pr_err("inet6_register_protosw failed in %s: %d\n", __func__, status); - goto out_cleanup; + goto register_protosw_v6_err; } status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA); if (status != 0) { pr_err("inet_add_protocol failed in %s: %d\n", __func__, status); - goto out_cleanup; + goto add_protocol_err; } status = inet6_add_protocol(&homav6_protocol, IPPROTO_HOMA); if (status != 0) { pr_err("inet6_add_protocol failed in %s: %d\n", __func__, status); - goto out_cleanup; + goto add_protocol_v6_err; } status = homa_init(homa); if (status) - goto out_cleanup; + goto homa_init_err; metrics_dir_entry = proc_create("homa_metrics", 0444, init_net.proc_net, &homa_metrics_pops); if (!metrics_dir_entry) { pr_err("couldn't create /proc/net/homa_metrics\n"); status = -ENOMEM; - goto out_cleanup; + goto metrics_err; } homa_ctl_header = register_net_sysctl(&init_net, "net/homa", @@ -587,13 +587,13 @@ int __init homa_load(void) if (!homa_ctl_header) { pr_err("couldn't register Homa sysctl parameters\n"); status = -ENOMEM; - goto out_cleanup; + goto sysctl_err; } status = homa_offload_init(); if (status != 0) { pr_err("Homa couldn't init offloads\n"); - goto out_cleanup; + goto offload_err; } timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); @@ -602,7 +602,7 @@ int __init homa_load(void) pr_err("couldn't create homa pacer thread: error %d\n", status); timer_kthread = NULL; - goto out_cleanup; + goto timer_err; } homa_gro_hook_tcp(); @@ -612,19 +612,26 @@ int __init homa_load(void) return 0; -out_cleanup: - homa_gro_unhook_tcp(); +timer_err: homa_offload_end(); +offload_err: unregister_net_sysctl_table(homa_ctl_header); +sysctl_err: proc_remove(metrics_dir_entry); +metrics_err: homa_destroy(homa); - inet_del_protocol(&homa_protocol, IPPROTO_HOMA); - inet_unregister_protosw(&homa_protosw); +homa_init_err: inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); +add_protocol_v6_err: + inet_del_protocol(&homa_protocol, IPPROTO_HOMA); +add_protocol_err: inet6_unregister_protosw(&homav6_protosw); - proto_unregister(&homa_prot); +register_protosw_v6_err: + inet_unregister_protosw(&homa_protosw); proto_unregister(&homav6_prot); -out: +proto_register_v6_err: + proto_unregister(&homa_prot); +proto_register_err: return status; } From 9419c4220ff1ee1f5d9c34534a9c86762b0c1e1d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 4 Dec 2024 21:43:05 -0800 Subject: [PATCH 083/625] Patch around problem formatting man pages For some reason, piping through ps2pdf generates blank output for some man pages, whereas running ps2pdf as a separate command seems to work. --- man/Makefile | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/man/Makefile b/man/Makefile index aa829ec2..8986d89b 100644 --- a/man/Makefile +++ b/man/Makefile @@ -15,14 +15,24 @@ all: $(PDFS) clean: rm -f *.pdf +# Note: in the rules below, it doesn't seem to work to eliminate the +# temporary file and use ps2pdf in a pipeline; as of 12/2024, under +# Cygwin, this produces blank output for some man pages under some +# conditions. %.pdf: %.2 - pdfroff -man $< > $@ + groff -man -Tps $< > tmp.ps + ps2pdf tmp.ps $@ + rm tmp.ps %.pdf: %.3 - pdfroff -man $< > $@ + groff -man -Tps $< > tmp.ps + ps2pdf tmp.ps $@ + rm tmp.ps %.pdf: %.7 - pdfroff -man $< > $@ + groff -man -Tps $< > tmp.ps + ps2pdf tmp.ps $@ + rm tmp.ps # The following target is useful for debugging Makefiles; it # prints the value of a make variable. From 3256ceeb48383579296948a85928ffbebcf13e0e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 08:59:48 -0800 Subject: [PATCH 084/625] Changes to setsockopt/getsockopt * Rename SO_HOMA_SET_BUF to SO_HOMA_RCVBUF * Rename struct homa_set_buf_args to struct homa_rcvbuf_args * Implement getsockopt for SO_HOMA_RCVBUF * Return ENOPROTOOPT instead of EINVAL for level and optname mismatches --- homa.h | 8 +-- homa_impl.h | 2 +- homa_metrics.c | 4 +- homa_plumbing.c | 41 ++++++++++---- homa_pool.c | 21 ++++++- homa_pool.h | 2 + homa_receiver.cc | 2 +- man/homa.7 | 6 +- test/unit_homa_plumbing.c | 116 +++++++++++++++++++++++++++++++------- test/unit_homa_pool.c | 11 +++- util/cp_node.cc | 12 ++-- util/homa_test.cc | 12 ++-- util/server.cc | 6 +- 13 files changed, 182 insertions(+), 61 deletions(-) diff --git a/homa.h b/homa.h index 4a17f8ea..8391141b 100644 --- a/homa.h +++ b/homa.h @@ -158,11 +158,11 @@ _Static_assert(sizeof(struct homa_abort_args) >= 32, "homa_abort_args shrunk"); _Static_assert(sizeof(struct homa_abort_args) <= 32, "homa_abort_args grew"); #endif -/** define SO_HOMA_SET_BUF: setsockopt option for specifying buffer region. */ -#define SO_HOMA_SET_BUF 10 +/** define SO_HOMA_RCVBUF: setsockopt option for specifying buffer region. */ +#define SO_HOMA_RCVBUF 10 -/** struct homa_set_buf - setsockopt argument for SO_HOMA_SET_BUF. */ -struct homa_set_buf_args { +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ +struct homa_rcvbuf_args { /** @start: First byte of buffer region. */ void *start; diff --git a/homa_impl.h b/homa_impl.h index b06148fb..83bce282 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1118,7 +1118,7 @@ struct homa_gap *homa_gap_new(struct list_head *next, int start, int end); void homa_gap_retry(struct homa_rpc *rpc); int homa_get_port(struct sock *sk, unsigned short snum); int homa_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option); + char __user *optval, int __user *optlen); int homa_hash(struct sock *sk); enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); int homa_init(struct homa *homa); diff --git a/homa_metrics.c b/homa_metrics.c index 0800d484..95b20c83 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -191,9 +191,9 @@ char *homa_metrics_print(struct homa *homa) m->reply_ns); M("abort_calls %15llu Total invocations of abort kernel call\n", m->reply_calls); - M("so_set_buf_ns %15llu Time spent in setsockopt SO_HOMA_SET_BUF\n", + M("so_set_buf_ns %15llu Time spent in setsockopt SO_HOMA_RCVBUF\n", m->so_set_buf_ns); - M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_SET_BUF\n", + M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_RCVBUF\n", m->so_set_buf_calls); M("grantable_lock_ns %15llu Time spent with homa->grantable_lock locked\n", m->grantable_lock_ns); diff --git a/homa_plumbing.c b/homa_plumbing.c index 821bf568..dcb5fa4d 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -847,12 +847,13 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct homa_sock *hsk = homa_sk(sk); - struct homa_set_buf_args args; + struct homa_rcvbuf_args args; __u64 start = sched_clock(); int ret; - if (level != IPPROTO_HOMA || optname != SO_HOMA_SET_BUF || - optlen != sizeof(struct homa_set_buf_args)) + if (level != IPPROTO_HOMA || optname != SO_HOMA_RCVBUF) + return -ENOPROTOOPT; + if (optlen != sizeof(struct homa_rcvbuf_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, optlen)) @@ -865,7 +866,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sizeof(args))) return -EFAULT; - homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_SET_BUF"); + homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_RCV_BUF"); ret = homa_pool_init(hsk, (__force void __user *)args.start, args.length); homa_sock_unlock(hsk); @@ -877,18 +878,38 @@ int homa_setsockopt(struct sock *sk, int level, int optname, /** * homa_getsockopt() - Implements the getsockopt system call for Homa sockets. * @sk: Socket on which the system call was invoked. - * @level: ?? + * @level: Selects level in the network stack to handle the request; + * must be IPPROTO_HOMA. * @optname: Identifies a particular setsockopt operation. * @optval: Address in user space where the option's value should be stored. - * @option: ??. + * @optlen: Number of bytes available at optval; will be overwritten with + * actual number of bytes stored. * Return: 0 on success, otherwise a negative errno. */ int homa_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option) + char __user *optval, int __user *optlen) { - pr_warn("unimplemented getsockopt invoked on Homa socket: level %d, optname %d\n", - level, optname); - return -EINVAL; + struct homa_sock *hsk = homa_sk(sk); + struct homa_rcvbuf_args val; + int len; + + if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int))) + return -EFAULT; + + if (level != IPPROTO_HOMA || optname != SO_HOMA_RCVBUF) + return -ENOPROTOOPT; + if (len < sizeof(val)) + return -EINVAL; + + homa_pool_get_rcvbuf(hsk, &val); + len = sizeof(val); + + if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int))) + return -EFAULT; + + if (copy_to_sockptr(USER_SOCKPTR(optval), &val, len)) + return -EFAULT; + return 0; } /** diff --git a/homa_pool.c b/homa_pool.c index 84407149..d3460a31 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -40,8 +40,8 @@ static void set_bpages_needed(struct homa_pool *pool) } /** - * homa_pool_init() - Initialize a homa_pool; any previous contents of the - * objects are overwritten. + * homa_pool_init() - Initialize a homa_pool; any previous contents are + * destroyed. * @hsk: Socket containing the pool to initialize. * @region: First byte of the memory region for the pool, allocated * by the application; must be page-aligned. @@ -54,6 +54,8 @@ int homa_pool_init(struct homa_sock *hsk, void __user *region, struct homa_pool *pool = hsk->buffer_pool; int i, result; + homa_pool_destroy(hsk->buffer_pool); + if (((uintptr_t)region) & ~PAGE_MASK) return -EINVAL; pool->hsk = hsk; @@ -121,6 +123,21 @@ void homa_pool_destroy(struct homa_pool *pool) pool->region = NULL; } +/** + * homa_pool_get_rcvbuf() - Return information needed to handle getsockopt + * for HOMA_SO_RCVBUF. + * @hsk: Socket on which getsockopt request was made. + * @args: Store info here. + */ +void homa_pool_get_rcvbuf(struct homa_sock *hsk, + struct homa_rcvbuf_args *args) +{ + homa_sock_lock(hsk, "homa_pool_get_rcvbuf"); + args->start = hsk->buffer_pool->region; + args->length = hsk->buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; + homa_sock_unlock(hsk); +} + /** * homa_pool_get_pages() - Allocate one or more full pages from the pool. * @pool: Pool from which to allocate pages diff --git a/homa_pool.h b/homa_pool.h index 6f41ecbf..3d6dc48f 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -152,6 +152,8 @@ void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available); int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, int leave_locked); +void homa_pool_get_rcvbuf(struct homa_sock *hsk, + struct homa_rcvbuf_args *args); int homa_pool_init(struct homa_sock *hsk, void *buf_region, __u64 region_size); void homa_pool_release_buffers(struct homa_pool *pool, diff --git a/homa_receiver.cc b/homa_receiver.cc index 8aeb8418..0f7fb0ac 100644 --- a/homa_receiver.cc +++ b/homa_receiver.cc @@ -10,7 +10,7 @@ * homa::receiver::homa() - Constructor for receivers. * @fd: Homa socket from which this object will receive incoming * messages. The caller is responsible for setting up buffering - * on the socket using setsockopt with the SO_HOMA_SET_BUF option. + * on the socket using setsockopt with the SO_HOMA_RCVBUF option. * The file descriptor must be valid for the lifetime of this * object. * @buf_region: Location of the buffer region that was allocated for diff --git a/man/homa.7 b/man/homa.7 index 76adfe2a..5e181c74 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -1,4 +1,4 @@ -.TH HOMA 7 2024-7-26 "Homa" "Linux Programmer's Manual" +.TH HOMA 7 2024-12-4 "Homa" "Linux Programmer's Manual" .SH NAME homa \- Homa transport protocol .SH SYNOPSIS @@ -147,7 +147,7 @@ is complete. Buffering must be set up by invoking .B setsockopt with the -.BR SO_HOMA_SET_BUF +.BR SO_HOMA_RCVBUF option. This call should be made exactly once per socket, before the first call to .BR recvmsg . @@ -167,7 +167,7 @@ arguments must refer to a struct of the following type: .ps -1 .vs -2 .EX -struct homa_set_buf_args { +struct homa_rcvbuf_args { void *start; size_t length; }; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 56246c65..fb95d3e5 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -264,42 +264,42 @@ TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) EXPECT_EQ(ENOMEM, -homa_socket(&sock.sock)); } -TEST_F(homa_plumbing, homa_set_sock_opt__bad_level) +TEST_F(homa_plumbing, homs_setsockopt__bad_level) { - EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, 0, 0, - self->optval, sizeof(struct homa_set_buf_args))); + EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, 0, 0, + self->optval, sizeof(struct homa_rcvbuf_args))); } -TEST_F(homa_plumbing, homa_set_sock_opt__bad_optname) +TEST_F(homa_plumbing, homs_setsockopt__bad_optname) { - EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, 0, - self->optval, sizeof(struct homa_set_buf_args))); + EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, 0, + self->optval, sizeof(struct homa_rcvbuf_args))); } -TEST_F(homa_plumbing, homa_set_sock_opt__bad_optlen) +TEST_F(homa_plumbing, homs_setsockopt__bad_optlen) { EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SET_BUF, self->optval, - sizeof(struct homa_set_buf_args) - 1)); + SO_HOMA_RCVBUF, self->optval, + sizeof(struct homa_rcvbuf_args) - 1)); } -TEST_F(homa_plumbing, homa_set_sock_opt__copy_from_sockptr_fails) +TEST_F(homa_plumbing, homs_setsockopt__copy_from_sockptr_fails) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SET_BUF, self->optval, - sizeof(struct homa_set_buf_args))); + SO_HOMA_RCVBUF, self->optval, + sizeof(struct homa_rcvbuf_args))); } -TEST_F(homa_plumbing, homa_set_sock_opt__copy_to_user_fails) +TEST_F(homa_plumbing, homa_setsockopt__copy_to_user_fails) { - struct homa_set_buf_args args = {(void *) 0x100000, 5*HOMA_BPAGE_SIZE}; + struct homa_rcvbuf_args args = {(void *) 0x100000, 5*HOMA_BPAGE_SIZE}; self->optval.user = &args; mock_copy_to_user_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SET_BUF, self->optval, - sizeof(struct homa_set_buf_args))); + SO_HOMA_RCVBUF, self->optval, + sizeof(struct homa_rcvbuf_args))); } -TEST_F(homa_plumbing, homa_set_sock_opt__success) +TEST_F(homa_plumbing, homa_setsockopt__success) { - struct homa_set_buf_args args; + struct homa_rcvbuf_args args; char buffer[5000]; args.start = (void *) (((__u64) (buffer + PAGE_SIZE - 1)) @@ -308,13 +308,89 @@ TEST_F(homa_plumbing, homa_set_sock_opt__success) self->optval.user = &args; homa_pool_destroy(self->hsk.buffer_pool); EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_SET_BUF, self->optval, - sizeof(struct homa_set_buf_args))); + SO_HOMA_RCVBUF, self->optval, + sizeof(struct homa_rcvbuf_args))); EXPECT_EQ(args.start, self->hsk.buffer_pool->region); EXPECT_EQ(64, self->hsk.buffer_pool->num_bpages); EXPECT_EQ(1, homa_metrics_per_cpu()->so_set_buf_calls); } + +TEST_F(homa_plumbing, homa_getsockopt__success) +{ + struct homa_rcvbuf_args val; + int size = sizeof32(val) + 10; + + EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, + 10*HOMA_BPAGE_SIZE + 1000)); + EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_EQ((void *)0x40000, val.start); + EXPECT_EQ(10*HOMA_BPAGE_SIZE, val.length); + EXPECT_EQ(sizeof32(val), size); +} +TEST_F(homa_plumbing, homa_getsockopt__cant_read_size) +{ + struct homa_rcvbuf_args val; + int size = sizeof32(val); + + mock_copy_data_errors = 1; + EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, 0, SO_HOMA_RCVBUF, + (char *)&val, &size)); +} +TEST_F(homa_plumbing, homa_getsockopt__bad_level) +{ + struct homa_rcvbuf_args val; + int size = sizeof32(val); + + EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, 0, SO_HOMA_RCVBUF, + (char *)&val, &size)); +} +TEST_F(homa_plumbing, homa_getsockopt__bad_optname) +{ + struct homa_rcvbuf_args val; + int size = sizeof32(val); + + EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF-1, (char *)&val, &size)); +} +TEST_F(homa_plumbing, homa_getsockopt__bad_length) +{ + struct homa_rcvbuf_args val; + int size = sizeof32(val) - 1; + + EXPECT_EQ(EINVAL, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF, (char *)&val, &size)); +} +TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_size) +{ + struct homa_rcvbuf_args val = {.start = NULL, .length = 0}; + int size = sizeof32(val) + 10; + + EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, + 10*HOMA_BPAGE_SIZE + 1000)); + mock_copy_to_user_errors = 1; + + EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_EQ(NULL, val.start); + EXPECT_EQ(sizeof32(val) + 10, size); +} +TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_value) +{ + struct homa_rcvbuf_args val = {.start = NULL, .length = 0}; + int size = sizeof32(val) + 10; + + EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, + 10*HOMA_BPAGE_SIZE + 1000)); + mock_copy_to_user_errors = 2; + + EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_EQ(NULL, val.start); + EXPECT_EQ(sizeof32(val), size); +} + TEST_F(homa_plumbing, homa_sendmsg__args_not_in_user_space) { self->sendmsg_hdr.msg_control_is_user = 0; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 43109255..c79e977e 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -112,10 +112,15 @@ TEST_F(homa_pool, homa_pool_init__cant_allocate_core_info) 100*HOMA_BPAGE_SIZE)); } -TEST_F(homa_pool, homa_pool_destroy__idempotent) +TEST_F(homa_pool, homa_pool_get_rcvbuf) { - homa_pool_destroy(self->hsk.buffer_pool); - homa_pool_destroy(self->hsk.buffer_pool); + struct homa_rcvbuf_args args; + + EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, + 10*HOMA_BPAGE_SIZE + 1000)); + homa_pool_get_rcvbuf(&self->hsk, &args); + EXPECT_EQ(args.start, (void *)0x40000); + EXPECT_EQ(10*HOMA_BPAGE_SIZE, args.length); } TEST_F(homa_pool, homa_pool_get_pages__basics) diff --git a/util/cp_node.cc b/util/cp_node.cc index 4407fb11..bc594427 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -972,7 +972,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, , threads() { sockaddr_in_union addr; - struct homa_set_buf_args arg; + struct homa_rcvbuf_args arg; if (std::find(experiments.begin(), experiments.end(), experiment) == experiments.end()) @@ -1012,10 +1012,10 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, } arg.start = buf_region; arg.length = buf_size; - int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, + int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); if (status < 0) { - printf("FATAL: error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("FATAL: error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); exit(1); } @@ -1946,7 +1946,7 @@ homa_client::homa_client(int id, std::string& experiment) , receiving_threads() , sending_thread() { - struct homa_set_buf_args arg; + struct homa_rcvbuf_args arg; fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); if (fd < 0) { @@ -1963,10 +1963,10 @@ homa_client::homa_client(int id, std::string& experiment) } arg.start = buf_region; arg.length = buf_size; - int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, + int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); if (status < 0) { - printf("FATAL: error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("FATAL: error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); exit(1); } diff --git a/util/homa_test.cc b/util/homa_test.cc index af9eb01f..6c0b69be 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -430,7 +430,7 @@ void test_set_buf(int fd) int status; char *region = (char *) mmap(NULL, 64*HOMA_BPAGE_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - struct homa_set_buf_args arg; + struct homa_rcvbuf_args arg; if (region == MAP_FAILED) { printf("Couldn't mmap buffer region: %s\n", strerror(errno)); @@ -439,10 +439,10 @@ void test_set_buf(int fd) arg.start = region; arg.length = 64*HOMA_BPAGE_SIZE; - status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, + status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); if (status < 0) - printf("Error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("Error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); } @@ -925,13 +925,13 @@ int main(int argc, char** argv) printf("Couldn't mmap buffer region: %s\n", strerror(errno)); exit(1); } - struct homa_set_buf_args arg; + struct homa_rcvbuf_args arg; arg.start = buf_region; arg.length = 1000*HOMA_BPAGE_SIZE; - status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, + status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); if (status < 0) { - printf("Error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("Error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); exit(1); } diff --git a/util/server.cc b/util/server.cc index fb034781..c4632a83 100644 --- a/util/server.cc +++ b/util/server.cc @@ -61,7 +61,7 @@ void homa_server(int port) int length; struct homa_recvmsg_args recv_args; struct msghdr hdr; - struct homa_set_buf_args arg; + struct homa_rcvbuf_args arg; char *buf_region; struct iovec vecs[HOMA_MAX_BPAGES]; int num_vecs; @@ -91,10 +91,10 @@ void homa_server(int port) } arg.start = buf_region; arg.length = 1000*HOMA_BPAGE_SIZE; - int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_SET_BUF, &arg, + int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); if (status < 0) { - printf("Error in setsockopt(SO_HOMA_SET_BUF): %s\n", + printf("Error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); return; } From ff1129b59f95f8ae5e1db1b6ba2180eafd66698a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 09:19:45 -0800 Subject: [PATCH 085/625] Fix crash if homa_pool_check_waiting called with no region in pool --- homa_pool.c | 2 ++ test/unit_homa_pool.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/homa_pool.c b/homa_pool.c index d3460a31..563f5484 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -433,6 +433,8 @@ void homa_pool_check_waiting(struct homa_pool *pool) #ifdef __UNIT_TEST__ pool->check_waiting_invoked += 1; #endif /* __UNIT_TEST__ */ + if (!pool->region) + return; while (atomic_read(&pool->free_bpages) >= pool->bpages_needed) { struct homa_rpc *rpc; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index c79e977e..2648ba41 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -519,6 +519,15 @@ TEST_F(homa_pool, homa_pool_check_waiting__basics) EXPECT_EQ(2, crpc3->msgin.num_bpages); EXPECT_EQ(INT_MAX, pool->bpages_needed); } +TEST_F(homa_pool, homa_pool_check_waiting__pool_not_initialized) +{ + struct homa_pool pool; + + memset(&pool, 0, sizeof(pool)); + + /* Without the initialization check, this will crash. */ + homa_pool_check_waiting(&pool); +} TEST_F(homa_pool, homa_pool_check_waiting__bpages_needed_but_no_queued_rpcs) { struct homa_pool *pool = self->hsk.buffer_pool; From 283630be97d09cdb14fbf3ea32e394a4cdc39af0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 09:30:54 -0800 Subject: [PATCH 086/625] Check for msg->msg_name null in homa_sendmsg --- homa_plumbing.c | 15 ++++++++++----- test/unit_homa_plumbing.c | 7 +++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index dcb5fa4d..0eef3b98 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -924,15 +924,21 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { struct homa_sock *hsk = homa_sk(sk); struct homa_sendmsg_args args; + union sockaddr_in_union *addr; __u64 start = sched_clock(); - __u64 finish; - int result = 0; struct homa_rpc *rpc = NULL; - union sockaddr_in_union *addr = (union sockaddr_in_union *) - msg->msg_name; + int result = 0; + __u64 finish; per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; + + addr = (union sockaddr_in_union *)msg->msg_name; + if (!addr) { + result = -EINVAL; + goto error; + } + if (unlikely(!msg->msg_control_is_user)) { tt_record("homa_sendmsg error: !msg->msg_control_is_user"); result = -EINVAL; @@ -1041,7 +1047,6 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) } tt_record2("homa_sendmsg returning error %d for id %d", result, args.id); - tt_freeze(); return result; } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index fb95d3e5..c1fed479 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -391,6 +391,13 @@ TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_value) EXPECT_EQ(sizeof32(val), size); } +TEST_F(homa_plumbing, homa_sendmsg__msg_name_null) +{ + self->sendmsg_hdr.msg_name = NULL; + EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +} TEST_F(homa_plumbing, homa_sendmsg__args_not_in_user_space) { self->sendmsg_hdr.msg_control_is_user = 0; From 9bee642a279acefcfd0f1fce7f915b13b81de7bd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 09:34:40 -0800 Subject: [PATCH 087/625] Trivial name change to improve clarity --- homa_plumbing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 0eef3b98..c2fa2dad 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -949,7 +949,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) result = -EFAULT; goto error; } - if (addr->in6.sin6_family != sk->sk_family) { + if (addr->sa.sa_family != sk->sk_family) { result = -EAFNOSUPPORT; goto error; } From 8340c9a058bf812ef81f536760c56164b085b755 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 10:19:43 -0800 Subject: [PATCH 088/625] Fix false errors when tests are run with --ipv4 --- test/mock.c | 14 ++++++++++++++ test/mock.h | 1 + test/unit_homa_offload.c | 1 + test/unit_homa_outgoing.c | 4 ++++ test/unit_homa_plumbing.c | 1 + 5 files changed, 21 insertions(+) diff --git a/test/mock.c b/test/mock.c index ae83a97f..d2f4dfad 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1397,6 +1397,20 @@ void mock_set_core(int num) pcpu_hot.cpu_number = num; } +/** + * mock_set_ipv6() - Invoked by some tests to make them work when tests + * are run with --ipv4. Changes the socket to an IPv6 socket and sets + * mock_mtu and mock_ipv6. + * @hsk: Socket to reset for IPv6, if it's currently set for IPv4. + */ +void mock_set_ipv6(struct homa_sock *hsk) +{ + mock_ipv6 = true; + mock_mtu -= hsk->ip_header_length - HOMA_IPV6_HEADER_LENGTH; + hsk->ip_header_length = HOMA_IPV6_HEADER_LENGTH; + hsk->sock.sk_family = AF_INET6; +} + /** * mock_skb_new() - Allocate and return a packet buffer. The buffer is * initialized as if it just arrived from the network. diff --git a/test/mock.h b/test/mock.h index 9391ec88..a83ee106 100644 --- a/test/mock.h +++ b/test/mock.h @@ -62,6 +62,7 @@ extern struct ctl_table_header * mock_register_net_sysctl(struct net *net, const char *path, struct ctl_table *table); extern void mock_set_core(int num); +extern void mock_set_ipv6(struct homa_sock *hsk); extern void mock_spin_lock(spinlock_t *lock); extern void mock_spin_unlock(spinlock_t *lock); extern int mock_skb_count(void); diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 921e7096..9b4211c7 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -156,6 +156,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) struct common_header *h; struct sk_buff *skb; + mock_ipv6 = true; homa_gro_hook_tcp(); self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 49245fe5..50e0fd12 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -289,6 +289,8 @@ TEST_F(homa_outgoing, homa_message_out_fill__basics) struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + mock_set_ipv6(&self->hsk); + ASSERT_FALSE(crpc == NULL); ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 3000), 0)); @@ -913,6 +915,8 @@ TEST_F(homa_outgoing, homa_resend_data__set_homa_info) { struct homa_rpc *crpc; + mock_set_ipv6(&self->hsk); + mock_ipv6 = true; mock_net_device.gso_max_size = 5000; crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index c1fed479..abee54f6 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -701,6 +701,7 @@ TEST_F(homa_plumbing, homa_recvmsg__rpc_has_error) self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); + mock_set_ipv6(&self->hsk); EXPECT_NE(NULL, crpc); crpc->completion_cookie = 44444; homa_rpc_abort(crpc, -ETIMEDOUT); From 6042c40b54afab933942d2e21d0a80a3d14ce989 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 10:30:13 -0800 Subject: [PATCH 089/625] Skip cleanup after some errors in homa_recvmsg --- homa_plumbing.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index c2fa2dad..234a1900 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1078,15 +1078,11 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, */ return -EINVAL; } - if (msg->msg_controllen != sizeof(control)) { - result = -EINVAL; - goto done; - } + if (msg->msg_controllen != sizeof(control)) + return -EINVAL; if (unlikely(copy_from_user(&control, (void __user *)msg->msg_control, - sizeof(control)))) { - result = -EFAULT; - goto done; - } + sizeof(control)))) + return -EFAULT; control.completion_cookie = 0; tt_record3("homa_recvmsg starting, port %d, pid %d, flags %d", hsk->port, current->pid, control.flags); From a9525072de7b3ad31171eb067f9d6160b6108f1d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 10:45:39 -0800 Subject: [PATCH 090/625] Return an error from homa_recvmsg for bad returned receive buffers --- homa_plumbing.c | 4 +++- homa_pool.c | 9 +++++++-- homa_pool.h | 2 +- test/unit_homa_outgoing.c | 1 - test/unit_homa_plumbing.c | 9 +++++++++ test/unit_homa_pool.c | 7 +++++++ 6 files changed, 27 insertions(+), 5 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 234a1900..089e2796 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1092,9 +1092,11 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, result = -EINVAL; goto done; } - homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages, + result = homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages, control.bpage_offsets); control.num_bpages = 0; + if (result != 0) + goto done; rpc = homa_wait_for_message(hsk, (flags & MSG_DONTWAIT) ? (control.flags | HOMA_RECVMSG_NONBLOCKING) diff --git a/homa_pool.c b/homa_pool.c index 563f5484..a7078fad 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -397,14 +397,16 @@ void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available) * @num_buffers: How many buffers to release. * @buffers: Points to @num_buffers values, each of which is an offset * from the start of the pool to the buffer to be released. + * Return: 0 for success, otherwise a negative errno. */ -void homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, +int homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, __u32 *buffers) { + int result = 0; int i; if (!pool->region) - return; + return result; for (i = 0; i < num_buffers; i++) { __u32 bpage_index = buffers[i] >> HOMA_BPAGE_SHIFT; struct homa_bpage *bpage = &pool->descriptors[bpage_index]; @@ -412,11 +414,14 @@ void homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, if (bpage_index < pool->num_bpages) { if (atomic_dec_return(&bpage->refs) == 0) atomic_inc(&pool->free_bpages); + } else { + result = -EINVAL; } } tt_record3("Released %d bpages, free_bpages for port %d now %d", num_buffers, pool->hsk->port, atomic_read(&pool->free_bpages)); + return result; } /** diff --git a/homa_pool.h b/homa_pool.h index 3d6dc48f..55967037 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -156,7 +156,7 @@ void homa_pool_get_rcvbuf(struct homa_sock *hsk, struct homa_rcvbuf_args *args); int homa_pool_init(struct homa_sock *hsk, void *buf_region, __u64 region_size); -void homa_pool_release_buffers(struct homa_pool *pool, +int homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, __u32 *buffers); #endif /* _HOMA_POOL_H */ diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 50e0fd12..48102e4f 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -916,7 +916,6 @@ TEST_F(homa_outgoing, homa_resend_data__set_homa_info) struct homa_rpc *crpc; mock_set_ipv6(&self->hsk); - mock_ipv6 = true; mock_net_device.gso_max_size = 5000; crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index abee54f6..b372d252 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -614,6 +614,15 @@ TEST_F(homa_plumbing, homa_recvmsg__release_buffers) EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[0].refs)); EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[1].refs)); } +TEST_F(homa_plumbing, homa_recvmsg__error_in_release_buffers) +{ + self->recvmsg_args.num_bpages = 1; + self->recvmsg_args.bpage_offsets[0] = + self->hsk.buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; + + EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, 0, &self->recvmsg_hdr.msg_namelen)); +} TEST_F(homa_plumbing, homa_recvmsg__error_in_homa_wait_for_message) { self->hsk.shutdown = true; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 2648ba41..75ce8bd8 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -485,6 +485,13 @@ TEST_F(homa_pool, homa_pool_release_buffers__basics) EXPECT_EQ(0, atomic_read(&pool->descriptors[0].refs)); pool->region = saved_region; } +TEST_F(homa_pool, homa_pool_release_buffers__bogus_offset) +{ + __u32 buffer = self->hsk.buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; + + EXPECT_EQ(EINVAL, -homa_pool_release_buffers(self->hsk.buffer_pool, + 1, &buffer)); +} TEST_F(homa_pool, homa_pool_check_waiting__basics) { From 78ab5328cb6954d549c1721f405765e4fca04a73 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 10:49:51 -0800 Subject: [PATCH 091/625] Trivial change for readability --- homa_plumbing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 089e2796..e64c8dfd 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1136,7 +1136,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (likely(rpc->msgin.length >= 0)) { control.num_bpages = rpc->msgin.num_bpages; memcpy(control.bpage_offsets, rpc->msgin.bpage_offsets, - sizeof(control.bpage_offsets)); + sizeof(rpc->msgin.bpage_offsets)); } if (sk->sk_family == AF_INET6) { struct sockaddr_in6 *in6 = msg->msg_name; From e15d79ac97eede5f58846a48d01a9e46199ac758 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 11:28:46 -0800 Subject: [PATCH 092/625] Minor update to notes.txt --- notes.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/notes.txt b/notes.txt index b29de62d..4827dacb 100755 --- a/notes.txt +++ b/notes.txt @@ -29,6 +29,7 @@ Notes for Homa implementation in Linux: * Also consider the amount of data that is "stuck" in the NIC? * Notes on Linux qdiscs: + * Default qdisc is fq_codel * qdisc_create() is in sch_api.c * Packet transmission "starts" in __dev_xmit_skb in dev.c. * sch_direct_xmit is called once it's time to actually transmit a From c2fdc6aa05f790ef8d42a09f26108e427ff376c3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 11:28:57 -0800 Subject: [PATCH 093/625] Renamed homa_try_bucket_lock to homa_try_rpc_lock Also added comments to clarify lock-unlock pairs for RPCs. --- homa_incoming.c | 7 +++---- homa_outgoing.c | 6 ++---- homa_plumbing.c | 11 ++++++----- homa_pool.c | 5 ++--- homa_rpc.c | 2 +- homa_rpc.h | 15 +++++++++++++++ homa_sock.h | 18 ------------------ 7 files changed, 29 insertions(+), 35 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 601bf243..e85b257b 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -531,7 +531,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) kfree_skb(skb); } if (rpc) - homa_grant_check_rpc(rpc); + homa_grant_check_rpc(rpc); /* Unlocks rpc. */ while (num_acks > 0) { num_acks--; @@ -952,8 +952,7 @@ struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) * the RPC, just skip it (waiting could deadlock), and it * will eventually get updated elsewhere. */ - if (homa_bucket_try_lock(oldest->bucket, oldest->id, - "homa_choose_fifo_grant")) { + if (homa_rpc_try_lock(oldest, "homa_choose_fifo_grant")) { homa_grant_update_incoming(oldest, homa); homa_rpc_unlock(oldest); } @@ -1106,7 +1105,7 @@ int homa_register_interests(struct homa_interest *interest, if (id != 0) { if (!homa_is_client(id)) return -EINVAL; - rpc = homa_find_client_rpc(hsk, id); + rpc = homa_find_client_rpc(hsk, id); /* Locks rpc. */ if (!rpc) return -EINVAL; if (rpc->interest && rpc->interest != interest) { diff --git a/homa_outgoing.c b/homa_outgoing.c index 9ae3dfb7..f6d94f54 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -918,8 +918,7 @@ void homa_pacer_xmit(struct homa *homa) homa_throttle_unlock(homa); break; } - if (!homa_bucket_try_lock(rpc->bucket, rpc->id, - "homa_pacer_xmit")) { + if (!homa_rpc_try_lock(rpc, "homa_pacer_xmit")) { homa_throttle_unlock(homa); INC_METRIC(pacer_skipped_rpcs, 1); break; @@ -1062,8 +1061,7 @@ void homa_log_throttled(struct homa *homa) homa_throttle_lock(homa); list_for_each_entry_rcu(rpc, &homa->throttled_rpcs, throttled_links) { rpcs++; - if (!homa_bucket_try_lock(rpc->bucket, rpc->id, - "homa_log_throttled")) { + if (!homa_rpc_try_lock(rpc, "homa_log_throttled")) { pr_notice("Skipping throttled RPC: locked\n"); continue; } diff --git a/homa_plumbing.c b/homa_plumbing.c index e64c8dfd..7d330cc6 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -776,7 +776,7 @@ int homa_ioc_abort(struct sock *sk, int *karg) homa_rpc_free(rpc); else homa_rpc_abort(rpc, -args.error); - homa_rpc_unlock(rpc); + homa_rpc_unlock(rpc); /* Locked by homa_find_client_rpc. */ return ret; } @@ -980,7 +980,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (result) goto error; args.id = rpc->id; - homa_rpc_unlock(rpc); + homa_rpc_unlock(rpc); /* Locked by homa_rpc_new_client. */ rpc = NULL; if (unlikely(copy_to_user((void __user *)msg->msg_control, @@ -1023,6 +1023,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (rpc->state != RPC_IN_SERVICE) { tt_record2("homa_sendmsg error: RPC id %d in bad state %d", rpc->id, rpc->state); + /* Locked by homa_find_server_rpc. */ homa_rpc_unlock(rpc); rpc = NULL; result = -EINVAL; @@ -1033,7 +1034,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) result = homa_message_out_fill(rpc, &msg->msg_iter, 1); if (result && rpc->state != RPC_DEAD) goto error; - homa_rpc_unlock(rpc); + homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ finish = sched_clock(); INC_METRIC(reply_ns, finish - start); } @@ -1043,7 +1044,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) error: if (rpc) { homa_rpc_free(rpc); - homa_rpc_unlock(rpc); + homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ } tt_record2("homa_sendmsg returning error %d for id %d", result, args.id); @@ -1171,7 +1172,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, else rpc->state = RPC_IN_SERVICE; } - homa_rpc_unlock(rpc); + homa_rpc_unlock(rpc); /* Locked by homa_wait_for_message. */ done: if (unlikely(copy_to_user((__force void __user *)msg->msg_control, diff --git a/homa_pool.c b/homa_pool.c index a7078fad..10889e21 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -451,8 +451,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) } rpc = list_first_entry(&pool->hsk->waiting_for_bufs, struct homa_rpc, buf_links); - if (!homa_bucket_try_lock(rpc->bucket, rpc->id, - "homa_pool_check_waiting")) { + if (!homa_rpc_try_lock(rpc, "homa_pool_check_waiting")) { /* Can't just spin on the RPC lock because we're * holding the socket lock (see sync.txt). Instead, * release the socket lock and try the entire @@ -476,7 +475,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) if (rpc->msgin.num_bpages > 0) { /* Allocation succeeded; "wake up" the RPC. */ rpc->msgin.resend_all = 1; - homa_grant_check_rpc(rpc); + homa_grant_check_rpc(rpc); /* Unlocks rpc. */ } else { homa_rpc_unlock(rpc); } diff --git a/homa_rpc.c b/homa_rpc.c index ea36033b..939e8da6 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -232,7 +232,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, if (rpc) { tt_record1("homa_rpc_acked freeing id %d", rpc->id); homa_rpc_free(rpc); - homa_rpc_unlock(rpc); + homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ } done: diff --git a/homa_rpc.h b/homa_rpc.h index ca8b81e2..20b94df5 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -457,6 +457,21 @@ static inline void homa_rpc_lock(struct homa_rpc *rpc, const char *locker) homa_bucket_lock(rpc->bucket, rpc->id, locker); } +/** + * homa_rpc_try_lock() - Acquire the lock for an RPC if it is available. + * @rpc: RPC to lock. + * @locker: Static string identifying the locking code. Normally ignored, + * but used when debugging deadlocks. + * Return: Nonzero if lock was successfully acquired, zero if it is + * currently owned by someone else. + */ +static inline int homa_rpc_try_lock(struct homa_rpc *rpc, const char *locker) +{ + if (!spin_trylock_bh(&rpc->bucket->lock)) + return 0; + return 1; +} + /** * homa_rpc_unlock() - Release the lock for an RPC. * @rpc: RPC to unlock. diff --git a/homa_sock.h b/homa_sock.h index 06f34aab..ac268cb7 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -386,24 +386,6 @@ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, homa_bucket_lock_slow(bucket, id); } -/** - * homa_bucket_try_lock() - Acquire the lock for an RPC hash table bucket if - * it is available. - * @bucket: Bucket to lock - * @id: ID of the RPC that is requesting the lock. - * @locker: Static string identifying the locking code. Normally ignored, - * but used when debugging deadlocks. - * Return: Nonzero if lock was successfully acquired, zero if it is - * currently owned by someone else. - */ -static inline int homa_bucket_try_lock(struct homa_rpc_bucket *bucket, - __u64 id, const char *locker) -{ - if (!spin_trylock_bh(&bucket->lock)) - return 0; - return 1; -} - /** * homa_bucket_unlock() - Release the lock for an RPC hash table bucket. * @bucket: Bucket to unlock. From a20fae808a3c979d446167d130cc05c4ed5fdee3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Dec 2024 21:08:02 -0800 Subject: [PATCH 094/625] Change trigger lines for strip.py * Use '#ifndef __STRIP__' instead of '#if 1' * Use '#ifdef __STRIP__' instead of '#if 0' --- homa.h | 2 +- homa_impl.h | 8 ++++---- homa_incoming.c | 10 +++++----- homa_offload.c | 2 +- homa_outgoing.c | 16 ++++++++-------- homa_plumbing.c | 8 ++++---- homa_pool.h | 2 +- homa_rpc.c | 2 +- homa_sock.c | 2 +- homa_timer.c | 2 +- util/strip.py | 23 ++++++++++++----------- 11 files changed, 39 insertions(+), 38 deletions(-) diff --git a/homa.h b/homa.h index 8391141b..cc890175 100644 --- a/homa.h +++ b/homa.h @@ -188,7 +188,7 @@ struct homa_rcvbuf_args { #define HOMAIOCABORT _IOWR(0x89, 0xe3, struct homa_abort_args) #define HOMAIOCFREEZE _IO(0x89, 0xef) -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ int homa_abort(int sockfd, uint64_t id, int error); int homa_send(int sockfd, const void *message_buf, size_t length, const struct sockaddr *dest_addr, diff --git a/homa_impl.h b/homa_impl.h index 83bce282..34ffde23 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -43,7 +43,7 @@ #include #include -#if 1 /* See strip.py --alt */ +#ifndef __STRIP__ /* See strip.py --alt */ #include #include "homa.h" #else /* See strip.py */ @@ -124,7 +124,7 @@ void *mock_vmalloc(size_t size); #define per_cpu(name, core) (name[core]) #endif /* __UNIT_TEST__ */ -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ /* Null out things that confuse VSCode Intellisense */ #ifdef __VSCODE__ #define raw_smp_processor_id() 1 @@ -139,7 +139,7 @@ struct homa_peer; struct homa_sock; struct homa; -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ #include "timetrace.h" #endif /* See strip.py */ #include "homa_metrics.h" @@ -1048,7 +1048,7 @@ static inline bool is_homa_pkt(struct sk_buff *skb) (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); } -#if 1 /* See strip.py --alt */ +#ifndef __STRIP__ /* See strip.py --alt */ /** * tt_addr() - Given an address, return a 4-byte id that will (hopefully) * provide a unique identifier for the address in a timetrace record. diff --git a/homa_incoming.c b/homa_incoming.c index e85b257b..a6b84144 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -227,7 +227,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) #define MAX_SKBS 20 #endif /* __UNIT_TEST__ */ struct sk_buff *skbs[MAX_SKBS]; -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ int start_offset = 0; int end_offset = 0; #endif /* See strip.py */ @@ -310,7 +310,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) goto free_skbs; copied += chunk_size; } -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ if (end_offset == 0) { start_offset = offset; } else if (end_offset != offset) { @@ -323,7 +323,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) } free_skbs: -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ if (end_offset != 0) { tt_record3("copied out bytes %d-%d for id %d", start_offset, end_offset, rpc->id); @@ -681,7 +681,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk) { struct resend_header *h = (struct resend_header *)skb->data; -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); #endif /* See strip.py */ struct busy_header busy; @@ -823,7 +823,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, */ if (rpc && (rpc->state != RPC_INCOMING || rpc->msgin.bytes_remaining)) { -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ tt_record3("NEED_ACK arrived for id %d before message received, state %d, remaining %d", rpc->id, rpc->state, rpc->msgin.bytes_remaining); homa_freeze(rpc, NEED_ACK_MISSING_DATA, diff --git a/homa_offload.c b/homa_offload.c index f8b8fa90..c8248319 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -340,7 +340,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, INC_METRIC(gro_grant_bypasses, 1); goto bypass; } -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ } else { tt_record4("homa_gro_receive got packet from 0x%x id %llu, type 0x%x, priority %d", saddr, homa_local_id(h_new->common.sender_id), diff --git a/homa_outgoing.c b/homa_outgoing.c index f6d94f54..eadbddd3 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -358,7 +358,7 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, struct homa_sock *hsk) { -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ struct netdev_queue *txq; #endif /* See strip.py */ struct common_header *h; @@ -406,7 +406,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, * a bogus "reference count"). */ if (refcount_read(&skb->users) > 1) { -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) { pr_notice("ip6_xmit didn't free Homa control packet (type %d) after error %d\n", h->type, result); @@ -426,7 +426,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, #endif /* See strip.py */ } } -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) tt_record4("__homa_xmit_control found stopped txq for id %d, qid %d, num_queued %d, limit %d", @@ -491,7 +491,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) __acquires(rpc->bucket_lock) { struct homa *homa = rpc->hsk->homa; -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ struct netdev_queue *txq; #endif /* See strip.py */ @@ -530,7 +530,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) homa_rpc_unlock(rpc); skb_get(skb); __homa_xmit_data(skb, rpc, priority); -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) tt_record4("homa_xmit_data found stopped txq for id %d, qid %d, num_queued %d, limit %d", @@ -555,7 +555,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) */ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) { -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ struct homa_skb_info *homa_info = homa_get_skb_info(skb); #endif /* See strip.py */ struct dst_entry *dst; @@ -762,7 +762,7 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) return 0; if (!list_empty(&homa->throttled_rpcs)) INC_METRIC(pacer_bytes, bytes); -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ if (idle < clock) { if (homa->pacer_wake_time) { __u64 lost = (homa->pacer_wake_time > idle) @@ -815,7 +815,7 @@ int homa_pacer_main(void *transport) * incoming packets from being handled). */ set_current_state(TASK_INTERRUPTIBLE); -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ if (list_first_or_null_rcu(&homa->throttled_rpcs, struct homa_rpc, throttled_links) == NULL) tt_record("pacer sleeping"); diff --git a/homa_plumbing.c b/homa_plumbing.c index 7d330cc6..03db8006 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -606,7 +606,7 @@ int __init homa_load(void) } homa_gro_hook_tcp(); -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ tt_init("timetrace", homa->temp); #endif /* See strip.py */ @@ -645,7 +645,7 @@ void __exit homa_unload(void) pr_notice("Homa module unloading\n"); exiting = true; -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ tt_destroy(); #endif /* See strip.py */ @@ -1255,7 +1255,7 @@ int homa_softirq(struct sk_buff *skb) packets = skb; prev_link = &packets; for (skb = packets; skb; skb = next) { -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); #endif /* See strip.py */ @@ -1567,7 +1567,7 @@ int homa_dointvec(const struct ctl_table *table, int write, if (action == 2) { homa_rpc_log_active(homa, 0); } else if (action == 3) { -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ tt_record("Freezing because of sysctl"); tt_freeze(); #endif /* See strip.py */ diff --git a/homa_pool.h b/homa_pool.h index 55967037..35aa0526 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -86,7 +86,7 @@ struct homa_pool_core { }; }; -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ _Static_assert(sizeof(struct homa_pool_core) == L1_CACHE_BYTES, "homa_pool_core overflowed a cache line"); #endif /* See strip.py */ diff --git a/homa_rpc.c b/homa_rpc.c index 939e8da6..d52db34e 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -602,7 +602,7 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) */ void homa_rpc_log_tt(struct homa_rpc *rpc) { -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ if (rpc->state == RPC_INCOMING) { int received = rpc->msgin.length - rpc->msgin.bytes_remaining; diff --git a/homa_sock.c b/homa_sock.c index 47b58f9f..3cde648c 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -266,7 +266,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) while (!list_empty(&hsk->dead_rpcs)) { homa_rpc_reap(hsk, 1000); i++; -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ if (i == 5) { tt_record("Freezing because reap seems hung"); tt_freeze(); diff --git a/homa_timer.c b/homa_timer.c index 0033921f..41966f42 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -118,7 +118,7 @@ void homa_check_rpc(struct homa_rpc *rpc) } resend.priority = homa->num_priorities - 1; homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); -#if 1 /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ if (homa_is_client(rpc->id)) { us = "client"; them = "server"; diff --git a/util/strip.py b/util/strip.py index 185ab9a2..30f637b2 100755 --- a/util/strip.py +++ b/util/strip.py @@ -18,26 +18,26 @@ #if statments in the following ways: * This entire block will be removed in the stripped version: - #if 1 /* See strip.py */ + #ifndef __STRIP__ /* See strip.py */ ... #endif /* See strip.py */ * The #if and #endif statements will be removed, leaving just the code in between: - #if 0 /* See strip.py */ + #ifdef __STRIP__ /* See strip.py */ ... #endif /* See strip.py */ * Everything will be removed except the code between #else and #endif: - #if 1 /* See strip.py */ + #ifndef __STRIP__ /* See strip.py */ ... #else /* See strip.py */ ... #endif /* See strip.py */ * It is also possible to strip using "alt" mode, with lines like this: - #if 1 /* See strip.py --alt */ - #if 0 /* See strip.py --alt */ + #ifndef __STRIP__ /* See strip.py --alt */ + #ifdef __STRIP__ /* See strip.py --alt */ If the --alt option was not specified then these lines are handled as if "--alt" wasn't present in the comments. However, if the --alt option was specified then these lines are ignored. @@ -132,8 +132,9 @@ def scan(file, alt_mode): skip_statement = False # Values of 0 or 1 mean we're in the middle of a group of lines labeled - # with '#if /* GitHubOnly */'. 0 means we're including lines, 1 means - # we're stripping them. None means we're not in such a group. + # with '#ifndef __STRIP__' or "#ifdef __STRIP__". 0 means we're including + # lines, 1 means we're stripping them. None means we're not in such a + # group. in_labeled_skip = None # Used to strip out unit testing code. Value is one of: @@ -217,16 +218,16 @@ def scan(file, alt_mode): continue if in_labeled_skip == 1: continue - if line.startswith('#if 1 /* See strip.py */') or ( - line.startswith('#if 1 /* See strip.py --alt */') + if line.startswith('#ifndef __STRIP__ /* See strip.py */') or ( + line.startswith('#ifndef __STRIP__ /* See strip.py --alt */') and not alt_mode): if slines[-1].strip() == '': delete_empty_line = True in_labeled_skip = 1 check_braces = False continue - if line.startswith('#if 0 /* See strip.py */')or ( - line.startswith('#if 0 /* See strip.py --alt */') + if line.startswith('#ifdef __STRIP__ /* See strip.py */') or ( + line.startswith('#ifdef __STRIP__ /* See strip.py --alt */') and not alt_mode): if slines[-1].strip() == '': slines.pop() From 2296089a1c00f2003b63250d0151832d5f464d53 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Dec 2024 16:42:14 -0800 Subject: [PATCH 095/625] Trivial Makefile update --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fb0cc40f..c1590b4a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile to build Homa as a Linux module. -HOMA_OBJS = homa_grant.o \ +HOMA_OBJS := homa_grant.o \ homa_incoming.o \ homa_metrics.o \ homa_offload.o \ From ab9fb519574196cdba2c91d0a512ba99498fcfd7 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Dec 2024 16:51:47 -0800 Subject: [PATCH 096/625] Change uint32_t to __u32 --- homa_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_impl.h b/homa_impl.h index 34ffde23..b253efa2 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1054,7 +1054,7 @@ static inline bool is_homa_pkt(struct sk_buff *skb) * provide a unique identifier for the address in a timetrace record. * @x: Address (either IPv6 or IPv4-mapped IPv6) */ -static inline uint32_t tt_addr(const struct in6_addr x) +static inline __u32 tt_addr(const struct in6_addr x) { return is_mapped_ipv4(x) ? ntohl(x.in6_u.u6_addr32[3]) : (x.in6_u.u6_addr32[3] ? ntohl(x.in6_u.u6_addr32[3]) From 005a672beea64c29f2a36ca53ec7387c2d5a9a52 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Dec 2024 17:22:03 -0800 Subject: [PATCH 097/625] Trivial updates --- homa_outgoing.c | 3 ++- homa_pool.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index eadbddd3..8d902817 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -983,9 +983,10 @@ void homa_pacer_stop(struct homa *homa) * homa_add_to_throttled() - Make sure that an RPC is on the throttled list * and wake up the pacer thread if necessary. * @rpc: RPC with outbound packets that have been granted but can't be - * sent because of NIC queue restrictions. + * sent because of NIC queue restrictions. Must be locked by caller. */ void homa_add_to_throttled(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; struct homa_rpc *candidate; diff --git a/homa_pool.h b/homa_pool.h index 35aa0526..da952c8a 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -47,8 +47,10 @@ struct homa_bpage { }; }; +#ifndef __STRIP__ /* See strip.py */ _Static_assert(sizeof(struct homa_bpage) == L1_CACHE_BYTES, "homa_bpage overflowed a cache line"); +#endif /* See strip.py */ /** * struct homa_pool_core - Holds core-specific data for a homa_pool (a bpage From 672963e2d79c9fc1c2c856a2bf5ccc295df9b584 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 9 Dec 2024 11:59:23 -0800 Subject: [PATCH 098/625] Don't reset crpc->silent_ticks on NEED_ACK packets --- homa_incoming.c | 3 ++- test/unit_homa_incoming.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index a6b84144..2a9958e9 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -464,7 +464,8 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) } else { if (h->common.type == DATA || h->common.type == GRANT || - h->common.type == BUSY) + h->common.type == BUSY || + h->common.type == NEED_ACK) rpc->silent_ticks = 0; rpc->peer->outstanding_resends = 0; } diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 4a789da6..6fbd00df 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1041,7 +1041,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) EXPECT_EQ(0, crpc->peer->outstanding_resends); /* Don't reset silent_ticks for some packet types. */ - h.common.type = NEED_ACK; + h.common.type = CUTOFFS; crpc->silent_ticks = 5; crpc->peer->outstanding_resends = 2; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), From 16af2eb111050dcfdcf64be3d5f51e6e6021e730 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 9 Dec 2024 12:03:05 -0800 Subject: [PATCH 099/625] Updated README.md Add note about IANA protocol number assignment --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a417d495..099de49f 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k sysctl mechanism. For details, see the man page `homa.7`. ## Significant recent improvements +- October 2024: Homa now has an official IANA IP protocol number (146). - August 2024: upgraded to Linux 6.10.6. - July 2024: introduced "TCP hijacking", where Homa packets are sent as legitimate TCP segments (using TCP as the IP protocol) and then reclaimed From 2486742ff42f947ee77dfc63fb66a3180e424ef9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 9 Dec 2024 16:22:09 -0800 Subject: [PATCH 100/625] Turn off TT_KERNEL in timetrace.c (was accidentally left on) --- timetrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timetrace.c b/timetrace.c index 721b3746..f924ce82 100644 --- a/timetrace.c +++ b/timetrace.c @@ -12,7 +12,7 @@ * timetrace stubs; we will then connect the timetrace mechanism here with * those stubs to allow the rest of the kernel to log in our buffers. */ -#define TT_KERNEL 1 +//#define TT_KERNEL 1 #endif /* __UNIT_TEST__ */ #ifdef TT_KERNEL extern struct tt_buffer *tt_linux_buffers[]; From dc5688a9dc33714c0ba8e051e2a7451e2adb5d1e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Dec 2024 09:15:14 -0800 Subject: [PATCH 101/625] Add tx_nic_rx field to intervals in tthoma.py --- util/tthoma.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index d33cddd9..0317c516 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -234,6 +234,10 @@ def __missing__(self, key): # tx_in_nic: Number of bytes of data that have been passed to the NIC # but not yet returned via the tx completion queue, as of the # end of the interval +# tx_nic_rx: Number of bytes of data that have been received by the +# destination but their packet buffers haven't been returned +# from the NIC via the completion queue, as of the end of +# the interval # tx_qdisc: Bytes of data that have been passed to ip*xmit but not # yet passed to the NIC, as of the end of the interval (large # numbers probably due to qdisc) @@ -3929,6 +3933,7 @@ def init_intervals(self): 'tx_nic_pkts': 0, 'tx_nic_bytes': 0, 'tx_in_nic': 0, + 'tx_nic_rx': 0, 'tx_qdisc': 0, 'tx_q': 0, 'tx_gro_bytes': 0, @@ -4084,7 +4089,6 @@ def analyze(self): nic_interval = get_interval(tx_node, tnic) else: tnic = None - tnic = pkt['nic'] if 'nic' in pkt else None tfree = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None tgro = pkt['gro'] if 'gro' in pkt else None @@ -4103,6 +4107,7 @@ def analyze(self): if 'nic' in pkt: tnic = pkt['nic'] + nic_interval = get_interval(tx_node, tnic) node_xmits[tx_node].append([pkt['nic'], tso_length + data_overhead_bytes]) nic_interval['tx_nic_pkts'] += 1 @@ -4127,6 +4132,9 @@ def analyze(self): start = traces[tx_node]['first_time'] add_to_intervals(tx_node, start, tfree, 'tx_in_nic', tso_length) + if tgro != None: + add_to_intervals(tx_node, tgro, tfree, 'tx_nic_rx', + tso_length) if tgro != None: interval = get_interval(tx_node, tgro) @@ -4144,6 +4152,8 @@ def analyze(self): tnic, 'rx_data_qdisc', length) elif not nic_data: tnic = txmit + if tnic != None: + nic_interval = get_interval(tx_node, tnic) elif txmit != None: add_to_intervals(rx_node, txmit, traces[tx_node]['last_time'], 'rx_data_qdisc', length) @@ -7167,10 +7177,10 @@ def output(self): 'of the interval\n') f.write('# Reqs: Request messages that have been started ' 'but not fully\n') - f.write(' transmitted as of the end of the interval\n') + f.write('# transmitted as of the end of the interval\n') f.write('# Resps: Response messages that have been started ' 'but not fully\n') - f.write(' transmitted as of the end of the interval\n') + f.write('# transmitted as of the end of the interval\n') f.write('# Pkts: Packets transmitted during the interval\n') f.write('# QDisc: KB of data that have been passed to ip*xmit ' 'but not yet\n') @@ -7184,9 +7194,13 @@ def output(self): f.write('# link speed)\n') f.write('# InNic: KB of data that have been queued for the ' 'NIC but whose packets\n') - f.write('# haven\'t been returned after transmission, ' - 'as of the end of\n') - f.write('# the interval\n') + f.write('# NicRx: KB of data that are still in the NIC\'s ' + 'possession (their packets\n') + f.write('# haven\'t been returned after transmission) ' + 'even though the data\n') + f.write('# has been received by the destination, as ' + 'of the end of the\n') + f.write('# interval\n') f.write('# FreeKB: KB of skb data freed after NIC notified ' 'transmission complete\n') f.write('# MinFr: Smallest p[\'free_tx_skb\'] - p[\'nic\'] for a ' @@ -7214,7 +7228,7 @@ def output(self): 'during the interval\n') f.write('\n# Time Gbps TxKB RPCs Reqs Resps') - f.write(' Pkts Qdisc NicKB NQEst InNic FreeKB') + f.write(' Pkts Qdisc NicKB NQEst InNic NicRx FreeKB') f.write(' MinFr MaxFr MinGF MaxGF') f.write(' GXmit GGro GAvail GNew\n') total = 0 @@ -7230,11 +7244,12 @@ def output(self): interval['rpcs_live'], interval['tx_live_req'], interval['tx_live_resp'])) - f.write(' %4d %5.0f %5.0f %5.1f %5.0f %5.0f' % ( + f.write(' %4d %5.0f %5.0f %5.1f %5.0f %5.0f %5.0f' % ( interval['tx_pkts'], interval['tx_qdisc'] * 1e-3, interval['tx_nic_bytes'] * 1e-3, interval['tx_q'] * 8 / (options.gbps * 1000), interval['tx_in_nic'] * 1e-3, + interval['tx_nic_rx'] * 1e-3, interval['tx_free_bytes'] * 1e-3)) v = interval['tx_min_free'] min_free = '%.1f' % v if v != 0 else '' From 132db93fb8ccad246be06f0ebf5471668120e115 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sat, 7 Dec 2024 18:12:31 -0800 Subject: [PATCH 102/625] Fix strip.py bug handling lines with code followed by /*...*/ --- util/strip.py | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/util/strip.py b/util/strip.py index 30f637b2..0357b03d 100755 --- a/util/strip.py +++ b/util/strip.py @@ -121,12 +121,9 @@ def scan(file, alt_mode): global exit_code - # True means the current line is in the middle of a /* ... */ comment + # True means the current line ends in the middle of a /* ... */ comment in_comment = False - # True means the current line is at least partially a comment line. - current_has_comment = False - # True means we're in the middle of a multi-line statement that # should be skipped (drop until a semicolon is seen). skip_statement = False @@ -181,32 +178,34 @@ def scan(file, alt_mode): line_num += 1 # pline is used for parsing; it is modified to remove - # uninteresting information such as comments and whitespace. - pline = line.rstrip() - - # See if (part of) this line is a comment. - current_has_comment = in_comment - if in_comment: - index = pline.find('*/') - if index >= 0: - in_comment = False - else: - index = pline.find('/*') - if index >= 0: - current_has_comment = True - index2 = pline.find('*/', index+2) - if index2 < 0: - in_comment = True - index = pline.find('//') - if index >= 0: - current_has_comment = True - - pline = pline.strip() + # uninteresting information such as whitespace. + pline = line.strip() + if pline.startswith('//') and not 'SPDX-License' in pline: # Strip // comment lines: these are used only for commenting # out debugging code. continue + # Extract the part of the line that is *not* in a /*...*/ comment + # (assume at most one comment per line). + cstart = pline.find('/*') + cend = pline.find('*/') + if cstart >= 0: + if cend >= 0: + non_comment = pline[0:cstart] + pline[cend+2:] + in_comment = False + else: + non_comment = pline[0:cstart] + in_comment = True + elif cend >= 0: + non_comment = pline[cend+2:] + in_comment = False + elif in_comment: + non_comment = '' + else: + non_comment = pline + non_comment = non_comment.strip() + # Strip groups of lines labeled with special '#if' if in_labeled_skip != None: if line.startswith('#endif /* See strip.py */'): @@ -350,7 +349,7 @@ def scan(file, alt_mode): open_index = len(slines) # Count statements - if pline[-1] == ';' and not current_has_comment: + if non_comment and non_comment[-1] == ';': statements_in_block += 1 # The current line needs to be retained in the output. From c5c8e6f651ae1e85e21bb86024dbed8bc1a76366 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 9 Dec 2024 09:03:41 -0800 Subject: [PATCH 103/625] Fix checkpatch.pl issue --- homa_sock.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/homa_sock.h b/homa_sock.h index ac268cb7..7b21f9fb 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -300,11 +300,8 @@ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, static inline void homa_sock_lock(struct homa_sock *hsk, const char *locker) __acquires(&hsk->lock) { - if (!spin_trylock_bh(&hsk->lock)) { -// printk(KERN_NOTICE "Slow path for socket %d, last locker %s", -// hsk->client_port, hsk->last_locker); + if (!spin_trylock_bh(&hsk->lock)) homa_sock_lock_slow(hsk); - } // hsk->last_locker = locker; } From f29c2c03ec0544a69b22c755607d979f3b60ac0f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 9 Dec 2024 09:03:55 -0800 Subject: [PATCH 104/625] Update to notes.txt --- notes.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/notes.txt b/notes.txt index 4827dacb..e2907288 100755 --- a/notes.txt +++ b/notes.txt @@ -86,6 +86,9 @@ Notes for Homa implementation in Linux: * Eventually SoftIRQ wakes up to handle the original packet, which re-creates the RPC and it gets serviced a second time. +* Use vmap to map the user-space buffer pool so that the kernel can use + memcpy rather than copy_to_user? + * SoftIRQ processing can lock out kernel-to-user copies; add a preemption mechanism where the copying code can set a flag that it needs the lock, then SoftIRQ releases the lock until the flag is clear? From 3b9d02f8c74e83fcd0f9d913357aebe3a176bcbb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 13 Dec 2024 08:53:12 -0800 Subject: [PATCH 105/625] Fix issues found by patchwork builds dheckpatch.pl warnings, plus documentation warnings from kdoc --- homa.h | 10 +++++----- homa_impl.h | 17 +++++++++++++++-- homa_outgoing.c | 5 +++-- homa_peer.c | 4 ++-- homa_peer.h | 5 +++-- homa_plumbing.c | 4 +++- homa_pool.c | 8 ++++---- homa_rpc.h | 1 + homa_sock.h | 7 ++++++- homa_utils.c | 13 +------------ homa_wire.h | 6 ++++-- 11 files changed, 47 insertions(+), 33 deletions(-) diff --git a/homa.h b/homa.h index cc890175..f1100238 100644 --- a/homa.h +++ b/homa.h @@ -112,11 +112,11 @@ struct homa_recvmsg_args { * @bpage_offsets: (in/out) Each entry is an offset into the buffer * region for the socket pool. When returned from recvmsg, the * offsets indicate where fragments of the new message are stored. All - * entries but the last refer to full buffer pages (HOMA_BPAGE_SIZE bytes) - * and are bpage-aligned. The last entry may refer to a bpage fragment and - * is not necessarily aligned. The application now owns these bpages and - * must eventually return them to Homa, using bpage_offsets in a future - * recvmsg invocation. + * entries but the last refer to full buffer pages (HOMA_BPAGE_SIZE + * bytes) and are bpage-aligned. The last entry may refer to a bpage + * fragment and is not necessarily aligned. The application now owns + * these bpages and must eventually return them to Homa, using + * bpage_offsets in a future recvmsg invocation. */ uint32_t bpage_offsets[HOMA_MAX_BPAGES]; }; diff --git a/homa_impl.h b/homa_impl.h index b253efa2..956feb15 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -161,8 +161,14 @@ void homa_throttle_lock_slow(struct homa *homa); * and easier to use than sockaddr_storage). */ union sockaddr_in_union { + + /** @sa: Used to access as a generic sockaddr. */ struct sockaddr sa; + + /** @in4: Used to access as IPv4 socket. */ struct sockaddr_in in4; + + /** @in6: Used to access as IPv6 socket. */ struct sockaddr_in6 in6; }; @@ -453,7 +459,7 @@ struct homa { struct homa_socktab *port_map __aligned(L1_CACHE_BYTES); /** - * @peertab: Info about all the other hosts we have communicated with. + * @peers: Info about all the other hosts we have communicated with. * Dynamically allocated; must be kfreed. */ struct homa_peertab *peers; @@ -529,7 +535,7 @@ struct homa { int window_param; /** - * @link_bandwidth: The raw bandwidth of the network uplink, in + * @link_mbps: The raw bandwidth of the network uplink, in * units of 1e06 bits per second. Set externally via sysctl. */ int link_mbps; @@ -905,6 +911,7 @@ struct homa_skb_info { * homa_get_skb_info() - Return the address of Homa's private information * for an sk_buff. * @skb: Socket buffer whose info is needed. + * Return: address of Homa's private information for @skb. */ static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb) { @@ -915,6 +922,7 @@ static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb) /** * homa_next_skb() - Compute address of Homa's private link field in @skb. * @skb: Socket buffer containing private link field. + * Return: address of Homa's private link field for @skb. * * Homa needs to keep a list of buffers in a message, but it can't use the * links built into sk_buffs because Homa wants to retain its list even @@ -974,6 +982,7 @@ static inline bool skb_is_ipv6(const struct sk_buff *skb) * ipv4_to_ipv6() - Given an IPv4 address, return an equivalent IPv6 address * (an IPv4-mapped one). * @ip4: IPv4 address, in network byte order. + * Return: IPv6 address that is equivalent to @ip4. */ static inline struct in6_addr ipv4_to_ipv6(__be32 ip4) { @@ -990,6 +999,7 @@ static inline struct in6_addr ipv4_to_ipv6(__be32 ip4) * ipv6_to_ipv4() - Given an IPv6 address produced by ipv4_to_ipv6, return * the original IPv4 address (in network byte order). * @ip6: IPv6 address; assumed to be a mapped IPv4 address. + * Return: IPv4 address stored in @ip6. */ static inline __be32 ipv6_to_ipv4(const struct in6_addr ip6) { @@ -1001,6 +1011,7 @@ static inline __be32 ipv6_to_ipv4(const struct in6_addr ip6) * form used in Homa, which is always an IPv6 address; if the original address * was IPv4, convert it to an IPv4-mapped IPv6 address. * @addr: Address to canonicalize (if NULL, "any" is returned). + * Return: IPv6 address corresponding to @addr. */ static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union *addr) @@ -1020,6 +1031,7 @@ static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union * address; if the original address was IPv4, convert it to an IPv4-mapped * IPv6 address. * @skb: The source address will be extracted from this packet buffer. + * Return: IPv6 address for @skb's source machine. */ static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb) { @@ -1031,6 +1043,7 @@ static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb) * is_mapped_ipv4() - Return true if an IPv6 address is actually an * IPv4-mapped address, false otherwise. * @x: The address to check. + * Return: see above. */ static inline bool is_mapped_ipv4(const struct in6_addr x) { diff --git a/homa_outgoing.c b/homa_outgoing.c index 8d902817..4d56bdd0 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -109,6 +109,9 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, int err, gso_size; __u64 segs; + segs = length + max_seg_data - 1; + do_div(segs, max_seg_data); + /* Initialize the overall skb. */ skb = homa_skb_new_tx(sizeof32(struct data_header)); if (!skb) @@ -135,8 +138,6 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, h->retransmit = 0; h->seg.offset = htonl(-1); - segs = length + max_seg_data - 1; - do_div(segs, max_seg_data); homa_info = homa_get_skb_info(skb); homa_info->next_skb = NULL; homa_info->wire_bytes = length + segs * (sizeof(struct data_header) diff --git a/homa_peer.c b/homa_peer.c index 5aad3696..d2429827 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -297,8 +297,8 @@ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, struct rtable *rt; flowi4_init_output(&peer->flow.u.ip4, inet->sk.sk_bound_dev_if, - inet->sk.sk_mark, inet->tos, RT_SCOPE_UNIVERSE, - inet->sk.sk_protocol, 0, + inet->sk.sk_mark, inet->tos, + RT_SCOPE_UNIVERSE, inet->sk.sk_protocol, 0, peer->addr.in6_u.u6_addr32[3], inet->inet_saddr, 0, 0, inet->sk.sk_uid); security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); diff --git a/homa_peer.h b/homa_peer.h index da2d40fb..5a33d955 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -205,7 +205,8 @@ int homa_peertab_init(struct homa_peertab *peertab); void homa_peer_add_ack(struct homa_rpc *rpc); struct homa_peer *homa_peer_find(struct homa_peertab *peertab, - const struct in6_addr *addr, struct inet_sock *inet); + const struct in6_addr *addr, + struct inet_sock *inet); int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst); struct dst_entry @@ -243,7 +244,7 @@ static inline void homa_peer_unlock(struct homa_peer *peer) * updating it if the cached information is stale. * @peer: Peer whose destination information is desired. * @hsk: Homa socket; needed by lower-level code to recreate the dst. - * Return Up-to-date destination for peer. + * Return: Up-to-date destination for peer. */ static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk) diff --git a/homa_plumbing.c b/homa_plumbing.c index 03db8006..ce520dd9 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -522,6 +522,7 @@ int __init homa_load(void) int status; pr_notice("Homa module loading\n"); +#ifndef __STRIP__ /* See strip.py */ pr_notice("Homa structure sizes: data_header %u, seg_header %u, ack %u, grant_header %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", sizeof32(struct data_header), sizeof32(struct seg_header), @@ -541,6 +542,7 @@ int __init homa_load(void) NR_CPUS, nr_cpu_ids, MAX_NUMNODES); +#endif /* See strip.py */ status = proto_register(&homa_prot, 1); if (status != 0) { pr_err("proto_register failed for homa_prot: %d\n", status); @@ -1094,7 +1096,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, goto done; } result = homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages, - control.bpage_offsets); + control.bpage_offsets); control.num_bpages = 0; if (result != 0) goto done; diff --git a/homa_pool.c b/homa_pool.c index 10889e21..a21cf9ce 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -130,7 +130,7 @@ void homa_pool_destroy(struct homa_pool *pool) * @args: Store info here. */ void homa_pool_get_rcvbuf(struct homa_sock *hsk, - struct homa_rcvbuf_args *args) + struct homa_rcvbuf_args *args) { homa_sock_lock(hsk, "homa_pool_get_rcvbuf"); args->start = hsk->buffer_pool->region; @@ -215,13 +215,13 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, */ ref_count = atomic_read(&bpage->refs); if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || - bpage->expiration > now))) + bpage->expiration > now))) continue; if (!spin_trylock_bh(&bpage->lock)) continue; ref_count = atomic_read(&bpage->refs); if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || - bpage->expiration > now))) { + bpage->expiration > now))) { spin_unlock_bh(&bpage->lock); continue; } @@ -400,7 +400,7 @@ void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available) * Return: 0 for success, otherwise a negative errno. */ int homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, - __u32 *buffers) + __u32 *buffers) { int result = 0; int i; diff --git a/homa_rpc.h b/homa_rpc.h index 20b94df5..d272ffb8 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -518,6 +518,7 @@ static inline void homa_unprotect_rpcs(struct homa_sock *hsk) * homa_is_client(): returns true if we are the client for a particular RPC, * false if we are the server. * @id: Id of the RPC in question. + * Return: true if we are the client for RPC id, false otherwise */ static inline bool homa_is_client(__u64 id) { diff --git a/homa_sock.h b/homa_sock.h index 7b21f9fb..079ac7eb 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -50,8 +50,10 @@ struct homa_socktab { * of a homa_socktab. */ struct homa_socktab_links { - /* Must be the first element of the struct! */ + /** hash_links: links this element into the hash chain. */ struct hlist_node hash_links; + + /** @sock: Homa socket structure. */ struct homa_sock *sock; }; @@ -269,7 +271,10 @@ struct homa_sock { * is present in the socket struct after Homa-specific information. */ struct homa_v6_sock { + /** @homa: All socket info except for IPv6-specific stuff. */ struct homa_sock homa; + + /** @inet6: Socket info specific to IPv6. */ struct ipv6_pinfo inet6; }; diff --git a/homa_utils.c b/homa_utils.c index d72604bb..d4dbb857 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -594,8 +594,6 @@ int homa_snprintf(char *buffer, int size, int used, const char *format, ...) */ char *homa_symbol_for_type(uint8_t type) { - static char buffer[20]; - switch (type) { case DATA: return "DATA"; @@ -616,16 +614,7 @@ char *homa_symbol_for_type(uint8_t type) case ACK: return "ACK"; } - - /* Using a static buffer can produce garbled text under concurrency, - * but (a) it's unlikely (this code only executes if the opcode is - * bogus), (b) this is mostly for testing and debugging, and (c) the - * code below ensures that the string cannot run past the end of the - * buffer, so the code is safe. - */ - snprintf(buffer, sizeof(buffer) - 1, "unknown(%u)", type); - buffer[sizeof(buffer) - 1] = 0; - return buffer; + return "??"; } /** diff --git a/homa_wire.h b/homa_wire.h index d8032490..78859065 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -287,7 +287,8 @@ _Static_assert(sizeof(struct data_header) <= HOMA_MAX_HEADER, "data_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); _Static_assert(sizeof(struct data_header) >= HOMA_MIN_PKT_LENGTH, "data_header too small: Homa doesn't currently have codeto pad data packets"); -_Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) & 0x3) == 0, +_Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) & + 0x3) == 0, " data_header length not a multiple of 4 bytes (required for TCP/TSO compatibility"); /** @@ -298,7 +299,8 @@ _Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) & 0x3) */ static inline int homa_data_len(struct sk_buff *skb) { - return skb->len - skb_transport_offset(skb) - sizeof(struct data_header); + return skb->len - skb_transport_offset(skb) - + sizeof(struct data_header); } /** From 3754a5e3127adf3f78c404cd04bb7329a6c0a4b5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Dec 2024 10:50:21 -0800 Subject: [PATCH 106/625] Change tt_record message for homa_softirq Simplifies process of stripping for Linux upstreaming --- homa_plumbing.c | 18 +++++------------- util/tthoma.py | 2 +- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index ce520dd9..d8b7cf35 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1237,7 +1237,6 @@ int homa_softirq(struct sk_buff *skb) struct sk_buff **prev_link, **other_link; struct homa *homa = global_homa; struct common_header *h; - int first_packet = 1; int header_offset; int pull_length; __u64 start; @@ -1252,15 +1251,12 @@ int homa_softirq(struct sk_buff *skb) * leaving the longer packets in the list. Also, perform various * prep/cleanup/error checking functions. */ + tt_record("homa_softirq starting"); skb->next = skb_shinfo(skb)->frag_list; skb_shinfo(skb)->frag_list = NULL; packets = skb; prev_link = &packets; for (skb = packets; skb; skb = next) { -#ifndef __STRIP__ /* See strip.py */ - const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - -#endif /* See strip.py */ next = skb->next; /* Make the header available at skb->data, even if the packet @@ -1286,6 +1282,8 @@ int homa_softirq(struct sk_buff *skb) if (unlikely(skb->len < sizeof(struct common_header) || h->type < DATA || h->type >= BOGUS || skb->len < header_lengths[h->type - DATA])) { + const struct in6_addr saddr = + skb_canonical_ipv6_saddr(skb); if (homa->verbose) pr_warn("Homa %s packet from %s too short: %d bytes\n", homa_symbol_for_type(h->type), @@ -1295,13 +1293,6 @@ int homa_softirq(struct sk_buff *skb) goto discard; } - if (first_packet) { - tt_record4("homa_softirq: first packet from 0x%x:%d, id %llu, type %d", - tt_addr(saddr), ntohs(h->sport), - homa_local_id(h->sender_id), h->type); - first_packet = 0; - } - /* Check for FREEZE here, rather than in homa_incoming.c, so * it will work even if the RPC and/or socket are unknown. */ @@ -1309,7 +1300,8 @@ int homa_softirq(struct sk_buff *skb) if (!tt_frozen) { homa_rpc_log_active_tt(homa, 0); tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", - ntohs(h->dport), tt_addr(saddr), + ntohs(h->dport), + tt_addr(skb_canonical_ipv6_saddr(skb)), ntohs(h->sport), homa_local_id(h->sender_id)); tt_freeze(); diff --git a/util/tthoma.py b/util/tthoma.py index 0317c516..fa7ebb06 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1288,7 +1288,7 @@ def __softirq_start(self, trace, time, core, match, interests): patterns.append({ 'name': 'softirq_start', - 'regexp': 'homa_softirq: first packet' + 'regexp': 'homa_softirq starting' }) def __rpc_handoff(self, trace, time, core, match, interests): From a5f78d05fa76392063a96fc81c40d077933689ad Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Dec 2024 11:21:25 -0800 Subject: [PATCH 107/625] Remove sport argument from homa_find_server_rpc Not needed since ids uniquely identify client RPCs. --- homa_incoming.c | 1 - homa_plumbing.c | 3 +-- homa_rpc.c | 10 +++------- homa_rpc.h | 3 +-- test/unit_homa_incoming.c | 4 ++-- test/unit_homa_rpc.c | 11 +++++------ 6 files changed, 12 insertions(+), 20 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 2a9958e9..80b3bd62 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -441,7 +441,6 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) } } else { rpc = homa_find_server_rpc(hsk, &saddr, - ntohs(h->common.sport), id); } } else { diff --git a/homa_plumbing.c b/homa_plumbing.c index d8b7cf35..ec3f963e 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1007,8 +1007,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) } canonical_dest = canonical_ipv6_addr(addr); - rpc = homa_find_server_rpc(hsk, &canonical_dest, - ntohs(addr->in6.sin6_port), args.id); + rpc = homa_find_server_rpc(hsk, &canonical_dest, args.id); if (!rpc) { /* Return without an error if the RPC doesn't exist; * this could be totally valid (e.g. client is diff --git a/homa_rpc.c b/homa_rpc.c index d52db34e..04aa76b6 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -212,7 +212,6 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack) { - __u16 client_port = ntohs(ack->client_port); __u16 server_port = ntohs(ack->server_port); __u64 id = homa_local_id(ack->client_id); struct homa_sock *hsk2 = hsk; @@ -228,7 +227,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, if (!hsk2) goto done; } - rpc = homa_find_server_rpc(hsk2, saddr, client_port, id); + rpc = homa_find_server_rpc(hsk2, saddr, id); if (rpc) { tt_record1("homa_rpc_acked freeing id %d", rpc->id); homa_rpc_free(rpc); @@ -507,7 +506,6 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) * a packet belongs to, if there is any. Thread-safe without socket lock. * @hsk: Socket via which packet was received. * @saddr: Address from which the packet was sent. - * @sport: Port at @saddr from which the packet was sent. * @id: Unique identifier for the RPC (must have server bit set). * * Return: A pointer to the homa_rpc matching the arguments, or NULL @@ -515,8 +513,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) * unlock it by invoking homa_rpc_unlock. */ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u16 sport, - __u64 id) + const struct in6_addr *saddr, __u64 id) __acquires(&srpc->bucket->lock) { struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); @@ -524,8 +521,7 @@ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, homa_bucket_lock(bucket, id, __func__); hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { - if (srpc->id == id && srpc->dport == sport && - ipv6_addr_equal(&srpc->peer->addr, saddr)) + if (srpc->id == id && ipv6_addr_equal(&srpc->peer->addr, saddr)) return srpc; } homa_bucket_unlock(bucket, id); diff --git a/homa_rpc.h b/homa_rpc.h index d272ffb8..c6e5043c 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -418,8 +418,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id); struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u16 sport, - __u64 id); + const struct in6_addr *saddr, __u64 id); void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); void homa_rpc_free(struct homa_rpc *rpc); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 6fbd00df..c03ffec9 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1789,9 +1789,9 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) self->client_ip, self->server_ip, self->client_port, self->server_id+2, 100, 5000); struct ack_header h = {.common = { - .sport = htons(self->client_port + 1), + .sport = htons(self->client_port), .dport = htons(self->hsk2.port), - .sender_id = cpu_to_be64(self->client_id), + .sender_id = cpu_to_be64(self->client_id + 10), .type = ACK}, .num_acks = htons(2)}; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 17e111e0..6dc4577b 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -705,17 +705,16 @@ TEST_F(homa_rpc, homa_find_server_rpc) ASSERT_NE(NULL, srpc3); ASSERT_NE(NULL, srpc4); EXPECT_EQ(srpc1, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, srpc1->id)); + srpc1->id)); homa_rpc_unlock(srpc1); EXPECT_EQ(srpc2, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, srpc2->id)); + srpc2->id)); homa_rpc_unlock(srpc2); EXPECT_EQ(srpc3, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port+1, srpc3->id)); + srpc3->id)); homa_rpc_unlock(srpc3); EXPECT_EQ(srpc4, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port+1, srpc4->id)); + srpc4->id)); homa_rpc_unlock(srpc4); - EXPECT_EQ(NULL, homa_find_server_rpc(&self->hsk, self->client_ip, - self->client_port, 3)); + EXPECT_EQ(NULL, homa_find_server_rpc(&self->hsk, self->client_ip, 3)); } From be127d855cb21ce4b5c0e5baaf038fa5e28eec0b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Dec 2024 11:39:47 -0800 Subject: [PATCH 108/625] Remove client_port field from struct homa_ack No longer needed now that homa_find_server_rpc doesn't take a client_port argument. --- homa_peer.c | 1 - homa_utils.c | 3 +-- homa_wire.h | 7 ++----- test/unit_homa_grant.c | 2 +- test/unit_homa_incoming.c | 13 ++++--------- test/unit_homa_offload.c | 3 +-- test/unit_homa_outgoing.c | 3 +-- test/unit_homa_peer.c | 18 ++++++------------ test/unit_homa_rpc.c | 6 +----- test/utils.c | 9 ++++----- 10 files changed, 21 insertions(+), 44 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index d2429827..e1e7f167 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -388,7 +388,6 @@ void homa_peer_add_ack(struct homa_rpc *rpc) homa_peer_lock(peer); if (peer->num_acks < HOMA_MAX_ACKS_PER_PKT) { peer->acks[peer->num_acks].client_id = cpu_to_be64(rpc->id); - peer->acks[peer->num_acks].client_port = htons(rpc->hsk->port); peer->acks[peer->num_acks].server_port = htons(rpc->dport); peer->num_acks++; homa_peer_unlock(peer); diff --git a/homa_utils.c b/homa_utils.c index d4dbb857..c51112fe 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -398,8 +398,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) used = homa_snprintf(buffer, buf_len, used, ", acks"); for (i = 0; i < count; i++) { used = homa_snprintf(buffer, buf_len, used, - " [cp %d, sp %d, id %llu]", - ntohs(h->acks[i].client_port), + " [sp %d, id %llu]", ntohs(h->acks[i].server_port), be64_to_cpu(h->acks[i].client_id)); } diff --git a/homa_wire.h b/homa_wire.h index 78859065..c2031942 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -177,9 +177,6 @@ struct homa_ack { */ __be64 client_id; - /** @client_port: The client-side port for the RPC. */ - __be16 client_port; - /** @server_port: The server-side port for the RPC. */ __be16 server_port; } __packed; @@ -278,7 +275,7 @@ struct data_header { */ __u8 retransmit; - __u8 pad; + char pad[3]; /** @seg: First of possibly many segments. */ struct seg_header seg; @@ -286,7 +283,7 @@ struct data_header { _Static_assert(sizeof(struct data_header) <= HOMA_MAX_HEADER, "data_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); _Static_assert(sizeof(struct data_header) >= HOMA_MIN_PKT_LENGTH, - "data_header too small: Homa doesn't currently have codeto pad data packets"); + "data_header too small: Homa doesn't currently have code to pad data packets"); _Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) & 0x3) == 0, " data_header length not a multiple of 4 bytes (required for TCP/TSO compatibility"); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 423713b9..b26e2c05 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -85,7 +85,7 @@ FIXTURE_SETUP(homa_grant) .sender_id = cpu_to_be64(self->client_id)}, .message_length = htonl(10000), .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0, 0}, + .ack = {0, 0}, .retransmit = 0, .seg = {.offset = 0}}; unit_log_clear(); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index c03ffec9..3d59f344 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -164,7 +164,7 @@ FIXTURE_SETUP(homa_incoming) .sender_id = cpu_to_be64(self->client_id)}, .message_length = htonl(10000), .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0, 0}, + .ack = {0, 0}, .retransmit = 0, .seg = {.offset = 0}}; unit_log_clear(); @@ -1073,7 +1073,6 @@ TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) ASSERT_NE(NULL, srpc); self->data.ack = (struct homa_ack) { - .client_port = htons(self->client_port), .server_port = htons(self->server_port), .client_id = cpu_to_be64(self->client_id)}; self->data.common.sender_id = cpu_to_be64(self->client_id+10); @@ -1088,7 +1087,6 @@ TEST_F(homa_incoming, homa_dispatch_pkts__too_many_acks) struct sk_buff *skb, *skb2, *skb3; self->data.ack = (struct homa_ack) { - .client_port = htons(self->client_port), .server_port = htons(self->server_port), .client_id = cpu_to_be64(self->client_id)}; self->data.common.sender_id = cpu_to_be64(self->client_id+10); @@ -1748,14 +1746,13 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) .sender_id = cpu_to_be64(self->server_id), .type = NEED_ACK}}; - peer->acks[0].client_port = htons(self->client_port); peer->acks[0].server_port = htons(self->server_port); peer->acks[0].client_id = cpu_to_be64(self->client_id+2); peer->num_acks = 1; mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); - EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks [cp 40000, sp 99, id 1236]", + EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks [sp 99, id 1236]", unit_log_get()); } @@ -1800,11 +1797,9 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) EXPECT_EQ(2, unit_list_length(&self->hsk2.active_rpcs)); unit_log_clear(); mock_xmit_log_verbose = 1; - h.acks[0] = (struct homa_ack) {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), + h.acks[0] = (struct homa_ack) {.server_port = htons(self->server_port), .client_id = cpu_to_be64(self->server_id+5)}; - h.acks[1] = (struct homa_ack) {.client_port = htons(self->client_port), - .server_port = htons(self->server_port), + h.acks[1] = (struct homa_ack) {.server_port = htons(self->server_port), .client_id = cpu_to_be64(self->server_id+1)}; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 9b4211c7..a1cdfaf3 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -53,7 +53,7 @@ FIXTURE_SETUP(homa_offload) .sender_id = cpu_to_be64(1000)}, .message_length = htonl(10000), .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0, 0}, + .ack = {0, 0}, .retransmit = 0, .seg = {.offset = htonl(2000)}}; for (i = 0; i < GRO_HASH_BUCKETS; i++) { @@ -267,7 +267,6 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) h.incoming = htonl(10000); h.cutoff_version = 0; h.ack.client_id = 0; - h.ack.client_port = 0; h.ack.server_port = 0; h.retransmit = 0; h.seg.offset = htonl(2000); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 48102e4f..24477c67 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -381,7 +381,6 @@ TEST_F(homa_outgoing, homa_message_out_fill__include_acks) ASSERT_FALSE(crpc == NULL); crpc->peer->acks[0] = (struct homa_ack) { - .client_port = htons(100), .server_port = htons(200), .client_id = cpu_to_be64(1000)}; crpc->peer->num_acks = 1; @@ -389,7 +388,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__include_acks) unit_iov_iter((void *) 1000, 500), 0)); homa_rpc_unlock(crpc); homa_skb_get(crpc->msgout.packets, &h, 0, sizeof(h)); - EXPECT_STREQ("client_port 100, server_port 200, client_id 1000", + EXPECT_STREQ("server_port 200, client_id 1000", unit_ack_string(&h.ack)); } TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index d2c49480..4a443146 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -368,15 +368,12 @@ TEST_F(homa_peer, homa_peer_add_ack) /* Initialize 3 acks in the peer. */ peer->acks[0] = (struct homa_ack) { - .client_port = htons(1000), .server_port = htons(self->server_port), .client_id = cpu_to_be64(90)}; peer->acks[1] = (struct homa_ack) { - .client_port = htons(1001), .server_port = htons(self->server_port), .client_id = cpu_to_be64(91)}; peer->acks[2] = (struct homa_ack) { - .client_port = htons(1002), .server_port = htons(self->server_port), .client_id = cpu_to_be64(92)}; peer->num_acks = 3; @@ -384,13 +381,13 @@ TEST_F(homa_peer, homa_peer_add_ack) /* Add one RPC to unacked (fits). */ homa_peer_add_ack(crpc1); EXPECT_EQ(4, peer->num_acks); - EXPECT_STREQ("client_port 32768, server_port 99, client_id 101", + EXPECT_STREQ("server_port 99, client_id 101", unit_ack_string(&peer->acks[3])); /* Add another RPC to unacked (also fits). */ homa_peer_add_ack(crpc2); EXPECT_EQ(5, peer->num_acks); - EXPECT_STREQ("client_port 32768, server_port 99, client_id 102", + EXPECT_STREQ("server_port 99, client_id 102", unit_ack_string(&peer->acks[4])); /* Third RPC overflows, triggers ACK transmission. */ @@ -398,7 +395,7 @@ TEST_F(homa_peer, homa_peer_add_ack) mock_xmit_log_verbose = 1; homa_peer_add_ack(crpc3); EXPECT_EQ(0, peer->num_acks); - EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 103, acks [cp 1000, sp 99, id 90] [cp 1001, sp 99, id 91] [cp 1002, sp 99, id 92] [cp 32768, sp 99, id 101] [cp 32768, sp 99, id 102]", + EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 103, acks [sp 99, id 90] [sp 99, id 91] [sp 99, id 92] [sp 99, id 101] [sp 99, id 102]", unit_log_get()); } @@ -416,27 +413,24 @@ TEST_F(homa_peer, homa_peer_get_acks) // Second call: retrieve 2 out of 3. peer->acks[0] = (struct homa_ack) { - .client_port = htons(4000), .server_port = htons(5000), .client_id = cpu_to_be64(100)}; peer->acks[1] = (struct homa_ack) { - .client_port = htons(4001), .server_port = htons(5001), .client_id = cpu_to_be64(101)}; peer->acks[2] = (struct homa_ack) { - .client_port = htons(4002), .server_port = htons(5002), .client_id = cpu_to_be64(102)}; peer->num_acks = 3; EXPECT_EQ(2, homa_peer_get_acks(peer, 2, acks)); - EXPECT_STREQ("client_port 4001, server_port 5001, client_id 101", + EXPECT_STREQ("server_port 5001, client_id 101", unit_ack_string(&acks[0])); - EXPECT_STREQ("client_port 4002, server_port 5002, client_id 102", + EXPECT_STREQ("server_port 5002, client_id 102", unit_ack_string(&acks[1])); EXPECT_EQ(1, peer->num_acks); // Third call: retrieve final id. EXPECT_EQ(1, homa_peer_get_acks(peer, 2, acks)); - EXPECT_STREQ("client_port 4000, server_port 5000, client_id 100", + EXPECT_STREQ("server_port 5000, client_id 100", unit_ack_string(&acks[0])); } diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 6dc4577b..c0497b73 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -47,7 +47,7 @@ FIXTURE_SETUP(homa_rpc) .sender_id = self->client_id}, .message_length = htonl(10000), .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0, 0}, + .ack = {0, 0}, .retransmit = 0, .seg = {.offset = 0}}; self->iovec.iov_base = (void *) 2000; @@ -305,7 +305,6 @@ TEST_F(homa_rpc, homa_rpc_acked__basics) self->server_ip, self->client_port, self->server_id, 100, 3000); ASSERT_NE(NULL, srpc); - ack.client_port = htons(self->client_port); ack.server_port = htons(self->server_port); ack.client_id = cpu_to_be64(self->client_id); homa_rpc_acked(&hsk, self->client_ip, &ack); @@ -324,7 +323,6 @@ TEST_F(homa_rpc, homa_rpc_acked__lookup_socket) self->server_ip, self->client_port, self->server_id, 100, 3000); ASSERT_NE(NULL, srpc); - ack.client_port = htons(self->client_port); ack.server_port = htons(self->server_port); ack.client_id = cpu_to_be64(self->client_id); homa_rpc_acked(&self->hsk, self->client_ip, &ack); @@ -343,7 +341,6 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_socket) self->server_ip, self->client_port, self->server_id, 100, 3000); ASSERT_NE(NULL, srpc); - ack.client_port = htons(self->client_port); ack.server_port = htons(self->server_port+1); ack.client_id = cpu_to_be64(self->client_id); homa_rpc_acked(&hsk, self->client_ip, &ack); @@ -362,7 +359,6 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) self->server_ip, self->client_port, self->server_id, 100, 3000); ASSERT_NE(NULL, srpc); - ack.client_port = htons(self->client_port); ack.server_port = htons(self->server_port); ack.client_id = cpu_to_be64(self->client_id+10); homa_rpc_acked(&hsk, self->client_ip, &ack); diff --git a/test/utils.c b/test/utils.c index b520ad90..c97b4ce8 100644 --- a/test/utils.c +++ b/test/utils.c @@ -67,7 +67,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, }, .message_length = htonl(resp_length), .incoming = htonl(10000), - .ack = {0, 0, 0}, + .ack = {0, 0}, .cutoff_version = 0, .retransmit = 0, .seg = {.offset = 0} @@ -361,7 +361,7 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, }, .message_length = htonl(req_length), .incoming = htonl(10000), - .ack = {0, 0, 0}, + .ack = {0, 0}, .cutoff_version = 0, .retransmit = 0, .seg = {.offset = 0} @@ -446,9 +446,8 @@ char *unit_ack_string(struct homa_ack *ack) static char buffer[1000]; snprintf(buffer, sizeof(buffer), - "client_port %d, server_port %d, client_id %llu", - ntohs(ack->client_port), ntohs(ack->server_port), - be64_to_cpu(ack->client_id)); + "server_port %d, client_id %llu", + ntohs(ack->server_port), be64_to_cpu(ack->client_id)); return buffer; } From c0e9b6f3d099cfaec66e5c8dcabb66ff7ed059bf Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Dec 2024 16:56:20 -0800 Subject: [PATCH 109/625] Rework ICMP packet handling V6 handler was incorrect; also cleaned up V4 handler. --- homa_plumbing.c | 66 +++++++++++------------- test/mock.c | 92 +++++++++++++++++++-------------- test/unit_homa_plumbing.c | 104 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 187 insertions(+), 75 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index ec3f963e..cf22f370 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1387,7 +1387,7 @@ int homa_softirq(struct sk_buff *skb) */ int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb) { - pr_warn("unimplemented backlog_rcv invoked on Homa socket\n"); + pr_warn_once("unimplemented backlog_rcv invoked on Homa socket\n"); kfree_skb(skb); return 0; } @@ -1402,34 +1402,34 @@ int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb) */ int homa_err_handler_v4(struct sk_buff *skb, u32 info) { - const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - const struct iphdr *iph = ip_hdr(skb); + const struct icmphdr *icmp = icmp_hdr(skb); struct homa *homa = global_homa; - int type = icmp_hdr(skb)->type; - int code = icmp_hdr(skb)->code; + struct in6_addr daddr; + int type = icmp->type; + int code = icmp->code; + struct iphdr *iph; + int error = 0; + int port = 0; + iph = (struct iphdr *)(skb->data); + daddr = ipv4_to_ipv6(iph->daddr); if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) { - char *icmp = (char *)icmp_hdr(skb); - struct common_header *h; + struct common_header *h = (struct common_header *)(skb->data + + iph->ihl * 4); - iph = (struct iphdr *)(icmp + sizeof(struct icmphdr)); - h = (struct common_header *)(icmp + sizeof(struct icmphdr) - + iph->ihl * 4); - homa_abort_rpcs(homa, &saddr, ntohs(h->dport), -ENOTCONN); + port = ntohs(h->dport); + error = -ENOTCONN; } else if (type == ICMP_DEST_UNREACH) { - int error; - if (code == ICMP_PROT_UNREACH) error = -EPROTONOSUPPORT; else error = -EHOSTUNREACH; - tt_record2("ICMP destination unreachable: 0x%x (daddr 0x%x)", - ntohl(iph->saddr), ntohl(iph->daddr)); - homa_abort_rpcs(homa, &saddr, 0, error); } else { pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n", __func__, info, type, code); } + if (error != 0) + homa_abort_rpcs(homa, &daddr, port, error); return 0; } @@ -1450,30 +1450,22 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct homa *homa = global_homa; + int error = 0; + int port = 0; if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_PORT_UNREACH) { - char *icmp = (char *)icmp_hdr(skb); - struct common_header *h; - - iph = (struct ipv6hdr *)(icmp + sizeof(struct icmphdr)); - h = (struct common_header *)(icmp + sizeof(struct icmphdr) - + HOMA_IPV6_HEADER_LENGTH); - homa_abort_rpcs(homa, &iph->daddr, ntohs(h->dport), -ENOTCONN); - } else if (type == ICMPV6_DEST_UNREACH) { - int error; - - if (code == ICMP_PROT_UNREACH) - error = -EPROTONOSUPPORT; - else - error = -EHOSTUNREACH; - tt_record2("ICMPv6 destination unreachable: 0x%x (daddr 0x%x)", - tt_addr(iph->saddr), tt_addr(iph->daddr)); - homa_abort_rpcs(homa, &iph->daddr, 0, error); - } else { - if (homa->verbose) - pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n", - __func__, info, type, code); + const struct common_header *h; + + h = (struct common_header *)(skb->data + sizeof(*iph)); + port = ntohs(h->dport); + error = -ENOTCONN; + } else if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_ADDR_UNREACH) { + error = -EHOSTUNREACH; + } else if (type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_NEXTHDR) { + error = -EPROTONOSUPPORT; } + if (error != 0) + homa_abort_rpcs(homa, &iph->daddr, port, error); return 0; } diff --git a/test/mock.c b/test/mock.c index d2f4dfad..3e43beb4 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1417,7 +1417,9 @@ void mock_set_ipv6(struct homa_sock *hsk) * @saddr: IPv6 address to use as the sender of the packet, in * network byte order. * @h: Header for the buffer; actual length and contents depend - * on the type. + * on the type. If NULL then no Homa header is added; + * extra_bytes of total space will be allocated for the + * skb, initialized to zero. * @extra_bytes: How much additional data to add to the buffer after * the header. * @first_value: Determines the data contents: the first __u32 will have @@ -1433,37 +1435,41 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct common_header *h, struct sk_buff *skb; unsigned char *p; - switch (h->type) { - case DATA: - header_size = sizeof(struct data_header); - break; - case GRANT: - header_size = sizeof(struct grant_header); - break; - case RESEND: - header_size = sizeof(struct resend_header); - break; - case UNKNOWN: - header_size = sizeof(struct unknown_header); - break; - case BUSY: - header_size = sizeof(struct busy_header); - break; - case CUTOFFS: - header_size = sizeof(struct cutoffs_header); - break; - case FREEZE: - header_size = sizeof(struct freeze_header); - break; - case NEED_ACK: - header_size = sizeof(struct need_ack_header); - break; - case ACK: - header_size = sizeof(struct ack_header); - break; - default: - header_size = sizeof(struct common_header); - break; + if (h) { + switch (h->type) { + case DATA: + header_size = sizeof(struct data_header); + break; + case GRANT: + header_size = sizeof(struct grant_header); + break; + case RESEND: + header_size = sizeof(struct resend_header); + break; + case UNKNOWN: + header_size = sizeof(struct unknown_header); + break; + case BUSY: + header_size = sizeof(struct busy_header); + break; + case CUTOFFS: + header_size = sizeof(struct cutoffs_header); + break; + case FREEZE: + header_size = sizeof(struct freeze_header); + break; + case NEED_ACK: + header_size = sizeof(struct need_ack_header); + break; + case ACK: + header_size = sizeof(struct ack_header); + break; + default: + header_size = sizeof(struct common_header); + break; + } + } else { + header_size = 0; } skb = malloc(sizeof(struct sk_buff)); memset(skb, 0, sizeof(*skb)); @@ -1474,17 +1480,27 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct common_header *h, ip_size = mock_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); data_size = SKB_DATA_ALIGN(ip_size + header_size + extra_bytes); shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - skb->head = malloc(data_size + shinfo_size); - memset(skb->head, 0, data_size + shinfo_size); + if (h) { + skb->head = malloc(data_size + shinfo_size); + memset(skb->head, 0, data_size + shinfo_size); + } else { + skb->head = malloc(extra_bytes); + memset(skb->head, 0, extra_bytes); + + } skb->data = skb->head; skb_reset_tail_pointer(skb); skb->end = skb->tail + data_size; skb_reserve(skb, ip_size); skb_reset_transport_header(skb); - p = skb_put(skb, header_size); - memcpy(skb->data, h, header_size); - p = skb_put(skb, extra_bytes); - unit_fill_data(p, extra_bytes, first_value); + if (header_size != 0) { + p = skb_put(skb, header_size); + memcpy(skb->data, h, header_size); + } + if (h && extra_bytes != 0) { + p = skb_put(skb, extra_bytes); + unit_fill_data(p, extra_bytes, first_value); + } skb->users.refs.counter = 1; if (mock_ipv6) { ipv6_hdr(skb)->version = 6; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index b372d252..9ff6e018 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -974,3 +974,107 @@ TEST_F(homa_plumbing, homa_softirq__per_rpc_batching) "sk->sk_data_ready invoked", unit_log_get()); } + +TEST_F(homa_plumbing, homa_err_handler_v4__port_unreachable) +{ + struct homa_rpc *crpc; + struct icmphdr *icmph; + struct sk_buff *icmp, *failed; + + mock_ipv6 = false; + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + ASSERT_NE(NULL, crpc); + + failed = mock_skb_new(self->server_ip, &self->data.common, 100, 0); + ip_hdr(failed)->daddr = ipv6_to_ipv4(self->server_ip[0]); + + icmp = mock_skb_new(self->server_ip, NULL, 1000, 0); + icmph = skb_put(icmp, sizeof *icmph); + icmph->type = ICMP_DEST_UNREACH; + icmph->code = ICMP_PORT_UNREACH; + icmp->data = skb_tail_pointer(icmp); + memcpy(skb_put(icmp, failed->len), failed->head, failed->len); + + EXPECT_EQ(0, homa_err_handler_v4(icmp, 111)); + EXPECT_EQ(ENOTCONN, -crpc->error); + + kfree_skb(icmp); + kfree_skb(failed); +} +TEST_F(homa_plumbing, homa_err_handler_v4__host_unreachable) +{ + struct homa_rpc *crpc; + struct icmphdr *icmph; + struct sk_buff *icmp, *failed; + + mock_ipv6 = false; + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + ASSERT_NE(NULL, crpc); + + failed = mock_skb_new(self->server_ip, &self->data.common, 100, 0); + ip_hdr(failed)->daddr = ipv6_to_ipv4(self->server_ip[0]); + + icmp = mock_skb_new(self->server_ip, NULL, 1000, 0); + icmph = skb_put(icmp, sizeof *icmph); + icmph->type = ICMP_DEST_UNREACH; + icmph->code = ICMP_HOST_UNKNOWN; + icmp->data = skb_tail_pointer(icmp); + memcpy(skb_put(icmp, failed->len), failed->head, failed->len); + + EXPECT_EQ(0, homa_err_handler_v4(icmp, 111)); + EXPECT_EQ(EHOSTUNREACH, -crpc->error); + + kfree_skb(icmp); + kfree_skb(failed); +} + +TEST_F(homa_plumbing, homa_err_handler_v6__port_unreachable) +{ + struct homa_rpc *crpc; + struct sk_buff *icmp, *failed; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + ASSERT_NE(NULL, crpc); + + failed = mock_skb_new(self->server_ip, &self->data.common, 100, 0); + ipv6_hdr(failed)->daddr = self->server_ip[0]; + + icmp = mock_skb_new(self->server_ip, NULL, 1000, 0); + memcpy(skb_put(icmp, failed->len), failed->head, failed->len); + + EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, 0, 111)); + EXPECT_EQ(ENOTCONN, -crpc->error); + + kfree_skb(icmp); + kfree_skb(failed); +} +TEST_F(homa_plumbing, homa_err_handler_v6__protocol_not_supported) +{ + struct homa_rpc *crpc; + struct sk_buff *icmp, *failed; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + ASSERT_NE(NULL, crpc); + + failed = mock_skb_new(self->server_ip, &self->data.common, 100, 0); + ipv6_hdr(failed)->daddr = self->server_ip[0]; + + icmp = mock_skb_new(self->server_ip, NULL, 1000, 0); + memcpy(skb_put(icmp, failed->len), failed->head, failed->len); + + EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_PARAMPROB, + ICMPV6_UNK_NEXTHDR, 0, 111)); + EXPECT_EQ(EPROTONOSUPPORT, -crpc->error); + + kfree_skb(icmp); + kfree_skb(failed); +} From 579fcbbef5761b3e31ab0a48e175495c75be3aa2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Dec 2024 17:00:41 -0800 Subject: [PATCH 110/625] Remove unneeded comment --- homa_plumbing.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index cf22f370..74387d43 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1486,11 +1486,6 @@ __poll_t homa_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __u32 mask; - /* It seems to be standard practice for poll functions *not* to - * acquire the socket lock, so we don't do it here; not sure - * why... - */ - sock_poll_wait(file, sock, wait); mask = POLLOUT | POLLWRNORM; From b58bd8540d3dccc68264ead4eea8276a5452eeb3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Dec 2024 17:57:49 -0800 Subject: [PATCH 111/625] Minor comment improvement --- homa_plumbing.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 74387d43..29490d37 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1330,7 +1330,8 @@ int homa_softirq(struct sk_buff *skb) /* Now process the longer packets. Each iteration of this loop * collects all of the packets for a particular RPC and dispatches - * them. + * them (batching the packets for an RPC allows more efficient + * generation of grants). */ while (packets) { struct in6_addr saddr, saddr2; From a3526b5d1033721162b9a96f3c8d965135b58f05 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 11 Dec 2024 09:30:36 -0800 Subject: [PATCH 112/625] Fix dead-code issues found by the test robot --- homa_sock.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/homa_sock.c b/homa_sock.c index 3cde648c..02c440a9 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -223,7 +223,9 @@ void homa_sock_shutdown(struct homa_sock *hsk) { struct homa_interest *interest; struct homa_rpc *rpc; - int i; +#ifndef __STRIP__ /* See strip.py */ + int i = 0; +#endif /* See strip.py */ homa_sock_lock(hsk, "homa_socket_shutdown"); if (hsk->shutdown) { @@ -262,11 +264,10 @@ void homa_sock_shutdown(struct homa_sock *hsk) wake_up_process(interest->thread); homa_sock_unlock(hsk); - i = 0; while (!list_empty(&hsk->dead_rpcs)) { homa_rpc_reap(hsk, 1000); - i++; #ifndef __STRIP__ /* See strip.py */ + i++; if (i == 5) { tt_record("Freezing because reap seems hung"); tt_freeze(); From 3588057af4f87db54f845d4f71b37052dfe6c870 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 11 Dec 2024 13:32:32 -0800 Subject: [PATCH 113/625] Check for socket shutdown in homa_poll --- homa_plumbing.c | 3 +++ test/unit_homa_plumbing.c | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/homa_plumbing.c b/homa_plumbing.c index 29490d37..7f48ff63 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1490,6 +1490,9 @@ __poll_t homa_poll(struct file *file, struct socket *sock, sock_poll_wait(file, sock, wait); mask = POLLOUT | POLLWRNORM; + if (homa_sk(sk)->shutdown) + mask |= POLLIN; + if (!list_empty(&homa_sk(sk)->ready_requests) || !list_empty(&homa_sk(sk)->ready_responses)) mask |= POLLIN | POLLRDNORM; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 9ff6e018..e57500dc 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -1078,3 +1078,27 @@ TEST_F(homa_plumbing, homa_err_handler_v6__protocol_not_supported) kfree_skb(icmp); kfree_skb(failed); } + +TEST_F(homa_plumbing, homa_poll__not_readable) +{ + struct socket sock = {.sk = &self->hsk.sock}; + + EXPECT_EQ(POLLOUT | POLLWRNORM, homa_poll(NULL, &sock, NULL)); +} +TEST_F(homa_plumbing, homa_poll__socket_shutdown) +{ + struct socket sock = {.sk = &self->hsk.sock}; + + homa_sock_shutdown(&self->hsk); + EXPECT_EQ(POLLIN | POLLOUT | POLLWRNORM, homa_poll(NULL, &sock, NULL)); +} +TEST_F(homa_plumbing, homa_poll__socket_readable) +{ + struct socket sock = {.sk = &self->hsk.sock}; + + unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 100, 100); + EXPECT_EQ(POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM, + homa_poll(NULL, &sock, NULL)); +} From fe53afc1f5368d952ca3d2338015f63fe7ea41a3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 13 Dec 2024 09:45:50 -0800 Subject: [PATCH 114/625] Remove unused ETHERNET_MAX_PAYLOAD declaration --- homa_wire.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/homa_wire.h b/homa_wire.h index c2031942..7a122ef2 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -63,13 +63,6 @@ enum homa_packet_type { */ #define HOMA_MAX_HEADER 90 -/** - * define ETHERNET_MAX_PAYLOAD - Maximum length of an Ethernet packet, - * excluding preamble, frame delimeter, VLAN header, CRC, and interpacket gap; - * i.e. all of this space is available for Homa. - */ -#define ETHERNET_MAX_PAYLOAD 1500 - /** * define HOMA_MAX_PRIORITIES - The maximum number of priority levels that * Homa can use (the actual number can be restricted to less than this at From 931015e336b407c157fc663daa0750fdf980bd04 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 13 Dec 2024 10:20:18 -0800 Subject: [PATCH 115/625] Rename classes in homa_wire.h to have homa_ prefix --- homa_grant.c | 2 +- homa_impl.h | 2 +- homa_incoming.c | 28 ++++----- homa_offload.c | 21 ++++--- homa_outgoing.c | 58 +++++++++--------- homa_peer.c | 2 +- homa_plumbing.c | 48 +++++++-------- homa_rpc.c | 4 +- homa_rpc.h | 2 +- homa_timer.c | 4 +- homa_utils.c | 32 +++++----- homa_wire.h | 122 +++++++++++++++++++------------------- test/mock.c | 28 ++++----- test/mock.h | 59 +++++++++--------- test/unit_homa_grant.c | 4 +- test/unit_homa_incoming.c | 62 +++++++++---------- test/unit_homa_offload.c | 28 ++++----- test/unit_homa_outgoing.c | 26 ++++---- test/unit_homa_plumbing.c | 8 +-- test/unit_homa_rpc.c | 4 +- test/utils.c | 4 +- 21 files changed, 276 insertions(+), 272 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 71f40a01..5015fb23 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -222,7 +222,7 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) int homa_grant_send(struct homa_rpc *rpc, struct homa *homa) { int incoming, increment, available; - struct grant_header grant; + struct homa_grant_hdr grant; /* Compute how many additional bytes to grant. */ incoming = rpc->msgin.granted - (rpc->msgin.length diff --git a/homa_impl.h b/homa_impl.h index 956feb15..a57b5086 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -943,7 +943,7 @@ static inline struct sk_buff **homa_next_skb(struct sk_buff *skb) * that should be replicated in each segment. The bytes after * this will be distributed among segments. */ -static inline void homa_set_doff(struct data_header *h, int size) +static inline void homa_set_doff(struct homa_data_hdr *h, int size) { h->common.doff = size << 2; } diff --git a/homa_incoming.c b/homa_incoming.c index 80b3bd62..53ce9526 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -83,7 +83,7 @@ struct homa_gap *homa_gap_new(struct list_head *next, int start, int end) */ void homa_gap_retry(struct homa_rpc *rpc) { - struct resend_header resend; + struct homa_resend_hdr resend; struct homa_gap *gap; list_for_each_entry(gap, &rpc->msgin.gaps, links) { @@ -105,7 +105,7 @@ void homa_gap_retry(struct homa_rpc *rpc) */ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) { - struct data_header *h = (struct data_header *)skb->data; + struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; struct homa_gap *gap, *dummy, *gap2; int start = ntohl(h->seg.offset); int length = homa_data_len(skb); @@ -273,7 +273,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) /* Each iteration of this loop copies out one skb. */ for (i = 0; i < n; i++) { - struct data_header *h = (struct data_header *) + struct homa_data_hdr *h = (struct homa_data_hdr *) skbs[i]->data; int pkt_length = homa_data_len(skbs[i]); int offset = ntohl(h->seg.offset); @@ -365,7 +365,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) #define MAX_ACKS 10 #endif /* __UNIT_TEST__ */ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - struct data_header *h = (struct data_header *)skb->data; + struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; __u64 id = homa_local_id(h->common.sender_id); int dport = ntohs(h->common.dport); @@ -403,7 +403,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) /* Each iteration through the following loop processes one packet. */ for (; skb; skb = next) { - h = (struct data_header *)skb->data; + h = (struct homa_data_hdr *)skb->data; next = skb->next; /* Relinquish the RPC lock temporarily if it's needed @@ -560,7 +560,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) */ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) { - struct data_header *h = (struct data_header *)skb->data; + struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; struct homa *homa = rpc->hsk->homa; tt_record4("incoming data packet, id %d, peer 0x%x, offset %d/%d", @@ -619,7 +619,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) * packet. */ if (jiffies != rpc->peer->last_update_jiffies) { - struct cutoffs_header h2; + struct homa_cutoffs_hdr h2; int i; for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { @@ -646,7 +646,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) */ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) { - struct grant_header *h = (struct grant_header *)skb->data; + struct homa_grant_hdr *h = (struct homa_grant_hdr *)skb->data; int new_offset = ntohl(h->offset); tt_record4("processing grant for id %llu, offset %d, priority %d, increment %d", @@ -680,11 +680,11 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk) { - struct resend_header *h = (struct resend_header *)skb->data; + struct homa_resend_hdr *h = (struct homa_resend_hdr *)skb->data; #ifndef __STRIP__ /* See strip.py */ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); #endif /* See strip.py */ - struct busy_header busy; + struct homa_busy_hdr busy; if (!rpc) { tt_record4("resend request for unknown id %d, peer 0x%x:%d, offset %d; responding with UNKNOWN", @@ -783,7 +783,7 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) */ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) { - struct cutoffs_header *h = (struct cutoffs_header *)skb->data; + struct homa_cutoffs_hdr *h = (struct homa_cutoffs_hdr *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); struct homa_peer *peer; int i; @@ -809,11 +809,11 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc) { - struct common_header *h = (struct common_header *)skb->data; + struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); __u64 id = homa_local_id(h->sender_id); struct homa_peer *peer; - struct ack_header ack; + struct homa_ack_hdr ack; tt_record1("Received NEED_ACK for id %d", id); @@ -871,7 +871,7 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, __releases(rpc->bucket_lock) { const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - struct ack_header *h = (struct ack_header *)skb->data; + struct homa_ack_hdr *h = (struct homa_ack_hdr *)skb->data; int i, count; if (rpc) { diff --git a/homa_offload.c b/homa_offload.c index c8248319..d80fabdf 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -131,7 +131,7 @@ void homa_gro_unhook_tcp(void) struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, struct sk_buff *skb) { - struct common_header *h = (struct common_header *) + struct homa_common_hdr *h = (struct homa_common_hdr *) skb_transport_header(skb); // tt_record4("homa_tcp_gro_receive got type 0x%x, flags 0x%x, " @@ -231,8 +231,8 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, /* This is needed to separate header info (which is replicated * in each segment) from data, which is divided among the segments. */ - __skb_pull(skb, sizeof(struct data_header) - - sizeof(struct seg_header)); + __skb_pull(skb, sizeof(struct homa_data_hdr) + - sizeof(struct homa_seg_hdr)); segs = skb_segment(skb, features); /* Set incrementing ids in each of the segments (mimics behavior @@ -284,8 +284,8 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, struct homa_offload_core *offload_core; struct homa *homa = global_homa; struct sk_buff *result = NULL; + struct homa_data_hdr *h_new; __u64 *softirq_ns_metric; - struct data_header *h_new; struct sk_buff *held_skb; __u64 now = sched_clock(); int priority; @@ -293,7 +293,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, __u32 hash; int busy; - h_new = (struct data_header *)skb_transport_header(skb); + h_new = (struct homa_data_hdr *)skb_transport_header(skb); offload_core = &per_cpu(homa_offload_core, raw_smp_processor_id()); busy = (now - offload_core->last_gro) < homa->gro_busy_ns; offload_core->last_active = now; @@ -328,7 +328,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, } else if (h_new->common.type == GRANT) { tt_record4("homa_gro_receive got grant from 0x%x id %llu, offset %d, priority %d", saddr, homa_local_id(h_new->common.sender_id), - ntohl(((struct grant_header *)h_new)->offset), + ntohl(((struct homa_grant_hdr *)h_new)->offset), priority); /* The following optimization handles grants here at NAPI * level, bypassing the SoftIRQ mechanism (and avoiding the @@ -476,7 +476,8 @@ void homa_gro_gen2(struct homa *homa, struct sk_buff *skb) * cores. See balance.txt for overall design information on load * balancing. */ - struct data_header *h = (struct data_header *)skb_transport_header(skb); + struct homa_data_hdr *h = + (struct homa_data_hdr *)skb_transport_header(skb); int this_core = raw_smp_processor_id(); struct homa_offload_core *offload_core; int candidate = this_core; @@ -532,7 +533,8 @@ void homa_gro_gen3(struct homa *homa, struct sk_buff *skb) /* See balance.txt for overall design information on the Gen3 * load balancer. */ - struct data_header *h = (struct data_header *)skb_transport_header(skb); + struct homa_data_hdr *h = + (struct homa_data_hdr *)skb_transport_header(skb); __u64 now, busy_time; int *candidates; int i, core; @@ -577,7 +579,8 @@ void homa_gro_gen3(struct homa *homa, struct sk_buff *skb) */ int homa_gro_complete(struct sk_buff *skb, int hoffset) { - struct data_header *h = (struct data_header *)skb_transport_header(skb); + struct homa_data_hdr *h = + (struct homa_data_hdr *)skb_transport_header(skb); struct homa *homa = global_homa; // tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", diff --git a/homa_outgoing.c b/homa_outgoing.c index 4d56bdd0..f1174e27 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -34,10 +34,10 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) /** * homa_fill_data_interleaved() - This function is invoked to fill in the * part of a data packet after the initial header, when GSO is being used - * but TCP hijacking is not. As result, seg_headers must be interleaved + * but TCP hijacking is not. As result, homa_seg_hdrs must be interleaved * with the data to provide the correct offset for each segment. * @rpc: RPC whose output message is being created. - * @skb: The packet being filled. The initial data_header was + * @skb: The packet being filled. The initial homa_data_hdr was * created and initialized by the caller and the * homa_skb_info has been filled in with the packet geometry. * @iter: Describes location(s) of (remaining) message data in user @@ -54,11 +54,11 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, int err; /* Each iteration of the following loop adds info for one packet, - * which includes a seg_header followed by the data for that - * segment. The first seg_header was already added by the caller. + * which includes a homa_seg_hdr followed by the data for that + * segment. The first homa_seg_hdr was already added by the caller. */ while (1) { - struct seg_header seg; + struct homa_seg_hdr seg; if (bytes_left < seg_length) seg_length = bytes_left; @@ -104,7 +104,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, int length, int max_seg_data) { struct homa_skb_info *homa_info; - struct data_header *h; + struct homa_data_hdr *h; struct sk_buff *skb; int err, gso_size; __u64 segs; @@ -113,19 +113,19 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, do_div(segs, max_seg_data); /* Initialize the overall skb. */ - skb = homa_skb_new_tx(sizeof32(struct data_header)); + skb = homa_skb_new_tx(sizeof32(struct homa_data_hdr)); if (!skb) return ERR_PTR(-ENOMEM); /* Fill in the Homa header (which will be replicated in every * network packet by GSO). */ - h = (struct data_header *)skb_put(skb, sizeof(struct data_header)); + h = (struct homa_data_hdr *)skb_put(skb, sizeof(struct homa_data_hdr)); h->common.sport = htons(rpc->hsk->port); h->common.dport = htons(rpc->dport); h->common.sequence = htonl(offset); h->common.type = DATA; - homa_set_doff(h, sizeof(struct data_header)); + homa_set_doff(h, sizeof(struct homa_data_hdr)); h->common.flags = HOMA_TCP_FLAGS; h->common.checksum = 0; h->common.urgent = htons(HOMA_TCP_URGENT); @@ -140,17 +140,17 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, homa_info = homa_get_skb_info(skb); homa_info->next_skb = NULL; - homa_info->wire_bytes = length + segs * (sizeof(struct data_header) + homa_info->wire_bytes = length + segs * (sizeof(struct homa_data_hdr) + rpc->hsk->ip_header_length + HOMA_ETH_OVERHEAD); homa_info->data_bytes = length; homa_info->seg_length = max_seg_data; homa_info->offset = offset; if (segs > 1 && rpc->hsk->sock.sk_protocol != IPPROTO_TCP) { - homa_set_doff(h, sizeof(struct data_header) - - sizeof32(struct seg_header)); + homa_set_doff(h, sizeof(struct homa_data_hdr) - + sizeof32(struct homa_seg_hdr)); h->seg.offset = htonl(offset); - gso_size = max_seg_data + sizeof(struct seg_header); + gso_size = max_seg_data + sizeof(struct homa_seg_hdr); err = homa_fill_data_interleaved(rpc, skb, iter); } else { gso_size = max_seg_data; @@ -235,14 +235,14 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) dst = homa_get_dst(rpc->peer, rpc->hsk); mtu = dst_mtu(dst); max_seg_data = mtu - rpc->hsk->ip_header_length - - sizeof(struct data_header); + - sizeof(struct homa_data_hdr); gso_size = dst->dev->gso_max_size; if (gso_size > rpc->hsk->homa->max_gso_size) gso_size = rpc->hsk->homa->max_gso_size; /* Round gso_size down to an even # of mtus. */ segs_per_gso = gso_size - rpc->hsk->ip_header_length - - sizeof(struct data_header); + - sizeof(struct homa_data_hdr); do_div(segs_per_gso, max_seg_data); if (segs_per_gso == 0) segs_per_gso = 1; @@ -324,7 +324,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) * @length: Length of @contents (including the common header). * @rpc: The packet will go to the socket that handles the other end * of this RPC. Addressing info for the packet, including all of - * the fields of common_header except type, will be set from this. + * the fields of homa_common_hdr except type, will be set from this. * * Return: Either zero (for success), or a negative errno value if there * was a problem. @@ -332,7 +332,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) int homa_xmit_control(enum homa_packet_type type, void *contents, size_t length, struct homa_rpc *rpc) { - struct common_header *h = contents; + struct homa_common_hdr *h = contents; h->type = type; h->sport = htons(rpc->hsk->port); @@ -362,7 +362,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, #ifndef __STRIP__ /* See strip.py */ struct netdev_queue *txq; #endif /* See strip.py */ - struct common_header *h; + struct homa_common_hdr *h; struct dst_entry *dst; int result, priority; struct sk_buff *skb; @@ -449,9 +449,9 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, */ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) { - struct common_header *h = (struct common_header *)skb->data; + struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data; struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - struct unknown_header unknown; + struct homa_unknown_hdr unknown; struct homa_peer *peer; if (hsk->homa->verbose) @@ -565,7 +565,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) /* Update info that may have changed since the message was initially * created. */ - ((struct data_header *)skb_transport_header(skb))->cutoff_version = + ((struct homa_data_hdr *)skb_transport_header(skb))->cutoff_version = rpc->peer->cutoff_version; dst = homa_get_dst(rpc->peer, rpc->hsk); @@ -575,7 +575,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) skb->ooo_okay = 1; skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct common_header, checksum); + skb->csum_offset = offsetof(struct homa_common_hdr, checksum); if (rpc->hsk->inet.sk.sk_family == AF_INET6) { tt_record4("calling ip6_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", homa_get_skb_info(skb)->wire_bytes, @@ -628,7 +628,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, */ for (skb = rpc->msgout.packets; skb; skb = homa_info->next_skb) { int seg_offset, offset, seg_length, data_left; - struct data_header *h; + struct homa_data_hdr *h; homa_info = homa_get_skb_info(skb); offset = homa_info->offset; @@ -638,13 +638,13 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, continue; offset = homa_info->offset; - seg_offset = sizeof32(struct data_header); + seg_offset = sizeof32(struct homa_data_hdr); data_left = homa_info->data_bytes; if (skb_shinfo(skb)->gso_segs <= 1) { seg_length = data_left; } else { seg_length = homa_info->seg_length; - h = (struct data_header *)skb_transport_header(skb); + h = (struct homa_data_hdr *)skb_transport_header(skb); } for ( ; data_left > 0; data_left -= seg_length, offset += seg_length, @@ -662,8 +662,8 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, continue; /* This segment must be retransmitted. */ - new_skb = homa_skb_new_tx(sizeof(struct data_header) - - sizeof(struct seg_header)); + new_skb = homa_skb_new_tx(sizeof(struct homa_data_hdr) + - sizeof(struct homa_seg_hdr)); if (unlikely(!new_skb)) { if (rpc->hsk->homa->verbose) pr_notice("%s couldn't allocate skb\n", @@ -672,7 +672,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, goto resend_done; } h = __skb_put_data(new_skb, skb_transport_header(skb), - sizeof32(struct data_header)); + sizeof32(struct homa_data_hdr)); h->common.sequence = htonl(offset); h->seg.offset = htonl(offset); h->retransmit = 1; @@ -696,7 +696,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, new_homa_info = homa_get_skb_info(new_skb); new_homa_info->wire_bytes = rpc->hsk->ip_header_length - + sizeof(struct data_header) + + sizeof(struct homa_data_hdr) + seg_length + HOMA_ETH_OVERHEAD; new_homa_info->data_bytes = seg_length; new_homa_info->seg_length = seg_length; diff --git a/homa_peer.c b/homa_peer.c index e1e7f167..b9c433da 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -383,7 +383,7 @@ void homa_peer_lock_slow(struct homa_peer *peer) void homa_peer_add_ack(struct homa_rpc *rpc) { struct homa_peer *peer = rpc->peer; - struct ack_header ack; + struct homa_ack_hdr ack; homa_peer_lock(peer); if (peer->num_acks < HOMA_MAX_ACKS_PER_PKT) { diff --git a/homa_plumbing.c b/homa_plumbing.c index 7f48ff63..93f1b458 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -496,15 +496,15 @@ static struct ctl_table homa_ctl_table[] = { /* Sizes of the headers for each Homa packet type, in bytes. */ static __u16 header_lengths[] = { - sizeof32(struct data_header), - sizeof32(struct grant_header), - sizeof32(struct resend_header), - sizeof32(struct unknown_header), - sizeof32(struct busy_header), - sizeof32(struct cutoffs_header), - sizeof32(struct freeze_header), - sizeof32(struct need_ack_header), - sizeof32(struct ack_header) + sizeof32(struct homa_data_hdr), + sizeof32(struct homa_grant_hdr), + sizeof32(struct homa_resend_hdr), + sizeof32(struct homa_unknown_hdr), + sizeof32(struct homa_busy_hdr), + sizeof32(struct homa_cutoffs_hdr), + sizeof32(struct homa_freeze_hdr), + sizeof32(struct homa_need_ack_hdr), + sizeof32(struct homa_ack_hdr) }; /* Used to remove sysctl values when the module is unloaded. */ @@ -523,11 +523,11 @@ int __init homa_load(void) pr_notice("Homa module loading\n"); #ifndef __STRIP__ /* See strip.py */ - pr_notice("Homa structure sizes: data_header %u, seg_header %u, ack %u, grant_header %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", - sizeof32(struct data_header), - sizeof32(struct seg_header), + pr_notice("Homa structure sizes: homa_data_hdr %u, homa_seg_hdr %u, ack %u, homa_grant_hdr %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", + sizeof32(struct homa_data_hdr), + sizeof32(struct homa_seg_hdr), sizeof32(struct homa_ack), - sizeof32(struct grant_header), + sizeof32(struct homa_grant_hdr), sizeof32(struct homa_peer), sizeof32(struct iphdr), sizeof32(struct flowi), @@ -1235,7 +1235,7 @@ int homa_softirq(struct sk_buff *skb) struct sk_buff *packets, *other_pkts, *next; struct sk_buff **prev_link, **other_link; struct homa *homa = global_homa; - struct common_header *h; + struct homa_common_hdr *h; int header_offset; int pull_length; __u64 start; @@ -1277,8 +1277,8 @@ int homa_softirq(struct sk_buff *skb) __skb_pull(skb, header_offset); /* Reject packets that are too short or have bogus types. */ - h = (struct common_header *)skb->data; - if (unlikely(skb->len < sizeof(struct common_header) || + h = (struct homa_common_hdr *)skb->data; + if (unlikely(skb->len < sizeof(struct homa_common_hdr) || h->type < DATA || h->type >= BOGUS || skb->len < header_lengths[h->type - DATA])) { const struct in6_addr saddr = @@ -1311,7 +1311,7 @@ int homa_softirq(struct sk_buff *skb) /* Process the packet now if it is a control packet or * if it contains an entire short message. */ - if (h->type != DATA || ntohl(((struct data_header *)h) + if (h->type != DATA || ntohl(((struct homa_data_hdr *)h) ->message_length) < 1400) { UNIT_LOG("; ", "homa_softirq shortcut type 0x%x", h->type); @@ -1335,7 +1335,7 @@ int homa_softirq(struct sk_buff *skb) */ while (packets) { struct in6_addr saddr, saddr2; - struct common_header *h2; + struct homa_common_hdr *h2; struct sk_buff *skb2; skb = packets; @@ -1343,10 +1343,10 @@ int homa_softirq(struct sk_buff *skb) saddr = skb_canonical_ipv6_saddr(skb); other_pkts = NULL; other_link = &other_pkts; - h = (struct common_header *)skb->data; + h = (struct homa_common_hdr *)skb->data; for (skb2 = skb->next; skb2; skb2 = next) { next = skb2->next; - h2 = (struct common_header *)skb2->data; + h2 = (struct homa_common_hdr *)skb2->data; if (h2->sender_id == h->sender_id) { saddr2 = skb_canonical_ipv6_saddr(skb2); if (ipv6_addr_equal(&saddr, &saddr2)) { @@ -1363,7 +1363,7 @@ int homa_softirq(struct sk_buff *skb) #ifdef __UNIT_TEST__ UNIT_LOG("; ", "id %lld, offsets", homa_local_id(h->sender_id)); for (skb2 = packets; skb2; skb2 = skb2->next) { - struct data_header *h3 = (struct data_header *) + struct homa_data_hdr *h3 = (struct homa_data_hdr *) skb2->data; UNIT_LOG("", " %d", ntohl(h3->seg.offset)); } @@ -1415,7 +1415,7 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) iph = (struct iphdr *)(skb->data); daddr = ipv4_to_ipv6(iph->daddr); if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) { - struct common_header *h = (struct common_header *)(skb->data + + struct homa_common_hdr *h = (struct homa_common_hdr *)(skb->data + iph->ihl * 4); port = ntohs(h->dport); @@ -1455,9 +1455,9 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, int port = 0; if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_PORT_UNREACH) { - const struct common_header *h; + const struct homa_common_hdr *h; - h = (struct common_header *)(skb->data + sizeof(*iph)); + h = (struct homa_common_hdr *)(skb->data + sizeof(*iph)); port = ntohs(h->dport); error = -ENOTCONN; } else if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_ADDR_UNREACH) { diff --git a/homa_rpc.c b/homa_rpc.c index 04aa76b6..0e1941fd 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -108,7 +108,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, */ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, const struct in6_addr *source, - struct data_header *h, int *created) + struct homa_data_hdr *h, int *created) __acquires(&srpc->bucket->lock) { __u64 id = homa_local_id(h->common.sender_id); @@ -660,7 +660,7 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) if (!homa_protect_rpcs(hsk)) continue; list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - struct freeze_header freeze; + struct homa_freeze_hdr freeze; count++; homa_rpc_log_tt(rpc); diff --git a/homa_rpc.h b/homa_rpc.h index c6e5043c..91aca525 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -432,7 +432,7 @@ struct homa_rpc struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, const struct in6_addr *source, - struct data_header *h, int *created); + struct homa_data_hdr *h, int *created); int homa_rpc_reap(struct homa_sock *hsk, int count); char *homa_symbol_for_state(struct homa_rpc *rpc); int homa_validate_incoming(struct homa *homa, int verbose, diff --git a/homa_timer.c b/homa_timer.c index 41966f42..586c0817 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -20,7 +20,7 @@ void homa_check_rpc(struct homa_rpc *rpc) { struct homa *homa = rpc->hsk->homa; - struct resend_header resend; + struct homa_resend_hdr resend; const char *us, *them; /* See if we need to request an ack for this RPC. */ @@ -32,7 +32,7 @@ void homa_check_rpc(struct homa_rpc *rpc) /* >= comparison that handles tick wrap-around. */ if ((rpc->done_timer_ticks + homa->request_ack_ticks - 1 - homa->timer_ticks) & 1 << 31) { - struct need_ack_header h; + struct homa_need_ack_hdr h; homa_xmit_control(NEED_ACK, &h, sizeof(h), rpc); tt_record4("Sent NEED_ACK for RPC id %d to peer 0x%x, port %d, ticks %d", diff --git a/homa_utils.c b/homa_utils.c index c51112fe..7c6cc20f 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -270,7 +270,7 @@ char *homa_print_ipv6_addr(const struct in6_addr *addr) */ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) { - struct common_header *common; + struct homa_common_hdr *common; char header[HOMA_MAX_HEADER]; struct in6_addr saddr; int used = 0; @@ -282,7 +282,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) } homa_skb_get(skb, &header, 0, sizeof(header)); - common = (struct common_header *)header; + common = (struct homa_common_hdr *)header; saddr = skb_canonical_ipv6_saddr(skb); used = homa_snprintf(buffer, buf_len, used, "%s from %s:%u, dport %d, id %llu", @@ -293,7 +293,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) switch (common->type) { case DATA: { struct homa_skb_info *homa_info = homa_get_skb_info(skb); - struct data_header *h = (struct data_header *)header; + struct homa_data_hdr *h = (struct homa_data_hdr *)header; int data_left, i, seg_length, pos, offset; if (skb_shinfo(skb)->gso_segs == 0) { @@ -328,7 +328,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) used = homa_snprintf(buffer, buf_len, used, ", extra segs"); for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { - struct seg_header seg; + struct homa_seg_hdr seg; homa_skb_get(skb, &seg, pos, sizeof(seg)); offset = ntohl(seg.offset); @@ -345,7 +345,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) break; } case GRANT: { - struct grant_header *h = (struct grant_header *)header; + struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; char *resend = (h->resend_all) ? ", resend_all" : ""; used = homa_snprintf(buffer, buf_len, used, @@ -354,7 +354,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) break; } case RESEND: { - struct resend_header *h = (struct resend_header *)header; + struct homa_resend_hdr *h = (struct homa_resend_hdr *)header; used = homa_snprintf(buffer, buf_len, used, ", offset %d, length %d, resend_prio %u", @@ -369,7 +369,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) /* Nothing to add here. */ break; case CUTOFFS: { - struct cutoffs_header *h = (struct cutoffs_header *)header; + struct homa_cutoffs_hdr *h = (struct homa_cutoffs_hdr *)header; used = homa_snprintf(buffer, buf_len, used, ", cutoffs %d %d %d %d %d %d %d %d, version %u", @@ -391,7 +391,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) /* Nothing to add here. */ break; case ACK: { - struct ack_header *h = (struct ack_header *)header; + struct homa_ack_hdr *h = (struct homa_ack_hdr *)header; int i, count; count = ntohs(h->num_acks); @@ -422,14 +422,14 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) */ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) { + struct homa_common_hdr *common; char header[HOMA_MAX_HEADER]; - struct common_header *common; - common = (struct common_header *)header; + common = (struct homa_common_hdr *)header; homa_skb_get(skb, header, 0, HOMA_MAX_HEADER); switch (common->type) { case DATA: { - struct data_header *h = (struct data_header *)header; + struct homa_data_hdr *h = (struct homa_data_hdr *)header; struct homa_skb_info *homa_info = homa_get_skb_info(skb); int data_left, used, i, seg_length, pos, offset; @@ -450,7 +450,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) seg_length, offset); for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { - struct seg_header seg; + struct homa_seg_hdr seg; homa_skb_get(skb, &seg, pos, sizeof(seg)); offset = ntohl(seg.offset); @@ -467,7 +467,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) break; } case GRANT: { - struct grant_header *h = (struct grant_header *)header; + struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; char *resend = h->resend_all ? " resend_all" : ""; snprintf(buffer, buf_len, "GRANT %d@%d%s", ntohl(h->offset), @@ -475,7 +475,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) break; } case RESEND: { - struct resend_header *h = (struct resend_header *)header; + struct homa_resend_hdr *h = (struct homa_resend_hdr *)header; snprintf(buffer, buf_len, "RESEND %d-%d@%d", ntohl(h->offset), ntohl(h->offset) + ntohl(h->length) - 1, @@ -515,7 +515,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) void homa_freeze_peers(struct homa *homa) { struct homa_socktab_scan scan; - struct freeze_header freeze; + struct homa_freeze_hdr freeze; struct homa_peer **peers; int num_peers, i, err; struct homa_sock *hsk; @@ -703,7 +703,7 @@ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) return; rpc->hsk->homa->freeze_type = 0; if (!tt_frozen) { -// struct freeze_header freeze; +// struct homa_freeze_hdr freeze; int dummy; pr_notice("freezing in %s with freeze_type %d\n", __func__, diff --git a/homa_wire.h b/homa_wire.h index 7a122ef2..f0d94c48 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -71,12 +71,12 @@ enum homa_packet_type { #define HOMA_MAX_PRIORITIES 8 /** - * struct common_header - Wire format for the first bytes in every Homa + * struct homa_common_hdr - Wire format for the first bytes in every Homa * packet. This must (mostly) match the format of a TCP header to enable * Homa packets to actually be transmitted as TCP packets (and thereby * take advantage of TSO and other features). */ -struct common_header { +struct homa_common_hdr { /** * @sport: Port on source machine from which packet was sent. * Must be in the same position as in a TCP header. @@ -108,7 +108,7 @@ struct common_header { /** * @doff: High order 4 bits holds the number of 4-byte chunks in a - * data_header (low-order bits unused). Used only for DATA packets; + * homa_data_hdr (low-order bits unused). Used only for DATA packets; * must be in the same position as the data offset in a TCP header. * Used by TSO to determine where the replicated header portion ends. */ @@ -174,8 +174,8 @@ struct homa_ack { __be16 server_port; } __packed; -/* struct data_header - Contains data for part or all of a Homa message. - * An incoming packet consists of a data_header followed by message data. +/* struct homa_data_hdr - Contains data for part or all of a Homa message. + * An incoming packet consists of a homa_data_hdr followed by message data. * An outgoing packet can have this simple format as well, or it can be * structured as a GSO packet. Homa supports two different formats for GSO * packets, depending on whether TCP hijacking is enabled: @@ -184,7 +184,7 @@ struct homa_ack { * * |-----------------------| |-----------------------| * | | | | - * | data_header | | data_header | + * | homa_data_hdr | | homa_data_hdr | * | | | | * |---------------------- | |-----------------------| * | | | | @@ -193,7 +193,7 @@ struct homa_ack { * | | | | * | | | | * |-----------------------| |-----------------------| - * | seg_header | | | + * | homa_seg_hdr | | | * |-----------------------| | | * | | | segment data | * | | | | @@ -201,7 +201,7 @@ struct homa_ack { * | | |-----------------------| * | | | | * |-----------------------| | | - * | seg_header | | segment data | + * | homa_seg_hdr | | segment data | * |-----------------------| | | * | | | | * | | |-----------------------| @@ -213,12 +213,12 @@ struct homa_ack { * With TCP hijacking, TSO will automatically adjust @common.sequence in * the segments, so that value can be used as the offset of the data within * the message. Without TCP hijacking, TSO will not adjust @common.sequence - * in the segments, so Homa sprinkles correct offsets (in seg_headers) - * throughout the segment data; TSO/GSO will include a different seg_header + * in the segments, so Homa sprinkles correct offsets (in homa_seg_hdrs) + * throughout the segment data; TSO/GSO will include a different homa_seg_hdr * in each generated packet. */ -struct seg_header { +struct homa_seg_hdr { /** * @offset: Offset within message of the first byte of data in * this segment. If this field is -1 it means that the packet was @@ -230,8 +230,8 @@ struct seg_header { __be32 offset; } __packed; -struct data_header { - struct common_header common; +struct homa_data_hdr { + struct homa_common_hdr common; /** @message_length: Total #bytes in the message. */ __be32 message_length; @@ -271,36 +271,36 @@ struct data_header { char pad[3]; /** @seg: First of possibly many segments. */ - struct seg_header seg; + struct homa_seg_hdr seg; } __packed; -_Static_assert(sizeof(struct data_header) <= HOMA_MAX_HEADER, - "data_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); -_Static_assert(sizeof(struct data_header) >= HOMA_MIN_PKT_LENGTH, - "data_header too small: Homa doesn't currently have code to pad data packets"); -_Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header)) & +_Static_assert(sizeof(struct homa_data_hdr) <= HOMA_MAX_HEADER, + "homa_data_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_data_hdr) >= HOMA_MIN_PKT_LENGTH, + "homa_data_hdr too small: Homa doesn't currently have code to pad data packets"); +_Static_assert(((sizeof(struct homa_data_hdr) - sizeof(struct homa_seg_hdr)) & 0x3) == 0, - " data_header length not a multiple of 4 bytes (required for TCP/TSO compatibility"); + " homa_data_hdr length not a multiple of 4 bytes (required for TCP/TSO compatibility"); /** * homa_data_len() - Returns the total number of bytes in a DATA packet - * after the data_header. Note: if the packet is a GSO packet, the result + * after the homa_data_hdr. Note: if the packet is a GSO packet, the result * may include metadata as well as packet data. * @skb: Incoming data packet */ static inline int homa_data_len(struct sk_buff *skb) { return skb->len - skb_transport_offset(skb) - - sizeof(struct data_header); + sizeof(struct homa_data_hdr); } /** - * struct grant_header - Wire format for GRANT packets, which are sent by + * struct homa_grant_hdr - Wire format for GRANT packets, which are sent by * the receiver back to the sender to indicate that the sender may transmit * additional bytes in the message. */ -struct grant_header { +struct homa_grant_hdr { /** @common: Fields common to all packet types. */ - struct common_header common; + struct homa_common_hdr common; /** * @offset: Byte offset within the message. @@ -324,20 +324,20 @@ struct grant_header { */ __u8 resend_all; } __packed; -_Static_assert(sizeof(struct grant_header) <= HOMA_MAX_HEADER, - "grant_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_grant_hdr) <= HOMA_MAX_HEADER, + "homa_grant_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** - * struct resend_header - Wire format for RESEND packets. + * struct homa_resend_hdr - Wire format for RESEND packets. * * A RESEND is sent by the receiver when it believes that message data may * have been lost in transmission (or if it is concerned that the sender may * have crashed). The receiver should resend the specified portion of the * message, even if it already sent it previously. */ -struct resend_header { +struct homa_resend_hdr { /** @common: Fields common to all packet types. */ - struct common_header common; + struct homa_common_hdr common; /** * @offset: Offset within the message of the first byte of data that @@ -362,11 +362,11 @@ struct resend_header { */ __u8 priority; } __packed; -_Static_assert(sizeof(struct resend_header) <= HOMA_MAX_HEADER, - "resend_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_resend_hdr) <= HOMA_MAX_HEADER, + "homa_resend_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** - * struct unknown_header - Wire format for UNKNOWN packets. + * struct homa_unknown_hdr - Wire format for UNKNOWN packets. * * An UNKNOWN packet is sent by either server or client when it receives a * packet for an RPC that is unknown to it. When a client receives an @@ -374,12 +374,12 @@ _Static_assert(sizeof(struct resend_header) <= HOMA_MAX_HEADER, * when a server receives an UNKNOWN packet it will typically discard its * state for the RPC. */ -struct unknown_header { +struct homa_unknown_hdr { /** @common: Fields common to all packet types. */ - struct common_header common; + struct homa_common_hdr common; } __packed; -_Static_assert(sizeof(struct unknown_header) <= HOMA_MAX_HEADER, - "unknown_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_unknown_hdr) <= HOMA_MAX_HEADER, + "homa_unknown_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct busy_header - Wire format for BUSY packets. @@ -387,22 +387,22 @@ _Static_assert(sizeof(struct unknown_header) <= HOMA_MAX_HEADER, * These packets tell the recipient that the sender is still alive (even if * it isn't sending data expected by the recipient). */ -struct busy_header { +struct homa_busy_hdr { /** @common: Fields common to all packet types. */ - struct common_header common; + struct homa_common_hdr common; } __packed; -_Static_assert(sizeof(struct busy_header) <= HOMA_MAX_HEADER, - "busy_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_busy_hdr) <= HOMA_MAX_HEADER, + "homa_busy_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** - * struct cutoffs_header - Wire format for CUTOFFS packets. + * struct homa_cutoffs_hdr - Wire format for CUTOFFS packets. * * These packets tell the recipient how to assign priorities to * unscheduled packets. */ -struct cutoffs_header { +struct homa_cutoffs_hdr { /** @common: Fields common to all packet types. */ - struct common_header common; + struct homa_common_hdr common; /** * @unsched_cutoffs: priorities to use for unscheduled packets @@ -418,45 +418,45 @@ struct cutoffs_header { */ __be16 cutoff_version; } __packed; -_Static_assert(sizeof(struct cutoffs_header) <= HOMA_MAX_HEADER, - "cutoffs_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_cutoffs_hdr) <= HOMA_MAX_HEADER, + "homa_cutoffs_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** - * struct freeze_header - Wire format for FREEZE packets. + * struct homa_freeze_hdr - Wire format for FREEZE packets. * * These packets tell the recipient to freeze its timetrace; used * for debugging. */ -struct freeze_header { +struct homa_freeze_hdr { /** @common: Fields common to all packet types. */ - struct common_header common; + struct homa_common_hdr common; } __packed; -_Static_assert(sizeof(struct freeze_header) <= HOMA_MAX_HEADER, - "freeze_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_freeze_hdr) <= HOMA_MAX_HEADER, + "homa_freeze_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** - * struct need_ack_header - Wire format for NEED_ACK packets. + * struct homa_need_ack_hdr - Wire format for NEED_ACK packets. * * These packets ask the recipient (a client) to return an ACK message if * the packet's RPC is no longer active. */ -struct need_ack_header { +struct homa_need_ack_hdr { /** @common: Fields common to all packet types. */ - struct common_header common; + struct homa_common_hdr common; } __packed; -_Static_assert(sizeof(struct need_ack_header) <= HOMA_MAX_HEADER, - "need_ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_need_ack_hdr) <= HOMA_MAX_HEADER, + "homa_need_ack_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** - * struct ack_header - Wire format for ACK packets. + * struct homa_ack_hdr - Wire format for ACK packets. * * These packets are sent from a client to a server to indicate that * a set of RPCs is no longer active on the client, so the server can * free any state it may have for them. */ -struct ack_header { +struct homa_ack_hdr { /** @common: Fields common to all packet types. */ - struct common_header common; + struct homa_common_hdr common; /** @num_acks: Number of (leading) elements in @acks that are valid. */ __be16 num_acks; @@ -465,8 +465,8 @@ struct ack_header { /** @acks: Info about RPCs that are no longer active. */ struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; } __packed; -_Static_assert(sizeof(struct ack_header) <= HOMA_MAX_HEADER, - "ack_header too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_ack_hdr) <= HOMA_MAX_HEADER, + "homa_ack_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * homa_local_id(): given an RPC identifier from an input packet (which diff --git a/test/mock.c b/test/mock.c index 3e43beb4..39865a5f 100644 --- a/test/mock.c +++ b/test/mock.c @@ -835,7 +835,7 @@ void mutex_unlock(struct mutex *lock) int netif_receive_skb(struct sk_buff *skb) { - struct data_header *h = (struct data_header *) + struct homa_data_hdr *h = (struct homa_data_hdr *) skb_transport_header(skb); unit_log_printf("; ", "netif_receive_skb, id %llu, offset %d", be64_to_cpu(h->common.sender_id), ntohl(h->seg.offset)); @@ -1084,7 +1084,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features) { struct sk_buff *skb1, *skb2; - struct data_header h; + struct homa_data_hdr h; int offset, length; /* Split the existing packet into two packets. */ @@ -1428,7 +1428,7 @@ void mock_set_ipv6(struct homa_sock *hsk) * Return: A packet buffer containing the information described above. * The caller owns this buffer and is responsible for freeing it. */ -struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct common_header *h, +struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct homa_common_hdr *h, int extra_bytes, int first_value) { int header_size, ip_size, data_size, shinfo_size; @@ -1438,34 +1438,34 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct common_header *h, if (h) { switch (h->type) { case DATA: - header_size = sizeof(struct data_header); + header_size = sizeof(struct homa_data_hdr); break; case GRANT: - header_size = sizeof(struct grant_header); + header_size = sizeof(struct homa_grant_hdr); break; case RESEND: - header_size = sizeof(struct resend_header); + header_size = sizeof(struct homa_resend_hdr); break; case UNKNOWN: - header_size = sizeof(struct unknown_header); + header_size = sizeof(struct homa_unknown_hdr); break; case BUSY: - header_size = sizeof(struct busy_header); + header_size = sizeof(struct homa_busy_hdr); break; case CUTOFFS: - header_size = sizeof(struct cutoffs_header); + header_size = sizeof(struct homa_cutoffs_hdr); break; case FREEZE: - header_size = sizeof(struct freeze_header); + header_size = sizeof(struct homa_freeze_hdr); break; case NEED_ACK: - header_size = sizeof(struct need_ack_header); + header_size = sizeof(struct homa_need_ack_hdr); break; case ACK: - header_size = sizeof(struct ack_header); + header_size = sizeof(struct homa_ack_hdr); break; default: - header_size = sizeof(struct common_header); + header_size = sizeof(struct homa_common_hdr); break; } } else { @@ -1553,7 +1553,7 @@ void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) homa_sock_bind(homa->port_map, hsk, port); hsk->inet.pinet6 = &hsk_pinfo; mock_mtu = UNIT_TEST_DATA_PER_PACKET + hsk->ip_header_length - + sizeof(struct data_header); + + sizeof(struct homa_data_hdr); mock_net_device.gso_max_size = mock_mtu; homa_pool_init(hsk, (void *) 0x1000000, 100*HOMA_BPAGE_SIZE); } diff --git a/test/mock.h b/test/mock.h index a83ee106..36b03756 100644 --- a/test/mock.h +++ b/test/mock.h @@ -44,34 +44,35 @@ extern int mock_vmalloc_errors; extern int mock_xmit_log_verbose; extern int mock_xmit_log_homa_info; -extern struct page * +struct page * mock_alloc_pages(gfp_t gfp, unsigned order); -extern int mock_check_error(int *errorMask); -extern void mock_clear_xmit_prios(void); -extern void mock_data_ready(struct sock *sk); -extern cycles_t mock_get_cycles(void); -extern unsigned int - mock_get_mtu(const struct dst_entry *dst); -extern void mock_get_page(struct page *page); -extern int mock_page_refs(struct page *page); -extern int mock_page_refs(struct page *page); -extern void mock_put_page(struct page *page); -extern void mock_rcu_read_lock(void); -extern void mock_rcu_read_unlock(void); -extern struct ctl_table_header * - mock_register_net_sysctl(struct net *net, - const char *path, struct ctl_table *table); -extern void mock_set_core(int num); -extern void mock_set_ipv6(struct homa_sock *hsk); -extern void mock_spin_lock(spinlock_t *lock); -extern void mock_spin_unlock(spinlock_t *lock); -extern int mock_skb_count(void); -extern struct sk_buff * - mock_skb_new(struct in6_addr *saddr, struct common_header *h, +int mock_check_error(int *errorMask); +void mock_clear_xmit_prios(void); +void mock_data_ready(struct sock *sk); +cycles_t mock_get_cycles(void); +unsigned int + mock_get_mtu(const struct dst_entry *dst); +void mock_get_page(struct page *page); +int mock_page_refs(struct page *page); +int mock_page_refs(struct page *page); +void mock_put_page(struct page *page); +void mock_rcu_read_lock(void); +void mock_rcu_read_unlock(void); +struct ctl_table_header * + mock_register_net_sysctl(struct net *net, + const char *path, + struct ctl_table *table); +void mock_set_core(int num); +void mock_set_ipv6(struct homa_sock *hsk); +void mock_spin_lock(spinlock_t *lock); +void mock_spin_unlock(spinlock_t *lock); +int mock_skb_count(void); +struct sk_buff * + mock_skb_new(struct in6_addr *saddr, struct homa_common_hdr *h, int extra_bytes, int first_value); -extern void mock_sock_destroy(struct homa_sock *hsk, - struct homa_socktab *socktab); -extern void mock_sock_init(struct homa_sock *hsk, struct homa *homa, - int port); -extern void mock_teardown(void); -extern void *mock_vmalloc(size_t size); +void mock_sock_destroy(struct homa_sock *hsk, + struct homa_socktab *socktab); +void mock_sock_init(struct homa_sock *hsk, struct homa *homa, + int port); +void mock_teardown(void); +void *mock_vmalloc(size_t size); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index b26e2c05..c99fda4b 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -45,7 +45,7 @@ FIXTURE(homa_grant) { union sockaddr_in_union server_addr; struct homa homa; struct homa_sock hsk; - struct data_header data; + struct homa_data_hdr data; int incoming_delta; }; FIXTURE_SETUP(homa_grant) @@ -78,7 +78,7 @@ FIXTURE_SETUP(homa_grant) self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->data = (struct data_header){.common = { + self->data = (struct homa_data_hdr){.common = { .sport = htons(self->client_port), .dport = htons(self->server_port), .type = DATA, diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 3d59f344..7df7145e 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -130,7 +130,7 @@ FIXTURE(homa_incoming) { struct homa homa; struct homa_sock hsk; struct homa_sock hsk2; - struct data_header data; + struct homa_data_hdr data; struct homa_interest interest; }; FIXTURE_SETUP(homa_incoming) @@ -157,7 +157,7 @@ FIXTURE_SETUP(homa_incoming) self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->data = (struct data_header){.common = { + self->data = (struct homa_data_hdr){.common = { .sport = htons(self->client_port), .dport = htons(self->server_port), .type = DATA, @@ -926,7 +926,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__non_data_packet_for_existing_server_rp struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 100); - struct resend_header resend = {.common = { + struct homa_resend_hdr resend = {.common = { .sport = htons(self->client_port), .dport = htons(self->server_port), .type = RESEND, @@ -951,7 +951,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_client_rpc) EXPECT_EQ(10000, crpc->msgout.granted); unit_log_clear(); - struct grant_header h = {{.sport = htons(self->server_port), + struct homa_grant_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = GRANT}, @@ -964,7 +964,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_client_rpc) } TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) { - struct grant_header h = {{.sport = htons(self->server_port), + struct homa_grant_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99991), .type = UNKNOWN}}; @@ -976,7 +976,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) } TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) { - struct resend_header h = {{.sport = htons(self->client_port), + struct homa_resend_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->server_port), .sender_id = cpu_to_be64(99990), .type = GRANT}}; @@ -988,7 +988,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) } TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) { - struct cutoffs_header h = {{.sport = htons(self->server_port), + struct homa_cutoffs_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99991), .type = CUTOFFS}, @@ -1009,7 +1009,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) } TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) { - struct resend_header h = {{.sport = htons(self->client_port), + struct homa_resend_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99990), .type = RESEND}, @@ -1024,7 +1024,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - struct grant_header h = {.common = {.sport = htons(self->server_port), + struct homa_grant_hdr h = {.common = {.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = GRANT}, @@ -1059,7 +1059,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) EXPECT_EQ(10000, crpc->msgout.granted); unit_log_clear(); - struct common_header h = {.sport = htons(self->server_port), + struct homa_common_hdr h = {.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = 99}; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h, 0, 0), &self->homa); @@ -1331,7 +1331,7 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); - struct grant_header h = {{.sport = htons(srpc->dport), + struct homa_grant_hdr h = {{.sport = htons(srpc->dport), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->client_id), .type = GRANT}, @@ -1373,7 +1373,7 @@ TEST_F(homa_incoming, homa_grant_pkt__reset) struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); - struct grant_header h = {{.sport = htons(srpc->dport), + struct homa_grant_hdr h = {{.sport = htons(srpc->dport), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->client_id), .type = GRANT}, @@ -1405,7 +1405,7 @@ TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - struct grant_header h = {{.sport = htons(self->server_port), + struct homa_grant_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = GRANT}, @@ -1421,7 +1421,7 @@ TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) { - struct resend_header h = {{.sport = htons(self->client_port), + struct homa_resend_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->server_port), .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, @@ -1435,7 +1435,7 @@ TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) } TEST_F(homa_incoming, homa_resend_pkt__rpc_in_service_server_sends_busy) { - struct resend_header h = {{.sport = htons(self->client_port), + struct homa_resend_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->server_port), .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, @@ -1458,7 +1458,7 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_incoming_server_sends_busy) /* Entire msgin has not been received yet. But we have received * everything we have granted so far. */ - struct resend_header h = {{.sport = htons(self->client_port), + struct homa_resend_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->server_port), .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, @@ -1483,7 +1483,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) /* Important to respond to resends even if client thinks the * server must already have received everything. */ - struct resend_header h = {{.sport = htons(self->server_port), + struct homa_resend_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, @@ -1503,7 +1503,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) } TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) { - struct resend_header h = {{.sport = htons(self->server_port), + struct homa_resend_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, @@ -1523,7 +1523,7 @@ TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) } TEST_F(homa_incoming, homa_resend_pkt__client_send_data) { - struct resend_header h = {{.sport = htons(self->server_port), + struct homa_resend_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, @@ -1546,7 +1546,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_send_data) } TEST_F(homa_incoming, homa_resend_pkt__server_send_data) { - struct resend_header h = {{.sport = htons(self->client_port), + struct homa_resend_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, @@ -1571,7 +1571,7 @@ TEST_F(homa_incoming, homa_resend_pkt__server_send_data) TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) { - struct unknown_header h = {{.sport = htons(self->server_port), + struct homa_unknown_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = UNKNOWN}}; @@ -1593,7 +1593,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) } TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) { - struct unknown_header h = {{.sport = htons(self->server_port), + struct homa_unknown_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = UNKNOWN}}; @@ -1615,7 +1615,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) } TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) { - struct unknown_header h = {{.sport = htons(self->client_port), + struct homa_unknown_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->hsk2.port), .sender_id = cpu_to_be64(self->client_id), .type = UNKNOWN}}; @@ -1636,7 +1636,7 @@ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - struct cutoffs_header h = {{.sport = htons(self->server_port), + struct homa_cutoffs_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = CUTOFFS}, @@ -1656,7 +1656,7 @@ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) } TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) { - struct cutoffs_header h = {{.sport = htons(self->server_port), + struct homa_cutoffs_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = CUTOFFS}, @@ -1680,7 +1680,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 3000); - struct need_ack_header h = {.common = { + struct homa_need_ack_hdr h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), @@ -1701,7 +1701,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 3000); - struct need_ack_header h = {.common = { + struct homa_need_ack_hdr h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), @@ -1721,7 +1721,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 3000); - struct need_ack_header h = {.common = { + struct homa_need_ack_hdr h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), @@ -1740,7 +1740,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) { struct homa_peer *peer = homa_peer_find(self->homa.peers, self->server_ip, &self->hsk.inet); - struct need_ack_header h = {.common = { + struct homa_need_ack_hdr h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), @@ -1761,7 +1761,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 5000); - struct ack_header h = {.common = { + struct homa_ack_hdr h = {.common = { .sport = htons(self->client_port), .dport = htons(self->hsk2.port), .sender_id = cpu_to_be64(self->client_id), @@ -1785,7 +1785,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id+2, 100, 5000); - struct ack_header h = {.common = { + struct homa_ack_hdr h = {.common = { .sport = htons(self->client_port), .dport = htons(self->hsk2.port), .sender_id = cpu_to_be64(self->client_id + 10), diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index a1cdfaf3..a745ac66 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -29,7 +29,7 @@ FIXTURE(homa_offload) struct homa homa; struct homa_sock hsk; struct in6_addr ip; - struct data_header header; + struct homa_data_hdr header; struct napi_struct napi; struct sk_buff *skb, *skb2; struct list_head empty_list; @@ -45,7 +45,7 @@ FIXTURE_SETUP(homa_offload) global_homa = &self->homa; mock_sock_init(&self->hsk, &self->homa, 99); self->ip = unit_get_in_addr("196.168.0.1"); - self->header = (struct data_header){.common = { + self->header = (struct homa_data_hdr){.common = { .sport = htons(40000), .dport = htons(99), .type = DATA, .flags = HOMA_TCP_FLAGS, @@ -130,13 +130,13 @@ TEST_F(homa_offload, homa_gro_hook_tcp) TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) { - struct common_header *h; + struct homa_common_hdr *h; struct sk_buff *skb; homa_gro_hook_tcp(); self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); - h = (struct common_header *) skb_transport_header(skb); + h = (struct homa_common_hdr *) skb_transport_header(skb); h->flags = 0; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); EXPECT_STREQ("tcp_gro_receive", unit_log_get()); @@ -144,7 +144,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) unit_log_clear(); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); - h = (struct common_header *)skb_transport_header(skb); + h = (struct homa_common_hdr *)skb_transport_header(skb); h->urgent -= 1; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); EXPECT_STREQ("tcp_gro_receive", unit_log_get()); @@ -153,7 +153,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) } TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) { - struct common_header *h; + struct homa_common_hdr *h; struct sk_buff *skb; mock_ipv6 = true; @@ -161,7 +161,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); ip_hdr(skb)->protocol = IPPROTO_TCP; - h = (struct common_header *)skb_transport_header(skb); + h = (struct homa_common_hdr *)skb_transport_header(skb); h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; @@ -176,7 +176,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) } TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) { - struct common_header *h; + struct homa_common_hdr *h; struct sk_buff *skb; mock_ipv6 = false; @@ -184,7 +184,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) self->header.seg.offset = htonl(6000); skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); ip_hdr(skb)->protocol = IPPROTO_TCP; - h = (struct common_header *)skb_transport_header(skb); + h = (struct homa_common_hdr *)skb_transport_header(skb); h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); NAPI_GRO_CB(skb)->same_flow = 0; @@ -222,7 +222,7 @@ TEST_F(homa_offload, homa_gso_segment_set_ip_ids) TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) { struct sk_buff *skb, *skb2; - struct data_header *h; + struct homa_data_hdr *h; /* First call: copy offset from sequence number. */ self->header.common.sequence = htonl(6000); @@ -232,7 +232,7 @@ TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) cur_offload_core->held_skb = NULL; cur_offload_core->held_bucket = 99; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb)); - h = (struct data_header *) skb_transport_header(skb); + h = (struct homa_data_hdr *) skb_transport_header(skb); EXPECT_EQ(6000, htonl(h->seg.offset)); /* Second call: offset already valid. */ @@ -241,7 +241,7 @@ TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) skb2 = mock_skb_new(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb2)->same_flow = 0; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb2)); - h = (struct data_header *)skb_transport_header(skb2); + h = (struct homa_data_hdr *)skb_transport_header(skb2); EXPECT_EQ(5000, htonl(h->seg.offset)); kfree_skb(skb); @@ -257,7 +257,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) __u64 server_id = 1235; struct homa_rpc *srpc; int server_port = 99; - struct data_header h; + struct homa_data_hdr h; h.common.sport = htons(40000); h.common.dport = htons(server_port); @@ -318,11 +318,11 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); struct sk_buff *skb, *skb2, *skb3, *result; + struct homa_grant_hdr h; int client_port = 40000; __u64 client_id = 1234; __u64 server_id = 1235; struct homa_rpc *srpc; - struct grant_header h; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &client_ip, &server_ip, client_port, server_id, 100, diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 24477c67..a1c32b55 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -74,8 +74,8 @@ FIXTURE_TEARDOWN(homa_outgoing) TEST_F(homa_outgoing, set_priority__priority_mapping) { + struct homa_grant_hdr h; struct homa_rpc *srpc; - struct grant_header h; srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); @@ -110,8 +110,8 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved) EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", homa_print_packet(skb, buffer, sizeof(buffer))); - EXPECT_EQ(5000 + sizeof32(struct data_header) - + 3*sizeof32(struct seg_header), skb->len); + EXPECT_EQ(5000 + sizeof32(struct homa_data_hdr) + + 3*sizeof32(struct homa_seg_hdr), skb->len); kfree_skb(skb); } TEST_F(homa_outgoing, homa_fill_data_interleaved__error_copying_data) @@ -187,7 +187,7 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_int EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", homa_print_packet(skb, buffer, sizeof(buffer))); - EXPECT_EQ(4*(sizeof(struct data_header) + crpc->hsk->ip_header_length + EXPECT_EQ(4*(sizeof(struct homa_data_hdr) + crpc->hsk->ip_header_length + HOMA_ETH_OVERHEAD) + 5000, homa_get_skb_info(skb)->wire_bytes); EXPECT_EQ(5000, homa_get_skb_info(skb)->data_bytes); @@ -262,7 +262,7 @@ TEST_F(homa_outgoing, homa_new_data_packet__gso_information) skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); EXPECT_EQ(4, skb_shinfo(skb)->gso_segs); - EXPECT_EQ(1500 + sizeof(struct seg_header), + EXPECT_EQ(1500 + sizeof(struct homa_seg_hdr), skb_shinfo(skb)->gso_size); EXPECT_EQ(SKB_GSO_TCPV6, skb_shinfo(skb)->gso_type); kfree_skb(skb); @@ -377,7 +377,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__include_acks) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); - struct data_header h; + struct homa_data_hdr h; ASSERT_FALSE(crpc == NULL); crpc->peer->acks[0] = (struct homa_ack) { @@ -469,8 +469,8 @@ TEST_F(homa_outgoing, homa_message_out_fill__too_short_for_pipelining) TEST_F(homa_outgoing, homa_xmit_control__server_request) { + struct homa_grant_hdr h; struct homa_rpc *srpc; - struct grant_header h; homa_sock_bind(self->homa.port_map, &self->hsk, self->server_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, @@ -491,8 +491,8 @@ TEST_F(homa_outgoing, homa_xmit_control__server_request) } TEST_F(homa_outgoing, homa_xmit_control__client_response) { + struct homa_grant_hdr h; struct homa_rpc *crpc; - struct grant_header h; crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, @@ -512,8 +512,8 @@ TEST_F(homa_outgoing, homa_xmit_control__client_response) TEST_F(homa_outgoing, __homa_xmit_control__cant_alloc_skb) { + struct homa_grant_hdr h; struct homa_rpc *srpc; - struct grant_header h; srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); @@ -533,7 +533,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__cant_alloc_skb) TEST_F(homa_outgoing, __homa_xmit_control__pad_packet) { struct homa_rpc *srpc; - struct busy_header h; + struct homa_busy_hdr h; srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); @@ -546,8 +546,8 @@ TEST_F(homa_outgoing, __homa_xmit_control__pad_packet) } TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) { + struct homa_grant_hdr h; struct homa_rpc *srpc; - struct grant_header h; // Make sure the test uses IPv4. mock_ipv6 = false; @@ -570,8 +570,8 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) } TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) { + struct homa_grant_hdr h; struct homa_rpc *srpc; - struct grant_header h; // Make sure the test uses IPv6. mock_ipv6 = true; @@ -595,7 +595,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) TEST_F(homa_outgoing, homa_xmit_unknown) { - struct grant_header h = {{.sport = htons(self->client_port), + struct homa_grant_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->server_port), .sender_id = cpu_to_be64(99990), .type = GRANT}, diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index e57500dc..0444ec5a 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -32,7 +32,7 @@ FIXTURE(homa_plumbing) { struct homa_sock hsk; union sockaddr_in_union client_addr; union sockaddr_in_union server_addr; - struct data_header data; + struct homa_data_hdr data; int starting_skb_count; struct msghdr recvmsg_hdr; struct homa_recvmsg_args recvmsg_args; @@ -67,7 +67,7 @@ FIXTURE_SETUP(homa_plumbing) ipv6_to_ipv4(self->server_addr.in6.sin6_addr); } homa_sock_bind(self->homa.port_map, &self->hsk, self->server_port); - self->data = (struct data_header){.common = { + self->data = (struct homa_data_hdr){.common = { .sport = htons(self->client_port), .dport = htons(self->server_port), .type = DATA, @@ -828,7 +828,7 @@ TEST_F(homa_plumbing, homa_softirq__remove_extra_headers) TEST_F(homa_plumbing, homa_softirq__packet_too_short) { struct sk_buff *skb; - struct ack_header h; + struct homa_ack_hdr h; h.common.type = ACK; skb = mock_skb_new(self->client_ip, &h.common, 0, 0); @@ -876,7 +876,7 @@ TEST_F(homa_plumbing, homa_softirq__process_short_messages_first) } TEST_F(homa_plumbing, homa_softirq__process_control_first) { - struct common_header unknown = { + struct homa_common_hdr unknown = { .sport = htons(self->client_port), .dport = htons(self->server_port), .type = UNKNOWN, diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index c0497b73..d1d03fea 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -22,7 +22,7 @@ FIXTURE(homa_rpc) { struct homa homa; struct homa_sock hsk; union sockaddr_in_union server_addr; - struct data_header data; + struct homa_data_hdr data; struct homa_rpc *crpc; struct iovec iovec; struct iov_iter iter; @@ -40,7 +40,7 @@ FIXTURE_SETUP(homa_rpc) self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); - self->data = (struct data_header){.common = { + self->data = (struct homa_data_hdr){.common = { .sport = htons(self->client_port), .dport = htons(self->server_port), .type = DATA, diff --git a/test/utils.c b/test/utils.c index c97b4ce8..81d397b4 100644 --- a/test/utils.c +++ b/test/utils.c @@ -58,7 +58,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, return crpc; crpc->msgout.next_xmit_offset = crpc->msgout.length; - struct data_header h = { + struct homa_data_hdr h = { .common = { .sport = htons(server_port), .dport = htons(hsk->port), @@ -352,7 +352,7 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, int req_length, int resp_length) { int bytes_received, created; - struct data_header h = { + struct homa_data_hdr h = { .common = { .sport = htons(client_port), .dport = htons(hsk->port), From 78c3b37e42f081de3e1ae282fb296c93c1a3d304 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 15 Dec 2024 21:03:36 -0800 Subject: [PATCH 116/625] Update notes.txt --- notes.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/notes.txt b/notes.txt index e2907288..4d56b453 100755 --- a/notes.txt +++ b/notes.txt @@ -38,6 +38,21 @@ Notes for Homa implementation in Linux: * TCQ_F_NOLOCK seems to apply to the qdisc root lock: individual qdiscs still get locked. +* Notes on Homa qdisc: + * Keep separate packet queues for TCP and Homa + * Pace packet output to stay within network speed + * If both queues are nonempty, split bandwidth between the queues using + a static formula (have a relative weight for each of TCP and Homa?) + * For Homa, is it OK call ip_queue_xmit for all available output and + let the qdisc queue them up? + * Potential risk: queues might get long, so insertion could be expensive. + Can organize the queue by RPC, not individual packets ... there probably + are not large #'s of ready RPCs at once? + * Keep track of all of the queues for a particular device, and potentially + move packets between queues (e.g. all long packets get transmitted on a + single queue; no short packets get transmitted there). + + * Remedies to consider for the performance problems at 100 Gbps, where one tx channel gets very backed up: * Implement zero-copy on output in order to reduce memory bandwidth From ef09002fe4c4e55238d88c3d2b73d39cb04896f0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 16 Dec 2024 08:45:52 -0800 Subject: [PATCH 117/625] Adjust a few default values --- homa_utils.c | 6 +++--- test/unit_homa_incoming.c | 2 ++ test/unit_homa_offload.c | 2 ++ test/unit_homa_outgoing.c | 2 ++ test/unit_homa_pool.c | 2 ++ test/unit_homa_rpc.c | 2 ++ test/unit_homa_timer.c | 2 ++ 7 files changed, 15 insertions(+), 3 deletions(-) diff --git a/homa_utils.c b/homa_utils.c index 7c6cc20f..e765a838 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -82,8 +82,8 @@ int homa_init(struct homa *homa) } /* Wild guesses to initialize configuration values... */ - homa->unsched_bytes = 10000; - homa->window_param = 10000; + homa->unsched_bytes = 40000; + homa->window_param = 100000; homa->link_mbps = 25000; homa->poll_usecs = 50; homa->num_priorities = HOMA_MAX_PRIORITIES; @@ -124,7 +124,7 @@ int homa_init(struct homa *homa) return err; } homa->pacer_exit = false; - homa->max_nic_queue_ns = 2000; + homa->max_nic_queue_ns = 5000; homa->ns_per_mbyte = 0; homa->verbose = 0; homa->max_gso_size = 10000; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 7df7145e..5e5617f7 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -152,6 +152,8 @@ FIXTURE_SETUP(homa_incoming) self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.pacer_fifo_fraction = 0; self->homa.grant_fifo_fraction = 0; + self->homa.unsched_bytes = 10000; + self->homa.window_param = 10000; mock_sock_init(&self->hsk, &self->homa, 0); mock_sock_init(&self->hsk2, &self->homa, self->server_port); self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index a745ac66..9a655451 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -42,6 +42,8 @@ FIXTURE_SETUP(homa_offload) homa_init(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; + self->homa.unsched_bytes = 10000; + self->homa.window_param = 10000; global_homa = &self->homa; mock_sock_init(&self->hsk, &self->homa, 99); self->ip = unit_get_in_addr("196.168.0.1"); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index a1c32b55..98310f9b 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -58,6 +58,8 @@ FIXTURE_SETUP(homa_outgoing) atomic64_set(&self->homa.link_idle_time, 10000); self->homa.ns_per_mbyte = 1000000; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; + self->homa.unsched_bytes = 10000; + self->homa.window_param = 10000; mock_sock_init(&self->hsk, &self->homa, self->client_port); self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = self->server_ip[0]; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 75ce8bd8..6a6ca8f5 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -19,6 +19,8 @@ FIXTURE(homa_pool) { FIXTURE_SETUP(homa_pool) { homa_init(&self->homa); + self->homa.unsched_bytes = 10000; + self->homa.window_param = 10000; mock_sock_init(&self->hsk, &self->homa, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); self->server_ip = unit_get_in_addr("1.2.3.4"); diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index d1d03fea..7033e35d 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -39,6 +39,8 @@ FIXTURE_SETUP(homa_rpc) self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); + self->homa.unsched_bytes = 10000; + self->homa.window_param = 10000; mock_sock_init(&self->hsk, &self->homa, 0); self->data = (struct homa_data_hdr){.common = { .sport = htons(self->client_port), diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 474a0336..740de2ab 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -35,6 +35,8 @@ FIXTURE_SETUP(homa_timer) self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.resend_ticks = 2; self->homa.timer_ticks = 100; + self->homa.unsched_bytes = 10000; + self->homa.window_param = 10000; mock_sock_init(&self->hsk, &self->homa, 0); unit_log_clear(); } From f96725d5ffac99c3bc5d5051a1dcab5058c5e629 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 16 Dec 2024 09:34:13 -0800 Subject: [PATCH 118/625] Fix trivial comment error --- homa_rpc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/homa_rpc.h b/homa_rpc.h index 91aca525..4022d0b0 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -80,7 +80,10 @@ struct homa_message_out { */ int granted; - /** @priority: Priority level to use for future scheduled packets. */ + /** + * @sched_priority: Priority level to use for future scheduled + * packets. + */ __u8 sched_priority; /** From 0923c0ece0993e726643581a38acbdc194e9e75f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 18 Dec 2024 08:59:54 -0800 Subject: [PATCH 119/625] Fix compilation problem related to SPLIT_64 Resolves #66 --- homa_offload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/homa_offload.c b/homa_offload.c index d80fabdf..fad07bcc 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -384,7 +384,8 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, protocol = ip_hdr(held_skb)->protocol; if (protocol != IPPROTO_HOMA) { tt_record3("homa_gro_receive held_skb 0x%0x%0x isn't Homa: protocol %d", - SPLIT_64(held_skb), protocol); + tt_hi(held_skb), tt_lo(held_skb), + protocol); continue; } From 6f58bef36eb07e4fd0ff9537c65bc68faab62618 Mon Sep 17 00:00:00 2001 From: breakertt Date: Wed, 18 Dec 2024 09:40:43 -0800 Subject: [PATCH 120/625] Compute segs_per_gso more precisely Need different calculation with and without TCP hijacking. This resolves #67. --- homa_outgoing.c | 21 ++++++++++++--- test/unit_homa_outgoing.c | 56 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 4 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index f1174e27..02aed784 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -240,10 +240,23 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) if (gso_size > rpc->hsk->homa->max_gso_size) gso_size = rpc->hsk->homa->max_gso_size; - /* Round gso_size down to an even # of mtus. */ - segs_per_gso = gso_size - rpc->hsk->ip_header_length - - sizeof(struct homa_data_hdr); - do_div(segs_per_gso, max_seg_data); + /* Round gso_size down to an even # of mtus; calculation depends + * on whether we're doing TCP hijacking (need more space in TSO packet + * if no hijacking). + */ + if (rpc->hsk->sock.sk_protocol == IPPROTO_TCP) { + /* Hijacking */ + segs_per_gso = gso_size - rpc->hsk->ip_header_length + - sizeof(struct homa_data_hdr); + do_div(segs_per_gso, max_seg_data); + } else { + /* No hijacking */ + segs_per_gso = gso_size - rpc->hsk->ip_header_length - + sizeof(struct homa_data_hdr) + + sizeof(struct homa_seg_hdr); + do_div(segs_per_gso, max_seg_data + + sizeof(struct homa_seg_hdr)); + } if (segs_per_gso == 0) segs_per_gso = 1; max_gso_data = segs_per_gso * max_seg_data; diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 98310f9b..69c98732 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -333,6 +333,62 @@ TEST_F(homa_outgoing, homa_message_out_fill__zero_length_message) unit_iov_iter((void *) 1000, 0), 0)); homa_rpc_unlock(crpc); } +TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) +{ + struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, + &self->server_addr); + struct homa_rpc *crpc2 = homa_rpc_new_client(&self->hsk, + &self->server_addr); + + ASSERT_FALSE(crpc1 == NULL); + ASSERT_FALSE(crpc2 == NULL); + mock_set_ipv6(&self->hsk); + self->hsk.sock.sk_protocol = IPPROTO_TCP; + + /* First try: not quite enough space for 3 packets in GSO. */ + mock_net_device.gso_max_size = mock_mtu - 1 + + 2 * UNIT_TEST_DATA_PER_PACKET; + ASSERT_EQ(0, -homa_message_out_fill(crpc1, + unit_iov_iter((void *) 1000, 10000), 0)); + homa_rpc_unlock(crpc1); + EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 2800", unit_log_get()); + + /* Second try: just barely enough space for 3 packets in GSO. */ + mock_net_device.gso_max_size += 1; + unit_log_clear(); + ASSERT_EQ(0, -homa_message_out_fill(crpc2, + unit_iov_iter((void *) 1000, 10000), 0)); + homa_rpc_unlock(crpc2); + EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 4200", unit_log_get()); +} +TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) +{ + struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, + &self->server_addr); + struct homa_rpc *crpc2 = homa_rpc_new_client(&self->hsk, + &self->server_addr); + + ASSERT_FALSE(crpc1 == NULL); + ASSERT_FALSE(crpc2 == NULL); + mock_set_ipv6(&self->hsk); + + /* First try: not quite enough space for 3 packets in GSO. */ + mock_net_device.gso_max_size = mock_mtu - 1 + + 2 * (UNIT_TEST_DATA_PER_PACKET + + sizeof(struct homa_seg_hdr)); + ASSERT_EQ(0, -homa_message_out_fill(crpc1, + unit_iov_iter((void *) 1000, 10000), 0)); + homa_rpc_unlock(crpc1); + EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 2800", unit_log_get()); + + /* Second try: just barely enough space for 3 packets in GSO. */ + mock_net_device.gso_max_size += 1; + unit_log_clear(); + ASSERT_EQ(0, -homa_message_out_fill(crpc2, + unit_iov_iter((void *) 1000, 10000), 0)); + homa_rpc_unlock(crpc2); + EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 4200", unit_log_get()); +} TEST_F(homa_outgoing, homa_message_out_fill__gso_force_software) { struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, From 865dad6493bd0dfcac35d1ab6424a8b625f1f19a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 17 Dec 2024 15:57:38 -0800 Subject: [PATCH 121/625] Fix issues from checkpatch.pl and kernel-doc --- homa_impl.h | 1 - homa_incoming.c | 8 ++++++- homa_plumbing.c | 4 ++-- homa_pool.c | 10 +++++++-- homa_sock.c | 7 ++++-- homa_wire.h | 4 +++- test/unit_homa_incoming.c | 47 ++++++++++++++++++++++++++++++++++++++- test/unit_homa_pool.c | 16 +++++++++++++ util/strip.py | 4 +++- 9 files changed, 90 insertions(+), 11 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index a57b5086..e52a7783 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -161,7 +161,6 @@ void homa_throttle_lock_slow(struct homa *homa); * and easier to use than sockaddr_storage). */ union sockaddr_in_union { - /** @sa: Used to access as a generic sockaddr. */ struct sockaddr sa; diff --git a/homa_incoming.c b/homa_incoming.c index 53ce9526..41d376b3 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -519,7 +519,13 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) /* It isn't safe to process more packets once we've * released the RPC lock (this should never happen). */ - BUG_ON(next); + while (next) { + WARN_ONCE(next, "%s found extra packets after AC<\n", + __func__); + skb = next; + next = skb->next; + kfree_skb(skb); + } break; default: INC_METRIC(unknown_packet_types, 1); diff --git a/homa_plumbing.c b/homa_plumbing.c index 93f1b458..49834bb3 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1415,8 +1415,8 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) iph = (struct iphdr *)(skb->data); daddr = ipv4_to_ipv6(iph->daddr); if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) { - struct homa_common_hdr *h = (struct homa_common_hdr *)(skb->data + - iph->ihl * 4); + struct homa_common_hdr *h = (struct homa_common_hdr *)(skb->data + + iph->ihl * 4); port = ntohs(h->dport); error = -ENOTCONN; diff --git a/homa_pool.c b/homa_pool.c index a21cf9ce..bc2bc2f1 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -371,7 +371,8 @@ int homa_pool_allocate(struct homa_rpc *rpc) * been allocated for the message. * @offset: Offset within @rpc's incoming message. * @available: Will be filled in with the number of bytes of space available - * at the returned address. + * at the returned address (could be zero if offset is + * (erroneously) past the end of the message). * Return: The application's virtual address for buffer space corresponding * to @offset in the incoming message for @rpc. */ @@ -380,7 +381,12 @@ void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available) int bpage_index, bpage_offset; bpage_index = offset >> HOMA_BPAGE_SHIFT; - BUG_ON(bpage_index >= rpc->msgin.num_bpages); + if (offset >= rpc->msgin.length) { + WARN_ONCE(true, "%s got offset %d >= message length %d\n", + __func__, offset, rpc->msgin.length); + *available = 0; + return NULL; + } bpage_offset = offset & (HOMA_BPAGE_SIZE - 1); *available = (bpage_index < (rpc->msgin.num_bpages - 1)) ? HOMA_BPAGE_SIZE - bpage_offset diff --git a/homa_sock.c b/homa_sock.c index 02c440a9..4d9a064c 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -264,16 +264,19 @@ void homa_sock_shutdown(struct homa_sock *hsk) wake_up_process(interest->thread); homa_sock_unlock(hsk); +#ifndef __STRIP__ /* See strip.py */ while (!list_empty(&hsk->dead_rpcs)) { homa_rpc_reap(hsk, 1000); -#ifndef __STRIP__ /* See strip.py */ i++; if (i == 5) { tt_record("Freezing because reap seems hung"); tt_freeze(); } -#endif /* See strip.py */ } +#else /* See strip.py */ + while (!list_empty(&hsk->dead_rpcs)) + homa_rpc_reap(hsk, 1000); +#endif /* See strip.py */ if (hsk->buffer_pool) { homa_pool_destroy(hsk->buffer_pool); diff --git a/homa_wire.h b/homa_wire.h index f0d94c48..94dc7c1a 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -286,6 +286,7 @@ _Static_assert(((sizeof(struct homa_data_hdr) - sizeof(struct homa_seg_hdr)) & * after the homa_data_hdr. Note: if the packet is a GSO packet, the result * may include metadata as well as packet data. * @skb: Incoming data packet + * Return: see above */ static inline int homa_data_len(struct sk_buff *skb) { @@ -382,7 +383,7 @@ _Static_assert(sizeof(struct homa_unknown_hdr) <= HOMA_MAX_HEADER, "homa_unknown_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** - * struct busy_header - Wire format for BUSY packets. + * struct homa_busy_hdr - Wire format for BUSY packets. * * These packets tell the recipient that the sender is still alive (even if * it isn't sending data expected by the recipient). @@ -473,6 +474,7 @@ _Static_assert(sizeof(struct homa_ack_hdr) <= HOMA_MAX_HEADER, * is network-encoded), return the decoded id we should use for that * RPC on this machine. * @sender_id: RPC id from an incoming packet, such as h->common.sender_id + * Return: see above */ static inline __u64 homa_local_id(__be64 sender_id) { diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 5e5617f7..bdd956ff 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -772,7 +772,27 @@ TEST_F(homa_incoming, homa_copy_to_user__many_chunks_for_one_skb) "skb_copy_datagram_iter: 440 bytes to 0x1000a00: 103560-103999", unit_log_get()); } -TEST_F(homa_incoming, homa_copy_to_user__error_in_import_single_range) +TEST_F(homa_incoming, homa_copy_to_user__skb_data_extends_past_message_end) +{ + struct homa_data_hdr *h; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, self->client_id, + 1000, 4000); + ASSERT_NE(NULL, crpc); + self->data.message_length = htonl(4000); + homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + 3000, 101000), crpc); + + unit_log_clear(); + mock_copy_to_user_dont_copy = -1; + h = (struct homa_data_hdr *)skb_peek(&crpc->msgin.packets)->data; + h->seg.offset = htonl(4000); + EXPECT_EQ(0, -homa_copy_to_user(crpc)); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_incoming, homa_copy_to_user__error_in_import_ubuf) { struct homa_rpc *crpc; @@ -1051,6 +1071,31 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) EXPECT_EQ(5, crpc->silent_ticks); EXPECT_EQ(0, crpc->peer->outstanding_resends); } +TEST_F(homa_incoming, homa_dispatch_pkts__multiple_ack_packets) +{ + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 3000); + struct sk_buff *skb, *skb2, *skb3; + struct homa_ack_hdr ack; + + ASSERT_NE(NULL, srpc); + ack.common = self->data.common; + ack.common.type = ACK; + ack.common.sender_id += 100; + ack.num_acks = htons(1); + ack.acks[0].server_port = htons(self->server_port); + ack.acks[0].client_id = cpu_to_be64(self->client_id + 4); + skb = mock_skb_new(self->client_ip, &ack.common, 0, 0); + skb2 = mock_skb_new(self->client_ip, &ack.common, 0, 0); + skb3 = mock_skb_new(self->client_ip, &ack.common, 0, 0); + skb->next = skb2; + skb2->next = skb3; + + unit_log_clear(); + homa_dispatch_pkts(skb, &self->homa); + EXPECT_SUBSTR("ack 1239", unit_log_get()); +} TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 6a6ca8f5..b042a7d5 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -453,6 +453,22 @@ TEST_F(homa_pool, homa_pool_get_buffer) EXPECT_EQ((150000 & (HOMA_BPAGE_SIZE-1)) - 100, available); EXPECT_EQ((void *) (pool->region + 2*HOMA_BPAGE_SIZE + 100), buffer); } +TEST_F(homa_pool, homa_pool_get_buffer__bad_offset) +{ + struct homa_rpc *crpc; + int available; + void *buffer; + + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 150000); + ASSERT_NE(NULL, crpc); + buffer = homa_pool_get_buffer(crpc, 149900, &available); + EXPECT_NE(NULL, buffer); + EXPECT_EQ(100, available); + buffer = homa_pool_get_buffer(crpc, 150000, &available); + EXPECT_EQ(NULL, buffer); + EXPECT_EQ(0, available); +} TEST_F(homa_pool, homa_pool_release_buffers__basics) { diff --git a/util/strip.py b/util/strip.py index 0357b03d..a9f721a9 100755 --- a/util/strip.py +++ b/util/strip.py @@ -206,7 +206,9 @@ def scan(file, alt_mode): non_comment = pline non_comment = non_comment.strip() - # Strip groups of lines labeled with special '#if' + # Strip groups of lines labeled with special '#ifndef __STRIP__' + # Note: don't do brace elimination here: this allows greater control + # to the __STRIP__ code. if in_labeled_skip != None: if line.startswith('#endif /* See strip.py */'): in_labeled_skip = None From 559519aec3816b44ed64a0a304beedb6b39e9650 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 17 Dec 2024 16:21:01 -0800 Subject: [PATCH 122/625] Encapsulate access to interest->ready_rpc New functions homa_interest_get_rpc and homa_interest_set_rpc. --- homa_impl.h | 30 ++++++++++++++++++++++++++-- homa_incoming.c | 13 ++++++------ test/unit_homa_incoming.c | 42 +++++++++++++-------------------------- 3 files changed, 48 insertions(+), 37 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index e52a7783..efc17318 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -186,10 +186,13 @@ struct homa_interest { /** * @ready_rpc: This is actually a (struct homa_rpc *) identifying the * RPC that was found; NULL if no RPC has been found yet. This - * variable is used for synchronization to handoff the RPC, and - * must be set only after @locked is set. + * variable is used for lock-free synchronization to handoff a + * ready RPC to a receiving thread; read and write with functions + * below. */ atomic_long_t ready_rpc; +_Static_assert(sizeof(atomic_long_t) >= sizeof(struct homa_rpc *), + "atomic_long_t isn't large enough to store a homa_rpc *"); /** * @locked: Nonzero means that @ready_rpc is locked; only valid @@ -252,6 +255,29 @@ enum homa_freeze_type { NEED_ACK_MISSING_DATA = 6, }; +/** + * homa_interest_get_rpc() - Return the ready RPC stored in an interest, + * if there is one. + * @interest: Struct to check + * Return: the ready RPC, or NULL if none. + */ +static inline struct homa_rpc *homa_interest_get_rpc(struct homa_interest *interest) +{ + return (struct homa_rpc *)atomic_long_read(&interest->ready_rpc); +} + +/** + * homa_interest_set_rpc() - Hand off a ready RPC to an interest from a + * waiting receiver thread. Note: interest->locked must be set before + * calling this function. + * @interest: Owned by a thread that is ready to receive the RPC. + */ +static inline void homa_interest_set_rpc(struct homa_interest *interest, + struct homa_rpc *rpc) +{ + atomic_long_set_release(&interest->ready_rpc, (long)rpc); +} + /** * struct homa - Overall information about the Homa protocol implementation. * diff --git a/homa_incoming.c b/homa_incoming.c index 41d376b3..d11703b4 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1192,7 +1192,7 @@ int homa_register_interests(struct homa_interest *interest, interest->locked = 1; } atomic_andnot(RPC_HANDING_OFF, &rpc->flags); - atomic_long_set_release(&interest->ready_rpc, (long)rpc); + homa_interest_set_rpc(interest, rpc); return 0; } @@ -1227,7 +1227,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, */ while (1) { error = homa_register_interests(&interest, hsk, flags, id); - rpc = (struct homa_rpc *)atomic_long_read(&interest.ready_rpc); + rpc = homa_interest_get_rpc(&interest); if (rpc) goto found_rpc; if (error < 0) { @@ -1241,8 +1241,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, while (1) { int reaper_result; - rpc = (struct homa_rpc *)atomic_long_read(&interest - .ready_rpc); + rpc = homa_interest_get_rpc(&interest); if (rpc) { tt_record1("received RPC handoff while reaping, id %d", rpc->id); @@ -1301,7 +1300,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, /* Now it's time to sleep. */ per_cpu(homa_offload_core, interest.core).last_app_active = now; set_current_state(TASK_INTERRUPTIBLE); - rpc = (struct homa_rpc *)atomic_long_read(&interest.ready_rpc); + rpc = homa_interest_get_rpc(&interest); if (!rpc && !hsk->shutdown) { __u64 end; __u64 start = sched_clock(); @@ -1345,7 +1344,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, * this could have happened anytime up until we reset the * interests above). */ - rpc = (struct homa_rpc *)atomic_long_read(&interest.ready_rpc); + rpc = homa_interest_get_rpc(&interest); if (rpc) { tt_record2("homa_wait_for_message found rpc id %d, pid %d", rpc->id, current->pid); @@ -1491,7 +1490,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) INC_METRIC(handoffs_thread_waiting, 1); tt_record3("homa_rpc_handoff handing off id %d to pid %d on core %d", rpc->id, interest->thread->pid, task_cpu(interest->thread)); - atomic_long_set_release(&interest->ready_rpc, (long)rpc); + homa_interest_set_rpc(interest, rpc); /* Update the last_app_active time for the thread's core, so Homa * will try to avoid doing any work there. diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index bdd956ff..0cf70863 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2137,8 +2137,7 @@ TEST_F(homa_incoming, homa_register_interests__return_response_by_id) result = homa_register_interests(&self->interest, &self->hsk, 0, self->client_id); EXPECT_EQ(0, result); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_register_interests__socket_shutdown) @@ -2162,8 +2161,7 @@ TEST_F(homa_incoming, homa_register_interests__specified_id_has_packets) result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST, crpc->id); EXPECT_EQ(0, result); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_register_interests__specified_id_has_error) @@ -2179,8 +2177,7 @@ TEST_F(homa_incoming, homa_register_interests__specified_id_has_error) result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_NONBLOCKING, crpc->id); EXPECT_EQ(0, result); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_register_interests__specified_id_not_ready) @@ -2194,8 +2191,7 @@ TEST_F(homa_incoming, homa_register_interests__specified_id_not_ready) result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST, crpc->id); EXPECT_EQ(0, result); - EXPECT_EQ(NULL, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + EXPECT_EQ(NULL, homa_interest_get_rpc(&self->interest)); } TEST_F(homa_incoming, homa_register_interests__return_queued_response) { @@ -2208,8 +2204,7 @@ TEST_F(homa_incoming, homa_register_interests__return_queued_response) result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); EXPECT_EQ(0, result); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); EXPECT_EQ(LIST_POISON1, self->interest.request_links.next); EXPECT_EQ(LIST_POISON1, self->interest.response_links.next); homa_rpc_unlock(crpc); @@ -2225,8 +2220,7 @@ TEST_F(homa_incoming, homa_register_interests__return_queued_request) result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); EXPECT_EQ(0, result); - EXPECT_EQ(srpc, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + EXPECT_EQ(srpc, homa_interest_get_rpc(&self->interest)); EXPECT_EQ(LIST_POISON1, self->interest.request_links.next); EXPECT_EQ(LIST_POISON1, self->interest.response_links.next); homa_rpc_unlock(srpc); @@ -2246,8 +2240,7 @@ TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) result = homa_register_interests(&self->interest, &self->hsk, HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); EXPECT_EQ(0, result); - EXPECT_EQ(srpc1, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + EXPECT_EQ(srpc1, homa_interest_get_rpc(&self->interest)); EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); homa_rpc_unlock(srpc1); @@ -2257,8 +2250,7 @@ TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE |HOMA_RECVMSG_NONBLOCKING, 0); EXPECT_EQ(0, result); - EXPECT_EQ(srpc2, (struct homa_rpc *) - atomic_long_read(&self->interest.ready_rpc)); + EXPECT_EQ(srpc2, homa_interest_get_rpc(&self->interest)); EXPECT_STREQ("", unit_log_get()); homa_rpc_unlock(srpc2); } @@ -2600,8 +2592,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__handoff_already_in_progress) atomic_or(RPC_HANDING_OFF, &crpc->flags); homa_rpc_handoff(crpc); crpc->interest = NULL; - EXPECT_EQ(NULL, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); + EXPECT_EQ(NULL, homa_interest_get_rpc(&interest)); EXPECT_STREQ("", unit_log_get()); atomic_andnot(RPC_HANDING_OFF, &crpc->flags); } @@ -2630,8 +2621,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__rpc_already_enqueued) atomic_or(RPC_HANDING_OFF, &crpc->flags); homa_rpc_handoff(crpc); crpc->interest = NULL; - EXPECT_EQ(NULL, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); + EXPECT_EQ(NULL, homa_interest_get_rpc(&interest)); EXPECT_STREQ("", unit_log_get()); atomic_andnot(RPC_HANDING_OFF, &crpc->flags); } @@ -2652,8 +2642,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__interest_on_rpc) crpc->interest = &interest; homa_rpc_handoff(crpc); crpc->interest = NULL; - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); + EXPECT_EQ(crpc, homa_interest_get_rpc(&interest)); EXPECT_EQ(NULL, interest.reg_rpc); EXPECT_EQ(NULL, crpc->interest); EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); @@ -2674,8 +2663,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__response_interests) interest.thread = &mock_task; list_add_tail(&interest.response_links, &self->hsk.response_interests); homa_rpc_handoff(crpc); - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); + EXPECT_EQ(crpc, homa_interest_get_rpc(&interest)); EXPECT_EQ(0, unit_list_length(&self->hsk.response_interests)); EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); atomic_andnot(RPC_HANDING_OFF, &crpc->flags); @@ -2705,8 +2693,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__request_interests) interest.thread = &mock_task; list_add_tail(&interest.request_links, &self->hsk.request_interests); homa_rpc_handoff(srpc); - EXPECT_EQ(srpc, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); + EXPECT_EQ(srpc, homa_interest_get_rpc(&interest)); EXPECT_EQ(0, unit_list_length(&self->hsk.request_interests)); EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); atomic_andnot(RPC_HANDING_OFF, &srpc->flags); @@ -2746,8 +2733,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__detach_interest) homa_rpc_handoff(crpc); crpc->interest = NULL; - EXPECT_EQ(crpc, (struct homa_rpc *) - atomic_long_read(&interest.ready_rpc)); + EXPECT_EQ(crpc, homa_interest_get_rpc(&interest)); EXPECT_EQ(NULL, interest.reg_rpc); EXPECT_EQ(NULL, crpc->interest); EXPECT_EQ(0, unit_list_length(&self->hsk.response_interests)); From 50b433fb391b4c7f22b8a5e360d4f439e97631ee Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 17 Dec 2024 16:36:34 -0800 Subject: [PATCH 123/625] Improve comments, arrange for debugging code to be stripped. --- homa_impl.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index efc17318..65e76e66 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -288,7 +288,8 @@ struct homa { /** * @next_outgoing_id: Id to use for next outgoing RPC request. * This is always even: it's used only to generate client-side ids. - * Accessed without locks. + * Accessed without locks. Note: RPC ids are unique within a + * single client machine. */ atomic64_t next_outgoing_id; @@ -746,7 +747,7 @@ struct homa { int max_gso_size; /** - * @gso_force_software: A non-zero value will cause Home to perform + * @gso_force_software: A non-zero value will cause Homa to perform * segmentation in software using GSO; zero means ask the NIC to * perform TSO. Set externally via sysctl. */ @@ -888,12 +889,14 @@ struct homa { */ int next_id; +#ifndef __STRIP__ /* See strip.py */ /** * @temp: the values in this array can be read and written with sysctl. * They have no officially defined purpose, and are available for * short-term use during testing. */ int temp[4]; +#endif /* See strip.py */ }; /** @@ -970,6 +973,9 @@ static inline struct sk_buff **homa_next_skb(struct sk_buff *skb) */ static inline void homa_set_doff(struct homa_data_hdr *h, int size) { + /* Drop the 2 low-order bits from size and set the 4 high-order + * bits of doff from what's left. + */ h->common.doff = size << 2; } From 3bf03950fa2b7a2a4f34c55d49f2d93be1e8b356 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 17 Dec 2024 21:40:53 -0800 Subject: [PATCH 124/625] Refactor a few functions * Eliminate homa_next_skb (not used, incorrect) * Replace ipv4_to_ipv6 with Linux function ipv6_addr_set_v4mapped * Replace is_mapped_ipv4 with Linux function ipv6_addr_v4mapped --- homa_impl.h | 68 ++++++++++--------------------------------------- homa_plumbing.c | 2 +- 2 files changed, 14 insertions(+), 56 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 65e76e66..341d0fdb 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -943,24 +943,7 @@ struct homa_skb_info { */ static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb) { - return (struct homa_skb_info *)(skb_end_pointer(skb) - - sizeof(struct homa_skb_info)); -} - -/** - * homa_next_skb() - Compute address of Homa's private link field in @skb. - * @skb: Socket buffer containing private link field. - * Return: address of Homa's private link field for @skb. - * - * Homa needs to keep a list of buffers in a message, but it can't use the - * links built into sk_buffs because Homa wants to retain its list even - * after sending the packet, and the built-in links get used during sending. - * Thus we allocate extra space at the very end of the packet's data - * area to hold a forward pointer for a list. - */ -static inline struct sk_buff **homa_next_skb(struct sk_buff *skb) -{ - return (struct sk_buff **)(skb_end_pointer(skb) - sizeof(char *)); + return (struct homa_skb_info *)(skb_end_pointer(skb)) - 1; } /** @@ -1009,23 +992,6 @@ static inline bool skb_is_ipv6(const struct sk_buff *skb) return ipv6_hdr(skb)->version == 6; } -/** - * ipv4_to_ipv6() - Given an IPv4 address, return an equivalent IPv6 address - * (an IPv4-mapped one). - * @ip4: IPv4 address, in network byte order. - * Return: IPv6 address that is equivalent to @ip4. - */ -static inline struct in6_addr ipv4_to_ipv6(__be32 ip4) -{ - struct in6_addr ret = {}; - - if (ip4 == htonl(INADDR_ANY)) - return in6addr_any; - ret.in6_u.u6_addr32[2] = htonl(0xffff); - ret.in6_u.u6_addr32[3] = ip4; - return ret; -} - /** * ipv6_to_ipv4() - Given an IPv6 address produced by ipv4_to_ipv6, return * the original IPv4 address (in network byte order). @@ -1047,13 +1013,14 @@ static inline __be32 ipv6_to_ipv4(const struct in6_addr ip6) static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union *addr) { + struct in6_addr mapped; if (addr) { - return (addr->sa.sa_family == AF_INET6) - ? addr->in6.sin6_addr - : ipv4_to_ipv6(addr->in4.sin_addr.s_addr); - } else { - return in6addr_any; + if (addr->sa.sa_family == AF_INET6) + return addr->in6.sin6_addr; + ipv6_addr_set_v4mapped(addr->in4.sin_addr.s_addr, &mapped); + return mapped; } + return in6addr_any; } /** @@ -1066,21 +1033,12 @@ static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union */ static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb) { - return skb_is_ipv6(skb) ? ipv6_hdr(skb)->saddr - : ipv4_to_ipv6(ip_hdr(skb)->saddr); -} + struct in6_addr mapped; -/** - * is_mapped_ipv4() - Return true if an IPv6 address is actually an - * IPv4-mapped address, false otherwise. - * @x: The address to check. - * Return: see above. - */ -static inline bool is_mapped_ipv4(const struct in6_addr x) -{ - return ((x.in6_u.u6_addr32[0] == 0) && - (x.in6_u.u6_addr32[1] == 0) && - (x.in6_u.u6_addr32[2] == htonl(0xffff))); + if (skb_is_ipv6(skb)) + return ipv6_hdr(skb)->saddr; + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &mapped); + return mapped; } static inline bool is_homa_pkt(struct sk_buff *skb) @@ -1100,7 +1058,7 @@ static inline bool is_homa_pkt(struct sk_buff *skb) */ static inline __u32 tt_addr(const struct in6_addr x) { - return is_mapped_ipv4(x) ? ntohl(x.in6_u.u6_addr32[3]) + return ipv6_addr_v4mapped(&x) ? ntohl(x.in6_u.u6_addr32[3]) : (x.in6_u.u6_addr32[3] ? ntohl(x.in6_u.u6_addr32[3]) : ntohl(x.in6_u.u6_addr32[1])); } diff --git a/homa_plumbing.c b/homa_plumbing.c index 49834bb3..eceae70c 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1413,7 +1413,7 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) int port = 0; iph = (struct iphdr *)(skb->data); - daddr = ipv4_to_ipv6(iph->daddr); + ipv6_addr_set_v4mapped(iph->daddr, &daddr); if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) { struct homa_common_hdr *h = (struct homa_common_hdr *)(skb->data + iph->ihl * 4); From ae7abad2bb489bd127bb9a9baf7a6de349daf8b2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 19 Dec 2024 09:11:59 -0800 Subject: [PATCH 125/625] Refactor RPC handoff in struct homa_interest Replace @ready_rpc with @rpc and @rpc_ready (cleaner). --- homa_impl.h | 40 +++++++++++++++++++++++++--------------- homa_incoming.c | 14 +++++++------- test/unit_homa_rpc.c | 2 +- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 341d0fdb..15e61318 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -184,19 +184,21 @@ struct homa_interest { struct task_struct *thread; /** - * @ready_rpc: This is actually a (struct homa_rpc *) identifying the - * RPC that was found; NULL if no RPC has been found yet. This - * variable is used for lock-free synchronization to handoff a - * ready RPC to a receiving thread; read and write with functions - * below. + * @rpc_ready: Non-zero means an appropriate incoming message has + * been assigned to this interest, and @rpc and @locked are valid + * (they must be set before setting this variable). */ - atomic_long_t ready_rpc; -_Static_assert(sizeof(atomic_long_t) >= sizeof(struct homa_rpc *), - "atomic_long_t isn't large enough to store a homa_rpc *"); + atomic_t rpc_ready; /** - * @locked: Nonzero means that @ready_rpc is locked; only valid - * if @ready_rpc is non-NULL. + * @rpc: If @rpc_ready is non-zero, points to an RPC with a ready + * incoming message that meets the requirements of this interest. + */ + struct homa_rpc *rpc; + + /** + * @locked: Nonzero means that @rpc is locked; only valid if + * @rpc_ready is non-zero. */ int locked; @@ -234,7 +236,8 @@ _Static_assert(sizeof(atomic_long_t) >= sizeof(struct homa_rpc *), static inline void homa_interest_init(struct homa_interest *interest) { interest->thread = current; - atomic_long_set(&interest->ready_rpc, 0); + atomic_set(&interest->rpc_ready, 0); + interest->rpc = NULL; interest->locked = 0; interest->core = raw_smp_processor_id(); interest->reg_rpc = NULL; @@ -263,19 +266,26 @@ enum homa_freeze_type { */ static inline struct homa_rpc *homa_interest_get_rpc(struct homa_interest *interest) { - return (struct homa_rpc *)atomic_long_read(&interest->ready_rpc); + if (atomic_read(&interest->rpc_ready)) + return interest->rpc; + return NULL; } /** * homa_interest_set_rpc() - Hand off a ready RPC to an interest from a * waiting receiver thread. Note: interest->locked must be set before * calling this function. - * @interest: Owned by a thread that is ready to receive the RPC. + * @interest: Belongs to a thread that is waiting for an incoming message. + * @rpc: Ready rpc to assign to @interest. + * @locked: 1 means @rpc is locked, 0 means unlocked. */ static inline void homa_interest_set_rpc(struct homa_interest *interest, - struct homa_rpc *rpc) + struct homa_rpc *rpc, + int locked) { - atomic_long_set_release(&interest->ready_rpc, (long)rpc); + interest->rpc = rpc; + interest->locked = locked; + atomic_set_release(&interest->rpc_ready, 1); } /** diff --git a/homa_incoming.c b/homa_incoming.c index d11703b4..3fd78a75 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1105,9 +1105,9 @@ int homa_register_interests(struct homa_interest *interest, struct homa_sock *hsk, int flags, __u64 id) { struct homa_rpc *rpc = NULL; + int locked = 1; homa_interest_init(interest); - interest->locked = 1; if (id != 0) { if (!homa_is_client(id)) return -EINVAL; @@ -1139,7 +1139,7 @@ int homa_register_interests(struct homa_interest *interest, homa_rpc_unlock(rpc); } - interest->locked = 0; + locked = 0; if (flags & HOMA_RECVMSG_RESPONSE) { if (!list_empty(&hsk->ready_responses)) { rpc = list_first_entry(&hsk->ready_responses, @@ -1185,14 +1185,14 @@ int homa_register_interests(struct homa_interest *interest, */ atomic_or(RPC_HANDING_OFF, &rpc->flags); homa_sock_unlock(hsk); - if (!interest->locked) { + if (!locked) { atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc, "homa_register_interests"); atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); - interest->locked = 1; + locked = 1; } atomic_andnot(RPC_HANDING_OFF, &rpc->flags); - homa_interest_set_rpc(interest, rpc); + homa_interest_set_rpc(interest, rpc, locked); return 0; } @@ -1273,7 +1273,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, while (1) { __u64 blocked; - rpc = (struct homa_rpc *)atomic_long_read(&interest.ready_rpc); + rpc = homa_interest_get_rpc(&interest); if (rpc) { tt_record3("received RPC handoff while polling, id %d, socket %d, pid %d", rpc->id, hsk->port, @@ -1490,7 +1490,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) INC_METRIC(handoffs_thread_waiting, 1); tt_record3("homa_rpc_handoff handing off id %d to pid %d on core %d", rpc->id, interest->thread->pid, task_cpu(interest->thread)); - homa_interest_set_rpc(interest, rpc); + homa_interest_set_rpc(interest, rpc, 0); /* Update the last_app_active time for the thread's core, so Homa * will try to avoid doing any work there. diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 7033e35d..c68dbfc0 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -419,7 +419,7 @@ TEST_F(homa_rpc, homa_rpc_free__wakeup_interest) struct homa_interest interest = {}; ASSERT_NE(NULL, crpc); - atomic_long_set(&interest.ready_rpc, 0); + atomic_set(&interest.rpc_ready, 0); interest.reg_rpc = crpc; crpc->interest = &interest; unit_log_clear(); From 83670f5602f98d01f881293fc458281f096f9bc7 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 19 Dec 2024 09:46:52 -0800 Subject: [PATCH 126/625] Eliminate use of LIST_POISON in homa_interest structs (Use list_del_init instead of list_del, so that "inserted then deleted" state is the same as "never inserted") --- homa_impl.h | 4 ++-- homa_incoming.c | 24 ++++++++++++------------ test/unit_homa_incoming.c | 8 ++++---- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 15e61318..f23198a6 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -241,8 +241,8 @@ static inline void homa_interest_init(struct homa_interest *interest) interest->locked = 0; interest->core = raw_smp_processor_id(); interest->reg_rpc = NULL; - interest->request_links.next = LIST_POISON1; - interest->response_links.next = LIST_POISON1; + INIT_LIST_HEAD(&interest->request_links); + INIT_LIST_HEAD(&interest->response_links); } /** diff --git a/homa_incoming.c b/homa_incoming.c index 3fd78a75..a5485dc2 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1162,8 +1162,8 @@ int homa_register_interests(struct homa_interest *interest, /* Make sure the interest isn't on the response list; * otherwise it might receive a second RPC. */ - if (interest->response_links.next != LIST_POISON1) - list_del(&interest->response_links); + if (!list_empty(&interest->response_links)) + list_del_init(&interest->response_links); goto claim_rpc; } list_add(&interest->request_links, &hsk->request_interests); @@ -1328,15 +1328,15 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, */ UNIT_HOOK("found_rpc"); if (interest.reg_rpc || - interest.request_links.next != LIST_POISON1 || - interest.response_links.next != LIST_POISON1) { + !list_empty(&interest.request_links) || + !list_empty(&interest.response_links)) { homa_sock_lock(hsk, "homa_wait_for_message"); if (interest.reg_rpc) interest.reg_rpc->interest = NULL; - if (interest.request_links.next != LIST_POISON1) - list_del(&interest.request_links); - if (interest.response_links.next != LIST_POISON1) - list_del(&interest.response_links); + if (!list_empty(&interest.request_links)) + list_del_init(&interest.request_links); + if (!list_empty(&interest.response_links)) + list_del_init(&interest.response_links); homa_sock_unlock(hsk); } @@ -1507,10 +1507,10 @@ void homa_rpc_handoff(struct homa_rpc *rpc) interest->reg_rpc->interest = NULL; interest->reg_rpc = NULL; } - if (interest->request_links.next != LIST_POISON1) - list_del(&interest->request_links); - if (interest->response_links.next != LIST_POISON1) - list_del(&interest->response_links); + if (!list_empty(&interest->request_links)) + list_del_init(&interest->request_links); + if (!list_empty(&interest->response_links)) + list_del_init(&interest->response_links); wake_up_process(interest->thread); } diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 0cf70863..42aa7daf 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2205,8 +2205,8 @@ TEST_F(homa_incoming, homa_register_interests__return_queued_response) HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); EXPECT_EQ(0, result); EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); - EXPECT_EQ(LIST_POISON1, self->interest.request_links.next); - EXPECT_EQ(LIST_POISON1, self->interest.response_links.next); + EXPECT_TRUE(list_empty(&self->interest.request_links)); + EXPECT_TRUE(list_empty(&self->interest.response_links)); homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_register_interests__return_queued_request) @@ -2221,8 +2221,8 @@ TEST_F(homa_incoming, homa_register_interests__return_queued_request) HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); EXPECT_EQ(0, result); EXPECT_EQ(srpc, homa_interest_get_rpc(&self->interest)); - EXPECT_EQ(LIST_POISON1, self->interest.request_links.next); - EXPECT_EQ(LIST_POISON1, self->interest.response_links.next); + EXPECT_TRUE(list_empty(&self->interest.request_links)); + EXPECT_TRUE(list_empty(&self->interest.response_links)); homa_rpc_unlock(srpc); } TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) From 4a851f548deb527fb00bb514ff0688563878e8b9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 19 Dec 2024 10:29:01 -0800 Subject: [PATCH 127/625] Replace uint64_t with __u64 in homa.h and homa_api.c --- homa.h | 28 ++++++++++++++-------------- homa_api.c | 18 +++++++++--------- util/cp_node.cc | 6 +++--- util/homa_test.cc | 22 +++++++++++----------- util/server.cc | 4 ++-- 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/homa.h b/homa.h index f1100238..573b6681 100644 --- a/homa.h +++ b/homa.h @@ -60,14 +60,14 @@ struct homa_sendmsg_args { * id. If the message is a request, then the value is modified to * hold the id of the new RPC. */ - uint64_t id; + __u64 id; /** * @completion_cookie: (in) Used only for request messages; will be * returned by recvmsg when the RPC completes. Typically used to * locate app-specific info about the RPC. */ - uint64_t completion_cookie; + __u64 completion_cookie; }; #if !defined(__cplusplus) @@ -86,27 +86,27 @@ struct homa_recvmsg_args { * @id: (in/out) Initially specifies the id of the desired RPC, or 0 * if any RPC is OK; returns the actual id received. */ - uint64_t id; + __u64 id; /** * @completion_cookie: (out) If the incoming message is a response, * this will return the completion cookie specified when the * request was sent. For requests this will always be zero. */ - uint64_t completion_cookie; + __u64 completion_cookie; /** * @flags: (in) OR-ed combination of bits that control the operation. * See below for values. */ - uint32_t flags; + __u32 flags; /** * @num_bpages: (in/out) Number of valid entries in @bpage_offsets. * Passes in bpages from previous messages that can now be * recycled; returns bpages from the new message. */ - uint32_t num_bpages; + __u32 num_bpages; /** * @bpage_offsets: (in/out) Each entry is an offset into the buffer @@ -118,7 +118,7 @@ struct homa_recvmsg_args { * these bpages and must eventually return them to Homa, using * bpage_offsets in a future recvmsg invocation. */ - uint32_t bpage_offsets[HOMA_MAX_BPAGES]; + __u32 bpage_offsets[HOMA_MAX_BPAGES]; }; #if !defined(__cplusplus) @@ -141,7 +141,7 @@ _Static_assert(sizeof(struct homa_recvmsg_args) <= 88, */ struct homa_abort_args { /** @id: Id of RPC to abort, or zero to abort all RPCs on socket. */ - uint64_t id; + __u64 id; /** * @error: Zero means destroy and free RPCs; nonzero means complete @@ -150,7 +150,7 @@ struct homa_abort_args { int error; int _pad1; - uint64_t _pad2[2]; + __u64 _pad2[2]; }; #if !defined(__cplusplus) @@ -189,19 +189,19 @@ struct homa_rcvbuf_args { #define HOMAIOCFREEZE _IO(0x89, 0xef) #ifndef __STRIP__ /* See strip.py */ -int homa_abort(int sockfd, uint64_t id, int error); +int homa_abort(int sockfd, __u64 id, int error); int homa_send(int sockfd, const void *message_buf, size_t length, const struct sockaddr *dest_addr, - uint32_t addrlen, uint64_t *id, uint64_t completion_cookie); + __u32 addrlen, __u64 *id, __u64 completion_cookie); int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, const struct sockaddr *dest_addr, - uint32_t addrlen, uint64_t *id, uint64_t completion_cookie); + __u32 addrlen, __u64 *id, __u64 completion_cookie); ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, const struct sockaddr *dest_addr, - uint32_t addrlen, uint64_t id); + __u32 addrlen, __u64 id); ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, const struct sockaddr *dest_addr, - uint32_t addrlen, uint64_t id); + __u32 addrlen, __u64 id); #endif /* See strip.py */ #ifdef __cplusplus diff --git a/homa_api.c b/homa_api.c index 7f45e352..1a9d903b 100644 --- a/homa_api.c +++ b/homa_api.c @@ -36,8 +36,8 @@ * error occurred, -1 is returned and errno is set appropriately. */ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, - const struct sockaddr *dest_addr, uint32_t addrlen, - uint64_t id) + const struct sockaddr *dest_addr, __u32 addrlen, + __u64 id) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -81,8 +81,8 @@ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, * error occurred, -1 is returned and errno is set appropriately. */ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, - const struct sockaddr *dest_addr, uint32_t addrlen, - uint64_t id) + const struct sockaddr *dest_addr, __u32 addrlen, + __u64 id) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -118,8 +118,8 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, * error occurred, -1 is returned and errno is set appropriately. */ int homa_send(int sockfd, const void *message_buf, size_t length, - const struct sockaddr *dest_addr, uint32_t addrlen, - uint64_t *id, uint64_t completion_cookie) + const struct sockaddr *dest_addr, __u32 addrlen, + __u64 *id, __u64 completion_cookie) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -163,8 +163,8 @@ int homa_send(int sockfd, const void *message_buf, size_t length, * error occurred, -1 is returned and errno is set appropriately. */ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, - const struct sockaddr *dest_addr, uint32_t addrlen, - uint64_t *id, uint64_t completion_cookie) + const struct sockaddr *dest_addr, __u32 addrlen, + __u64 *id, __u64 completion_cookie) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -200,7 +200,7 @@ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, * Return: If an error occurred, -1 is returned and errno is set * appropriately. Otherwise zero is returned. */ -int homa_abort(int sockfd, uint64_t id, int error) +int homa_abort(int sockfd, __u64 id, int error) { struct homa_abort_args args = {id, error}; diff --git a/util/cp_node.cc b/util/cp_node.cc index bc594427..ee2241a5 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -2089,7 +2089,7 @@ void homa_client::sender() while (1) { uint64_t now; - uint64_t rpc_id; + __u64 rpc_id; int server; int status; int slot = get_rinfo(); @@ -2190,7 +2190,7 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, { message_header *header = reinterpret_cast(buffer); uint64_t start; - uint64_t rpc_id; + __u64 rpc_id; int status; header->length = length; @@ -2214,7 +2214,7 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, } while ((status < 0) && ((errno == EAGAIN) || (errno == EINTR))); if (status < 0) { log(NORMAL, "FATAL: measure_rtt got error in recvmsg: %s " - "(id %lu, server %s)\n", + "(id %llu, server %s)\n", strerror(errno), rpc_id, print_address((union sockaddr_in_union *) receiver->src_addr())); diff --git a/util/homa_test.cc b/util/homa_test.cc index 6c0b69be..fdcf4aa1 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -77,7 +77,7 @@ void close_fd(int fd) */ void send_fd(int fd, const sockaddr_in_union *addr, char *request) { - uint64_t id; + __u64 id; int status; sleep(1); @@ -87,7 +87,7 @@ void send_fd(int fd, const sockaddr_in_union *addr, char *request) printf("Error in homa_send: %s\n", strerror(errno)); } else { - printf("homa_send succeeded, id %lu\n", id); + printf("homa_send succeeded, id %llu\n", id); } } @@ -160,7 +160,7 @@ void test_close() */ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) { - uint64_t id; + __u64 id; int status; int completed = 0; size_t total = 0; @@ -188,7 +188,7 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) recv_hdr.msg_controllen = sizeof(recv_args); received = recvmsg(fd, &recv_hdr, 0); if (received < 0) { - printf("Error in recvmsg for id %lu: %s\n", + printf("Error in recvmsg for id %llu: %s\n", id, strerror(errno)); } else { total += received; @@ -214,7 +214,7 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) */ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) { - uint64_t id; + __u64 id; int status; ssize_t resp_length; @@ -224,7 +224,7 @@ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) printf("Error in homa_send: %s\n", strerror(errno)); return; } else { - printf("homa_send succeeded, id %lu\n", id); + printf("homa_send succeeded, id %llu\n", id); } recv_args.id = 0; recv_args.flags = HOMA_RECVMSG_RESPONSE; @@ -237,7 +237,7 @@ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) int seed = check_message(&recv_args, buf_region, resp_length, 2*sizeof32(int)); printf("Received message from %s with %lu bytes, " - "seed %d, id %lu\n", + "seed %d, id %llu\n", print_address(&source_addr), resp_length, seed, recv_args.id); } @@ -408,7 +408,7 @@ void test_rtt(int fd, const sockaddr_in_union *dest, char *request) */ void test_send(int fd, const sockaddr_in_union *dest, char *request) { - uint64_t id; + __u64 id; int status; status = homa_send(fd, request, length, &dest->sa, @@ -417,7 +417,7 @@ void test_send(int fd, const sockaddr_in_union *dest, char *request) printf("Error in homa_send: %s\n", strerror(errno)); } else { - printf("Homa_send succeeded, id %lu\n", id); + printf("Homa_send succeeded, id %llu\n", id); } } @@ -492,9 +492,9 @@ void test_stream(int fd, const sockaddr_in_union *dest) #define MAX_RPCS 100 int *buffers[MAX_RPCS]; ssize_t resp_length; - uint64_t id, end_cycles; + __u64 id; uint64_t start_cycles = 0; - uint64_t end_time; + uint64_t end_cycles, end_time; int status, i; int64_t bytes_sent = 0; int64_t start_bytes = 0; diff --git a/util/server.cc b/util/server.cc index c4632a83..03031a4c 100644 --- a/util/server.cc +++ b/util/server.cc @@ -125,13 +125,13 @@ void homa_server(int port) 2*sizeof32(int)); if (verbose) printf("Received message from %s with %d bytes, " - "id %lu, seed %d, response length %d\n", + "id %llu, seed %d, response length %d\n", print_address(&source), length, recv_args.id, seed, resp_length); } else if (verbose) printf("Received message from %s with " - "%d bytes, id %lu, response length %d\n", + "%d bytes, id %llu, response length %d\n", print_address(&source), length, recv_args.id, resp_length); From a4c40f92881d6853e76a848a40e912829320ff6c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 19 Dec 2024 10:30:08 -0800 Subject: [PATCH 128/625] Patch around kernel-doc bugs --- homa.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/homa.h b/homa.h index 573b6681..533c4bc3 100644 --- a/homa.h +++ b/homa.h @@ -42,7 +42,7 @@ extern "C" >> HOMA_BPAGE_SHIFT) /** - * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into + * define HOMA_MIN_DEFAULT_PORT - The 16 bit port space is divided into * two nonoverlapping regions. Ports 1-32767 are reserved exclusively * for well-defined server ports. The remaining ports are used for client * ports; these are allocated automatically by Homa. Port 0 is reserved. @@ -175,8 +175,8 @@ struct homa_rcvbuf_args { */ /** - * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism: - * always send all packets immediately. + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism + * (always send all packets immediately). */ #define HOMA_FLAG_DONT_THROTTLE 2 From 76d61a5a33dec0228828bf6fdbf8db1754426471 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 20 Dec 2024 09:23:03 -0800 Subject: [PATCH 129/625] Remove 'extern "C" {}' from homa.h --- homa.h | 9 --------- homa_receiver.h | 2 ++ util/cp_node.cc | 2 ++ util/dist_to_proto.cc | 2 ++ util/homa_prio.cc | 2 ++ util/homa_test.cc | 2 ++ util/server.cc | 2 ++ 7 files changed, 12 insertions(+), 9 deletions(-) diff --git a/homa.h b/homa.h index 533c4bc3..6fbd293d 100644 --- a/homa.h +++ b/homa.h @@ -13,11 +13,6 @@ #include #endif -#ifdef __cplusplus -extern "C" -{ -#endif - /* IANA-assigned Internet Protocol number for Homa. */ #define IPPROTO_HOMA 146 @@ -204,8 +199,4 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, __u32 addrlen, __u64 id); #endif /* See strip.py */ -#ifdef __cplusplus -} -#endif - #endif /* _UAPI_LINUX_HOMA_H */ diff --git a/homa_receiver.h b/homa_receiver.h index 770ff511..fdb89181 100644 --- a/homa_receiver.h +++ b/homa_receiver.h @@ -5,7 +5,9 @@ #include #include +extern "C" { #include "homa.h" +} namespace homa { /** diff --git a/util/cp_node.cc b/util/cp_node.cc index ee2241a5..b8ba866d 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -42,7 +42,9 @@ #include #include "dist.h" +extern "C" { #include "homa.h" +} #include "homa_receiver.h" #include "test_utils.h" #include "time_trace.h" diff --git a/util/dist_to_proto.cc b/util/dist_to_proto.cc index 4fc0003c..88f4a454 100644 --- a/util/dist_to_proto.cc +++ b/util/dist_to_proto.cc @@ -3,7 +3,9 @@ */ #include "dist.h" +extern "C" { #include "homa.h" +} #include "iostream" /** diff --git a/util/homa_prio.cc b/util/homa_prio.cc index 0a92cde3..ba1d829f 100644 --- a/util/homa_prio.cc +++ b/util/homa_prio.cc @@ -18,7 +18,9 @@ #include #include +extern "C" { #include "homa.h" +} /* Values of command-line arguments (and their default values): */ diff --git a/util/homa_test.cc b/util/homa_test.cc index fdcf4aa1..3028d038 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -28,7 +28,9 @@ #include +extern "C" { #include "homa.h" +} #include "test_utils.h" /* Determines message size in bytes for tests. */ diff --git a/util/server.cc b/util/server.cc index 03031a4c..7eaffe8d 100644 --- a/util/server.cc +++ b/util/server.cc @@ -29,7 +29,9 @@ #include +extern "C" { #include "homa.h" +} #include "test_utils.h" /* Log events to standard output. */ From e3f7fa85c0167d23e4594c9ff7b8a84273a60ae9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 20 Dec 2024 10:01:20 -0800 Subject: [PATCH 130/625] Fix kernel-doc issues --- homa_grant.c | 1 + homa_impl.h | 1 + homa_metrics.c | 1 + homa_metrics.h | 32 +++++++++++++++++--------------- homa_receiver.h | 13 ++++++------- homa_skb.h | 13 ++++++------- homa_wire.h | 20 +++++++++++--------- timetrace.c | 1 + timetrace.h | 15 ++++++--------- 9 files changed, 50 insertions(+), 47 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 5015fb23..5a51099b 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -16,6 +16,7 @@ * equivalent or rpc2 is higher priority. * @rpc1: First RPC to consider. * @rpc2: Second RPC to consider. + * Return: see above */ int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) { diff --git a/homa_impl.h b/homa_impl.h index f23198a6..51fb80bd 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1065,6 +1065,7 @@ static inline bool is_homa_pkt(struct sk_buff *skb) * tt_addr() - Given an address, return a 4-byte id that will (hopefully) * provide a unique identifier for the address in a timetrace record. * @x: Address (either IPv6 or IPv4-mapped IPv6) + * Return: see above */ static inline __u32 tt_addr(const struct in6_addr x) { diff --git a/homa_metrics.c b/homa_metrics.c index 95b20c83..6cc854e2 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -399,6 +399,7 @@ ssize_t homa_metrics_read(struct file *file, char __user *buffer, * @file: Information about the file being read. * @offset: Distance to seek, in bytes * @whence: Starting point from which to measure the distance to seek. + * Return: current position within file. */ loff_t homa_metrics_lseek(struct file *file, loff_t offset, int whence) { diff --git a/homa_metrics.h b/homa_metrics.h index 248573a8..ed70018b 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -19,14 +19,13 @@ * * All counters are free-running: they never reset. */ -#define HOMA_NUM_SMALL_COUNTS 64 -#define HOMA_NUM_MEDIUM_COUNTS 128 struct homa_metrics { /** * @small_msg_bytes: entry i holds the total number of bytes * received in messages whose length is between 64*i and 64*i + 63, * inclusive. */ +#define HOMA_NUM_SMALL_COUNTS 64 __u64 small_msg_bytes[HOMA_NUM_SMALL_COUNTS]; /** @@ -35,6 +34,7 @@ struct homa_metrics { * 1024*i + 1023, inclusive. The first four entries are always 0 * (small_msg_counts covers this range). */ +#define HOMA_NUM_MEDIUM_COUNTS 128 __u64 medium_msg_bytes[HOMA_NUM_MEDIUM_COUNTS]; /** @@ -190,7 +190,8 @@ struct homa_metrics { */ __u64 send_ns; - /** @send_calls: total number of invocations of homa_semdmsg + /** + * @send_calls: total number of invocations of homa_semdmsg * for requests. */ __u64 send_calls; @@ -324,32 +325,32 @@ struct homa_metrics { __u64 peer_new_entries; /** - * @peer_kmalloc errors: total number of times homa_peer_find + * @peer_kmalloc_errors: total number of times homa_peer_find * returned an error because it couldn't allocate memory for a new * peer. */ __u64 peer_kmalloc_errors; /** - * @peer_route errors: total number of times homa_peer_find + * @peer_route_errors: total number of times homa_peer_find * returned an error because it couldn't create a route to the peer. */ __u64 peer_route_errors; /** - * @control_xmit_errors errors: total number of times ip_queue_xmit + * @control_xmit_errors: total number of times ip_queue_xmit * failed when transmitting a control packet. */ __u64 control_xmit_errors; /** - * @data_xmit_errors errors: total number of times ip_queue_xmit + * @data_xmit_errors: total number of times ip_queue_xmit * failed when transmitting a data packet. */ __u64 data_xmit_errors; /** - * @unknown_rpc: total number of times an incoming packet was + * @unknown_rpcs: total number of times an incoming packet was * discarded because it referred to a nonexistent RPC. Doesn't * count grant packets received by servers (since these are * fairly common). @@ -357,13 +358,13 @@ struct homa_metrics { __u64 unknown_rpcs; /** - * @cant_create_server_rpc: total number of times a server discarded + * @server_cant_create_rpcs: total number of times a server discarded * an incoming packet because it couldn't create a homa_rpc object. */ __u64 server_cant_create_rpcs; /** - * @unknown_packet_type: total number of times a packet was discarded + * @unknown_packet_types: total number of times a packet was discarded * because its type wasn't one of the supported values. */ __u64 unknown_packet_types; @@ -460,7 +461,7 @@ struct homa_metrics { __u64 throttle_lock_misses; /** - * @peer_acklock_miss_ns: total time spent waiting for peer lock misses. + * @peer_ack_lock_miss_ns: total time spent waiting for peer lock misses. */ __u64 peer_ack_lock_miss_ns; @@ -543,7 +544,7 @@ struct homa_metrics { __u64 disabled_rpc_reaps; /** - * @reaper_runs: total number of times that the reaper was invoked + * @reaper_calls: total number of times that the reaper was invoked * and was not disabled. */ __u64 reaper_calls; @@ -572,7 +573,7 @@ struct homa_metrics { __u64 throttle_list_checks; /** - * @unacked_overflows: total number of times that homa_peer_add_ack + * @ack_overflows: total number of times that homa_peer_add_ack * found insufficient space for the new id and hence had to send an * ACK message. */ @@ -585,7 +586,7 @@ struct homa_metrics { __u64 ignored_need_acks; /** - * @bpage_resuses: total number of times that, when an owned page + * @bpage_reuses: total number of times that, when an owned page * reached the end, it could be reused because all existing * allocations had been released. */ @@ -645,8 +646,9 @@ struct homa_metrics { DECLARE_PER_CPU(struct homa_metrics, homa_metrics); /** - * per_cpu_metrics() - Return the metrics structure for the current core. + * homa_metrics_per_cpu() - Return the metrics structure for the current core. * This is unsynchronized and doesn't guarantee non-preemption. + * Return: see above */ static inline struct homa_metrics *homa_metrics_per_cpu(void) { diff --git a/homa_receiver.h b/homa_receiver.h index fdb89181..b8639643 100644 --- a/homa_receiver.h +++ b/homa_receiver.h @@ -10,13 +10,12 @@ extern "C" { } namespace homa { -/** - * class homa::receiver - Helper class for receiving a series of messages - * from a Homa socket. This class serves two purposes: first, it implements - * the application side of the Homa buffer management protocol, returning - * receive buffer space to Homa when the application longer needs it. Second, - * it provides convenience methods for accessing messages that are scattered\ - * over several discontiguous regions of buffer space. +/* Helper class for receiving a series of messages from a Homa socket. This + * class serves two purposes: first, it implements the application side of + * the Homa buffer management protocol, returning receive buffer space to + * Homa when the application longer needs it. Second, it provides convenience + * methods for accessing messages that are scattered over several discontiguous + * regions of buffer space. * * Typical usage: * - Call receive, which will invoke Homa to receive an incoming message. diff --git a/homa_skb.h b/homa_skb.h index 4c6103bb..ea7e0879 100644 --- a/homa_skb.h +++ b/homa_skb.h @@ -10,14 +10,14 @@ #include /** - * define HOMA_PAGE_ORDER: power-of-two exponent determining how + * define HOMA_SKB_PAGE_ORDER - exponent (power of two) determining how * many pages to allocate in a high-order page for skb pages (e.g., * 2 means allocate in units of 4 pages). */ #define HOMA_SKB_PAGE_ORDER 4 /** - * define HOMA_PAGE_SIZE: number of bytes corresponding to HOMA_PAGE_ORDER. + * define HOMA_SKB_PAGE_SIZE - number of bytes corresponding to HOMA_PAGE_ORDER. */ #define HOMA_SKB_PAGE_SIZE (PAGE_SIZE << HOMA_SKB_PAGE_ORDER) @@ -72,11 +72,10 @@ struct homa_skb_core { /** @page_size: total number of bytes available in @skb_page. */ int page_size; - /** - * define HOMA_MAX_STASHED: maximum number of stashed pages that - * can be consumed by a message of a given size (assumes page_inuse - * is 0). This is a rough guess, since it doesn't consider all of - * the data_segments that will be needed for the packets. + /* Maximum number of stashed pages that can be consumed by a message + * of a given size (assumes page_inuse is 0). This is a rough guess, + * since it doesn't consider all of the data_segments that will be + * needed for the packets. */ #define HOMA_MAX_STASHED(size) ((((size) - 1) / HOMA_SKB_PAGE_SIZE) + 1) diff --git a/homa_wire.h b/homa_wire.h index 94dc7c1a..68836deb 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -7,8 +7,7 @@ #include -/** - * enum homa_packet_type - Defines the possible types of Homa packets. +/* Defines the possible types of Homa packets. * * See the xxx_header structs below for more information about each type. */ @@ -97,13 +96,16 @@ struct homa_common_hdr { */ __be32 sequence; - /* The fields below correspond to the acknowledgment field in TCP - * headers; not used by Homa, except for the low-order 8 bits, which - * specify the Homa packet type (one of the values in the - * homa_packet_type enum). + /** + * @ack: Corresponds to the high-order bits of the acknowledgment + * field in TCP headers; not used by Homa. + */ + char ack[3]; + + /** + * @type: Homa packet type (one of the values of the homa_packet_type + * enum). Corresponds to the low-order byte of the ack in TCP. */ - __be16 ack1; - __u8 ack2; __u8 type; /** @@ -131,7 +133,7 @@ struct homa_common_hdr { __be16 window; /** - * @checksum: not used by Homa, but must occupy the same bytes as + * @checksum: Not used by Homa, but must occupy the same bytes as * the checksum in a TCP header (TSO may modify this?). */ __be16 checksum; diff --git a/timetrace.c b/timetrace.c index f924ce82..ccb1ec49 100644 --- a/timetrace.c +++ b/timetrace.c @@ -488,6 +488,7 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, * @file: Information about the file being read. * @offset: Distance to seek, in bytes * @whence: Starting point from which to measure the distance to seek. + * Return: current position within file. */ loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence) { diff --git a/timetrace.h b/timetrace.h index 44767ede..0a202523 100644 --- a/timetrace.h +++ b/timetrace.h @@ -15,8 +15,7 @@ cycles_t mock_get_cycles(void); // Used only in debugging. #define ENABLE_TIME_TRACE 1 -/** - * Timetrace implements a circular buffer of entries, each of which +/* Timetrace implements a circular buffer of entries, each of which * consists of a fine-grain timestamp, a short descriptive string, and * a few additional values. It's typically used to record times at * various points in kernel operations, in order to find performance @@ -25,8 +24,7 @@ cycles_t mock_get_cycles(void); * analysis by reading a file in /proc. */ -/** - * This structure holds one entry in a tt_buffer. +/* This structure holds one entry in a tt_buffer. */ struct tt_event { /** @@ -53,8 +51,7 @@ struct tt_event { #define TT_BUF_SIZE_EXP 14 #define TT_BUF_SIZE BIT(TT_BUF_SIZE_EXP) -/** - * Represents a sequence of events, typically consisting of all those +/* Represents a sequence of events, typically consisting of all those * generated by one thread. Has a fixed capacity, so slots are re-used * on a circular basis. This class is not thread-safe. */ @@ -72,8 +69,7 @@ struct tt_buffer { struct tt_event events[TT_BUF_SIZE]; }; -/** - * Holds information about an attempt to read timetrace information +/* Holds information about an attempt to read timetrace information * using a /proc file. Several of these can exist simultaneously. */ struct tt_proc_file { @@ -136,6 +132,7 @@ extern void *tt_debug_ptr[100]; /** * tt_rdtsc(): return the current value of the fine-grain CPU cycle counter * (accessed via the RDTSC instruction). + * Return: see above */ static inline __u64 tt_rdtsc(void) { @@ -145,7 +142,7 @@ static inline __u64 tt_rdtsc(void) return (((__u64)hi << 32) | lo); } -/** +/* * tt_recordN(): record an event, along with N parameters. * * @format: Format string for snprintf that will be used, along with From 77fdbcf9fbcdbd81b77dace1034a7c43c9300f76 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 20 Dec 2024 15:21:46 -0800 Subject: [PATCH 131/625] Change start in struct homa_rcvbuf_args from void * to __u64 Needed to properly handle 32-bit mode apps running on 64-bit kernels. --- homa.h | 4 ++-- homa_plumbing.c | 5 ++--- homa_pool.c | 7 ++++--- homa_pool.h | 2 +- test/unit_homa_plumbing.c | 16 ++++++++-------- test/unit_homa_pool.c | 2 +- util/cp_node.cc | 4 ++-- util/homa_test.cc | 4 ++-- util/server.cc | 2 +- 9 files changed, 23 insertions(+), 23 deletions(-) diff --git a/homa.h b/homa.h index 6fbd293d..c27f7c09 100644 --- a/homa.h +++ b/homa.h @@ -158,8 +158,8 @@ _Static_assert(sizeof(struct homa_abort_args) <= 32, "homa_abort_args grew"); /** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ struct homa_rcvbuf_args { - /** @start: First byte of buffer region. */ - void *start; + /** @start: Address of first byte of buffer region in user space. */ + __u64 start; /** @length: Total number of bytes available at @start. */ size_t length; diff --git a/homa_plumbing.c b/homa_plumbing.c index eceae70c..775af398 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -864,13 +864,12 @@ int homa_setsockopt(struct sock *sk, int level, int optname, /* Do a trivial test to make sure we can at least write the first * page of the region. */ - if (copy_to_user((__force void __user *)args.start, &args, + if (copy_to_user(u64_to_user_ptr(args.start), &args, sizeof(args))) return -EFAULT; homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_RCV_BUF"); - ret = homa_pool_init(hsk, (__force void __user *)args.start, - args.length); + ret = homa_pool_init(hsk, u64_to_user_ptr(args.start), args.length); homa_sock_unlock(hsk); INC_METRIC(so_set_buf_calls, 1); INC_METRIC(so_set_buf_ns, sched_clock() - start); diff --git a/homa_pool.c b/homa_pool.c index bc2bc2f1..5af6f1bc 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -59,7 +59,7 @@ int homa_pool_init(struct homa_sock *hsk, void __user *region, if (((uintptr_t)region) & ~PAGE_MASK) return -EINVAL; pool->hsk = hsk; - pool->region = (char *)region; + pool->region = (char __user *)region; pool->num_bpages = region_size >> HOMA_BPAGE_SHIFT; pool->descriptors = NULL; pool->cores = NULL; @@ -133,7 +133,7 @@ void homa_pool_get_rcvbuf(struct homa_sock *hsk, struct homa_rcvbuf_args *args) { homa_sock_lock(hsk, "homa_pool_get_rcvbuf"); - args->start = hsk->buffer_pool->region; + args->start = (uintptr_t)hsk->buffer_pool->region; args->length = hsk->buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; homa_sock_unlock(hsk); } @@ -376,7 +376,8 @@ int homa_pool_allocate(struct homa_rpc *rpc) * Return: The application's virtual address for buffer space corresponding * to @offset in the incoming message for @rpc. */ -void *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available) +void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, + int *available) { int bpage_index, bpage_offset; diff --git a/homa_pool.h b/homa_pool.h index da952c8a..2fcd297f 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -111,7 +111,7 @@ struct homa_pool { * memory). Divided into bpages. 0 means the pool hasn't yet been * initialized. */ - char *region; + char __user *region; /** @num_bpages: total number of bpages in the pool. */ int num_bpages; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 0444ec5a..c379e175 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -289,7 +289,7 @@ TEST_F(homa_plumbing, homs_setsockopt__copy_from_sockptr_fails) } TEST_F(homa_plumbing, homa_setsockopt__copy_to_user_fails) { - struct homa_rcvbuf_args args = {(void *) 0x100000, 5*HOMA_BPAGE_SIZE}; + struct homa_rcvbuf_args args = {0x100000, 5*HOMA_BPAGE_SIZE}; self->optval.user = &args; mock_copy_to_user_errors = 1; @@ -302,7 +302,7 @@ TEST_F(homa_plumbing, homa_setsockopt__success) struct homa_rcvbuf_args args; char buffer[5000]; - args.start = (void *) (((__u64) (buffer + PAGE_SIZE - 1)) + args.start = (((uintptr_t)(buffer + PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1)); args.length = 64*HOMA_BPAGE_SIZE; self->optval.user = &args; @@ -310,7 +310,7 @@ TEST_F(homa_plumbing, homa_setsockopt__success) EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, self->optval, sizeof(struct homa_rcvbuf_args))); - EXPECT_EQ(args.start, self->hsk.buffer_pool->region); + EXPECT_EQ(args.start, (uintptr_t)self->hsk.buffer_pool->region); EXPECT_EQ(64, self->hsk.buffer_pool->num_bpages); EXPECT_EQ(1, homa_metrics_per_cpu()->so_set_buf_calls); } @@ -325,7 +325,7 @@ TEST_F(homa_plumbing, homa_getsockopt__success) 10*HOMA_BPAGE_SIZE + 1000)); EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); - EXPECT_EQ((void *)0x40000, val.start); + EXPECT_EQ(0x40000, val.start); EXPECT_EQ(10*HOMA_BPAGE_SIZE, val.length); EXPECT_EQ(sizeof32(val), size); } @@ -364,7 +364,7 @@ TEST_F(homa_plumbing, homa_getsockopt__bad_length) } TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_size) { - struct homa_rcvbuf_args val = {.start = NULL, .length = 0}; + struct homa_rcvbuf_args val = {.start = 0, .length = 0}; int size = sizeof32(val) + 10; EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, @@ -373,12 +373,12 @@ TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_size) EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); - EXPECT_EQ(NULL, val.start); + EXPECT_EQ(0, val.start); EXPECT_EQ(sizeof32(val) + 10, size); } TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_value) { - struct homa_rcvbuf_args val = {.start = NULL, .length = 0}; + struct homa_rcvbuf_args val = {.start = 0, .length = 0}; int size = sizeof32(val) + 10; EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, @@ -387,7 +387,7 @@ TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_value) EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); - EXPECT_EQ(NULL, val.start); + EXPECT_EQ(0, val.start); EXPECT_EQ(sizeof32(val), size); } diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index b042a7d5..4fbbcc18 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -121,7 +121,7 @@ TEST_F(homa_pool, homa_pool_get_rcvbuf) EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, 10*HOMA_BPAGE_SIZE + 1000)); homa_pool_get_rcvbuf(&self->hsk, &args); - EXPECT_EQ(args.start, (void *)0x40000); + EXPECT_EQ(0x40000, args.start); EXPECT_EQ(10*HOMA_BPAGE_SIZE, args.length); } diff --git a/util/cp_node.cc b/util/cp_node.cc index b8ba866d..1f6e9192 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -1012,7 +1012,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, port, strerror(errno)); exit(1); } - arg.start = buf_region; + arg.start = (uintptr_t)buf_region; arg.length = buf_size; int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); @@ -1963,7 +1963,7 @@ homa_client::homa_client(int id, std::string& experiment) id, strerror(errno)); exit(1); } - arg.start = buf_region; + arg.start = (uintptr_t)buf_region; arg.length = buf_size; int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); diff --git a/util/homa_test.cc b/util/homa_test.cc index 3028d038..6c3c30f5 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -439,7 +439,7 @@ void test_set_buf(int fd) return; } - arg.start = region; + arg.start = (uintptr_t)region; arg.length = 64*HOMA_BPAGE_SIZE; status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); @@ -928,7 +928,7 @@ int main(int argc, char** argv) exit(1); } struct homa_rcvbuf_args arg; - arg.start = buf_region; + arg.start = (uintptr_t)buf_region; arg.length = 1000*HOMA_BPAGE_SIZE; status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); diff --git a/util/server.cc b/util/server.cc index 7eaffe8d..29dbfc64 100644 --- a/util/server.cc +++ b/util/server.cc @@ -91,7 +91,7 @@ void homa_server(int port) printf("Couldn't mmap buffer region: %s\n", strerror(errno)); return; } - arg.start = buf_region; + arg.start = (uintptr_t)buf_region; arg.length = 1000*HOMA_BPAGE_SIZE; int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, sizeof(arg)); From 6c9722e1d9d76b066d4fa7657e8851f9f7e837c5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 20 Dec 2024 15:52:46 -0800 Subject: [PATCH 132/625] Document restrictions on sharing sockets across processes --- man/homa.7 | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/man/homa.7 b/man/homa.7 index 5e181c74..6a01a415 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -60,7 +60,7 @@ A request fails only if Homa cannot maintain communication with the Homa transport module on the server. Homa ensures at-most-once semantics for an RPC. .PP -Home is intended for use between machines that are physically +Homa is intended for use between machines that are physically close, with round-trip latencies no more than a few tens of microseconds. Homa is not suitable for wide-area communication. .PP @@ -168,7 +168,7 @@ arguments must refer to a struct of the following type: .vs -2 .EX struct homa_rcvbuf_args { - void *start; + __u64 start; size_t length; }; .EE @@ -192,6 +192,10 @@ then .I recvmsg calls on the socket will return ENOMEM errors. +.PP +Because of this mechanism, a Homa socket cannot be shared by multiple +processes unless the processes also share the buffer space and map +it to the same virtual address in each sharing process. .SH SENDING MESSAGES .PP The From f00f176a4451e266e2f11e3f22195ad2e0492bdb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 6 Jan 2025 09:36:00 -0800 Subject: [PATCH 133/625] Fix checkpatch.pl issue --- homa_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_pool.c b/homa_pool.c index 5af6f1bc..bee403af 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -372,7 +372,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) * @offset: Offset within @rpc's incoming message. * @available: Will be filled in with the number of bytes of space available * at the returned address (could be zero if offset is - * (erroneously) past the end of the message). + * (erroneously) past the end of the message). * Return: The application's virtual address for buffer space corresponding * to @offset in the incoming message for @rpc. */ From 16c3efee57f26de494e1bb9e7bf6e0f13ea7b718 Mon Sep 17 00:00:00 2001 From: breakertt Date: Mon, 6 Jan 2025 15:04:36 -0800 Subject: [PATCH 134/625] Fix invalid access to shinfo->frags[-1] when nr_frags is 0 There isn't actually an invalid access, but formation of the address causes warnings from some analysis tools. Resolves #68 --- homa_skb.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/homa_skb.c b/homa_skb.c index 5775e72a..7de18208 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -193,22 +193,24 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) /* Can we just extend the skb's last fragment? */ skb_core = &per_cpu(homa_skb_core, raw_smp_processor_id()); - frag = &shinfo->frags[shinfo->nr_frags - 1]; - if (shinfo->nr_frags > 0 && - skb_frag_page(frag) == skb_core->skb_page && - skb_core->page_inuse < skb_core->page_size && - (frag->offset + skb_frag_size(frag)) == skb_core->page_inuse) { - if ((skb_core->page_size - skb_core->page_inuse) < - actual_size) - actual_size = skb_core->page_size - - skb_core->page_inuse; - *length = actual_size; - skb_frag_size_add(frag, actual_size); - result = page_address(skb_frag_page(frag)) + - skb_core->page_inuse; - skb_core->page_inuse += actual_size; - skb_len_add(skb, actual_size); - return result; + if (shinfo->nr_frags > 0) { + frag = &shinfo->frags[shinfo->nr_frags - 1]; + if (skb_frag_page(frag) == skb_core->skb_page && + skb_core->page_inuse < skb_core->page_size && + (frag->offset + skb_frag_size(frag)) == + skb_core->page_inuse) { + if ((skb_core->page_size - skb_core->page_inuse) < + actual_size) + actual_size = skb_core->page_size - + skb_core->page_inuse; + *length = actual_size; + skb_frag_size_add(frag, actual_size); + result = page_address(skb_frag_page(frag)) + + skb_core->page_inuse; + skb_core->page_inuse += actual_size; + skb_len_add(skb, actual_size); + return result; + } } /* Need to add a new fragment to the skb. */ From c4f579f9a83728baeb638a6f11c15e8f0ddf65d8 Mon Sep 17 00:00:00 2001 From: breakertt Date: Thu, 20 Feb 2025 19:53:15 +0000 Subject: [PATCH 135/625] Fix incorrect skb check in homa_message_out_fill --- homa_outgoing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index 02aed784..2e035047 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -290,7 +290,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) skb_data_bytes = bytes_left; skb = homa_new_data_packet(rpc, iter, offset, skb_data_bytes, max_seg_data); - if (unlikely(!skb)) { + if (unlikely(IS_ERR(skb))) { err = PTR_ERR(skb); homa_rpc_lock(rpc, "homa_message_out_fill"); goto error; From ba19300e89c80241598c40ad450dc3c4a081ac19 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 7 Jan 2025 11:58:43 -0800 Subject: [PATCH 136/625] Make struct hrtimer in homa_timer_main a static If stack-allocated, causes complaints in debug mode and also triggers deadlock warning by kernel test robot. --- homa_plumbing.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 775af398..6e174c20 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1684,7 +1684,11 @@ enum hrtimer_restart homa_hrtimer(struct hrtimer *timer) int homa_timer_main(void *transport) { struct homa *homa = (struct homa *)transport; - struct hrtimer hrtimer; + + /* The following variable is static because hrtimer_init will + * complain about a stack-allocated hrtimer if in debug mode. + */ + static struct hrtimer hrtimer; ktime_t tick_interval; u64 nsec; From 0a6e71b3f4af8369b3a2e6c61ccfe1b249e807b9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 7 Jan 2025 12:01:21 -0800 Subject: [PATCH 137/625] Remove unnecessary cast --- homa_peer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index b9c433da..b947e679 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -239,8 +239,7 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, __func__, PTR_ERR(dst)); INC_METRIC(peer_route_errors, 1); } else { - struct homa_dead_dst *dead = (struct homa_dead_dst *) - kmalloc(sizeof(*dead), GFP_KERNEL); + struct homa_dead_dst *dead = kmalloc(sizeof(*dead), GFP_KERNEL); if (unlikely(!dead)) { /* Can't allocate memory to keep track of the * dead dst; just free it immediately (a bit From bb43fe7ca79788bedfc69684992c3de2eaef8a0d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 7 Jan 2025 16:21:52 -0800 Subject: [PATCH 138/625] Refactor homa_dst_refresh * Use GFP_ATOMIC instead of GFP_KERNEL * Handle malloc failures better (retain old dst) --- homa_peer.c | 38 ++++++++++++++++++++------------------ test/unit_homa_peer.c | 16 ++++++++-------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index b947e679..028cf4eb 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -228,9 +228,18 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, struct homa_sock *hsk) { + struct homa_dead_dst *save_dead; struct dst_entry *dst; + __u64 now; + + /* Need to keep around the current entry for a while in case + * someone is using it. If we can't do that, then don't update + * the entry. + */ + save_dead = kmalloc(sizeof(*save_dead), GFP_ATOMIC); + if (unlikely(!save_dead)) + return; - spin_lock_bh(&peertab->write_lock); dst = homa_peer_get_dst(peer, &hsk->inet); if (IS_ERR(dst)) { /* Retain the existing dst if we can't create a new one. */ @@ -238,24 +247,17 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, pr_notice("%s couldn't recreate dst: error %ld", __func__, PTR_ERR(dst)); INC_METRIC(peer_route_errors, 1); - } else { - struct homa_dead_dst *dead = kmalloc(sizeof(*dead), GFP_KERNEL); - if (unlikely(!dead)) { - /* Can't allocate memory to keep track of the - * dead dst; just free it immediately (a bit - * risky, admittedly). - */ - dst_release(peer->dst); - } else { - __u64 now = sched_clock(); - - dead->dst = peer->dst; - dead->gc_time = now + 125000000; - list_add_tail(&dead->dst_links, &peertab->dead_dsts); - homa_peertab_gc_dsts(peertab, now); - } - peer->dst = dst; + kfree(save_dead); + return; } + + spin_lock_bh(&peertab->write_lock); + now = sched_clock(); + save_dead->dst = peer->dst; + save_dead->gc_time = now + 100000000; + list_add_tail(&save_dead->dst_links, &peertab->dead_dsts); + homa_peertab_gc_dsts(peertab, now); + peer->dst = dst; spin_unlock_bh(&peertab->write_lock); } diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 4a443146..44984c55 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -113,11 +113,11 @@ TEST_F(homa_peer, homa_peertab_gc_dsts) homa_dst_refresh(&self->peertab, peer, &self->hsk); mock_ns = 50000000; homa_dst_refresh(&self->peertab, peer, &self->hsk); - mock_ns = 100000000; + mock_ns = 90000000; homa_dst_refresh(&self->peertab, peer, &self->hsk); EXPECT_EQ(3, dead_count(&self->peertab)); - homa_peertab_gc_dsts(&self->peertab, 150000000); + homa_peertab_gc_dsts(&self->peertab, 110000000); EXPECT_EQ(2, dead_count(&self->peertab)); homa_peertab_gc_dsts(&self->peertab, ~0); EXPECT_EQ(0, dead_count(&self->peertab)); @@ -228,7 +228,7 @@ TEST_F(homa_peer, homa_dst_refresh__basics) EXPECT_NE(old_dst, peer->dst); EXPECT_EQ(1, dead_count(self->homa.peers)); } -TEST_F(homa_peer, homa_dst_refresh__routing_error) +TEST_F(homa_peer, homa_dst_refresh__malloc_error) { struct dst_entry *old_dst; struct homa_peer *peer; @@ -238,13 +238,12 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) EXPECT_EQ_IP(*ip1111, peer->addr); old_dst = homa_get_dst(peer, &self->hsk); - mock_route_errors = 1; + mock_kmalloc_errors = 1; homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); - EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); EXPECT_EQ(0, dead_count(self->homa.peers)); } -TEST_F(homa_peer, homa_dst_refresh__malloc_error) +TEST_F(homa_peer, homa_dst_refresh__routing_error) { struct dst_entry *old_dst; struct homa_peer *peer; @@ -254,9 +253,10 @@ TEST_F(homa_peer, homa_dst_refresh__malloc_error) EXPECT_EQ_IP(*ip1111, peer->addr); old_dst = homa_get_dst(peer, &self->hsk); - mock_kmalloc_errors = 1; + mock_route_errors = 1; homa_dst_refresh(self->homa.peers, peer, &self->hsk); - EXPECT_NE(old_dst, peer->dst); + EXPECT_EQ(old_dst, peer->dst); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); EXPECT_EQ(0, dead_count(self->homa.peers)); } TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) From 39381591c5efe88b0d443cd9775d375ebb8e8f2e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 8 Jan 2025 21:56:01 -0800 Subject: [PATCH 139/625] Fix compilation problems related to sanity checks Problems used to occur with LOCKDEP and other config options for kernel validation. --- homa_impl.h | 3 +++ homa_pool.h | 2 ++ test/mock.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 51fb80bd..6007c198 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -59,6 +59,9 @@ struct page *mock_alloc_pages(gfp_t gfp, unsigned int order); #define compound_order mock_compound_order unsigned int mock_compound_order(struct page *page); +#ifdef cpu_to_node +#undef cpu_to_node +#endif #define cpu_to_node mock_cpu_to_node int mock_cpu_to_node(int cpu); diff --git a/homa_pool.h b/homa_pool.h index 2fcd297f..94cb648c 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -48,8 +48,10 @@ struct homa_bpage { }; #ifndef __STRIP__ /* See strip.py */ +#ifndef CONFIG_LOCKDEP _Static_assert(sizeof(struct homa_bpage) == L1_CACHE_BYTES, "homa_bpage overflowed a cache line"); +#endif #endif /* See strip.py */ /** diff --git a/test/mock.c b/test/mock.c index 39865a5f..9911a972 100644 --- a/test/mock.c +++ b/test/mock.c @@ -197,6 +197,7 @@ unsigned int nr_cpu_ids = 8; unsigned long page_offset_base; unsigned long phys_base; unsigned long vmemmap_base; +kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES]; int __preempt_count; struct pcpu_hot pcpu_hot = {.cpu_number = 1}; char sock_flow_table[RPS_SOCK_FLOW_TABLE_SIZE(1024)]; @@ -204,6 +205,7 @@ struct net_hotdata net_hotdata = { .rps_cpu_mask = 0x1f, .rps_sock_flow_table = (struct rps_sock_flow_table *) sock_flow_table }; +int debug_locks; extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) @@ -331,6 +333,11 @@ void __copy_overflow(int size, unsigned long count) abort(); } +int debug_lockdep_rcu_enabled(void) +{ + return 0; +} + void dst_release(struct dst_entry *dst) { if (!dst) @@ -696,6 +703,14 @@ struct file *filp_open(const char *, int, umode_t) return NULL; } +void __fortify_panic(const u8 reason, const size_t avail, const size_t size) +{ + FAIL("__fortify_panic invoked"); + + /* API prohibits return. */ + while (1) ; +} + ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { return 0; @@ -749,6 +764,11 @@ void __kfree_skb(struct sk_buff *skb) free(skb); } +void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + return mock_kmalloc(size, gfpflags); +} + void *mock_kmalloc(size_t size, gfp_t flags) { void *block; @@ -789,21 +809,41 @@ int kthread_stop(struct task_struct *k) } #ifdef CONFIG_DEBUG_LIST -bool __list_add_valid(struct list_head *new, - struct list_head *prev, - struct list_head *next) +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) +{ + return true; +} +#endif + +bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev, + struct list_head *next) { return true; } +#ifdef CONFIG_DEBUG_LIST bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif +bool __list_del_entry_valid_or_report(struct list_head *entry) +{ + return true; +} + void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) {} +void lockdep_rcu_suspicious(const char *file, const int line, const char *s) +{} + +int lock_is_held_type(const struct lockdep_map *lock, int read) +{ + return 0; +} + void lock_sock_nested(struct sock *sk, int subclass) { mock_active_locks++; @@ -822,7 +862,11 @@ void __mutex_init(struct mutex *lock, const char *name, } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_lock_nested(struct mutex *lock, unsigned int subclass) +#else void mutex_lock(struct mutex *lock) +#endif { mock_active_locks++; } @@ -937,6 +981,10 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) mock_active_locks++; } +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key, short inner) +{} + int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { UNIT_HOOK("spin_lock"); @@ -961,6 +1009,16 @@ int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) return 1; } +int rcu_read_lock_held(void) +{ + return 0; +} + +int rcu_read_lock_bh_held(void) +{ + return 0; +} + bool rcuref_get_slowpath(rcuref_t *ref) { return true; @@ -1054,6 +1112,9 @@ struct sk_buff *skb_dequeue(struct sk_buff_head *list) return __skb_dequeue(list); } +void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) +{} + void *skb_pull(struct sk_buff *skb, unsigned int len) { if ((skb_tail_pointer(skb) - skb->data) < len) From 8047748c76844d40435c8b62a157c4bb144680ec Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 8 Jan 2025 21:58:07 -0800 Subject: [PATCH 140/625] Replace GFP_KERNEL with GFP_ATOMIC where appropriate Also add checks in unit tests for proper use of GFP_ATOMIC --- homa_incoming.c | 2 +- homa_metrics.c | 4 ++-- homa_peer.c | 2 +- homa_rpc.c | 4 ++-- homa_skb.c | 10 +++++----- homa_sock.c | 2 +- homa_stub.h | 2 +- test/mock.c | 32 +++++++++++++++++++++++--------- test/unit_homa_outgoing.c | 20 ++++++++++---------- timetrace.c | 2 +- 10 files changed, 47 insertions(+), 33 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index a5485dc2..3efbbf0a 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -66,7 +66,7 @@ struct homa_gap *homa_gap_new(struct list_head *next, int start, int end) { struct homa_gap *gap; - gap = kmalloc(sizeof(*gap), GFP_KERNEL); + gap = kmalloc(sizeof(*gap), GFP_ATOMIC); if (!gap) return NULL; gap->start = start; diff --git a/homa_metrics.c b/homa_metrics.c index 6cc854e2..dc36c7e1 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -28,7 +28,7 @@ void homa_metric_append(struct homa *homa, const char *format, ...) #else homa->metrics_capacity = 4096; #endif - homa->metrics = kmalloc(homa->metrics_capacity, GFP_KERNEL); + homa->metrics = kmalloc(homa->metrics_capacity, GFP_ATOMIC); if (!homa->metrics) { pr_warn("%s couldn't allocate memory\n", __func__); return; @@ -51,7 +51,7 @@ void homa_metric_append(struct homa *homa, const char *format, ...) /* Not enough room; expand buffer capacity. */ homa->metrics_capacity *= 2; - new_buffer = kmalloc(homa->metrics_capacity, GFP_KERNEL); + new_buffer = kmalloc(homa->metrics_capacity, GFP_ATOMIC); if (!new_buffer) { pr_warn("%s couldn't allocate memory\n", __func__); return; diff --git a/homa_peer.c b/homa_peer.c index 028cf4eb..e2304f48 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -92,7 +92,7 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, if (count == 0) return NULL; - result = kmalloc_array(count, sizeof(peer), GFP_KERNEL); + result = kmalloc_array(count, sizeof(peer), GFP_ATOMIC); if (!result) return NULL; *num_peers = count; diff --git a/homa_rpc.c b/homa_rpc.c index 0e1941fd..92d0cfff 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -134,7 +134,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, } /* Initialize fields that don't require the socket lock. */ - srpc = kmalloc(sizeof(*srpc), GFP_KERNEL); + srpc = kmalloc(sizeof(*srpc), GFP_ATOMIC); if (!srpc) { err = -ENOMEM; goto error; @@ -345,7 +345,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) struct homa_rpc *rpc; int i, batch_size; int rx_frees = 0; - int result; + int result = 0; INC_METRIC(reaper_calls, 1); INC_METRIC(reaper_dead_skbs, hsk->dead_skbs); diff --git a/homa_skb.c b/homa_skb.c index 7de18208..308336eb 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -52,7 +52,7 @@ int homa_skb_init(struct homa *homa) if (!homa->page_pools[numa]) { struct homa_page_pool *pool; - pool = kmalloc(sizeof(*pool), GFP_KERNEL); + pool = kmalloc(sizeof(*pool), GFP_ATOMIC); if (!pool) return -ENOMEM; pool->avail = 0; @@ -131,7 +131,7 @@ struct sk_buff *homa_skb_new_tx(int length) */ skb = alloc_skb(HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH + sizeof(struct homa_skb_info) + length, - GFP_KERNEL); + GFP_ATOMIC); if (likely(skb)) { skb_reserve(skb, HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH); skb_reset_transport_header(skb); @@ -286,7 +286,7 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) /* Step 3: can we allocate a new big page? */ INC_METRIC(skb_page_allocs, 1); start = sched_clock(); - skb_core->skb_page = alloc_pages((GFP_KERNEL & ~__GFP_RECLAIM) | __GFP_COMP + skb_core->skb_page = alloc_pages(GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, HOMA_SKB_PAGE_ORDER); if (likely(skb_core->skb_page)) { INC_METRIC(skb_page_alloc_ns, sched_clock() - start); @@ -294,7 +294,7 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) } /* Step 4: can we allocate a normal page? */ - skb_core->skb_page = alloc_page(GFP_KERNEL); + skb_core->skb_page = alloc_page(GFP_ATOMIC); INC_METRIC(skb_page_alloc_ns, sched_clock() - start); if (likely(skb_core->skb_page)) { skb_core->page_size = PAGE_SIZE; @@ -600,7 +600,7 @@ void homa_skb_release_pages(struct homa *homa) kfree(homa->skb_pages_to_free); homa->skb_pages_to_free = kmalloc_array(release_max, sizeof(struct page *), - GFP_KERNEL); + GFP_ATOMIC); homa->pages_to_free_slots = release_max; } diff --git a/homa_sock.c b/homa_sock.c index 4d9a064c..b8705f06 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -178,7 +178,7 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) INIT_HLIST_HEAD(&bucket->rpcs); bucket->id = i + 1000000; } - hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_KERNEL); + hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_ATOMIC); if (!hsk->buffer_pool) result = -ENOMEM; if (homa->hijack_tcp) diff --git a/homa_stub.h b/homa_stub.h index 7b47b63e..19a27ab3 100644 --- a/homa_stub.h +++ b/homa_stub.h @@ -67,7 +67,7 @@ static inline struct sk_buff *homa_skb_new_tx(int length) skb = alloc_skb(HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH + sizeof(struct homa_skb_info) + length, - GFP_KERNEL); + GFP_ATOMIC); if (likely(skb)) { skb_reserve(skb, HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH); skb_reset_transport_header(skb); diff --git a/test/mock.c b/test/mock.c index 9911a972..df3af7c5 100644 --- a/test/mock.c +++ b/test/mock.c @@ -115,11 +115,16 @@ static struct unit_hash *skbs_in_use; */ static struct unit_hash *vmallocs_in_use; -/* The number of locks that have been acquired but not yet released. - * Should be 0 at the end of each test. +/* The number of locks (other than spin locks) that have been acquired + * but not yet released. Should be 0 at the end of each test. */ static int mock_active_locks; +/* The number of spin locksthat have been acquired but not yet released. + * Should be 0 at the end of each test. + */ +static int mock_active_spin_locks; + /* The number of times rcu_read_lock has been called minus the number * of times rcu_read_unlock has been called. * Should be 0 at the end of each test. @@ -775,6 +780,9 @@ void *mock_kmalloc(size_t size, gfp_t flags) if (mock_check_error(&mock_kmalloc_errors)) return NULL; + if (mock_active_spin_locks > 0 && (flags & ~__GFP_ZERO) != GFP_ATOMIC) + FAIL("Incorrect flags 0x%x passed to mock_kmalloc; expected GFP_ATOMIC (0x%x)", + flags, GFP_ATOMIC); block = malloc(size); if (!block) { FAIL("malloc failed"); @@ -972,13 +980,13 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) void _raw_spin_lock(raw_spinlock_t *lock) { - mock_active_locks++; + mock_active_spin_locks++; } void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { UNIT_HOOK("spin_lock"); - mock_active_locks++; + mock_active_spin_locks++; } void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, @@ -990,14 +998,14 @@ int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) UNIT_HOOK("spin_lock"); if (mock_check_error(&mock_trylock_errors)) return 0; - mock_active_locks++; + mock_active_spin_locks++; return 1; } void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { UNIT_HOOK("unlock"); - mock_active_locks--; + mock_active_spin_locks--; } int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) @@ -1005,7 +1013,7 @@ int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) UNIT_HOOK("spin_lock"); if (mock_check_error(&mock_spin_lock_held)) return 0; - mock_active_locks++; + mock_active_spin_locks++; return 1; } @@ -1627,7 +1635,7 @@ void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) void mock_spin_unlock(spinlock_t *lock) { UNIT_HOOK("unlock"); - mock_active_locks--; + mock_active_spin_locks--; } /** @@ -1723,9 +1731,15 @@ void mock_teardown(void) vmallocs_in_use = NULL; if (mock_active_locks != 0) - FAIL(" %d locks still locked after test", mock_active_locks); + FAIL(" %d (non-spin) locks still locked after test", + mock_active_locks); mock_active_locks = 0; + if (mock_active_spin_locks != 0) + FAIL(" %d spin locks still locked after test", + mock_active_spin_locks); + mock_active_spin_locks = 0; + if (mock_active_rcu_locks != 0) FAIL(" %d rcu_read_locks still active after test", mock_active_rcu_locks); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 69c98732..2421e5fa 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -335,13 +335,15 @@ TEST_F(homa_outgoing, homa_message_out_fill__zero_length_message) } TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) { - struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, - &self->server_addr); - struct homa_rpc *crpc2 = homa_rpc_new_client(&self->hsk, - &self->server_addr); + struct homa_rpc *crpc1, *crpc2; + crpc1 = homa_rpc_new_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc1 == NULL); + homa_rpc_unlock(crpc1); + + crpc2 = homa_rpc_new_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc2 == NULL); + mock_set_ipv6(&self->hsk); self->hsk.sock.sk_protocol = IPPROTO_TCP; @@ -350,7 +352,6 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) 2 * UNIT_TEST_DATA_PER_PACKET; ASSERT_EQ(0, -homa_message_out_fill(crpc1, unit_iov_iter((void *) 1000, 10000), 0)); - homa_rpc_unlock(crpc1); EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 2800", unit_log_get()); /* Second try: just barely enough space for 3 packets in GSO. */ @@ -363,13 +364,10 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) } TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) { - struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, - &self->server_addr); - struct homa_rpc *crpc2 = homa_rpc_new_client(&self->hsk, - &self->server_addr); + struct homa_rpc *crpc1, *crpc2; + crpc1 = homa_rpc_new_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc1 == NULL); - ASSERT_FALSE(crpc2 == NULL); mock_set_ipv6(&self->hsk); /* First try: not quite enough space for 3 packets in GSO. */ @@ -382,6 +380,8 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 2800", unit_log_get()); /* Second try: just barely enough space for 3 packets in GSO. */ + crpc2 = homa_rpc_new_client(&self->hsk, &self->server_addr); + ASSERT_FALSE(crpc2 == NULL); mock_net_device.gso_max_size += 1; unit_log_clear(); ASSERT_EQ(0, -homa_message_out_fill(crpc2, diff --git a/timetrace.c b/timetrace.c index ccb1ec49..8552be20 100644 --- a/timetrace.c +++ b/timetrace.c @@ -337,7 +337,7 @@ int tt_proc_open(struct inode *inode, struct file *file) result = -EINVAL; goto done; } - pf = kmalloc(sizeof(*pf), GFP_KERNEL); + pf = kmalloc(sizeof(*pf), GFP_ATOMIC); if (!pf) { result = -ENOMEM; goto done; From 1e251b38585d3d4a4f79041d0bf4e56b7c87e3cd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 9 Jan 2025 10:12:30 -0800 Subject: [PATCH 141/625] Move mocking definitions from homa_impl.h to mock.h --- homa_impl.h | 74 +------------------------------------------------ test/mock.h | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 76 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 6007c198..9ca0f49c 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -52,79 +52,7 @@ #include "homa_wire.h" #ifdef __UNIT_TEST__ -#undef alloc_pages -#define alloc_pages mock_alloc_pages -struct page *mock_alloc_pages(gfp_t gfp, unsigned int order); - -#define compound_order mock_compound_order -unsigned int mock_compound_order(struct page *page); - -#ifdef cpu_to_node -#undef cpu_to_node -#endif -#define cpu_to_node mock_cpu_to_node -int mock_cpu_to_node(int cpu); - -#undef current -#define current current_task -extern struct task_struct *current_task; - -#define get_page mock_get_page -void mock_get_page(struct page *page); - -#undef kmalloc -#define kmalloc mock_kmalloc -void *mock_kmalloc(size_t size, gfp_t flags); - -#undef kmalloc_array -#define kmalloc_array(count, size, type) mock_kmalloc((count) * (size), type) - -#define kthread_complete_and_exit(...) - -#ifdef page_address -#undef page_address -#endif -#define page_address(page) ((void *)page) - -#define page_ref_count mock_page_refs -int mock_page_refs(struct page *page); - -#define page_to_nid mock_page_to_nid -int mock_page_to_nid(struct page *page); - -#define put_page mock_put_page -void mock_put_page(struct page *page); - -#define rcu_read_lock mock_rcu_read_lock -void mock_rcu_read_lock(void); - -#define rcu_read_unlock mock_rcu_read_unlock -void mock_rcu_read_unlock(void); - -#undef register_net_sysctl -#define register_net_sysctl mock_register_net_sysctl -struct ctl_table_header *mock_register_net_sysctl(struct net *net, - const char *path, - struct ctl_table *table); - -#define signal_pending(...) mock_signal_pending -extern int mock_signal_pending; - -#define spin_unlock mock_spin_unlock -void mock_spin_unlock(spinlock_t *lock); - -#undef vmalloc -#define vmalloc mock_vmalloc -void *mock_vmalloc(size_t size); - -#undef DECLARE_PER_CPU -#define DECLARE_PER_CPU(type, name) extern type name[10] - -#undef DEFINE_PER_CPU -#define DEFINE_PER_CPU(type, name) type name[10] - -#undef per_cpu -#define per_cpu(name, core) (name[core]) +#include "mock.h" #endif /* __UNIT_TEST__ */ #ifndef __STRIP__ /* See strip.py */ diff --git a/test/mock.h b/test/mock.h index 36b03756..a56bec45 100644 --- a/test/mock.h +++ b/test/mock.h @@ -1,9 +1,74 @@ /* Copyright (c) 2019-2022 Homa Developers * SPDX-License-Identifier: BSD-1-Clause */ +#ifndef _HOMA_MOCK_H +#define _HOMA_MOCK_H -/* Functions for mocking that are exported to test code. */ +/* Replace various Linux variables and functions with mocked ones. */ +#undef alloc_pages +#define alloc_pages mock_alloc_pages + +#define compound_order mock_compound_order + +#ifdef cpu_to_node +#undef cpu_to_node +#endif +#define cpu_to_node mock_cpu_to_node + +#undef current +#define current current_task + +#undef DECLARE_PER_CPU +#define DECLARE_PER_CPU(type, name) extern type name[10] + +#undef DEFINE_PER_CPU +#define DEFINE_PER_CPU(type, name) type name[10] + +#define get_page mock_get_page + +#undef kmalloc +#define kmalloc mock_kmalloc + +#undef kmalloc_array +#define kmalloc_array(count, size, type) mock_kmalloc((count) * (size), type) + +#define kthread_complete_and_exit(...) + +#ifdef page_address +#undef page_address +#endif +#define page_address(page) ((void *)page) + +#define page_ref_count mock_page_refs + +#define page_to_nid mock_page_to_nid +#undef per_cpu +#define per_cpu(name, core) (name[core]) + +#define put_page mock_put_page + +#define rcu_read_lock mock_rcu_read_lock + +#define rcu_read_unlock mock_rcu_read_unlock + +#undef register_net_sysctl +#define register_net_sysctl mock_register_net_sysctl + +#define signal_pending(...) mock_signal_pending + +#define spin_unlock mock_spin_unlock + +#undef vmalloc +#define vmalloc mock_vmalloc + +/* Forward references: */ +struct homa; +struct homa_rpc; +struct homa_sock; +struct homa_socktab; + +/* Functions for mocking that are exported to test code. */ extern int mock_alloc_page_errors; extern int mock_alloc_skb_errors; extern int mock_bpage_size; @@ -36,6 +101,7 @@ extern int mock_numa_mask; extern int mock_page_nid_mask; extern char mock_printk_output[]; extern int mock_route_errors; +extern int mock_signal_pending; extern int mock_spin_lock_held; extern struct task_struct mock_task; @@ -44,17 +110,23 @@ extern int mock_vmalloc_errors; extern int mock_xmit_log_verbose; extern int mock_xmit_log_homa_info; +extern struct task_struct *current_task; + struct page * - mock_alloc_pages(gfp_t gfp, unsigned order); + mock_alloc_pages(gfp_t gfp, unsigned order); int mock_check_error(int *errorMask); void mock_clear_xmit_prios(void); +unsigned int mock_compound_order(struct page *page); +int mock_cpu_to_node(int core); void mock_data_ready(struct sock *sk); cycles_t mock_get_cycles(void); unsigned int mock_get_mtu(const struct dst_entry *dst); void mock_get_page(struct page *page); +void *mock_kmalloc(size_t size, gfp_t flags); int mock_page_refs(struct page *page); int mock_page_refs(struct page *page); +int mock_page_to_nid(struct page *page); void mock_put_page(struct page *page); void mock_rcu_read_lock(void); void mock_rcu_read_unlock(void); @@ -69,10 +141,12 @@ void mock_spin_unlock(spinlock_t *lock); int mock_skb_count(void); struct sk_buff * mock_skb_new(struct in6_addr *saddr, struct homa_common_hdr *h, - int extra_bytes, int first_value); + int extra_bytes, int first_value); void mock_sock_destroy(struct homa_sock *hsk, struct homa_socktab *socktab); void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port); void mock_teardown(void); void *mock_vmalloc(size_t size); + +#endif /* _HOMA_MOCK_H */ From 0adc096b79f149698ebe41fa6187574770de558c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 9 Jan 2025 10:24:25 -0800 Subject: [PATCH 142/625] When creating socket, check for all default ports in use --- homa_impl.h | 8 +++----- homa_sock.c | 18 ++++++++++++------ homa_sock.h | 4 +++- homa_utils.c | 2 +- test/mock.c | 25 +++++++++++++++++-------- test/mock.h | 6 +++++- test/unit_homa_sock.c | 17 ++++++++++++++++- 7 files changed, 57 insertions(+), 23 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 9ca0f49c..f9852675 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -412,12 +412,10 @@ struct homa { atomic_t total_incoming __aligned(L1_CACHE_BYTES); /** - * @next_client_port: A client port number to consider for the - * next Homa socket; increments monotonically. Current value may - * be in the range allocated for servers; must check before using. - * This port may also be in use already; must check. + * @prev_default_port: The most recent port number assigned from + * the range of default ports. */ - __u16 next_client_port __aligned(L1_CACHE_BYTES); + __u16 prev_default_port __aligned(L1_CACHE_BYTES); /** * @port_map: Information about all open sockets. Dynamically diff --git a/homa_sock.c b/homa_sock.c index b8705f06..44208623 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -130,6 +130,7 @@ void homa_socktab_end_scan(struct homa_socktab_scan *scan) int homa_sock_init(struct homa_sock *hsk, struct homa *homa) { struct homa_socktab *socktab = homa->port_map; + int starting_port; int result = 0; int i; @@ -142,17 +143,22 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) ? HOMA_IPV4_HEADER_LENGTH : HOMA_IPV6_HEADER_LENGTH; hsk->shutdown = false; + starting_port = homa->prev_default_port; while (1) { - if (homa->next_client_port < HOMA_MIN_DEFAULT_PORT) - homa->next_client_port = HOMA_MIN_DEFAULT_PORT; - if (!homa_sock_find(socktab, homa->next_client_port)) + homa->prev_default_port++; + if (homa->prev_default_port < HOMA_MIN_DEFAULT_PORT) + homa->prev_default_port = HOMA_MIN_DEFAULT_PORT; + if (!homa_sock_find(socktab, homa->prev_default_port)) break; - homa->next_client_port++; + if (homa->prev_default_port == starting_port) { + spin_unlock_bh(&socktab->write_lock); + hsk->shutdown = true; + return -EADDRNOTAVAIL; + } } - hsk->port = homa->next_client_port; + hsk->port = homa->prev_default_port; hsk->inet.inet_num = hsk->port; hsk->inet.inet_sport = htons(hsk->port); - homa->next_client_port++; hsk->socktab_links.sock = hsk; hlist_add_head_rcu(&hsk->socktab_links.hash_links, &socktab->buckets[homa_port_hash(hsk->port)]); diff --git a/homa_sock.h b/homa_sock.h index 079ac7eb..4d29af2e 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -171,7 +171,9 @@ struct homa_sock { */ struct homa *homa; - /** @shutdown: True means the socket is no longer usable. */ + /** @shutdown: True means the socket is no longer usable (either + * shutdown has already been invoked, or the socket was never + * properly initialized). */ bool shutdown; /** diff --git a/homa_utils.c b/homa_utils.c index e765a838..312d5d7c 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -55,7 +55,7 @@ int homa_init(struct homa *homa) homa->throttle_add = 0; homa->throttle_min_bytes = 200; atomic_set(&homa->total_incoming, 0); - homa->next_client_port = HOMA_MIN_DEFAULT_PORT; + homa->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL); if (!homa->port_map) { pr_err("%s couldn't create port_map: kmalloc failure", diff --git a/test/mock.c b/test/mock.c index df3af7c5..7119c8ba 100644 --- a/test/mock.c +++ b/test/mock.c @@ -181,6 +181,9 @@ int mock_page_nid_mask; /* Used to collect printk output. */ char mock_printk_output [5000]; +/* Used instead of HOMA_MIN_DEFAULT_PORT by homa_skb.c. */ +__u16 mock_min_default_port = 0x8000; + struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; struct netdev_queue mock_net_queue = {.state = 0}; struct net_device mock_net_device = { @@ -1603,28 +1606,33 @@ int mock_skb_count(void) * @homa: Overall information about the Homa protocol. * @port: Port number to use for the socket, or 0 to * use default. + * Return: 0 for success, otherwise a negative errno. */ -void mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) +int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) { - int saved_port = homa->next_client_port; + int saved_port = homa->prev_default_port; static struct ipv6_pinfo hsk_pinfo; struct sock *sk = &hsk->sock; + int err = 0; memset(hsk, 0, sizeof(*hsk)); sk->sk_data_ready = mock_data_ready; sk->sk_family = mock_ipv6 ? AF_INET6 : AF_INET; - if ((port != 0) && (port >= HOMA_MIN_DEFAULT_PORT)) - homa->next_client_port = port; - homa_sock_init(hsk, homa); + if (port != 0 && port >= mock_min_default_port) + homa->prev_default_port = port - 1; + err = homa_sock_init(hsk, homa); if (port != 0) - homa->next_client_port = saved_port; - if (port < HOMA_MIN_DEFAULT_PORT) + homa->prev_default_port = saved_port; + if (err != 0) + return err; + if (port != 0 && port < mock_min_default_port) homa_sock_bind(homa->port_map, hsk, port); hsk->inet.pinet6 = &hsk_pinfo; mock_mtu = UNIT_TEST_DATA_PER_PACKET + hsk->ip_header_length + sizeof(struct homa_data_hdr); mock_net_device.gso_max_size = mock_mtu; - homa_pool_init(hsk, (void *) 0x1000000, 100*HOMA_BPAGE_SIZE); + err = homa_pool_init(hsk, (void *) 0x1000000, 100*HOMA_BPAGE_SIZE); + return err; } /** @@ -1686,6 +1694,7 @@ void mock_teardown(void) mock_compound_order_mask = 0; mock_page_nid_mask = 0; mock_printk_output[0] = 0; + mock_min_default_port = 0x8000; mock_net_device.gso_max_size = 0; mock_net_device.gso_max_segs = 1000; memset(inet_offloads, 0, sizeof(inet_offloads)); diff --git a/test/mock.h b/test/mock.h index a56bec45..6b388d8b 100644 --- a/test/mock.h +++ b/test/mock.h @@ -26,6 +26,9 @@ #define get_page mock_get_page +#undef HOMA_MIN_DEFAULT_PORT +#define HOMA_MIN_DEFAULT_PORT mock_min_default_port + #undef kmalloc #define kmalloc mock_kmalloc @@ -92,6 +95,7 @@ extern char mock_xmit_prios[]; extern int mock_log_rcu_sched; extern int mock_max_grants; extern int mock_max_skb_frags; +extern __u16 mock_min_default_port; extern int mock_mtu; extern struct net_device mock_net_device; @@ -144,7 +148,7 @@ struct sk_buff * int extra_bytes, int first_value); void mock_sock_destroy(struct homa_sock *hsk, struct homa_socktab *socktab); -void mock_sock_init(struct homa_sock *hsk, struct homa *homa, +int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port); void mock_teardown(void); void *mock_vmalloc(size_t size); diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index fe02beda..df5170d2 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -119,7 +119,7 @@ TEST_F(homa_sock, homa_sock_init__skip_port_in_use) { struct homa_sock hsk2, hsk3; - self->homa.next_client_port = 0xffff; + self->homa.prev_default_port = 0xfffe; mock_sock_init(&hsk2, &self->homa, 0); mock_sock_init(&hsk3, &self->homa, 0); EXPECT_EQ(65535, hsk2.port); @@ -127,6 +127,21 @@ TEST_F(homa_sock, homa_sock_init__skip_port_in_use) homa_sock_destroy(&hsk2); homa_sock_destroy(&hsk3); } +TEST_F(homa_sock, homa_sock_init__all_ports_in_use) +{ + struct homa_sock hsk2, hsk3, hsk4; + + mock_min_default_port = -2; + EXPECT_EQ(0, -mock_sock_init(&hsk2, &self->homa, 0)); + EXPECT_EQ(0, -mock_sock_init(&hsk3, &self->homa, 0)); + EXPECT_EQ(EADDRNOTAVAIL, -mock_sock_init(&hsk4, &self->homa, 0)); + EXPECT_EQ(65534, hsk2.port); + EXPECT_EQ(65535, hsk3.port); + EXPECT_EQ(1, hsk4.shutdown); + homa_sock_destroy(&hsk2); + homa_sock_destroy(&hsk3); + homa_sock_destroy(&hsk4); +} TEST_F(homa_sock, homa_sock_init__ip_header_length) { struct homa_sock hsk_v4, hsk_v6; From 99bc882cea1739e24d50eff6fbfacb34c5e54fed Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 9 Jan 2025 10:56:25 -0800 Subject: [PATCH 143/625] Replace spinlock with mutex for metrics This allows GFP_KERNEL to be used for memory allocation. --- homa_impl.h | 4 ++-- homa_metrics.c | 12 ++++++------ homa_utils.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index f9852675..8a3dce62 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -775,10 +775,10 @@ struct homa { __u32 timer_ticks; /** - * @metrics_lock: Used to synchronize accesses to @metrics_active_opens + * @metrics_mutex: Used to synchronize accesses to @metrics_active_opens * and updates to @metrics. */ - spinlock_t metrics_lock; + struct mutex metrics_mutex; /* * @metrics: a human-readable string containing recent values diff --git a/homa_metrics.c b/homa_metrics.c index dc36c7e1..7902ae56 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -28,7 +28,7 @@ void homa_metric_append(struct homa *homa, const char *format, ...) #else homa->metrics_capacity = 4096; #endif - homa->metrics = kmalloc(homa->metrics_capacity, GFP_ATOMIC); + homa->metrics = kmalloc(homa->metrics_capacity, GFP_KERNEL); if (!homa->metrics) { pr_warn("%s couldn't allocate memory\n", __func__); return; @@ -51,7 +51,7 @@ void homa_metric_append(struct homa *homa, const char *format, ...) /* Not enough room; expand buffer capacity. */ homa->metrics_capacity *= 2; - new_buffer = kmalloc(homa->metrics_capacity, GFP_ATOMIC); + new_buffer = kmalloc(homa->metrics_capacity, GFP_KERNEL); if (!new_buffer) { pr_warn("%s couldn't allocate memory\n", __func__); return; @@ -355,11 +355,11 @@ int homa_metrics_open(struct inode *inode, struct file *file) * use this copy for subsequent opens, until the file has been * completely closed. */ - spin_lock(&homa->metrics_lock); + mutex_lock(&homa->metrics_mutex); if (homa->metrics_active_opens == 0) homa_metrics_print(homa); homa->metrics_active_opens++; - spin_unlock(&homa->metrics_lock); + mutex_unlock(&homa->metrics_mutex); return 0; } @@ -418,8 +418,8 @@ int homa_metrics_release(struct inode *inode, struct file *file) { struct homa *homa = global_homa; - spin_lock(&homa->metrics_lock); + mutex_lock(&homa->metrics_mutex); homa->metrics_active_opens--; - spin_unlock(&homa->metrics_lock); + mutex_unlock(&homa->metrics_mutex); return 0; } diff --git a/homa_utils.c b/homa_utils.c index 312d5d7c..4a6aa092 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -135,7 +135,7 @@ int homa_init(struct homa *homa) homa->busy_usecs = 100; homa->gro_busy_usecs = 5; homa->timer_ticks = 0; - spin_lock_init(&homa->metrics_lock); + mutex_init(&homa->metrics_mutex); homa->metrics = NULL; homa->metrics_capacity = 0; homa->metrics_length = 0; From 12389ed96547d447e2fdadc98610591598b99a39 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 9 Jan 2025 11:48:05 -0800 Subject: [PATCH 144/625] Check return value from kmalloc_array in homa_skb_release_pages --- homa_impl.h | 4 ++-- homa_skb.c | 16 ++++++++++++---- man/homa.7 | 4 ++-- test/unit_homa_skb.c | 23 +++++++++++++++++++++++ 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 8a3dce62..21b0892f 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -470,8 +470,8 @@ struct homa { __u64 skb_page_free_time; /** - * @skb_page_pool_min_mb: Don't return pages from a pool to Linux - * if the amount of cached data in the pool has been less than this + * @skb_page_pool_min_kb: Don't return pages from a pool to Linux + * if the amount of unused space in the pool has been less than this * many KBytes at any time in the recent past. Set externally via * sysctl. */ diff --git a/homa_skb.c b/homa_skb.c index 308336eb..5f20405e 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -596,15 +596,23 @@ void homa_skb_release_pages(struct homa *homa) homa->skb_page_free_time = now + 500000000ULL; release_max = homa->skb_page_frees_per_sec / 2; if (homa->pages_to_free_slots < release_max) { - if (homa->skb_pages_to_free) - kfree(homa->skb_pages_to_free); + struct page **old = homa->skb_pages_to_free; + homa->skb_pages_to_free = kmalloc_array(release_max, sizeof(struct page *), GFP_ATOMIC); - homa->pages_to_free_slots = release_max; + if (homa->skb_pages_to_free) { + kfree(old); + homa->pages_to_free_slots = release_max; + } else { + homa->skb_pages_to_free = old; + release_max = homa->pages_to_free_slots; + } } - /* Find the pool with the largest low-water mark. */ + /* Find the pool with the largest number of pages that haven't + * been used recently. + */ max_low_mark = -1; spin_lock_bh(&homa->page_pool_mutex); for (i = 0; i <= homa->max_numa; i++) { diff --git a/man/homa.7 b/man/homa.7 index 6a01a415..e02a1056 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -545,8 +545,8 @@ pages than needed. .TP .IR skb_page_pool_min_kb When releasing pages from the sk_buff page pools back to Linux, Homa will -not release pages from a pool if the total capacity of free pages in -the pool has been less than this option (specified in Kbytes) at any point +not release pages from a pool if the amount of unused space in +the pool has been less than this (specified in Kbytes) at any point in the recent past. .TP .IR throttle_min_bytes diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 81558548..3362c71e 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -714,6 +714,29 @@ TEST_F(homa_skb, homa_skb_release_pages__allocate_skb_pages_to_free) homa_skb_release_pages(&self->homa); EXPECT_EQ(5, self->homa.pages_to_free_slots); } +TEST_F(homa_skb, homa_skb_release_pages__cant_reallocate_skb_pages_to_free) +{ + struct homa_page_pool *pool; + + EXPECT_EQ(0UL, self->homa.skb_page_free_time); + mock_ns = 1000000; + self->homa.skb_page_free_time = 500000; + self->homa.skb_page_frees_per_sec = 20; + self->homa.skb_page_pool_min_kb = 0; + add_to_pool(&self->homa, 20, 0); + pool = get_skb_core(0)->pool; + pool->low_mark = 15; + + EXPECT_EQ(0, self->homa.pages_to_free_slots); + self->homa.skb_pages_to_free = kmalloc_array(4, sizeof(struct page *), + GFP_ATOMIC); + self->homa.pages_to_free_slots = 4; + + mock_kmalloc_errors = 1; + homa_skb_release_pages(&self->homa); + EXPECT_EQ(16, get_skb_core(0)->pool->avail); + EXPECT_EQ(4, self->homa.pages_to_free_slots); +} TEST_F(homa_skb, homa_skb_release_pages__limited_by_min_kb) { EXPECT_EQ(0UL, self->homa.skb_page_free_time); From 2facfa71a2f01059c69fceacfa4323b6acf46a46 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 10 Jan 2025 11:31:18 -0800 Subject: [PATCH 145/625] Minor updates to reap.txt --- reap.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reap.txt b/reap.txt index 3b45415d..1ab327cf 100644 --- a/reap.txt +++ b/reap.txt @@ -8,7 +8,7 @@ This file discusses issues related to freeing resources for completed RPCs RPC completed, but this can result in severe performance hiccups. For example, a server RPC is freed once the last packet of the response has been transmitted, but this can happen in homa_softirq in response - to an incoming grant, and there may be other short messages waiting + to an incoming acknowledgment, and there may be other short messages waiting to be processed. Freeing a long RPC could result in significant delay for a subsequent short RPC. @@ -34,7 +34,7 @@ This file discusses issues related to freeing resources for completed RPCs will reap a few buffers for every incoming data packet. This is undesirable because it will impact Homa's performance. -* In addition, during the conversion to the new input buffering scheme for 2.0, +* In addition, during the conversion to the new input buffering scheme, freeing of packets for incoming messages was moved to homa_copy_to_user, under the assumption that this code wouldn't be on the critical path. However, right now the packet freeing is taking 20-25% of the total From 3aca8106771b49df8393eb4e2fe4750d74a2716f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 10 Jan 2025 13:40:09 -0800 Subject: [PATCH 146/625] Fix bug in homa_rpc_reap Wasn't resetting rx_frees for each loop; could result in hsk->dead_skbs becoming negative. --- homa_rpc.c | 3 ++- test/unit_homa_rpc.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index 92d0cfff..cd7cfe04 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -344,7 +344,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) int num_skbs, num_rpcs; struct homa_rpc *rpc; int i, batch_size; - int rx_frees = 0; + int rx_frees; int result = 0; INC_METRIC(reaper_calls, 1); @@ -360,6 +360,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) count -= batch_size; num_skbs = 0; num_rpcs = 0; + rx_frees = 0; homa_sock_lock(hsk, "homa_rpc_reap"); if (atomic_read(&hsk->protect_count)) { diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index c68dbfc0..49669588 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -506,7 +506,7 @@ TEST_F(homa_rpc, homa_rpc_reap__basics) EXPECT_STREQ("reaped 1234", unit_log_get()); unit_log_clear(); EXPECT_STREQ("1236 1238", dead_rpcs(&self->hsk)); - EXPECT_EQ(2, self->hsk.dead_skbs); + EXPECT_EQ(3, self->hsk.dead_skbs); } TEST_F(homa_rpc, homa_rpc_reap__protected) { From 93679a0917573a9862b4557020a91d9616d84f42 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 10 Jan 2025 15:55:13 -0800 Subject: [PATCH 147/625] Change API for homa_rpc_reap Caller now specifies only whether to reap "a few" or "all"; the exact number for "a few" is determined internally by homa_rpc_reap. --- homa_incoming.c | 5 ++- homa_rpc.c | 32 ++++++++++------- homa_rpc.h | 2 +- homa_sock.c | 2 +- homa_timer.c | 2 +- test/unit_homa_rpc.c | 85 ++++++++++++++++++++++++++++++++++++-------- 6 files changed, 96 insertions(+), 32 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 3efbbf0a..54f6334d 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -552,7 +552,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) __u64 start = sched_clock(); tt_record("homa_data_pkt calling homa_rpc_reap"); - homa_rpc_reap(hsk, hsk->homa->reap_limit); + homa_rpc_reap(hsk, false); INC_METRIC(data_pkt_reap_ns, sched_clock() - start); } } @@ -1247,8 +1247,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, rpc->id); goto found_rpc; } - reaper_result = homa_rpc_reap(hsk, - hsk->homa->reap_limit); + reaper_result = homa_rpc_reap(hsk, false); if (reaper_result == 0) break; diff --git a/homa_rpc.c b/homa_rpc.c index cd7cfe04..8c491af6 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -321,18 +321,20 @@ void homa_rpc_free(struct homa_rpc *rpc) * RPCs for a given socket. For a large RPC, it can take a long time to * free all of its packet buffers, so we try to perform this work * off the critical path where it won't delay applications. Each call to - * this function does a small chunk of work. See the file reap.txt for - * more information. - * @hsk: Homa socket that may contain dead RPCs. Must not be locked by the - * caller; this function will lock and release. - * @count: Number of buffers to free during this call. + * this function normally does a small chunk of work (unless reap_all is + * true). See the file reap.txt for more information. + * @hsk: Homa socket that may contain dead RPCs. Must not be locked by the + * caller; this function will lock and release. + * @reap_all: False means do a small chunk of work; there may still be + * unreaped RPCs on return. True means reap all dead rpcs for + * hsk. Will busy-wait if reaping has been disabled for some RPCs. * * Return: A return value of 0 means that we ran out of work to do; calling * again will do no work (there could be unreaped RPCs, but if so, * reaping has been disabled for them). A value greater than * zero means there is still more reaping work to be done. */ -int homa_rpc_reap(struct homa_sock *hsk, int count) +int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) { #ifdef __UNIT_TEST__ #define BATCH_MAX 3 @@ -344,6 +346,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) int num_skbs, num_rpcs; struct homa_rpc *rpc; int i, batch_size; + int skbs_to_reap; int rx_frees; int result = 0; @@ -353,11 +356,14 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) /* Each iteration through the following loop will reap * BATCH_MAX skbs. */ - while (count > 0) { - batch_size = count; - if (batch_size > BATCH_MAX) - batch_size = BATCH_MAX; - count -= batch_size; + skbs_to_reap = hsk->homa->reap_limit; + while (skbs_to_reap > 0 && !list_empty(&hsk->dead_rpcs)) { + batch_size = BATCH_MAX; + if (!reap_all) { + if (batch_size > skbs_to_reap) + batch_size = skbs_to_reap; + skbs_to_reap -= batch_size; + } num_skbs = 0; num_rpcs = 0; rx_frees = 0; @@ -369,6 +375,8 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) atomic_read(&hsk->protect_count), hsk->dead_skbs); homa_sock_unlock(hsk); + if (reap_all) + continue; return 0; } @@ -470,7 +478,7 @@ int homa_rpc_reap(struct homa_sock *hsk, int count) tt_record4("reaped %d skbs, %d rpcs; %d skbs remain for port %d", num_skbs + rx_frees, num_rpcs, hsk->dead_skbs, hsk->port); - if (!result) + if (!result && !reap_all) break; } homa_pool_check_waiting(hsk->buffer_pool); diff --git a/homa_rpc.h b/homa_rpc.h index 4022d0b0..390376cf 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -436,7 +436,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, const struct in6_addr *source, struct homa_data_hdr *h, int *created); -int homa_rpc_reap(struct homa_sock *hsk, int count); +int homa_rpc_reap(struct homa_sock *hsk, bool reap_all); char *homa_symbol_for_state(struct homa_rpc *rpc); int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); diff --git a/homa_sock.c b/homa_sock.c index 44208623..49569b5e 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -272,7 +272,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) #ifndef __STRIP__ /* See strip.py */ while (!list_empty(&hsk->dead_rpcs)) { - homa_rpc_reap(hsk, 1000); + homa_rpc_reap(hsk, true); i++; if (i == 5) { tt_record("Freezing because reap seems hung"); diff --git a/homa_timer.c b/homa_timer.c index 586c0817..411fc1f2 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -213,7 +213,7 @@ void homa_timer(struct homa *homa) __u64 start = sched_clock(); tt_record("homa_timer calling homa_rpc_reap"); - if (homa_rpc_reap(hsk, hsk->homa->reap_limit) == 0) + if (homa_rpc_reap(hsk, false) == 0) break; INC_METRIC(timer_reap_ns, sched_clock() - start); } diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 49669588..2625d836 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -12,6 +12,17 @@ #define n(x) htons(x) #define N(x) htonl(x) +static struct homa_sock *hook_hsk; +void unprotect_hsk_hook(char *id) +{ + if (strcmp(id, "unlock") != 0) + return; + if (hook_hsk) { + homa_unprotect_rpcs(hook_hsk); + hook_hsk = NULL; + } +} + FIXTURE(homa_rpc) { struct in6_addr client_ip[1]; int client_port; @@ -502,12 +513,36 @@ TEST_F(homa_rpc, homa_rpc_reap__basics) EXPECT_STREQ("1234 1236 1238", dead_rpcs(&self->hsk)); EXPECT_EQ(11, self->hsk.dead_skbs); unit_log_clear(); - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 7)); + self->homa.reap_limit = 7; + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1234", unit_log_get()); unit_log_clear(); EXPECT_STREQ("1236 1238", dead_rpcs(&self->hsk)); EXPECT_EQ(3, self->hsk.dead_skbs); } +TEST_F(homa_rpc, homa_rpc_reap__reap_all) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 20000, 100); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_free(crpc1); + homa_rpc_free(crpc2); + unit_log_clear(); + EXPECT_STREQ("1234 1236", dead_rpcs(&self->hsk)); + self->homa.reap_limit = 3; + unit_log_clear(); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, true)); + EXPECT_STREQ("reaped 1234; reaped 1236", unit_log_get()); + unit_log_clear(); + EXPECT_STREQ("", dead_rpcs(&self->hsk)); + EXPECT_EQ(0, self->hsk.dead_skbs); +} TEST_F(homa_rpc, homa_rpc_reap__protected) { struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, @@ -518,10 +553,26 @@ TEST_F(homa_rpc, homa_rpc_reap__protected) homa_rpc_free(crpc1); unit_log_clear(); homa_protect_rpcs(&self->hsk); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 10)); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); homa_unprotect_rpcs(&self->hsk); EXPECT_STREQ("", unit_log_get()); } +TEST_F(homa_rpc, homa_rpc_reap__protected_and_reap_all) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + + ASSERT_NE(NULL, crpc1); + homa_rpc_free(crpc1); + unit_log_clear(); + homa_protect_rpcs(&self->hsk); + hook_hsk = &self->hsk; + unit_hook_register(unprotect_hsk_hook); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, true)); + EXPECT_STREQ("reaped 1234", unit_log_get()); + EXPECT_EQ(0, self->hsk.dead_skbs); +} TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_flags) { struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, @@ -537,13 +588,14 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_flags) homa_rpc_free(crpc2); unit_log_clear(); atomic_or(RPC_COPYING_TO_USER, &crpc1->flags); - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 3)); + self->homa.reap_limit = 3; + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); unit_log_clear(); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("", unit_log_get()); atomic_andnot(RPC_COPYING_TO_USER, &crpc1->flags); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1234", unit_log_get()); } TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_active_xmits) @@ -561,11 +613,12 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_active_xmits) homa_rpc_free(crpc2); unit_log_clear(); atomic_inc(&crpc1->msgout.active_xmits); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 100)); + self->homa.reap_limit = 100; + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); unit_log_clear(); atomic_dec(&crpc1->msgout.active_xmits); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 100)); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1234", unit_log_get()); } TEST_F(homa_rpc, homa_rpc_reap__grant_in_progress) @@ -583,13 +636,14 @@ TEST_F(homa_rpc, homa_rpc_reap__grant_in_progress) homa_rpc_free(crpc2); unit_log_clear(); atomic_inc(&crpc1->grants_in_progress); - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, 3)); + self->homa.reap_limit = 3; + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); unit_log_clear(); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("", unit_log_get()); atomic_dec(&crpc1->grants_in_progress); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 3)); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1234", unit_log_get()); } TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) @@ -602,7 +656,8 @@ TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) homa_rpc_free(crpc); EXPECT_EQ(9, self->hsk.dead_skbs); unit_log_clear(); - homa_rpc_reap(&self->hsk, 5); + self->homa.reap_limit = 5; + homa_rpc_reap(&self->hsk, false); EXPECT_STREQ("1234", dead_rpcs(&self->hsk)); EXPECT_EQ(4, self->hsk.dead_skbs); } @@ -618,7 +673,8 @@ TEST_F(homa_rpc, homa_rpc_reap__release_buffers) homa_rpc_free(crpc); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); self->hsk.buffer_pool->check_waiting_invoked = 0; - homa_rpc_reap(&self->hsk, 5); + self->homa.reap_limit = 5; + homa_rpc_reap(&self->hsk, false); EXPECT_EQ(0, atomic_read(&pool->descriptors[1].refs)); EXPECT_EQ(1, self->hsk.buffer_pool->check_waiting_invoked); } @@ -636,12 +692,13 @@ TEST_F(homa_rpc, homa_rpc_reap__free_gaps) EXPECT_STREQ("start 1000, end 2000; start 5000, end 6000, time 1000", unit_print_gaps(crpc)); homa_rpc_free(crpc); - homa_rpc_reap(&self->hsk, 5); + self->homa.reap_limit = 5; + homa_rpc_reap(&self->hsk, false); // Test framework will complain if memory not freed. } TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) { - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, 10)); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); } TEST_F(homa_rpc, homa_find_client_rpc) From 32ba8f3002dca8d2932ef500669340b5f599c3fd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 13 Jan 2025 09:46:24 -0800 Subject: [PATCH 148/625] Add missing RCU locks/unlocks --- homa_offload.c | 2 ++ homa_sock.c | 2 ++ homa_utils.c | 9 +++++++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/homa_offload.c b/homa_offload.c index fad07bcc..265c2849 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -89,6 +89,7 @@ void homa_gro_hook_tcp(void) return; pr_notice("Homa setting up TCP hijacking\n"); + rcu_read_lock(); tcp_net_offload = rcu_dereference(inet_offloads[IPPROTO_TCP]); hook_tcp_net_offload = *tcp_net_offload; hook_tcp_net_offload.callbacks.gro_receive = homa_tcp_gro_receive; @@ -100,6 +101,7 @@ void homa_gro_hook_tcp(void) hook_tcp6_net_offload.callbacks.gro_receive = homa_tcp_gro_receive; inet6_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) &hook_tcp6_net_offload; + rcu_read_unlock(); } /** diff --git a/homa_sock.c b/homa_sock.c index 49569b5e..a158d7e0 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -29,11 +29,13 @@ void homa_socktab_destroy(struct homa_socktab *socktab) struct homa_socktab_scan scan; struct homa_sock *hsk; + rcu_read_lock(); for (hsk = homa_socktab_start_scan(socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { homa_sock_destroy(hsk); } homa_socktab_end_scan(&scan); + rcu_read_unlock(); } /** diff --git a/homa_utils.c b/homa_utils.c index 4a6aa092..c2d7a02e 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -521,17 +521,18 @@ void homa_freeze_peers(struct homa *homa) struct homa_sock *hsk; /* Find a socket to use (any will do). */ + rcu_read_lock(); hsk = homa_socktab_start_scan(homa->port_map, &scan); homa_socktab_end_scan(&scan); if (!hsk) { tt_record("homa_freeze_peers couldn't find a socket"); - return; + goto done; } peers = homa_peertab_get_peers(homa->peers, &num_peers); if (!peers) { tt_record("homa_freeze_peers couldn't find peers to freeze"); - return; + goto done; } freeze.common.type = FREEZE; freeze.common.sport = htons(hsk->port); @@ -547,6 +548,10 @@ void homa_freeze_peers(struct homa *homa) err, tt_addr(peers[i]->addr)); } kfree(peers); + +done: + rcu_read_unlock(); + return; } /** From ccc2a90550936bcbf907866644ce55294973c62c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 14 Jan 2025 08:53:09 -0800 Subject: [PATCH 149/625] Update notes.txt --- notes.txt | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/notes.txt b/notes.txt index 4d56b453..d69f430a 100755 --- a/notes.txt +++ b/notes.txt @@ -28,31 +28,6 @@ Notes for Homa implementation in Linux: traffic? * Also consider the amount of data that is "stuck" in the NIC? -* Notes on Linux qdiscs: - * Default qdisc is fq_codel - * qdisc_create() is in sch_api.c - * Packet transmission "starts" in __dev_xmit_skb in dev.c. - * sch_direct_xmit is called once it's time to actually transmit a - packet (or list of packets). However, the device driver can return - NETDEV_TX_BUSY, in which case the packet will be (re)queued in the qdisc. - * TCQ_F_NOLOCK seems to apply to the qdisc root lock: individual qdiscs - still get locked. - -* Notes on Homa qdisc: - * Keep separate packet queues for TCP and Homa - * Pace packet output to stay within network speed - * If both queues are nonempty, split bandwidth between the queues using - a static formula (have a relative weight for each of TCP and Homa?) - * For Homa, is it OK call ip_queue_xmit for all available output and - let the qdisc queue them up? - * Potential risk: queues might get long, so insertion could be expensive. - Can organize the queue by RPC, not individual packets ... there probably - are not large #'s of ready RPCs at once? - * Keep track of all of the queues for a particular device, and potentially - move packets between queues (e.g. all long packets get transmitted on a - single queue; no short packets get transmitted there). - - * Remedies to consider for the performance problems at 100 Gbps, where one tx channel gets very backed up: * Implement zero-copy on output in order to reduce memory bandwidth From d758eb8bedc01f1f64211ed0d93943ff96904b3d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 14 Jan 2025 16:29:35 -0800 Subject: [PATCH 150/625] Rework synchronization in timetrace.c Switched from spinlock to mutex (spinlock was being used in non-atomic context). Also simplified/improved tt_frozen management. --- homa_plumbing.c | 2 +- homa_timer.c | 2 +- homa_utils.c | 2 +- test/unit_timetrace.c | 12 +++++------ timetrace.c | 46 +++++++++++++++++++++---------------------- timetrace.h | 2 +- 6 files changed, 32 insertions(+), 34 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 6e174c20..a20472af 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1295,7 +1295,7 @@ int homa_softirq(struct sk_buff *skb) * it will work even if the RPC and/or socket are unknown. */ if (unlikely(h->type == FREEZE)) { - if (!tt_frozen) { + if (!atomic_read(&tt_frozen)) { homa_rpc_log_active_tt(homa, 0); tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", ntohs(h->dport), diff --git a/homa_timer.c b/homa_timer.c index 411fc1f2..c6eded5d 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -187,7 +187,7 @@ void homa_timer(struct homa *homa) if (total_grants == prev_grant_count && homa->num_grantable_rpcs > 20) { zero_count++; - if (zero_count > 3 && !tt_frozen && 0) { + if (zero_count > 3 && !atomic_read(&tt_frozen) && 0) { pr_err("%s found no grants going out\n", __func__); homa_rpc_log_active_tt(homa, 0); tt_record("freezing because no grants are going out"); diff --git a/homa_utils.c b/homa_utils.c index c2d7a02e..34c8a8ad 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -707,7 +707,7 @@ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) if (type != rpc->hsk->homa->freeze_type) return; rpc->hsk->homa->freeze_type = 0; - if (!tt_frozen) { + if (!atomic_read(&tt_frozen)) { // struct homa_freeze_hdr freeze; int dummy; diff --git a/test/unit_timetrace.c b/test/unit_timetrace.c index b13ddf63..142d749f 100644 --- a/test/unit_timetrace.c +++ b/test/unit_timetrace.c @@ -34,10 +34,10 @@ TEST_F(timetrace, tt_freeze) EXPECT_EQ(0, tt_freeze_count.counter); tt_freeze(); EXPECT_EQ(1, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); tt_freeze(); EXPECT_EQ(1, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); } TEST_F(timetrace, tt_record__basics) @@ -285,20 +285,20 @@ TEST_F(timetrace, tt_proc_release__unfreeze) tt_freeze(); tt_proc_open(NULL, &self->file); EXPECT_EQ(2, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); tt_proc_open(NULL, &file2); EXPECT_EQ(3, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); tt_proc_release(NULL, &self->file); EXPECT_EQ(2, tt_freeze_count.counter); - EXPECT_TRUE(tt_frozen); + EXPECT_TRUE(atomic_read(&tt_frozen)); EXPECT_NE(NULL, tt_buffers[1]->events[3].format); EXPECT_EQ(2, tt_buffers[1]->next_index); tt_proc_release(NULL, &file2); EXPECT_EQ(0, tt_freeze_count.counter); - EXPECT_FALSE(tt_frozen); + EXPECT_FALSE(atomic_read(&tt_frozen)); EXPECT_EQ(NULL, tt_buffers[1]->events[3].format); EXPECT_EQ(0, tt_buffers[1]->next_index); } diff --git a/timetrace.c b/timetrace.c index 8552be20..050bda5a 100644 --- a/timetrace.c +++ b/timetrace.c @@ -64,7 +64,7 @@ static struct proc_dir_entry *tt_dir_entry; * isn't safe here, because tt_freeze gets called at times when threads * can't sleep. */ -static spinlock_t tt_lock; +static struct mutex tt_mutex; /* No new timetrace entries will be made whenever this is nonzero (counts * the number of active /proc reads, plus 1 more if tt_frozen is true). @@ -75,7 +75,7 @@ atomic_t tt_freeze_count = {.counter = 1}; /* True means that tt_freeze has been called since the last time the * timetrace was read. */ -bool tt_frozen; +atomic_t tt_frozen; /* True means timetrace has been successfully initialized. */ static bool init; @@ -134,9 +134,9 @@ int tt_init(char *proc_file, int *temp) tt_dir_entry = NULL; } - spin_lock_init(&tt_lock); - tt_freeze_count.counter = 0; - tt_frozen = false; + mutex_init(&tt_mutex); + atomic_set(&tt_freeze_count, 0); + atomic_set(&tt_frozen, 0); init = true; #ifdef TT_KERNEL @@ -173,7 +173,7 @@ void tt_destroy(void) { int i; - spin_lock(&tt_lock); + mutex_lock(&tt_mutex); if (init) { init = false; if (tt_dir_entry) @@ -203,7 +203,7 @@ void tt_destroy(void) tt_linux_homa_temp = tt_linux_homa_temp_default; #endif - spin_unlock(&tt_lock); + mutex_unlock(&tt_mutex); } /** @@ -213,16 +213,15 @@ void tt_destroy(void) */ void tt_freeze(void) { - if (tt_frozen) - return; - tt_record("timetrace frozen"); - pr_notice("%s invoked\n", __func__); - spin_lock(&tt_lock); - if (!tt_frozen) { - tt_frozen = true; + /* Need to synchronize here to make sure tt_freeze_count only + * gets incremented once, even under concurrent calls to this + * function. + */ + if (atomic_xchg(&tt_frozen, 1) == 0) { + tt_record("timetrace frozen"); + pr_notice("%s invoked\n", __func__); atomic_inc(&tt_freeze_count); } - spin_unlock(&tt_lock); } /** @@ -332,7 +331,7 @@ int tt_proc_open(struct inode *inode, struct file *file) struct tt_proc_file *pf = NULL; int result = 0; - spin_lock(&tt_lock); + mutex_lock(&tt_mutex); if (!init) { result = -EINVAL; goto done; @@ -356,7 +355,7 @@ int tt_proc_open(struct inode *inode, struct file *file) } done: - spin_unlock(&tt_lock); + mutex_unlock(&tt_mutex); return result; } @@ -383,7 +382,7 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, */ int copied_to_user = 0; - spin_lock(&tt_lock); + mutex_lock(&tt_mutex); if (!pf || pf->file != file) { pr_err("tt_metrics_read found damaged private_data: 0x%p\n", file->private_data); @@ -477,7 +476,7 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, } done: - spin_unlock(&tt_lock); + mutex_unlock(&tt_mutex); return copied_to_user; } @@ -517,13 +516,12 @@ int tt_proc_release(struct inode *inode, struct file *file) kfree(pf); file->private_data = NULL; - spin_lock(&tt_lock); + mutex_lock(&tt_mutex); if (init) { - if (tt_frozen && (atomic_read(&tt_freeze_count) == 2)) { + if (atomic_read(&tt_freeze_count) == 2 && + atomic_xchg(&tt_frozen, 0) == 1) atomic_dec(&tt_freeze_count); - tt_frozen = false; - } if (atomic_read(&tt_freeze_count) == 1) { /* We are the last active open of the file; reset all of @@ -539,7 +537,7 @@ int tt_proc_release(struct inode *inode, struct file *file) atomic_dec(&tt_freeze_count); } - spin_unlock(&tt_lock); + mutex_unlock(&tt_mutex); return 0; } diff --git a/timetrace.h b/timetrace.h index 0a202523..5d3ad33c 100644 --- a/timetrace.h +++ b/timetrace.h @@ -119,7 +119,7 @@ loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence); extern struct tt_buffer *tt_buffers[]; extern int tt_buffer_size; extern atomic_t tt_freeze_count; -extern bool tt_frozen; +extern atomic_t tt_frozen; extern int tt_pf_storage; extern bool tt_test_no_khz; From 28735a3300e9b25873b3f0d199b3c7897d38030a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 11:48:09 -0800 Subject: [PATCH 151/625] Improve Cloudlab config script Add methods get_cpu_type, get_link_speed Eliminate use of Cloudlab node type (more general-purpose) --- cloudlab/bin/config | 101 +++++++++++++++++++++++++++++++------------- 1 file changed, 71 insertions(+), 30 deletions(-) diff --git a/cloudlab/bin/config b/cloudlab/bin/config index 5eedf40d..e049b24e 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -26,6 +26,12 @@ from switch import * interface = None vlan = None +# Cached result of get_cpu_type. +cpu_type = None + +# Cached result of get_link_speed (integer Mbits/sec.) +link_mbps = None + # Number of this node (e.g. 1 for "node1"). Set by get_node_num. node_num = None @@ -87,6 +93,19 @@ def get_core_mask(core = -1): mask_words.pop() return result +def get_cpu_type(): + """ + Return information about the processor we're running on (the "model name" + from /proc/cpuinfo) + """ + + global cpu_type + if cpu_type == None: + read_cpu_info() + if cpu_type == None: + raise Exception("Couldn't find 'model name' line in /proc/cpuinfo") + return cpu_type + def get_interfaces(): """ Runs ifconfig and parses its output to identify the key network @@ -108,7 +127,8 @@ def get_interfaces(): available += ", " available += current if (('s1f1' in current) or ('s1f0' in current) - or ('s0f0' in current) or ('s0f1' in current)): + or ('s0f0' in current) or ('s0f1' in current) + or (current == 'eno1')): interface = current continue if re.match('^[ ]+ inet 10\.0\.1\.', line): @@ -122,6 +142,27 @@ def get_interfaces(): print("Primary network interface is %s, vlan is %s" % (interface, vlan)) return [interface, vlan] +def get_link_speed(): + """ + Return the link speed for the primary NIC, in Mbits/sec. + """ + + global link_mbps + if link_mbps != None: + return link_mbps + nic = get_interfaces()[0] + num_channels = -1 + + for line in subprocess.run(["ethtool", "eno1"], stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, encoding="utf-8", + check=True).stdout.splitlines(): + match = re.match('.*Speed: ([0-9]+)Mb/s', line) + if match: + link_mbps = int(match.group(1)) + print("Link speed for %s is %d Mbps" % (nic, link_mbps)) + return link_mbps + raise Exception("Couldn't find link speed in ethtool output") + def get_nic_irqs(): """ Returns a list containing the IRQ numbers for NIC channels (entry @@ -141,6 +182,7 @@ def get_nic_irqs(): if not i in irqs: raise Exception('Couldn\'t find IRQ for NIC channel %d' % (i)) result.append(irqs[i]) + f.close() return result def get_node_num(): @@ -162,7 +204,8 @@ def get_node_num(): def get_node_type(): """ - Returns the node type for this machine. + Returns the node type for this machine (assumes we're running on a + Cloudlab machine). """ global node_type @@ -258,7 +301,7 @@ def read_cpu_info(): Read the file /proc/cpuinfo and store information from it in various global arrays above. """ - global cpu_info, num_phys_cores + global cpu_info, cpu_type, num_phys_cores if len(cpu_info) > 0: return @@ -272,6 +315,9 @@ def read_cpu_info(): name = match.group(1) value = match.group(2) # print("name '%s' value '%s'" % (name, value)) + if name == 'model name' and cpu_type == None: + cpu_type = value + print("CPU type is %s" % (cpu_type)) if name == 'processor': cpu = int(value) cpu_info.append({name: int(value)}) @@ -336,41 +382,40 @@ def config_homa(mod): this node type. mod: the path to the Homa module '.ko' file """ - type = get_node_type() print("Installing Homa kernel module from %s" % (mod)) subprocess.run(["sudo", "rmmod", "homa"], check=False) subprocess.run(["sudo", "bash", "-c", "insmod %s" % (mod)], check=True) set_sysctl("num_priorities", "8") - if type == "xl170": - set_sysctl("link_mbps", "25000") - set_sysctl("max_nic_queue_ns", "2000") + link_mbps = get_link_speed() + set_sysctl ("link_mbps", str(link_mbps)) + if link_mbps == 10000: + set_sysctl("max_nic_queue_ns", "5000") + set_sysctl("unsched_bytes", "30000") + set_sysctl("window", "50000") + set_sysctl("max_incoming", "400000") + set_sysctl("max_gso_size", "10000") + elif link_mbps == 25000: + set_sysctl("max_nic_queue_ns", "5000") set_sysctl("unsched_bytes", "60000") set_sysctl("window", "100000") set_sysctl("max_incoming", "480000") set_sysctl("max_gso_size", "10000") - elif type == "c6525-100g": - set_sysctl("link_mbps", "100000") + elif link_mbps == 100000: set_sysctl("max_nic_queue_ns", "5000") set_sysctl("unsched_bytes", "60000") set_sysctl("window", "200000") set_sysctl("max_incoming", "1600000") set_sysctl("max_gso_size", "100000") - elif type == "c6525-25g": - set_sysctl("link_mbps", "25000") - set_sysctl("max_nic_queue_ns", "5000") - set_sysctl("unsched_bytes", "60000") - set_sysctl("window", "100000") - set_sysctl("max_incoming", "480000") - set_sysctl("max_gso_size", "10000") else: - raise Exception("Can't configure Homa: unknown node type %s" % (type)) + raise Exception("Can't configure Homa: no config info available " + "for link speed %d Mbps" % (link_mbps)) def config_ecn_threshold(kb): """ - Modify the configuration of this experiment's egress ports at the top-of-rack switch - to enable optimal Homa performance. + Modify the configuration of this experiment's egress ports at the + top-of-rack switch to enable optimal Homa performance. """ s = Switch(True) for port in get_exp_ports(): @@ -492,24 +537,18 @@ def config_power(): """ Configure the machine's power management for best Homa performance. """ - type = get_node_type() - if type == "xl170": - # Intel E5-2640v4 processor. For Homa, it's best to leave C-states - # enabled. This can cause CPUs to sleep in power-saving mode, but if - # C-states are disabled, then so is Turbo mode, and that will hurt - # peak peformance. + if "Intel" in get_cpu_type(): + # For Intel processors, it's best to leave C-states enabled. This + # can cause CPUs to sleep in power-saving mode, but if C-states + # are disabled, then so is Turbo mode, and that will hurt peak peformance. print("Configuring power settings for Intel CPUs") try: subprocess.run(["sudo", "cpupower", "frequency-set", "-g", "performance"], check=True) except subprocess.CalledProcessError: print("*** cpupower error; ignoring for now") - elif (type == "c6525-100g") or (type == "c6525-25g"): - # AMD 7402P (EPYC Rome processor); don't know of any appropriate - # power setting changes - return else: - raise Exception("Can't configure power: unknown node type %s" % (type)) + print("Skipping power settings (non-Intel CPU type)") def config_reset_switch_all_ports(): """ @@ -695,5 +734,7 @@ while i < len(sys.argv): config_switch_all_ports() elif arg == "switch_ports": config_switch_ports() + elif arg == "test": + get_cpu_type() else: raise Exception("Unknown feature '%s'" % (arg)) From cd6c7693027611b0e4f0dfccc402caa838dda4d2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 28 Jan 2025 16:54:44 -0800 Subject: [PATCH 152/625] Add homa_make_header_avl function Invoke this in both homa_gro_receive and homa_softirq (previously homa_gro_receive wasn't invoking pskb_may_pull). --- homa_impl.h | 17 +++++++++++++++++ homa_offload.c | 10 ++++++---- homa_plumbing.c | 8 ++------ 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 21b0892f..f54cacdd 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1003,6 +1003,23 @@ static inline __u32 tt_addr(const struct in6_addr x) : ntohl(x.in6_u.u6_addr32[1])); } +/** + * homa_make_header_avl() - Invokes pskb_may_pull to make sure that all the + * Homa header information for a packet is in the linear part of the skb + * where it can be addressed using skb_transport_header. + * @skb: Packet for which header is needed. + * Return: The result of pskb_may_pull (true for success) + */ +static inline bool homa_make_header_avl(struct sk_buff *skb) +{ + int pull_length; + + pull_length = skb_transport_header(skb) - skb->data + HOMA_MAX_HEADER; + if (pull_length > skb->len) + pull_length = skb->len; + return pskb_may_pull(skb, pull_length); +} + #ifdef __UNIT_TEST__ void unit_log_printf(const char *separator, const char *format, ...) __printf(2, 3); diff --git a/homa_offload.c b/homa_offload.c index 265c2849..67da24b1 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -295,6 +295,9 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, __u32 hash; int busy; + if (!homa_make_header_avl(skb)) + tt_record("homa_gro_receive couldn't pull enough data from packet"); + h_new = (struct homa_data_hdr *)skb_transport_header(skb); offload_core = &per_cpu(homa_offload_core, raw_smp_processor_id()); busy = (now - offload_core->last_gro) < homa->gro_busy_ns; @@ -306,11 +309,10 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, priority = ((struct iphdr *)skb_network_header(skb))->tos >> 5; saddr = ntohl(ip_hdr(skb)->saddr); } + tt_record4("homa_gro_receive transport %d, len %d, data_len %d, delta %d", + skb->network_header, skb->len, skb->data_len, + skb_transport_header(skb) - skb->data); -// The test below is overly conservative except for data packets. -// if (!pskb_may_pull(skb, 64)) -// tt_record("homa_gro_receive can't pull enough data " -// "from packet for trace"); if (h_new->common.type == DATA) { if (h_new->seg.offset == (__force __be32)-1) { tt_record2("homa_gro_receive replaced offset %d with %d", diff --git a/homa_plumbing.c b/homa_plumbing.c index a20472af..dc0ed0a7 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1236,7 +1236,6 @@ int homa_softirq(struct sk_buff *skb) struct homa *homa = global_homa; struct homa_common_hdr *h; int header_offset; - int pull_length; __u64 start; start = sched_clock(); @@ -1262,16 +1261,13 @@ int homa_softirq(struct sk_buff *skb) * header hasn't yet been removed (this happens for GRO packets * on the frag_list, since they aren't handled explicitly by IP. */ - header_offset = skb_transport_header(skb) - skb->data; - pull_length = HOMA_MAX_HEADER + header_offset; - if (pull_length > skb->len) - pull_length = skb->len; - if (!pskb_may_pull(skb, pull_length)) { + if (!homa_make_header_avl(skb)) { if (homa->verbose) pr_notice("Homa can't handle fragmented packet (no space for header); discarding\n"); UNIT_LOG("", "pskb discard"); goto discard; } + header_offset = skb_transport_header(skb) - skb->data; if (header_offset) __skb_pull(skb, header_offset); From 9a25dcdc59b8123bf7b62320bc067f8b58ca7de7 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 29 Jan 2025 13:13:59 -0800 Subject: [PATCH 153/625] Update notes.txt --- notes.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/notes.txt b/notes.txt index d69f430a..590afb59 100755 --- a/notes.txt +++ b/notes.txt @@ -28,6 +28,13 @@ Notes for Homa implementation in Linux: traffic? * Also consider the amount of data that is "stuck" in the NIC? +* Optimizations for skb freeing: + * In GRO, merge page frags out of skbs and return skbs to napi with + napi_reuse_skb (return GRO_MERGED_FREE?). See also napi_get_frags (used + by the driver?). + * Apparently TCP has a faster way of eventually freeing the merged skb + (return things to the allocating core): see tcp_eat_recv_skb? + * Remedies to consider for the performance problems at 100 Gbps, where one tx channel gets very backed up: * Implement zero-copy on output in order to reduce memory bandwidth From 39728229036d1bd8316ea12aa32991a7e45f639d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 13:06:24 -0800 Subject: [PATCH 154/625] Add .txt files to manifest for upstreaming --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c1590b4a..c3f36464 100644 --- a/Makefile +++ b/Makefile @@ -54,10 +54,14 @@ CP_HDRS := homa_impl.h \ homa_stub.h \ homa_wire.h CP_SRCS := $(patsubst %.o,%.c,$(filter-out timetrace.o, $(HOMA_OBJS))) -CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS)) +CP_EXTRAS := reap.txt \ + sync.txt +CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS) $(CP_EXTRAS)) net-next: $(CP_TARGETS) $(LINUX_SRC_DIR)/include/uapi/linux/homa.h $(HOMA_TARGET)/%: % util/strip.py util/strip.py $< > $@ +$(HOMA_TARGET)/%.txt: %.txt + cp $< $@ $(LINUX_SRC_DIR)/include/uapi/linux/homa.h: homa.h util/strip.py util/strip.py $< > $@ From 5583a68c2e28f312665690510973baf942089aba Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 13:06:51 -0800 Subject: [PATCH 155/625] Replace __u64 with u64 in non-uapi code Same with __s64 and __u32. --- homa_grant.c | 10 +- homa_impl.h | 20 ++-- homa_incoming.c | 26 ++--- homa_metrics.c | 2 +- homa_metrics.h | 210 +++++++++++++++++++------------------- homa_offload.c | 18 ++-- homa_offload.h | 6 +- homa_outgoing.c | 18 ++-- homa_peer.c | 14 +-- homa_peer.h | 8 +- homa_plumbing.c | 16 +-- homa_pool.c | 12 +-- homa_pool.h | 8 +- homa_rpc.c | 8 +- homa_rpc.h | 24 ++--- homa_skb.c | 8 +- homa_sock.c | 6 +- homa_sock.h | 10 +- homa_timer.c | 6 +- homa_utils.c | 8 +- homa_wire.h | 2 +- reap.txt | 23 +++-- test/mock.c | 20 ++-- test/mock.h | 4 +- test/unit_homa_grant.c | 6 +- test/unit_homa_incoming.c | 4 +- test/unit_homa_offload.c | 8 +- test/unit_homa_outgoing.c | 4 +- test/unit_homa_peer.c | 2 +- test/unit_homa_plumbing.c | 6 +- test/unit_homa_pool.c | 20 ++-- test/unit_homa_rpc.c | 4 +- test/unit_homa_sock.c | 2 +- test/unit_homa_timer.c | 4 +- timetrace.c | 40 ++++---- timetrace.h | 38 +++---- util/cp_node.cc | 4 +- 37 files changed, 318 insertions(+), 311 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 5a51099b..5d389d20 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -81,7 +81,7 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) /* Message not yet tracked; add it in priority order to * the peer's list. */ - __u64 time = sched_clock(); + u64 time = sched_clock(); INC_METRIC(grantable_rpcs_integral, homa->num_grantable_rpcs * (time - homa->last_grantable_change)); @@ -165,7 +165,7 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) struct homa *homa = rpc->hsk->homa; struct homa_peer *peer = rpc->peer; struct homa_rpc *candidate; - __u64 time = sched_clock(); + u64 time = sched_clock(); struct homa_rpc *head; if (list_empty(&rpc->grantable_links)) @@ -395,7 +395,7 @@ void homa_grant_recalc(struct homa *homa, int locked) */ struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; int i, active, try_again; - __u64 start; + u64 start; tt_record("homa_grant_recalc starting"); INC_METRIC(grant_recalc_calls, 1); @@ -567,7 +567,7 @@ void homa_grant_find_oldest(struct homa *homa) int max_incoming = homa->grant_window + 2 * homa->fifo_grant_increment; struct homa_rpc *rpc, *oldest; struct homa_peer *peer; - __u64 oldest_birth; + u64 oldest_birth; oldest = NULL; oldest_birth = ~0; @@ -653,7 +653,7 @@ int homa_grantable_lock_slow(struct homa *homa, int recalc) __acquires(&homa->grantable_lock) { int starting_count = atomic_read(&homa->grant_recalc_count); - __u64 start = sched_clock(); + u64 start = sched_clock(); int result = 0; tt_record("beginning wait for grantable lock"); diff --git a/homa_impl.h b/homa_impl.h index f54cacdd..5745f8a7 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -254,7 +254,7 @@ struct homa { * @grantable_lock_time: sched_clock() time when grantable_lock * was last locked. */ - __u64 grantable_lock_time; + u64 grantable_lock_time; /** * @grant_recalc_count: Incremented every time homa_grant_recalc @@ -286,7 +286,7 @@ struct homa { * increment or decrement of num_grantable_rpcs; used for computing * statistics. */ - __u64 last_grantable_change; + u64 last_grantable_change; /** * @max_grantable_rpcs: The largest value that has been seen for @@ -366,7 +366,7 @@ struct homa { * @pacer_wake_time: time (in sched_clock units) when the pacer last * woke up (if the pacer is running) or 0 if the pacer is sleeping. */ - __u64 pacer_wake_time; + u64 pacer_wake_time; /** * @throttle_lock: Used to synchronize access to @throttled_rpcs. To @@ -387,7 +387,7 @@ struct homa { * @throttle_add: The time (in sched_clock() units) when the most * recent RPC was added to @throttled_rpcs. */ - __u64 throttle_add; + u64 throttle_add; /** * @throttle_min_bytes: If a packet has fewer bytes than this, then it @@ -467,7 +467,7 @@ struct homa { * @skb_page_free_time: Time (in sched_clock() units) when the * next sk_buff page should be freed. Could be in the past. */ - __u64 skb_page_free_time; + u64 skb_page_free_time; /** * @skb_page_pool_min_kb: Don't return pages from a pool to Linux @@ -670,7 +670,7 @@ struct homa { * of the value, to ensure that we don't underestimate NIC queue * length and queue too many packets. */ - __u32 ns_per_mbyte; + u32 ns_per_mbyte; /** * @verbose: Nonzero enables additional logging. Set externally via @@ -772,7 +772,7 @@ struct homa { * @timer_ticks: number of times that homa_timer has been invoked * (may wraparound, which is safe). */ - __u32 timer_ticks; + u32 timer_ticks; /** * @metrics_mutex: Used to synchronize accesses to @metrics_active_opens @@ -996,7 +996,7 @@ static inline bool is_homa_pkt(struct sk_buff *skb) * @x: Address (either IPv6 or IPv4-mapped IPv6) * Return: see above */ -static inline __u32 tt_addr(const struct in6_addr x) +static inline u32 tt_addr(const struct in6_addr x) { return ipv6_addr_v4mapped(&x) ? ntohl(x.in6_u.u6_addr32[3]) : (x.in6_u.u6_addr32[3] ? ntohl(x.in6_u.u6_addr32[3]) @@ -1111,7 +1111,7 @@ void homa_prios_changed(struct homa *homa); int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int homa_register_interests(struct homa_interest *interest, - struct homa_sock *hsk, int flags, __u64 id); + struct homa_sock *hsk, int flags, u64 id); void homa_remove_from_throttled(struct homa_rpc *rpc); void homa_resend_data(struct homa_rpc *rpc, int start, int end, int priority); @@ -1150,7 +1150,7 @@ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, - __u64 id); + u64 id); int homa_xmit_control(enum homa_packet_type type, void *contents, size_t length, struct homa_rpc *rpc); int __homa_xmit_control(void *contents, size_t length, diff --git a/homa_incoming.c b/homa_incoming.c index 54f6334d..1d09b4f1 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -232,7 +232,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) int end_offset = 0; #endif /* See strip.py */ int error = 0; - __u64 start; + u64 start; int n = 0; /* Number of filled entries in skbs. */ int i; @@ -366,7 +366,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) #endif /* __UNIT_TEST__ */ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; - __u64 id = homa_local_id(h->common.sender_id); + u64 id = homa_local_id(h->common.sender_id); int dport = ntohs(h->common.dport); /* Used to collect acks from data packets so we can process them @@ -549,7 +549,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) * nor homa_timer can keep up with reaping dead * RPCs. See reap.txt for details. */ - __u64 start = sched_clock(); + u64 start = sched_clock(); tt_record("homa_data_pkt calling homa_rpc_reap"); homa_rpc_reap(hsk, false); @@ -817,7 +817,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, { struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - __u64 id = homa_local_id(h->sender_id); + u64 id = homa_local_id(h->sender_id); struct homa_peer *peer; struct homa_ack_hdr ack; @@ -911,7 +911,7 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) { struct homa_rpc *rpc, *oldest; - __u64 oldest_birth; + u64 oldest_birth; int granted; oldest = NULL; @@ -1102,7 +1102,7 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) * interest. */ int homa_register_interests(struct homa_interest *interest, - struct homa_sock *hsk, int flags, __u64 id) + struct homa_sock *hsk, int flags, u64 id) { struct homa_rpc *rpc = NULL; int locked = 1; @@ -1210,10 +1210,10 @@ int homa_register_interests(struct homa_interest *interest, * errno value. The RPC will be locked; the caller must unlock. */ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, - __u64 id) + u64 id) __acquires(&rpc->bucket_lock) { - __u64 poll_start, poll_end, now; + u64 poll_start, poll_end, now; int error, blocked = 0, polled = 0; struct homa_rpc *result = NULL; struct homa_interest interest; @@ -1270,7 +1270,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, poll_start = now; poll_end = now + (1000 * hsk->homa->poll_usecs); while (1) { - __u64 blocked; + u64 blocked; rpc = homa_interest_get_rpc(&interest); if (rpc) { @@ -1301,8 +1301,8 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, set_current_state(TASK_INTERRUPTIBLE); rpc = homa_interest_get_rpc(&interest); if (!rpc && !hsk->shutdown) { - __u64 end; - __u64 start = sched_clock(); + u64 end; + u64 start = sched_clock(); tt_record1("homa_wait_for_message sleeping, pid %d", current->pid); @@ -1403,7 +1403,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, struct homa_interest *homa_choose_interest(struct homa *homa, struct list_head *head, int offset) { - __u64 busy_time = sched_clock() - homa->busy_ns; + u64 busy_time = sched_clock() - homa->busy_ns; struct homa_interest *backup = NULL; struct homa_interest *interest; struct list_head *pos; @@ -1520,7 +1520,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) */ void homa_incoming_sysctl_changed(struct homa *homa) { - __u64 tmp; + u64 tmp; if (homa->grant_fifo_fraction > 500) homa->grant_fifo_fraction = 500; diff --git a/homa_metrics.c b/homa_metrics.c index 7902ae56..45bf5b47 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -81,7 +81,7 @@ char *homa_metrics_print(struct homa *homa) sched_clock()); for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = &per_cpu(homa_metrics, core); - __s64 delta; + s64 delta; M("core %15d Core id for following metrics\n", core); diff --git a/homa_metrics.h b/homa_metrics.h index ed70018b..22695aec 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -26,7 +26,7 @@ struct homa_metrics { * inclusive. */ #define HOMA_NUM_SMALL_COUNTS 64 - __u64 small_msg_bytes[HOMA_NUM_SMALL_COUNTS]; + u64 small_msg_bytes[HOMA_NUM_SMALL_COUNTS]; /** * @medium_msg_bytes: entry i holds the total number of bytes @@ -35,319 +35,319 @@ struct homa_metrics { * (small_msg_counts covers this range). */ #define HOMA_NUM_MEDIUM_COUNTS 128 - __u64 medium_msg_bytes[HOMA_NUM_MEDIUM_COUNTS]; + u64 medium_msg_bytes[HOMA_NUM_MEDIUM_COUNTS]; /** * @large_msg_count: the total number of messages received whose * length is too large to appear in medium_msg_bytes. */ - __u64 large_msg_count; + u64 large_msg_count; /** * @large_msg_bytes: the total number of bytes received in * messages too large to be counted by medium_msg_bytes. */ - __u64 large_msg_bytes; + u64 large_msg_bytes; /** * @sent_msg_bytes: The total number of bytes in outbound * messages. */ - __u64 sent_msg_bytes; + u64 sent_msg_bytes; /** * @packets_sent: total number of packets sent for each packet type * (entry 0 corresponds to DATA, and so on). */ - __u64 packets_sent[BOGUS - DATA]; + u64 packets_sent[BOGUS - DATA]; /** * @packets_received: total number of packets received for each * packet type (entry 0 corresponds to DATA, and so on). */ - __u64 packets_received[BOGUS - DATA]; + u64 packets_received[BOGUS - DATA]; /** @priority_bytes: total bytes sent at each priority level. */ - __u64 priority_bytes[HOMA_MAX_PRIORITIES]; + u64 priority_bytes[HOMA_MAX_PRIORITIES]; /** @priority_packets: total packets sent at each priority level. */ - __u64 priority_packets[HOMA_MAX_PRIORITIES]; + u64 priority_packets[HOMA_MAX_PRIORITIES]; /** * @skb_allocs: total number of calls to homa_skb_new_tx. */ - __u64 skb_allocs; + u64 skb_allocs; /** @skb_alloc_ns: total time spent in homa_skb_new_tx. */ - __u64 skb_alloc_ns; + u64 skb_alloc_ns; /** * @skb_frees: total number of sk_buffs for data packets that have * been freed (counts normal paths only). */ - __u64 skb_frees; + u64 skb_frees; /** @skb_free_ns: total time spent freeing sk_buffs. */ - __u64 skb_free_ns; + u64 skb_free_ns; /** * @skb_page_allocs: total number of calls to homa_skb_page_alloc. */ - __u64 skb_page_allocs; + u64 skb_page_allocs; /** @skb_page_alloc_ns: total time spent in homa_skb_page_alloc. */ - __u64 skb_page_alloc_ns; + u64 skb_page_alloc_ns; /** * @requests_received: total number of request messages received. */ - __u64 requests_received; + u64 requests_received; /** * @requests_queued: total number of requests that were added to * @homa->ready_requests (no thread was waiting). */ - __u64 requests_queued; + u64 requests_queued; /** * @responses_received: total number of response messages received. */ - __u64 responses_received; + u64 responses_received; /** * @responses_queued: total number of responses that were added to * @homa->ready_responses (no thread was waiting). */ - __u64 responses_queued; + u64 responses_queued; /** * @fast_wakeups: total number of times that a message arrived for * a receiving thread that was polling in homa_wait_for_message. */ - __u64 fast_wakeups; + u64 fast_wakeups; /** * @slow_wakeups: total number of times that a receiving thread * had to be put to sleep (no message arrived while it was polling). */ - __u64 slow_wakeups; + u64 slow_wakeups; /** * @handoffs_thread_waiting: total number of times that an RPC * was handed off to a waiting thread (vs. being queued). */ - __u64 handoffs_thread_waiting; + u64 handoffs_thread_waiting; /** * @handoffs_alt_thread: total number of times that a thread other * than the first on the list was chosen for a handoff (because the * first thread was on a busy core). */ - __u64 handoffs_alt_thread; + u64 handoffs_alt_thread; /** * @poll_ns: total time spent in the polling loop in * homa_wait_for_message. */ - __u64 poll_ns; + u64 poll_ns; /** * @softirq_calls: total number of calls to homa_softirq (i.e., * total number of GRO packets processed, each of which could contain * multiple Homa packets. */ - __u64 softirq_calls; + u64 softirq_calls; /** * @softirq_ns: total time spent executing homa_softirq when * invoked under Linux's SoftIRQ handler. */ - __u64 softirq_ns; + u64 softirq_ns; /** * @bypass_softirq_ns: total time spent executing homa_softirq when * invoked during GRO, bypassing the SoftIRQ mechanism. */ - __u64 bypass_softirq_ns; + u64 bypass_softirq_ns; /** * @linux_softirq_ns: total time spent executing all softirq * activities, as measured by the linux softirq module. Only * available with modified Linux kernels. */ - __u64 linux_softirq_ns; + u64 linux_softirq_ns; /** * @napi_ns: total time spent executing all NAPI activities, as * measured by the linux softirq module. Only available with modified * Linux kernels. */ - __u64 napi_ns; + u64 napi_ns; /** * @send_ns: total time spent executing the homa_sendmsg kernel * call handler to send requests. */ - __u64 send_ns; + u64 send_ns; /** * @send_calls: total number of invocations of homa_semdmsg * for requests. */ - __u64 send_calls; + u64 send_calls; /** * @recv_ns: total time spent executing homa_recvmsg (including * time when the thread is blocked). */ - __u64 recv_ns; + u64 recv_ns; /** @recv_calls: total number of invocations of homa_recvmsg. */ - __u64 recv_calls; + u64 recv_calls; /** * @blocked_ns: total time spent by threads in blocked state * while executing the homa_recvmsg kernel call handler. */ - __u64 blocked_ns; + u64 blocked_ns; /** * @reply_ns: total time spent executing the homa_sendmsg kernel * call handler to send responses. */ - __u64 reply_ns; + u64 reply_ns; /** * @reply_calls: total number of invocations of homa_semdmsg * for responses. */ - __u64 reply_calls; + u64 reply_calls; /** * @abort_ns: total time spent executing the homa_ioc_abort * kernel call handler. */ - __u64 abort_ns; + u64 abort_ns; /** * @abort_calls: total number of invocations of the homa_ioc_abort * kernel call. */ - __u64 abort_calls; + u64 abort_calls; /** * @so_set_buf_ns: total time spent executing the homa_ioc_set_buf * kernel call handler. */ - __u64 so_set_buf_ns; + u64 so_set_buf_ns; /** * @so_set_buf_calls: total number of invocations of the homa_ioc_set_buf * kernel call. */ - __u64 so_set_buf_calls; + u64 so_set_buf_calls; /** * @grantable_lock_ns: total time spent with homa->grantable_lock * locked. */ - __u64 grantable_lock_ns; + u64 grantable_lock_ns; /** @timer_ns: total time spent in homa_timer. */ - __u64 timer_ns; + u64 timer_ns; /** * @timer_reap_ns: total time spent by homa_timer to reap dead * RPCs. This time is included in @timer_ns. */ - __u64 timer_reap_ns; + u64 timer_reap_ns; /** * @data_pkt_reap_ns: total time spent by homa_data_pkt to reap * dead RPCs. */ - __u64 data_pkt_reap_ns; + u64 data_pkt_reap_ns; /** * @pacer_ns: total time spent executing in homa_pacer_main * (not including blocked time). */ - __u64 pacer_ns; + u64 pacer_ns; /** * @pacer_lost_ns: unnecessary delays in transmitting packets * (i.e. wasted output bandwidth) because the pacer was slow or got * descheduled. */ - __u64 pacer_lost_ns; + u64 pacer_lost_ns; /** * @pacer_bytes: total number of bytes transmitted when * @homa->throttled_rpcs is nonempty. */ - __u64 pacer_bytes; + u64 pacer_bytes; /** * @pacer_skipped_rpcs: total number of times that the pacer had to * abort because it couldn't lock an RPC. */ - __u64 pacer_skipped_rpcs; + u64 pacer_skipped_rpcs; /** * @pacer_needed_help: total number of times that homa_check_pacer * found that the pacer was running behind, so it actually invoked * homa_pacer_xmit. */ - __u64 pacer_needed_help; + u64 pacer_needed_help; /** * @throttled_ns: total amount of time that @homa->throttled_rpcs * is nonempty. */ - __u64 throttled_ns; + u64 throttled_ns; /** * @resent_packets: total number of data packets issued in response to * RESEND packets. */ - __u64 resent_packets; + u64 resent_packets; /** * @peer_hash_links: total # of link traversals in homa_peer_find. */ - __u64 peer_hash_links; + u64 peer_hash_links; /** * @peer_new_entries: total # of new entries created in Homa's * peer table (this value doesn't increment if the desired peer is * found in the entry in its hash chain). */ - __u64 peer_new_entries; + u64 peer_new_entries; /** * @peer_kmalloc_errors: total number of times homa_peer_find * returned an error because it couldn't allocate memory for a new * peer. */ - __u64 peer_kmalloc_errors; + u64 peer_kmalloc_errors; /** * @peer_route_errors: total number of times homa_peer_find * returned an error because it couldn't create a route to the peer. */ - __u64 peer_route_errors; + u64 peer_route_errors; /** * @control_xmit_errors: total number of times ip_queue_xmit * failed when transmitting a control packet. */ - __u64 control_xmit_errors; + u64 control_xmit_errors; /** * @data_xmit_errors: total number of times ip_queue_xmit * failed when transmitting a data packet. */ - __u64 data_xmit_errors; + u64 data_xmit_errors; /** * @unknown_rpcs: total number of times an incoming packet was @@ -355,292 +355,292 @@ struct homa_metrics { * count grant packets received by servers (since these are * fairly common). */ - __u64 unknown_rpcs; + u64 unknown_rpcs; /** * @server_cant_create_rpcs: total number of times a server discarded * an incoming packet because it couldn't create a homa_rpc object. */ - __u64 server_cant_create_rpcs; + u64 server_cant_create_rpcs; /** * @unknown_packet_types: total number of times a packet was discarded * because its type wasn't one of the supported values. */ - __u64 unknown_packet_types; + u64 unknown_packet_types; /** * @short_packets: total number of times a packet was discarded * because it was too short to hold all the required information. */ - __u64 short_packets; + u64 short_packets; /** * @packet_discards: total number of times a normal (non-retransmitted) * packet was discarded because all its data had already been received. */ - __u64 packet_discards; + u64 packet_discards; /** * @resent_discards: total number of times a retransmitted packet * was discarded because its data had already been received. */ - __u64 resent_discards; + u64 resent_discards; /** * @resent_packets_used: total number of times a resent packet was * actually incorporated into the message at the target (i.e. it * wasn't redundant). */ - __u64 resent_packets_used; + u64 resent_packets_used; /** * @rpc_timeouts: total number of times an RPC (either client or * server) was aborted because the peer was nonresponsive. */ - __u64 rpc_timeouts; + u64 rpc_timeouts; /** * @server_rpc_discards: total number of times an RPC was aborted on * the server side because of a timeout. */ - __u64 server_rpc_discards; + u64 server_rpc_discards; /** * @server_rpcs_unknown: total number of times an RPC was aborted on * the server side because it is no longer known to the client. */ - __u64 server_rpcs_unknown; + u64 server_rpcs_unknown; /** * @client_lock_misses: total number of times that Homa had to wait * to acquire a client bucket lock. */ - __u64 client_lock_misses; + u64 client_lock_misses; /** * @client_lock_miss_ns: total time spent waiting for client * bucket lock misses. */ - __u64 client_lock_miss_ns; + u64 client_lock_miss_ns; /** * @server_lock_misses: total number of times that Homa had to wait * to acquire a server bucket lock. */ - __u64 server_lock_misses; + u64 server_lock_misses; /** * @server_lock_miss_ns: total time spent waiting for server * bucket lock misses. */ - __u64 server_lock_miss_ns; + u64 server_lock_miss_ns; /** * @socket_lock_miss_ns: total time spent waiting for socket * lock misses. */ - __u64 socket_lock_miss_ns; + u64 socket_lock_miss_ns; /** * @socket_lock_misses: total number of times that Homa had to wait * to acquire a socket lock. */ - __u64 socket_lock_misses; + u64 socket_lock_misses; /** * @throttle_lock_miss_ns: total time spent waiting for throttle * lock misses. */ - __u64 throttle_lock_miss_ns; + u64 throttle_lock_miss_ns; /** * @throttle_lock_misses: total number of times that Homa had to wait * to acquire the throttle lock. */ - __u64 throttle_lock_misses; + u64 throttle_lock_misses; /** * @peer_ack_lock_miss_ns: total time spent waiting for peer lock misses. */ - __u64 peer_ack_lock_miss_ns; + u64 peer_ack_lock_miss_ns; /** * @peer_ack_lock_misses: total number of times that Homa had to wait * to acquire the lock used for managing acks for a peer. */ - __u64 peer_ack_lock_misses; + u64 peer_ack_lock_misses; /** * @grantable_lock_miss_ns: total time spent waiting for grantable * lock misses. */ - __u64 grantable_lock_miss_ns; + u64 grantable_lock_miss_ns; /** * @grantable_lock_misses: total number of times that Homa had to wait * to acquire the grantable lock. */ - __u64 grantable_lock_misses; + u64 grantable_lock_misses; /** * @grantable_rpcs_integral: cumulative sum of time_delta*grantable, * where time_delta is in nanoseconds and grantable is the value of * homa->num_grantable_rpcs over that time period. */ - __u64 grantable_rpcs_integral; + u64 grantable_rpcs_integral; /** * @grant_recalc_calls: cumulative number of times homa_grant_recalc * has been invoked. */ - __u64 grant_recalc_calls; + u64 grant_recalc_calls; /** @grant_recalc_ns: total time spent in homa_grant_recalc. */ - __u64 grant_recalc_ns; + u64 grant_recalc_ns; /** * @grant_recalc_loops: cumulative number of times homa_grant_recalc * has looped back to recalculate again. */ - __u64 grant_recalc_loops; + u64 grant_recalc_loops; /** * @grant_recalc_skips: cumulative number of times that * homa_grant_recalc skipped its work because in other thread * already did it. */ - __u64 grant_recalc_skips; + u64 grant_recalc_skips; /** * @grant_priority_bumps: cumulative number of times the grant priority * of an RPC has increased above its next-higher-priority neighbor. */ - __u64 grant_priority_bumps; + u64 grant_priority_bumps; /** * @fifo_grants: total number of times that grants were sent to * the oldest message. */ - __u64 fifo_grants; + u64 fifo_grants; /** * @fifo_grants_no_incoming: total number of times that, when a * FIFO grant was issued, the message had no outstanding grants * (everything granted had been received). */ - __u64 fifo_grants_no_incoming; + u64 fifo_grants_no_incoming; /** * @disabled_reaps: total number of times that the reaper couldn't * run at all because it was disabled. */ - __u64 disabled_reaps; + u64 disabled_reaps; /** * @disabled_rpc_reaps: total number of times that the reaper skipped * an RPC because reaping was disabled for that particular RPC */ - __u64 disabled_rpc_reaps; + u64 disabled_rpc_reaps; /** * @reaper_calls: total number of times that the reaper was invoked * and was not disabled. */ - __u64 reaper_calls; + u64 reaper_calls; /** * @reaper_dead_skbs: incremented by hsk->dead_skbs each time that * reaper_calls is incremented. */ - __u64 reaper_dead_skbs; + u64 reaper_dead_skbs; /** * @forced_reaps: total number of times that homa_wait_for_message * invoked the reaper because dead_skbs was too high. */ - __u64 forced_reaps; + u64 forced_reaps; /** * @throttle_list_adds: total number of calls to homa_add_to_throttled. */ - __u64 throttle_list_adds; + u64 throttle_list_adds; /** * @throttle_list_checks: number of list elements examined in * calls to homa_add_to_throttled. */ - __u64 throttle_list_checks; + u64 throttle_list_checks; /** * @ack_overflows: total number of times that homa_peer_add_ack * found insufficient space for the new id and hence had to send an * ACK message. */ - __u64 ack_overflows; + u64 ack_overflows; /** * @ignored_need_acks: total number of times that a NEED_ACK packet * was ignored because the RPC's result hadn't been fully received. */ - __u64 ignored_need_acks; + u64 ignored_need_acks; /** * @bpage_reuses: total number of times that, when an owned page * reached the end, it could be reused because all existing * allocations had been released. */ - __u64 bpage_reuses; + u64 bpage_reuses; /** * @buffer_alloc_failures: total number of times that * homa_pool_allocate was unable to allocate buffer space for * an incoming message. */ - __u64 buffer_alloc_failures; + u64 buffer_alloc_failures; /** * @linux_pkt_alloc_bytes: total bytes allocated in new packet buffers * by the NIC driver because of packet cache underflows. */ - __u64 linux_pkt_alloc_bytes; + u64 linux_pkt_alloc_bytes; /** * @dropped_data_no_bufs: total bytes of incoming data dropped because * there was no application buffer space available. */ - __u64 dropped_data_no_bufs; + u64 dropped_data_no_bufs; /** * @gen3_handoffs: total number of handoffs from GRO to SoftIRQ made * by Gen3 load balancer. */ - __u64 gen3_handoffs; + u64 gen3_handoffs; /** * @gen3_alt_handoffs: total number of GRO->SoftIRQ handoffs that * didn't choose the primary SoftIRQ core because it was busy with * app threads. */ - __u64 gen3_alt_handoffs; + u64 gen3_alt_handoffs; /** * @gro_grant_bypasses: total number of GRANT packets passed directly * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ * mechanism (triggered by HOMA_GRO_FAST_GRANTS). */ - __u64 gro_grant_bypasses; + u64 gro_grant_bypasses; /** * @gro_data_bypasses: total number of DATA packets passed directly * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ * mechanism (triggered by HOMA_GRO_SHORT_BYPASS). */ - __u64 gro_data_bypasses; + u64 gro_data_bypasses; /** @temp: For temporary use during testing. */ #define NUM_TEMP_METRICS 10 - __u64 temp[NUM_TEMP_METRICS]; + u64 temp[NUM_TEMP_METRICS]; }; DECLARE_PER_CPU(struct homa_metrics, homa_metrics); diff --git a/homa_offload.c b/homa_offload.c index 67da24b1..291913e3 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -282,17 +282,17 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * gro_list by the caller, so it will be considered for merges * in the future. */ - __u64 saved_softirq_metric, softirq_ns; + u64 saved_softirq_metric, softirq_ns; struct homa_offload_core *offload_core; struct homa *homa = global_homa; struct sk_buff *result = NULL; struct homa_data_hdr *h_new; - __u64 *softirq_ns_metric; + u64 *softirq_ns_metric; struct sk_buff *held_skb; - __u64 now = sched_clock(); + u64 now = sched_clock(); int priority; - __u32 saddr; - __u32 hash; + u32 saddr; + u32 hash; int busy; if (!homa_make_header_avl(skb)) @@ -486,7 +486,7 @@ void homa_gro_gen2(struct homa *homa, struct sk_buff *skb) int this_core = raw_smp_processor_id(); struct homa_offload_core *offload_core; int candidate = this_core; - __u64 now = sched_clock(); + u64 now = sched_clock(); int i; for (i = CORES_TO_CHECK; i > 0; i--) { @@ -540,7 +540,7 @@ void homa_gro_gen3(struct homa *homa, struct sk_buff *skb) */ struct homa_data_hdr *h = (struct homa_data_hdr *)skb_transport_header(skb); - __u64 now, busy_time; + u64 now, busy_time; int *candidates; int i, core; @@ -600,8 +600,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) homa_gro_gen2(homa, skb); } else if (homa->gro_policy & HOMA_GRO_IDLE) { int i, core, best; - __u64 best_time = ~0; - __u64 last_active; + u64 best_time = ~0; + u64 last_active; /* Pick a specific core to handle SoftIRQ processing for this * group of packets. The goal here is to spread load so that no diff --git a/homa_offload.h b/homa_offload.h index cdc7b795..b0f21a8c 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -17,14 +17,14 @@ struct homa_offload_core { * there was system activity, such NAPI or SoftIRQ, on this * core. Used for load balancing. */ - __u64 last_active; + u64 last_active; /** * @last_gro: the last time (in sched_clock() units) that * homa_gro_receive returned on this core. Used to determine * whether GRO is keeping a core busy. */ - __u64 last_gro; + u64 last_gro; /** * @softirq_backlog: the number of batches of packets that have @@ -55,7 +55,7 @@ struct homa_offload_core { * by sending or receiving messages). Used for load balancing * (see balance.txt). */ - __u64 last_app_active; + u64 last_app_active; /** * held_skb: last packet buffer known to be available for diff --git a/homa_outgoing.c b/homa_outgoing.c index 2e035047..1ee8a63d 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -107,7 +107,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, struct homa_data_hdr *h; struct sk_buff *skb; int err, gso_size; - __u64 segs; + u64 segs; segs = length + max_seg_data - 1; do_div(segs, max_seg_data); @@ -213,7 +213,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) struct sk_buff **last_link; struct dst_entry *dst; - __u64 segs_per_gso; + u64 segs_per_gso; int overlap_xmit; /* Bytes of the message that haven't yet been copied into skbs. */ @@ -733,7 +733,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, */ void homa_outgoing_sysctl_changed(struct homa *homa) { - __u64 tmp; + u64 tmp; tmp = 8 * 1000ULL * 1000ULL * 1000ULL; @@ -761,7 +761,7 @@ void homa_outgoing_sysctl_changed(struct homa *homa) */ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) { - __u64 idle, new_idle, clock, ns_for_packet; + u64 idle, new_idle, clock, ns_for_packet; int bytes; bytes = homa_get_skb_info(skb)->wire_bytes; @@ -779,7 +779,7 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) #ifndef __STRIP__ /* See strip.py */ if (idle < clock) { if (homa->pacer_wake_time) { - __u64 lost = (homa->pacer_wake_time > idle) + u64 lost = (homa->pacer_wake_time > idle) ? clock - homa->pacer_wake_time : clock - idle; INC_METRIC(pacer_lost_ns, lost); @@ -880,7 +880,7 @@ void homa_pacer_xmit(struct homa *homa) * homa_pacer_main about interfering with softirq handlers). */ for (i = 0; i < 5; i++) { - __u64 idle_time, now; + u64 idle_time, now; /* If the NIC queue is too long, wait until it gets shorter. */ now = sched_clock(); @@ -912,7 +912,7 @@ void homa_pacer_xmit(struct homa *homa) homa->pacer_fifo_count -= homa->pacer_fifo_fraction; if (homa->pacer_fifo_count <= 0) { struct homa_rpc *cur; - __u64 oldest = ~0; + u64 oldest = ~0; homa->pacer_fifo_count += 1000; rpc = NULL; @@ -1006,7 +1006,7 @@ void homa_add_to_throttled(struct homa_rpc *rpc) struct homa_rpc *candidate; int bytes_left; int checks = 0; - __u64 now; + u64 now; if (!list_empty(&rpc->throttled_links)) return; @@ -1069,7 +1069,7 @@ void homa_remove_from_throttled(struct homa_rpc *rpc) void homa_log_throttled(struct homa *homa) { struct homa_rpc *rpc; - __s64 bytes = 0; + s64 bytes = 0; int rpcs = 0; pr_notice("Printing throttled list\n"); diff --git a/homa_peer.c b/homa_peer.c index e2304f48..5f53ad53 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -115,7 +115,7 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, * dates no later than this will be freed. Specify ~0 to * free all entries. */ -void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now) +void homa_peertab_gc_dsts(struct homa_peertab *peertab, u64 now) { while (!list_empty(&peertab->dead_dsts)) { struct homa_dead_dst *dead = @@ -153,14 +153,14 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, struct dst_entry *dst; // Should use siphash or jhash here: - __u32 bucket = hash_32((__force __u32)addr->in6_u.u6_addr32[0], + u32 bucket = hash_32((__force u32)addr->in6_u.u6_addr32[0], HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32((__force __u32)addr->in6_u.u6_addr32[1], + bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[1], HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32((__force __u32)addr->in6_u.u6_addr32[2], + bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[2], HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32((__force __u32)addr->in6_u.u6_addr32[3], + bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[3], HOMA_PEERTAB_BUCKET_BITS); hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], peertab_links) { @@ -230,7 +230,7 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, { struct homa_dead_dst *save_dead; struct dst_entry *dst; - __u64 now; + u64 now; /* Need to keep around the current entry for a while in case * someone is using it. If we can't do that, then don't update @@ -366,7 +366,7 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, void homa_peer_lock_slow(struct homa_peer *peer) __acquires(&peer->ack_lock) { - __u64 start = sched_clock(); + u64 start = sched_clock(); tt_record("beginning wait for peer lock"); spin_lock_bh(&peer->ack_lock); diff --git a/homa_peer.h b/homa_peer.h index 5a33d955..aba37914 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -26,7 +26,7 @@ struct homa_dead_dst { * @gc_time: Time (in units of sched_clock()) when it is safe * to free @dst. */ - __u64 gc_time; + u64 gc_time; /** @dst_links: Used to link together entries in peertab->dead_dsts. */ struct list_head dst_links; @@ -161,14 +161,14 @@ struct homa_peer { * @least_recent_ticks: the @resend_timer_ticks value for * @least_recent_rpc. */ - __u32 least_recent_ticks; + u32 least_recent_ticks; /** * @current_ticks: the value of @homa->timer_ticks the last time * that @least_recent_rpc and @least_recent_ticks were computed. * Used to detect the start of a new homa_timer pass. */ - __u32 current_ticks; + u32 current_ticks; /** * @resend_rpc: the value of @least_recent_rpc computed in the @@ -215,7 +215,7 @@ struct dst_entry void homa_peer_lock_slow(struct homa_peer *peer); void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7); -void homa_peertab_gc_dsts(struct homa_peertab *peertab, __u64 now); +void homa_peertab_gc_dsts(struct homa_peertab *peertab, u64 now); /** * homa_peer_lock() - Acquire the lock for a peer's @unacked_lock. If the lock diff --git a/homa_plumbing.c b/homa_plumbing.c index dc0ed0a7..3102cf5d 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -794,7 +794,7 @@ int homa_ioc_abort(struct sock *sk, int *karg) int homa_ioctl(struct sock *sk, int cmd, int *karg) { int result; - __u64 start = sched_clock(); + u64 start = sched_clock(); switch (cmd) { case HOMAIOCABORT: @@ -850,7 +850,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, { struct homa_sock *hsk = homa_sk(sk); struct homa_rcvbuf_args args; - __u64 start = sched_clock(); + u64 start = sched_clock(); int ret; if (level != IPPROTO_HOMA || optname != SO_HOMA_RCVBUF) @@ -926,10 +926,10 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) struct homa_sock *hsk = homa_sk(sk); struct homa_sendmsg_args args; union sockaddr_in_union *addr; - __u64 start = sched_clock(); + u64 start = sched_clock(); struct homa_rpc *rpc = NULL; int result = 0; - __u64 finish; + u64 finish; per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; @@ -1066,9 +1066,9 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, { struct homa_sock *hsk = homa_sk(sk); struct homa_recvmsg_args control; - __u64 start = sched_clock(); + u64 start = sched_clock(); struct homa_rpc *rpc; - __u64 finish; + u64 finish; int result; INC_METRIC(recv_calls, 1); @@ -1236,7 +1236,7 @@ int homa_softirq(struct sk_buff *skb) struct homa *homa = global_homa; struct homa_common_hdr *h; int header_offset; - __u64 start; + u64 start; start = sched_clock(); INC_METRIC(softirq_calls, 1); @@ -1480,7 +1480,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; - __u32 mask; + u32 mask; sock_poll_wait(file, sock, wait); mask = POLLOUT | POLLWRNORM; diff --git a/homa_pool.c b/homa_pool.c index bee403af..3982b340 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -49,7 +49,7 @@ static void set_bpages_needed(struct homa_pool *pool) * Return: Either zero (for success) or a negative errno for failure. */ int homa_pool_init(struct homa_sock *hsk, void __user *region, - __u64 region_size) + u64 region_size) { struct homa_pool *pool = hsk->buffer_pool; int i, result; @@ -151,12 +151,12 @@ void homa_pool_get_rcvbuf(struct homa_sock *hsk, * set). Otherwise the pages are left unowned. * Return: 0 for success, -1 if there wasn't enough free space in the pool. */ -int homa_pool_get_pages(struct homa_pool *pool, int num_pages, __u32 *pages, +int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, int set_owner) { int core_num = raw_smp_processor_id(); struct homa_pool_core *core; - __u64 now = sched_clock(); + u64 now = sched_clock(); int alloced = 0; int limit = 0; @@ -257,7 +257,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) { struct homa_pool *pool = rpc->hsk->buffer_pool; int full_pages, partial, i, core_id; - __u32 pages[HOMA_MAX_BPAGES]; + u32 pages[HOMA_MAX_BPAGES]; struct homa_pool_core *core; struct homa_bpage *bpage; struct homa_rpc *other; @@ -407,7 +407,7 @@ void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, * Return: 0 for success, otherwise a negative errno. */ int homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, - __u32 *buffers) + u32 *buffers) { int result = 0; int i; @@ -415,7 +415,7 @@ int homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, if (!pool->region) return result; for (i = 0; i < num_buffers; i++) { - __u32 bpage_index = buffers[i] >> HOMA_BPAGE_SHIFT; + u32 bpage_index = buffers[i] >> HOMA_BPAGE_SHIFT; struct homa_bpage *bpage = &pool->descriptors[bpage_index]; if (bpage_index < pool->num_bpages) { diff --git a/homa_pool.h b/homa_pool.h index 94cb648c..1aa7a66a 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -42,7 +42,7 @@ struct homa_bpage { * which it's OK to steal this page from its current * owner (if @refs is 1). */ - __u64 expiration; + u64 expiration; }; }; }; @@ -155,12 +155,12 @@ void homa_pool_destroy(struct homa_pool *pool); void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available); int homa_pool_get_pages(struct homa_pool *pool, int num_pages, - __u32 *pages, int leave_locked); + u32 *pages, int leave_locked); void homa_pool_get_rcvbuf(struct homa_sock *hsk, struct homa_rcvbuf_args *args); int homa_pool_init(struct homa_sock *hsk, void *buf_region, - __u64 region_size); + u64 region_size); int homa_pool_release_buffers(struct homa_pool *pool, - int num_buffers, __u32 *buffers); + int num_buffers, u32 *buffers); #endif /* _HOMA_POOL_H */ diff --git a/homa_rpc.c b/homa_rpc.c index 8c491af6..52c26900 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -111,7 +111,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, struct homa_data_hdr *h, int *created) __acquires(&srpc->bucket->lock) { - __u64 id = homa_local_id(h->common.sender_id); + u64 id = homa_local_id(h->common.sender_id); struct homa_rpc_bucket *bucket; struct homa_rpc *srpc = NULL; int err; @@ -213,7 +213,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack) { __u16 server_port = ntohs(ack->server_port); - __u64 id = homa_local_id(ack->client_id); + u64 id = homa_local_id(ack->client_id); struct homa_sock *hsk2 = hsk; struct homa_rpc *rpc; @@ -495,7 +495,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * The RPC will be locked; the caller must eventually unlock it * by invoking homa_rpc_unlock. */ -struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) +struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, u64 id) __acquires(&crpc->bucket->lock) { struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); @@ -522,7 +522,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, __u64 id) * unlock it by invoking homa_rpc_unlock. */ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u64 id) + const struct in6_addr *saddr, u64 id) __acquires(&srpc->bucket->lock) { struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); diff --git a/homa_rpc.h b/homa_rpc.h index 390376cf..48f0fd0a 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -90,7 +90,7 @@ struct homa_message_out { * @init_ns: Time in sched_clock units when this structure was * initialized. Used to find the oldest outgoing message. */ - __u64 init_ns; + u64 init_ns; }; /** @@ -108,7 +108,7 @@ struct homa_gap { * @time: time (in sched_clock units) when the gap was first detected. * As of 7/2024 this isn't used for anything. */ - __u64 time; + u64 time; /** @links: for linking into list in homa_message_in. */ struct list_head links; @@ -180,20 +180,20 @@ struct homa_message_in { * @birth: sched_clock() time when this RPC was added to the grantable * list. Invalid if RPC isn't in the grantable list. */ - __u64 birth; + u64 birth; /** * @num_bpages: The number of entries in @bpage_offsets used for this * message (0 means buffers not allocated yet). */ - __u32 num_bpages; + u32 num_bpages; /** * @bpage_offsets: Describes buffer space allocated for this message. * Each entry is an offset from the start of the buffer region. * All but the last pointer refer to areas of size HOMA_BPAGE_SIZE. */ - __u32 bpage_offsets[HOMA_MAX_BPAGES]; + u32 bpage_offsets[HOMA_MAX_BPAGES]; }; /** @@ -299,14 +299,14 @@ struct homa_rpc { * from its port. The low-order bit indicates whether we are * server (1) or client (0) for this RPC. */ - __u64 id; + u64 id; /** * @completion_cookie: Only used on clients. Contains identifying * information about the RPC provided by the application; returned to * the application with the RPC's result. */ - __u64 completion_cookie; + u64 completion_cookie; /** * @error: Only used on clients. If nonzero, then the RPC has @@ -390,7 +390,7 @@ struct homa_rpc { * @resend_timer_ticks: Value of homa->timer_ticks the last time * we sent a RESEND for this RPC. */ - __u32 resend_timer_ticks; + u32 resend_timer_ticks; /** * @done_timer_ticks: The value of homa->timer_ticks the first @@ -398,7 +398,7 @@ struct homa_rpc { * packets have been transmitted), so we're ready for an ack. * Zero means we haven't reached that point yet. */ - __u32 done_timer_ticks; + u32 done_timer_ticks; /** * @magic: when the RPC is alive, this holds a distinct value that @@ -418,10 +418,10 @@ struct homa_rpc { void homa_check_rpc(struct homa_rpc *rpc); struct homa_rpc - *homa_find_client_rpc(struct homa_sock *hsk, __u64 id); + *homa_find_client_rpc(struct homa_sock *hsk, u64 id); struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, __u64 id); + const struct in6_addr *saddr, u64 id); void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); void homa_rpc_free(struct homa_rpc *rpc); @@ -522,7 +522,7 @@ static inline void homa_unprotect_rpcs(struct homa_sock *hsk) * @id: Id of the RPC in question. * Return: true if we are the client for RPC id, false otherwise */ -static inline bool homa_is_client(__u64 id) +static inline bool homa_is_client(u64 id) { return (id & 1) == 0; } diff --git a/homa_skb.c b/homa_skb.c index 5f20405e..706e84ee 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -123,7 +123,7 @@ void homa_skb_cleanup(struct homa *homa) */ struct sk_buff *homa_skb_new_tx(int length) { - __u64 start = sched_clock(); + u64 start = sched_clock(); struct sk_buff *skb; /* Note: allocate space for an IPv6 header, which is larger than @@ -244,7 +244,7 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) { struct homa_page_pool *pool; - __u64 start; + u64 start; if (skb_core->skb_page) { if (page_ref_count(skb_core->skb_page) == 1) { @@ -455,7 +455,7 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) #define MAX_PAGES_AT_ONCE 50 #endif struct page *pages_to_cache[MAX_PAGES_AT_ONCE]; - __u64 start = sched_clock(); + u64 start = sched_clock(); int num_pages = 0; int i, j; @@ -587,7 +587,7 @@ void homa_skb_release_pages(struct homa *homa) { int i, max_low_mark, min_pages, release, release_max; struct homa_page_pool *max_pool; - __u64 now = sched_clock(); + u64 now = sched_clock(); if (now < homa->skb_page_free_time) return; diff --git a/homa_sock.c b/homa_sock.c index a158d7e0..1a8102e6 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -387,7 +387,7 @@ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) void homa_sock_lock_slow(struct homa_sock *hsk) __acquires(&hsk->lock) { - __u64 start = sched_clock(); + u64 start = sched_clock(); tt_record("beginning wait for socket lock"); spin_lock_bh(&hsk->lock); @@ -405,10 +405,10 @@ void homa_sock_lock_slow(struct homa_sock *hsk) * @id: ID of the particular RPC being locked (multiple RPCs may * share a single bucket lock). */ -void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, __u64 id) +void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) __acquires(&bucket->lock) { - __u64 start = sched_clock(); + u64 start = sched_clock(); tt_record2("beginning wait for rpc lock, id %d (bucket %d)", id, bucket->id); diff --git a/homa_sock.h b/homa_sock.h index 4d29af2e..da5ad367 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -281,7 +281,7 @@ struct homa_v6_sock { }; void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, - __u64 id); + u64 id); int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, __u16 port); void homa_sock_destroy(struct homa_sock *hsk); @@ -347,7 +347,7 @@ static inline int homa_port_hash(__u16 port) * Return: The bucket in which this RPC will appear, if the RPC exists. */ static inline struct homa_rpc_bucket *homa_client_rpc_bucket(struct homa_sock *hsk, - __u64 id) + u64 id) { /* We can use a really simple hash function here because RPC ids * are allocated sequentially. @@ -365,7 +365,7 @@ static inline struct homa_rpc_bucket *homa_client_rpc_bucket(struct homa_sock *h * Return: The bucket in which this RPC will appear, if the RPC exists. */ static inline struct homa_rpc_bucket *homa_server_rpc_bucket(struct homa_sock *hsk, - __u64 id) + u64 id) { /* Each client allocates RPC ids sequentially, so they will * naturally distribute themselves across the hash space. @@ -384,7 +384,7 @@ static inline struct homa_rpc_bucket *homa_server_rpc_bucket(struct homa_sock *h * but used occasionally for diagnostics and debugging. */ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, - __u64 id, const char *locker) + u64 id, const char *locker) { if (!spin_trylock_bh(&bucket->lock)) homa_bucket_lock_slow(bucket, id); @@ -395,7 +395,7 @@ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, * @bucket: Bucket to unlock. * @id: ID of the RPC that was using the lock. */ -static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, __u64 id) +static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, u64 id) __releases(&bucket->lock) { spin_unlock_bh(&bucket->lock); diff --git a/homa_timer.c b/homa_timer.c index c6eded5d..0e7d20f1 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -156,7 +156,7 @@ void homa_check_rpc(struct homa_rpc *rpc) void homa_timer(struct homa *homa) { struct homa_socktab_scan scan; - static __u64 prev_grant_count; + static u64 prev_grant_count; int total_incoming_rpcs = 0; int sum_incoming_rec = 0; struct homa_sock *hsk; @@ -164,7 +164,7 @@ void homa_timer(struct homa *homa) struct homa_rpc *rpc; int sum_incoming = 0; cycles_t start, end; - __u64 total_grants; + u64 total_grants; int total_rpcs = 0; int rpc_count = 0; int core; @@ -210,7 +210,7 @@ void homa_timer(struct homa *homa) * isn't keeping up with RPC reaping, so we'll help * out. See reap.txt for more info. */ - __u64 start = sched_clock(); + u64 start = sched_clock(); tt_record("homa_timer calling homa_rpc_reap"); if (homa_rpc_reap(hsk, false) == 0) diff --git a/homa_utils.c b/homa_utils.c index 34c8a8ad..e057b21c 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -199,7 +199,7 @@ char *homa_print_ipv4_addr(__be32 addr) #define NUM_BUFS_IPV4 4 #define BUF_SIZE_IPV4 30 static char buffers[NUM_BUFS_IPV4][BUF_SIZE_IPV4]; - __u32 a2 = ntohl(addr); + u32 a2 = ntohl(addr); static int next_buf; char *buffer; @@ -242,7 +242,7 @@ char *homa_print_ipv6_addr(const struct in6_addr *addr) } else if ((addr->s6_addr32[0] == 0) && (addr->s6_addr32[1] == 0) && (addr->s6_addr32[2] == htonl(0x0000ffff))) { - __u32 a2 = ntohl(addr->s6_addr32[3]); + u32 a2 = ntohl(addr->s6_addr32[3]); snprintf(buffer, BUF_SIZE, "%u.%u.%u.%u", (a2 >> 24) & 0xff, (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); @@ -665,7 +665,7 @@ void homa_prios_changed(struct homa *homa) */ void homa_spin(int ns) { - __u64 end; + u64 end; end = sched_clock() + ns; while (sched_clock() < end) @@ -683,7 +683,7 @@ void homa_spin(int ns) void homa_throttle_lock_slow(struct homa *homa) __acquires(&homa->throttle_lock) { - __u64 start = sched_clock(); + u64 start = sched_clock(); tt_record("beginning wait for throttle lock"); spin_lock_bh(&homa->throttle_lock); diff --git a/homa_wire.h b/homa_wire.h index 68836deb..46b8bb97 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -478,7 +478,7 @@ _Static_assert(sizeof(struct homa_ack_hdr) <= HOMA_MAX_HEADER, * @sender_id: RPC id from an incoming packet, such as h->common.sender_id * Return: see above */ -static inline __u64 homa_local_id(__be64 sender_id) +static inline u64 homa_local_id(__be64 sender_id) { /* If the client bit was set on the sender side, it needs to be * removed here, and conversely. diff --git a/reap.txt b/reap.txt index 1ab327cf..24131686 100644 --- a/reap.txt +++ b/reap.txt @@ -5,12 +5,19 @@ This file discusses issues related to freeing resources for completed RPCs quite expensive for RPCs with long messages. * The natural time to reap is when homa_rpc_free is invoked to mark an +<<<<<<< HEAD RPC completed, but this can result in severe performance hiccups. For example, a server RPC is freed once the last packet of the response has been transmitted, but this can happen in homa_softirq in response to an incoming acknowledgment, and there may be other short messages waiting to be processed. Freeing a long RPC could result in significant delay for a subsequent short RPC. +======= + RPC completed, but this can result in severe performance hiccups. However, + this can happen in homa_softirq at a time when there are short messages + waiting to be processed. Freeing a long RPC could result in significant + delay for a subsequent short RPC. +>>>>>>> bd01983 (Replace __u64 with u64 in non-uapi code) * Thus Homa doesn't reap immediately in homa_rpc_free. Instead, dead RPCs are queued up and reaping occurs later, at a more convenient time where @@ -30,17 +37,17 @@ This file discusses issues related to freeing resources for completed RPCs reap to get down to that limit. However, it seems possible that there may be cases where a single thread cannot keep up with all the reaping to be done. - * If homa_timer can't keep up, then as a last resort, homa_pkt_dispatch + * If homa_timer can't keep up, then as a last resort, homa_dispatch_pkts will reap a few buffers for every incoming data packet. This is undesirable because it will impact Homa's performance. -* In addition, during the conversion to the new input buffering scheme, - freeing of packets for incoming messages was moved to homa_copy_to_user, - under the assumption that this code wouldn't be on the critical path. - However, right now the packet freeing is taking 20-25% of the total - time in that function, and with faster networks it's quite possible that - this code will indeed be on the critical path. So, it probably shouldn't - be doing packet freeing after all. +* During the conversion to the new input buffering scheme, freeing of packets + for incoming messages was moved to homa_copy_to_user, under the assumption + that this code wouldn't be on the critical path. However, right now the + packet freeing is taking 20-25% of the total time in that function, and + with faster networks it's quite possible that this code will indeed be on + the critical path. So, it may eventually be necessary to remove + packet freeing from homa_copy_to_user. * Here are some approaches that have been tried and eventually abandoned: * Occasionally when data packets arrive, reap if too much dead info has diff --git a/test/mock.c b/test/mock.c index 7119c8ba..51609326 100644 --- a/test/mock.c +++ b/test/mock.c @@ -138,10 +138,10 @@ static int mock_active_rcu_locks; cycles_t mock_cycles; /* Used as the return value for calls to sched_clock. */ -__u64 mock_ns; +u64 mock_ns; /* Add this value to mock_ns every time sched_clock is invoked. */ -__u64 mock_ns_tick; +u64 mock_ns_tick; /* Indicates whether we should be simulation IPv6 or IPv4 in the * current test. Can be overridden by a test. @@ -270,7 +270,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) } while (bytes_left > 0) { struct iovec *iov = (struct iovec *) iter_iov(iter); - __u64 int_base = (__u64) iov->iov_base; + u64 int_base = (u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; if (chunk_bytes > bytes_left) @@ -326,7 +326,7 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { - __u64 int_from = (__u64) from; + u64 int_from = (u64) from; if (mock_check_error(&mock_copy_data_errors)) return 1; @@ -407,7 +407,7 @@ void __icmp_send(struct sk_buff *skb, int type, int code, __be32 info, unit_log_printf("; ", "icmp_send type %d, code %d", type, code); } -void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, +void icmp6_send(struct sk_buff *skb, u8 type, u8 code, u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm) { @@ -602,7 +602,7 @@ unsigned int ip6_mtu(const struct dst_entry *dst) } int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) + u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { char buffer[200]; const char *prefix = " "; @@ -1047,7 +1047,7 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} -__u64 sched_clock(void) +u64 sched_clock(void) { mock_ns += mock_ns_tick; return mock_ns; @@ -1098,7 +1098,7 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset, } while (bytes_left > 0) { struct iovec *iov = (struct iovec *) iter_iov(iter); - __u64 int_base = (__u64) iov->iov_base; + u64 int_base = (u64) iov->iov_base; size_t chunk_bytes = iov->iov_len; if (chunk_bytes > bytes_left) @@ -1494,8 +1494,8 @@ void mock_set_ipv6(struct homa_sock *hsk) * skb, initialized to zero. * @extra_bytes: How much additional data to add to the buffer after * the header. - * @first_value: Determines the data contents: the first __u32 will have - * this value, and each successive __u32 will increment by 4. + * @first_value: Determines the data contents: the first u32 will have + * this value, and each successive u32 will increment by 4. * * Return: A packet buffer containing the information described above. * The caller owns this buffer and is responsible for freeing it. diff --git a/test/mock.h b/test/mock.h index 6b388d8b..c8703ad6 100644 --- a/test/mock.h +++ b/test/mock.h @@ -99,8 +99,8 @@ extern __u16 mock_min_default_port; extern int mock_mtu; extern struct net_device mock_net_device; -extern __u64 mock_ns; -extern __u64 mock_ns_tick; +extern u64 mock_ns; +extern u64 mock_ns_tick; extern int mock_numa_mask; extern int mock_page_nid_mask; extern char mock_printk_output[]; diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index c99fda4b..148cd6e4 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -40,8 +40,8 @@ FIXTURE(homa_grant) { int client_port; struct in6_addr server_ip[5]; int server_port; - __u64 client_id; - __u64 server_id; + u64 client_id; + u64 server_id; union sockaddr_in_union server_addr; struct homa homa; struct homa_sock hsk; @@ -98,7 +98,7 @@ FIXTURE_TEARDOWN(homa_grant) } static struct homa_rpc *test_rpc(FIXTURE_DATA(homa_grant) *self, - __u64 id, struct in6_addr *server_ip, int size) + u64 id, struct in6_addr *server_ip, int size) { struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, server_ip, self->server_port, diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 42aa7daf..1da25210 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -124,8 +124,8 @@ FIXTURE(homa_incoming) { int client_port; struct in6_addr server_ip[2]; int server_port; - __u64 client_id; - __u64 server_id; + u64 client_id; + u64 server_id; union sockaddr_in_union server_addr; struct homa homa; struct homa_sock hsk; diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 9a655451..9287b82b 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -255,8 +255,8 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); struct sk_buff *skb, *skb2, *skb3, *skb4, *result; int client_port = 40000; - __u64 client_id = 1234; - __u64 server_id = 1235; + u64 client_id = 1234; + u64 server_id = 1235; struct homa_rpc *srpc; int server_port = 99; struct homa_data_hdr h; @@ -322,8 +322,8 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) struct sk_buff *skb, *skb2, *skb3, *result; struct homa_grant_hdr h; int client_port = 40000; - __u64 client_id = 1234; - __u64 server_id = 1235; + u64 client_id = 1234; + u64 server_id = 1235; struct homa_rpc *srpc; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 2421e5fa..65b95083 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -38,8 +38,8 @@ FIXTURE(homa_outgoing) { int client_port; struct in6_addr server_ip[1]; int server_port; - __u64 client_id; - __u64 server_id; + u64 client_id; + u64 server_id; struct homa homa; struct homa_sock hsk; union sockaddr_in_union server_addr; diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 44984c55..358b17c6 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -310,7 +310,7 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) { struct dst_entry *dst; char buffer[30]; - __u32 addr; + u32 addr; // Make sure the test uses IPv6. mock_ipv6 = true; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index c379e175..61ea9005 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -26,8 +26,8 @@ FIXTURE(homa_plumbing) { int client_port; struct in6_addr server_ip[1]; int server_port; - __u64 client_id; - __u64 server_id; + u64 client_id; + u64 server_id; struct homa homa; struct homa_sock hsk; union sockaddr_in_union client_addr; @@ -646,7 +646,7 @@ TEST_F(homa_plumbing, homa_recvmsg__MSG_DONT_WAIT) TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) { struct homa_rpc *crpc; - __u32 pages[2]; + u32 pages[2]; // Make sure the test uses IPv4. mock_ipv6 = false; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 4fbbcc18..5d9e19ec 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -128,7 +128,7 @@ TEST_F(homa_pool, homa_pool_get_rcvbuf) TEST_F(homa_pool, homa_pool_get_pages__basics) { struct homa_pool *pool = self->hsk.buffer_pool; - __u32 pages[10]; + u32 pages[10]; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(0, pages[0]); @@ -141,7 +141,7 @@ TEST_F(homa_pool, homa_pool_get_pages__basics) TEST_F(homa_pool, homa_pool_get_pages__not_enough_space) { struct homa_pool *pool = self->hsk.buffer_pool; - __u32 pages[10]; + u32 pages[10]; atomic_set(&pool->free_bpages, 1); EXPECT_EQ(-1, homa_pool_get_pages(pool, 2, pages, 0)); @@ -151,7 +151,7 @@ TEST_F(homa_pool, homa_pool_get_pages__not_enough_space) TEST_F(homa_pool, homa_pool_get_pages__set_limit) { struct homa_pool *pool = self->hsk.buffer_pool; - __u32 pages[10]; + u32 pages[10]; atomic_set(&pool->free_bpages, 62); pool->cores[raw_smp_processor_id()].next_candidate = 49; @@ -162,7 +162,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit) TEST_F(homa_pool, homa_pool_get_pages__set_limit_with_MIN_EXTRA) { struct homa_pool *pool = self->hsk.buffer_pool; - __u32 pages[10]; + u32 pages[10]; atomic_set(&pool->free_bpages, 92); pool->cores[raw_smp_processor_id()].next_candidate = 13; @@ -173,7 +173,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit_with_MIN_EXTRA) TEST_F(homa_pool, homa_pool_get_pages__skip_unusable_bpages) { struct homa_pool *pool = self->hsk.buffer_pool; - __u32 pages[10]; + u32 pages[10]; mock_ns = 1000; atomic_set(&pool->descriptors[0].refs, 2); @@ -191,7 +191,7 @@ TEST_F(homa_pool, homa_pool_get_pages__skip_unusable_bpages) TEST_F(homa_pool, homa_pool_get_pages__cant_lock_pages) { struct homa_pool *pool = self->hsk.buffer_pool; - __u32 pages[10]; + u32 pages[10]; mock_ns = 1000; mock_trylock_errors = 3; @@ -202,7 +202,7 @@ TEST_F(homa_pool, homa_pool_get_pages__cant_lock_pages) TEST_F(homa_pool, homa_pool_get_pages__state_changes_while_locking) { struct homa_pool *pool = self->hsk.buffer_pool; - __u32 pages[10]; + u32 pages[10]; mock_ns = 1000; unit_hook_register(steal_bpages_hook); @@ -213,7 +213,7 @@ TEST_F(homa_pool, homa_pool_get_pages__state_changes_while_locking) TEST_F(homa_pool, homa_pool_get_pages__steal_expired_page) { struct homa_pool *pool = self->hsk.buffer_pool; - __u32 pages[10]; + u32 pages[10]; pool->descriptors[0].owner = 5; mock_ns = 5000; @@ -228,7 +228,7 @@ TEST_F(homa_pool, homa_pool_get_pages__steal_expired_page) TEST_F(homa_pool, homa_pool_get_pages__set_owner) { struct homa_pool *pool = self->hsk.buffer_pool; - __u32 pages[10]; + u32 pages[10]; self->homa.bpage_lease_usecs = 1; mock_ns = 5000; @@ -505,7 +505,7 @@ TEST_F(homa_pool, homa_pool_release_buffers__basics) } TEST_F(homa_pool, homa_pool_release_buffers__bogus_offset) { - __u32 buffer = self->hsk.buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; + u32 buffer = self->hsk.buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; EXPECT_EQ(EINVAL, -homa_pool_release_buffers(self->hsk.buffer_pool, 1, &buffer)); diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 2625d836..942bbd78 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -28,8 +28,8 @@ FIXTURE(homa_rpc) { int client_port; struct in6_addr server_ip[1]; int server_port; - __u64 client_id; - __u64 server_id; + u64 client_id; + u64 server_id; struct homa homa; struct homa_sock hsk; union sockaddr_in_union server_addr; diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index df5170d2..166aa571 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -28,7 +28,7 @@ FIXTURE(homa_sock) { int client_port; struct in6_addr server_ip[1]; int server_port; - __u64 client_id; + u64 client_id; }; FIXTURE_SETUP(homa_sock) { diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 740de2ab..316df670 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -14,8 +14,8 @@ FIXTURE(homa_timer) { int client_port; struct in6_addr server_ip[1]; int server_port; - __u64 client_id; - __u64 server_id; + u64 client_id; + u64 server_id; union sockaddr_in_union server_addr; struct homa homa; struct homa_sock hsk; diff --git a/timetrace.c b/timetrace.c index 050bda5a..79eb1d0a 100644 --- a/timetrace.c +++ b/timetrace.c @@ -21,23 +21,23 @@ extern atomic_t *tt_linux_freeze_count; extern atomic_t tt_linux_freeze_no_homa; extern int *tt_linux_homa_temp; extern int tt_linux_homa_temp_default[16]; -extern void (*tt_linux_inc_metrics)(int metric, __u64 count); -extern void (*tt_linux_record)(struct tt_buffer *buffer, __u64 timestamp, - const char *format, __u32 arg0, __u32 arg1, - __u32 arg2, __u32 arg3); -extern void tt_linux_skip_metrics(int metric, __u64 count); +extern void (*tt_linux_inc_metrics)(int metric, u64 count); +extern void (*tt_linux_record)(struct tt_buffer *buffer, u64 timestamp, + const char *format, u32 arg0, u32 arg1, + u32 arg2, u32 arg3); +extern void tt_linux_skip_metrics(int metric, u64 count); extern void (*tt_linux_printk)(void); extern void (*tt_linux_dbg1)(char *msg, ...); extern void (*tt_linux_dbg2)(char *msg, ...); extern void (*tt_linux_dbg3)(char *msg, ...); extern void tt_linux_nop(void); -extern void homa_trace(__u64 u0, __u64 u1, int i0, int i1); +extern void homa_trace(u64 u0, u64 u1, int i0, int i1); -extern void ltt_record_nop(struct tt_buffer *buffer, __u64 timestamp, - const char *format, __u32 arg0, __u32 arg1, - __u32 arg2, __u32 arg3); +extern void ltt_record_nop(struct tt_buffer *buffer, u64 timestamp, + const char *format, u32 arg0, u32 arg1, + u32 arg2, u32 arg3); #endif -void tt_inc_metric(int metric, __u64 count); +void tt_inc_metric(int metric, u64 count); /* Separate buffers for each core: this eliminates the need for * synchronization in tt_record, which improves performance significantly. @@ -243,9 +243,9 @@ void tt_freeze(void) * @arg2: Argument to use when printing a message about this event. * @arg3: Argument to use when printing a message about this event. */ -void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, - const char *format, __u32 arg0, __u32 arg1, __u32 arg2, - __u32 arg3) +void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, + const char *format, u32 arg0, u32 arg1, u32 arg2, + u32 arg3) { struct tt_event *event; @@ -288,7 +288,7 @@ void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, void tt_find_oldest(int *pos) { struct tt_buffer *buffer; - __u64 start_time = 0; + u64 start_time = 0; int i; for (i = 0; i < nr_cpu_ids; i++) { @@ -401,7 +401,7 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, struct tt_event *event; int entry_length, chunk_size, available, i, failed_to_copy; int current_core = -1; - __u64 earliest_time = ~0; + u64 earliest_time = ~0; /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { @@ -595,7 +595,7 @@ void tt_print_file(char *path) /* Each iteration of this loop printk's one event. */ while (true) { - __u64 earliest_time = ~0; + u64 earliest_time = ~0; struct tt_event *event; int current_core = -1; int i; @@ -694,7 +694,7 @@ void tt_printk(void) /* Each iteration of this loop printk's one event. */ while (true) { - __u64 earliest_time = ~0; + u64 earliest_time = ~0; struct tt_event *event; int current_core = -1; char msg[200]; @@ -754,7 +754,7 @@ void tt_get_messages(char *buffer, size_t length) /* Each iteration of this loop prints one event. */ while (true) { - __u64 earliest_time = ~0; + u64 earliest_time = ~0; struct tt_event *event; int current_core = -1; int i, result; @@ -833,7 +833,7 @@ void tt_dbg3(char *msg, ...) * to increment. * @count: Amount by which to increment to the metric. */ -void tt_inc_metric(int metric, __u64 count) +void tt_inc_metric(int metric, u64 count) { /* Maps from the metric argument to an offset within homa_metrics. * This level of indirection is needed so that the kernel doesn't @@ -846,7 +846,7 @@ void tt_inc_metric(int metric, __u64 count) offsetof(struct homa_metrics, linux_softirq_ns), offsetof(struct homa_metrics, linux_pkt_alloc_bytes), }; - __u64 *metric_addr = (__u64 *)(((char *)homa_metrics_per_cpu()) + u64 *metric_addr = (u64 *)(((char *)homa_metrics_per_cpu()) + offsets[metric]); *metric_addr += count; } diff --git a/timetrace.h b/timetrace.h index 5d3ad33c..d6af807c 100644 --- a/timetrace.h +++ b/timetrace.h @@ -30,7 +30,7 @@ struct tt_event { /** * Time when this event occurred (in tt_rdtsc units). */ - __u64 timestamp; + u64 timestamp; /** * Format string describing the event. NULL means that this * entry has never been occupied. @@ -41,10 +41,10 @@ struct tt_event { * Up to 4 additional arguments that may be referenced by * @format when printing out this event. */ - __u32 arg0; - __u32 arg1; - __u32 arg2; - __u32 arg3; + u32 arg0; + u32 arg1; + u32 arg2; + u32 arg3; }; /* The number of events in a tt_buffer, as a power of 2. */ @@ -97,9 +97,9 @@ struct tt_proc_file { void tt_destroy(void); void tt_freeze(void); int tt_init(char *proc_file, int *temp); -void tt_record_buf(struct tt_buffer *buffer, __u64 timestamp, - const char *format, __u32 arg0, __u32 arg1, - __u32 arg2, __u32 arg3); +void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, + const char *format, u32 arg0, u32 arg1, + u32 arg2, u32 arg3); /* Private methods and variables: exposed so they can be accessed * by unit tests. @@ -134,12 +134,12 @@ extern void *tt_debug_ptr[100]; * (accessed via the RDTSC instruction). * Return: see above */ -static inline __u64 tt_rdtsc(void) +static inline u64 tt_rdtsc(void) { - __u32 lo, hi; + u32 lo, hi; __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); - return (((__u64)hi << 32) | lo); + return (((u64)hi << 32) | lo); } /* @@ -159,8 +159,8 @@ static inline __u64 tt_rdtsc(void) * @arg2 Argument to use when printing a message about this event. * @arg3 Argument to use when printing a message about this event. */ -static inline void tt_record4(const char *format, __u32 arg0, __u32 arg1, - __u32 arg2, __u32 arg3) +static inline void tt_record4(const char *format, u32 arg0, u32 arg1, + u32 arg2, u32 arg3) { #if ENABLE_TIME_TRACE tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, @@ -168,8 +168,8 @@ static inline void tt_record4(const char *format, __u32 arg0, __u32 arg1, #endif } -static inline void tt_record3(const char *format, __u32 arg0, __u32 arg1, - __u32 arg2) +static inline void tt_record3(const char *format, u32 arg0, u32 arg1, + u32 arg2) { #if ENABLE_TIME_TRACE tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, @@ -177,7 +177,7 @@ static inline void tt_record3(const char *format, __u32 arg0, __u32 arg1, #endif } -static inline void tt_record2(const char *format, __u32 arg0, __u32 arg1) +static inline void tt_record2(const char *format, u32 arg0, u32 arg1) { #if ENABLE_TIME_TRACE tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, @@ -185,7 +185,7 @@ static inline void tt_record2(const char *format, __u32 arg0, __u32 arg1) #endif } -static inline void tt_record1(const char *format, __u32 arg0) +static inline void tt_record1(const char *format, u32 arg0) { #if ENABLE_TIME_TRACE tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, @@ -201,7 +201,7 @@ static inline void tt_record(const char *format) #endif } -static inline __u32 tt_hi(void *p) +static inline u32 tt_hi(void *p) { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshift-count-overflow" @@ -209,7 +209,7 @@ static inline __u32 tt_hi(void *p) #pragma GCC diagnostic pop } -static inline __u32 tt_lo(void *p) +static inline u32 tt_lo(void *p) { return ((uintptr_t)p) & 0xffffffff; } diff --git a/util/cp_node.cc b/util/cp_node.cc index 1f6e9192..f90874fd 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -3082,8 +3082,8 @@ int dump_times_cmd(std::vector &words) for (client *client: clients) { if (!exp.empty() && (client->experiment != exp)) continue; - __u32 start = client->total_responses % NUM_CLIENT_STATS; - __u32 i = start; + uint32_t start = client->total_responses % NUM_CLIENT_STATS; + uint32_t i = start; while (1) { if (client->actual_rtts[i] != 0) { fprintf(f, "%8d %12.2f\n", From 0506790245e7620c16ef64a9ca9bcf88600c6619 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 13:18:25 -0800 Subject: [PATCH 156/625] Use '____cacheline_aligned_in_smp', not '__aligned(L1_CACHE_BYTES)' --- homa_impl.h | 14 +++++++------- homa_pool.h | 45 ++++++++++++++++++--------------------------- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 5745f8a7..cc6aa86a 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -242,13 +242,13 @@ struct homa { * it could be a severe underestimate if there is competing traffic * from, say, TCP. Access only with atomic ops. */ - atomic64_t link_idle_time __aligned(L1_CACHE_BYTES); + atomic64_t link_idle_time ____cacheline_aligned_in_smp; /** * @grantable_lock: Used to synchronize access to grant-related * fields below, from @grantable_peers to @last_grantable_change. */ - spinlock_t grantable_lock __aligned(L1_CACHE_BYTES); + spinlock_t grantable_lock ____cacheline_aligned_in_smp; /** * @grantable_lock_time: sched_clock() time when grantable_lock @@ -347,7 +347,7 @@ struct homa { * @pacer_mutex: Ensures that only one instance of homa_pacer_xmit * runs at a time. Only used in "try" mode: never block on this. */ - spinlock_t pacer_mutex __aligned(L1_CACHE_BYTES); + spinlock_t pacer_mutex ____cacheline_aligned_in_smp; /** * @pacer_fifo_fraction: The fraction of time (in thousandths) when @@ -409,19 +409,19 @@ struct homa { * a peer sends more bytes than granted (see synchronization note in * homa_send_grants for why we have to allow this possibility). */ - atomic_t total_incoming __aligned(L1_CACHE_BYTES); + atomic_t total_incoming ____cacheline_aligned_in_smp; /** * @prev_default_port: The most recent port number assigned from * the range of default ports. */ - __u16 prev_default_port __aligned(L1_CACHE_BYTES); + __u16 prev_default_port ____cacheline_aligned_in_smp; /** * @port_map: Information about all open sockets. Dynamically * allocated; must be kfreed. */ - struct homa_socktab *port_map __aligned(L1_CACHE_BYTES); + struct homa_socktab *port_map ____cacheline_aligned_in_smp; /** * @peers: Info about all the other hosts we have communicated with. @@ -433,7 +433,7 @@ struct homa { * @page_pool_mutex: Synchronizes access to any/all of the page_pools * used for outgoing sk_buff data. */ - spinlock_t page_pool_mutex __aligned(L1_CACHE_BYTES); + spinlock_t page_pool_mutex ____cacheline_aligned_in_smp; /** * @page_pools: One page pool for each NUMA node on the machine. diff --git a/homa_pool.h b/homa_pool.h index 1aa7a66a..cb2fe6e6 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -59,35 +59,26 @@ _Static_assert(sizeof(struct homa_bpage) == L1_CACHE_BYTES, * out of which that core is allocating small chunks). */ struct homa_pool_core { - union { - /** - * @cache_line: Ensures that each object is exactly one - * cache line long. - */ - char cache_line[L1_CACHE_BYTES]; - struct { - /** - * @page_hint: Index of bpage in pool->descriptors, - * which may be owned by this core. If so, we'll use it - * for allocating partial pages. - */ - int page_hint; + /** + * @page_hint: Index of bpage in pool->descriptors, + * which may be owned by this core. If so, we'll use it + * for allocating partial pages. + */ + int page_hint ____cacheline_aligned_in_smp; - /** - * @allocated: if the page given by @page_hint is - * owned by this core, this variable gives the number of - * (initial) bytes that have already been allocated - * from the page. - */ - int allocated; + /** + * @allocated: if the page given by @page_hint is + * owned by this core, this variable gives the number of + * (initial) bytes that have already been allocated + * from the page. + */ + int allocated; - /** - * @next_candidate: when searching for free bpages, - * check this index next. - */ - int next_candidate; - }; - }; + /** + * @next_candidate: when searching for free bpages, + * check this index next. + */ + int next_candidate; }; #ifndef __STRIP__ /* See strip.py */ From cf63d80bacf72fe595132a5c67073b589706c2e5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 14:45:26 -0800 Subject: [PATCH 157/625] Use alloc_percpu_gfp for pool->cores --- homa_pool.c | 19 ++++++++++--------- homa_pool.h | 11 ++++------- test/mock.h | 12 ++++++++++++ 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/homa_pool.c b/homa_pool.c index 3982b340..b01ac436 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -86,17 +86,18 @@ int homa_pool_init(struct homa_sock *hsk, void __user *region, pool->bpages_needed = INT_MAX; /* Allocate and initialize core-specific data. */ - pool->cores = kmalloc_array(nr_cpu_ids, sizeof(struct homa_pool_core), - GFP_ATOMIC); + pool->cores = alloc_percpu_gfp(struct homa_pool_core, GFP_ATOMIC); if (!pool->cores) { result = -ENOMEM; goto error; } pool->num_cores = nr_cpu_ids; for (i = 0; i < pool->num_cores; i++) { - pool->cores[i].page_hint = 0; - pool->cores[i].allocated = 0; - pool->cores[i].next_candidate = 0; + struct homa_pool_core *core = per_cpu_ptr(pool->cores, i); + + core->page_hint = 0; + core->allocated = 0; + core->next_candidate = 0; } pool->check_waiting_invoked = 0; @@ -104,7 +105,7 @@ int homa_pool_init(struct homa_sock *hsk, void __user *region, error: kfree(pool->descriptors); - kfree(pool->cores); + free_percpu(pool->cores); pool->region = NULL; return result; } @@ -119,7 +120,7 @@ void homa_pool_destroy(struct homa_pool *pool) if (!pool->region) return; kfree(pool->descriptors); - kfree(pool->cores); + free_percpu(pool->cores); pool->region = NULL; } @@ -160,7 +161,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, int alloced = 0; int limit = 0; - core = &pool->cores[core_num]; + core = this_cpu_ptr(pool->cores); if (atomic_sub_return(num_pages, &pool->free_bpages) < 0) { atomic_add(num_pages, &pool->free_bpages); return -1; @@ -283,7 +284,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) if (unlikely(partial == 0)) goto success; core_id = raw_smp_processor_id(); - core = &pool->cores[core_id]; + core = this_cpu_ptr(pool->cores); bpage = &pool->descriptors[core->page_hint]; if (!spin_trylock_bh(&bpage->lock)) { tt_record("beginning wait for bpage lock"); diff --git a/homa_pool.h b/homa_pool.h index cb2fe6e6..f3b05320 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -6,6 +6,8 @@ #ifndef _HOMA_POOL_H #define _HOMA_POOL_H +#include + #include "homa_rpc.h" /** @@ -64,7 +66,7 @@ struct homa_pool_core { * which may be owned by this core. If so, we'll use it * for allocating partial pages. */ - int page_hint ____cacheline_aligned_in_smp; + int page_hint; /** * @allocated: if the page given by @page_hint is @@ -81,11 +83,6 @@ struct homa_pool_core { int next_candidate; }; -#ifndef __STRIP__ /* See strip.py */ -_Static_assert(sizeof(struct homa_pool_core) == L1_CACHE_BYTES, - "homa_pool_core overflowed a cache line"); -#endif /* See strip.py */ - /** * struct homa_pool - Describes a pool of buffer space for incoming * messages for a particular socket; managed by homa_pool.c. The pool is @@ -128,7 +125,7 @@ struct homa_pool { int bpages_needed; /** @cores: core-specific info; dynamically allocated. */ - struct homa_pool_core *cores; + struct homa_pool_core __percpu *cores; /** @num_cores: number of elements in @cores. */ int num_cores; diff --git a/test/mock.h b/test/mock.h index c8703ad6..89c917b8 100644 --- a/test/mock.h +++ b/test/mock.h @@ -8,6 +8,9 @@ #undef alloc_pages #define alloc_pages mock_alloc_pages +#undef alloc_percpu_gfp +#define alloc_percpu_gfp(type, flags) mock_kmalloc(10 * sizeof(type), flags) + #define compound_order mock_compound_order #ifdef cpu_to_node @@ -24,6 +27,9 @@ #undef DEFINE_PER_CPU #define DEFINE_PER_CPU(type, name) type name[10] +#undef free_percpu +#define free_percpu(name) kfree(name) + #define get_page mock_get_page #undef HOMA_MIN_DEFAULT_PORT @@ -49,6 +55,9 @@ #undef per_cpu #define per_cpu(name, core) (name[core]) +#undef per_cpu_ptr +#define per_cpu_ptr(name, core) (&name[core]) + #define put_page mock_put_page #define rcu_read_lock mock_rcu_read_lock @@ -62,6 +71,9 @@ #define spin_unlock mock_spin_unlock +#undef this_cpu_ptr +#define this_cpu_ptr(name) (&name[pcpu_hot.cpu_number]) + #undef vmalloc #define vmalloc mock_vmalloc From 068432cfe63cfbbb15ca8120b7f3c4381060a150 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 15:16:46 -0800 Subject: [PATCH 158/625] Extract bool homa_bpage_available from homa_pool_get_pages --- homa_pool.c | 25 ++++++++++++++++++------- homa_pool.h | 1 + 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/homa_pool.c b/homa_pool.c index b01ac436..8920bf59 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -139,6 +139,21 @@ void homa_pool_get_rcvbuf(struct homa_sock *hsk, homa_sock_unlock(hsk); } +/** + * homa_pool_bpage_available() - Check whether a bpage is available for use. + * @bpage: Bpage to check + * @now: Current time (sched_clock() units) + * Return: True if the bpage is free or if it can be stolen, otherwise + * false. + */ +bool homa_bpage_available(struct homa_bpage *bpage, u64 now) +{ + int ref_count = atomic_read(&bpage->refs); + + return ref_count == 0 || (ref_count == 1 && bpage->owner >= 0 && + bpage->expiration <= now); +} + /** * homa_pool_get_pages() - Allocate one or more full pages from the pool. * @pool: Pool from which to allocate pages @@ -172,7 +187,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, */ while (alloced != num_pages) { struct homa_bpage *bpage; - int cur, ref_count; + int cur; /* If we don't need to use all of the bpages in the pool, * then try to use only the ones with low indexes. This @@ -214,15 +229,11 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, * (must check again in case someone else snuck in and * grabbed the page). */ - ref_count = atomic_read(&bpage->refs); - if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || - bpage->expiration > now))) + if (!homa_bpage_available(bpage, now)) continue; if (!spin_trylock_bh(&bpage->lock)) continue; - ref_count = atomic_read(&bpage->refs); - if (ref_count >= 2 || (ref_count == 1 && (bpage->owner < 0 || - bpage->expiration > now))) { + if (!homa_bpage_available(bpage, now)) { spin_unlock_bh(&bpage->lock); continue; } diff --git a/homa_pool.h b/homa_pool.h index f3b05320..b919eeee 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -137,6 +137,7 @@ struct homa_pool { int check_waiting_invoked; }; +bool homa_bpage_available(struct homa_bpage *bpage, u64 now); int homa_pool_allocate(struct homa_rpc *rpc); void homa_pool_check_waiting(struct homa_pool *pool); void homa_pool_destroy(struct homa_pool *pool); From 932c37a200b960122d986d3c896cbf9903a31d40 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 15:36:42 -0800 Subject: [PATCH 159/625] Check incoming messages against HOMA_MAX_MESSAGE_LENGTH --- homa_incoming.c | 3 +++ test/unit_homa_incoming.c | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/homa_incoming.c b/homa_incoming.c index 1d09b4f1..bd9f87e9 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -23,6 +23,9 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) { int err; + if (length > HOMA_MAX_MESSAGE_LENGTH) + return -EINVAL; + rpc->msgin.length = length; skb_queue_head_init(&rpc->msgin.packets); rpc->msgin.recv_end = 0; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 1da25210..261f5a1b 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -191,6 +191,17 @@ TEST_F(homa_incoming, homa_message_in_init__basics) EXPECT_EQ(128, crpc->msgin.granted); EXPECT_EQ(1, crpc->msgin.num_bpages); } +TEST_F(homa_incoming, homa_message_in_init__message_too_long) +{ + struct homa_rpc *srpc; + int created; + + self->data.message_length = htonl(HOMA_MAX_MESSAGE_LENGTH+1); + srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + &created); + ASSERT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(EINVAL, -PTR_ERR(srpc)); +} TEST_F(homa_incoming, homa_message_in_init__pool_doesnt_exist) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, From 89db69bb36010245a7c48599e7cfb88d7ec55e77 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 15:40:36 -0800 Subject: [PATCH 160/625] Add a bit more info to homa_sock_lock comment --- homa_sock.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/homa_sock.h b/homa_sock.h index da5ad367..a4664027 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -301,8 +301,9 @@ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, * homa_sock_lock() - Acquire the lock for a socket. If the socket * isn't immediately available, record stats on the waiting time. * @hsk: Socket to lock. - * @locker: Static string identifying where the socket was locked; - * used to track down deadlocks. + * @locker: Static string identifying where the socket was locked. + * Not normally used, but can be helpful when tracking down + * deadlocks. */ static inline void homa_sock_lock(struct homa_sock *hsk, const char *locker) __acquires(&hsk->lock) From efdcd397be6994d2b0c62f83593109f59891fa16 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 16:23:36 -0800 Subject: [PATCH 161/625] Rename homa_rpc_free to homa_rpc_end --- homa_grant.c | 2 +- homa_impl.h | 2 +- homa_incoming.c | 10 ++--- homa_plumbing.c | 10 ++--- homa_rpc.c | 20 +++++---- homa_rpc.h | 2 +- homa_sock.c | 2 +- homa_sock.h | 2 +- reap.txt | 13 +----- test/unit_homa_incoming.c | 18 ++++---- test/unit_homa_outgoing.c | 6 +-- test/unit_homa_plumbing.c | 2 +- test/unit_homa_rpc.c | 90 +++++++++++++++++++-------------------- test/unit_homa_timer.c | 2 +- test/utils.c | 6 +-- util/tthoma.py | 18 ++++---- 16 files changed, 99 insertions(+), 106 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 5d389d20..d9a8ed37 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -620,7 +620,7 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) * calling homa_grant_recalc. This creates a risk that the * RPC could be reaped before the lock is reacquired. * However, this function is only called from a specific - * place in homa_rpc_free where the RPC hasn't yet been put + * place in homa_rpc_end where the RPC hasn't yet been put * on the reap list, so there is no way it can be reaped * until we return. */ diff --git a/homa_impl.h b/homa_impl.h index cc6aa86a..990b1644 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1120,7 +1120,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, void homa_rpc_abort(struct homa_rpc *crpc, int error); void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); -void homa_rpc_free(struct homa_rpc *rpc); +void homa_rpc_end(struct homa_rpc *rpc); void homa_rpc_handoff(struct homa_rpc *rpc); int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int homa_setsockopt(struct sock *sk, int level, int optname, diff --git a/homa_incoming.c b/homa_incoming.c index bd9f87e9..e1b14138 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -773,11 +773,11 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) rpc->state); } else { if (rpc->hsk->homa->verbose) - pr_notice("Freeing rpc id %llu from client %s:%d: unknown to client", + pr_notice("Ending rpc id %llu from client %s:%d: unknown to client", rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), rpc->dport); - homa_rpc_free(rpc); + homa_rpc_end(rpc); INC_METRIC(server_rpcs_unknown, 1); } done: @@ -885,7 +885,7 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, if (rpc) { tt_record1("homa_ack_pkt freeing rpc id %d", rpc->id); - homa_rpc_free(rpc); + homa_rpc_end(rpc); homa_rpc_unlock(rpc); } @@ -992,7 +992,7 @@ void homa_rpc_abort(struct homa_rpc *rpc, int error) INC_METRIC(server_rpc_discards, 1); tt_record3("aborting server RPC: peer 0x%x, id %d, error %d", tt_addr(rpc->peer->addr), rpc->id, error); - homa_rpc_free(rpc); + homa_rpc_end(rpc); return; } tt_record3("aborting client RPC: peer 0x%x, id %d, error %d", @@ -1079,7 +1079,7 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) if (error) homa_rpc_abort(rpc, error); else - homa_rpc_free(rpc); + homa_rpc_end(rpc); homa_rpc_unlock(rpc); } homa_unprotect_rpcs(hsk); diff --git a/homa_plumbing.c b/homa_plumbing.c index 3102cf5d..3d6ce000 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -775,7 +775,7 @@ int homa_ioc_abort(struct sock *sk, int *karg) if (!rpc) return -EINVAL; if (args.error == 0) - homa_rpc_free(rpc); + homa_rpc_end(rpc); else homa_rpc_abort(rpc, -args.error); homa_rpc_unlock(rpc); /* Locked by homa_find_client_rpc. */ @@ -1043,7 +1043,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) error: if (rpc) { - homa_rpc_free(rpc); + homa_rpc_end(rpc); homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ } tt_record2("homa_sendmsg returning error %d for id %d", @@ -1156,7 +1156,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } /* This indicates that the application now owns the buffers, so - * we won't free them in homa_rpc_free. + * we won't free them in homa_rpc_end. */ rpc->msgin.num_bpages = 0; @@ -1165,10 +1165,10 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, */ if (homa_is_client(rpc->id)) { homa_peer_add_ack(rpc); - homa_rpc_free(rpc); + homa_rpc_end(rpc); } else { if (result < 0) - homa_rpc_free(rpc); + homa_rpc_end(rpc); else rpc->state = RPC_IN_SERVICE; } diff --git a/homa_rpc.c b/homa_rpc.c index 52c26900..cf0ed189 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -230,7 +230,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, rpc = homa_find_server_rpc(hsk2, saddr, id); if (rpc) { tt_record1("homa_rpc_acked freeing id %d", rpc->id); - homa_rpc_free(rpc); + homa_rpc_end(rpc); homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ } @@ -240,12 +240,14 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, } /** - * homa_rpc_free() - Destructor for homa_rpc; will arrange for all resources - * associated with the RPC to be released (eventually). + * homa_rpc_end() - Stop all activity on an RPC and begin the process of + * releasing its resources; this process will continue in the background + * until homa_rpc_reap eventually completes it. * @rpc: Structure to clean up, or NULL. Must be locked. Its socket must - * not be locked. + * not be locked. Once this function returns the caller should not + * use the RPC except to unlock it. */ -void homa_rpc_free(struct homa_rpc *rpc) +void homa_rpc_end(struct homa_rpc *rpc) __acquires(&rpc->hsk->lock) __releases(&rpc->hsk->lock) { @@ -265,8 +267,8 @@ void homa_rpc_free(struct homa_rpc *rpc) */ if (!rpc || rpc->state == RPC_DEAD) return; - UNIT_LOG("; ", "homa_rpc_free invoked"); - tt_record1("homa_rpc_free invoked for id %d", rpc->id); + UNIT_LOG("; ", "homa_rpc_end invoked"); + tt_record1("homa_rpc_end invoked for id %d", rpc->id); rpc->state = RPC_DEAD; /* The following line must occur before the socket is locked or @@ -277,7 +279,7 @@ void homa_rpc_free(struct homa_rpc *rpc) homa_grant_free_rpc(rpc); /* Unlink from all lists, so no-one will ever find this RPC again. */ - homa_sock_lock(rpc->hsk, "homa_rpc_free"); + homa_sock_lock(rpc->hsk, "homa_rpc_end"); __hlist_del(&rpc->hash_links); list_del_rcu(&rpc->active_links); list_add_tail_rcu(&rpc->dead_links, &rpc->hsk->dead_rpcs); @@ -446,7 +448,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) UNIT_LOG("; ", "reaped %llu", rpc->id); /* Lock and unlock the RPC before freeing it. This * is needed to deal with races where the code - * that invoked homa_rpc_free hasn't unlocked the + * that invoked homa_rpc_end hasn't unlocked the * RPC yet. */ homa_rpc_lock(rpc, "homa_rpc_reap"); diff --git a/homa_rpc.h b/homa_rpc.h index 48f0fd0a..5840bc19 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -424,7 +424,7 @@ struct homa_rpc const struct in6_addr *saddr, u64 id); void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); -void homa_rpc_free(struct homa_rpc *rpc); +void homa_rpc_end(struct homa_rpc *rpc); void homa_rpc_log(struct homa_rpc *rpc); void homa_rpc_log_active(struct homa *homa, uint64_t id); void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); diff --git a/homa_sock.c b/homa_sock.c index 1a8102e6..42311055 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -261,7 +261,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { homa_rpc_lock(rpc, "homa_sock_shutdown"); - homa_rpc_free(rpc); + homa_rpc_end(rpc); homa_rpc_unlock(rpc); } diff --git a/homa_sock.h b/homa_sock.h index a4664027..b1963c58 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -206,7 +206,7 @@ struct homa_sock { struct list_head active_rpcs; /** - * @dead_rpcs: Contains RPCs for which homa_rpc_free has been + * @dead_rpcs: Contains RPCs for which homa_rpc_end has been * called, but their packet buffers haven't yet been freed. */ struct list_head dead_rpcs; diff --git a/reap.txt b/reap.txt index 24131686..a5956039 100644 --- a/reap.txt +++ b/reap.txt @@ -4,22 +4,13 @@ This file discusses issues related to freeing resources for completed RPCs * Most of the cost of reaping comes from freeing skbuffs; this can be quite expensive for RPCs with long messages. -* The natural time to reap is when homa_rpc_free is invoked to mark an -<<<<<<< HEAD - RPC completed, but this can result in severe performance hiccups. For - example, a server RPC is freed once the last packet of the response - has been transmitted, but this can happen in homa_softirq in response - to an incoming acknowledgment, and there may be other short messages waiting - to be processed. Freeing a long RPC could result in significant delay - for a subsequent short RPC. -======= +* The natural time to reap is when homa_rpc_end is invoked to mark an RPC completed, but this can result in severe performance hiccups. However, this can happen in homa_softirq at a time when there are short messages waiting to be processed. Freeing a long RPC could result in significant delay for a subsequent short RPC. ->>>>>>> bd01983 (Replace __u64 with u64 in non-uapi code) -* Thus Homa doesn't reap immediately in homa_rpc_free. Instead, dead RPCs +* Thus Homa doesn't reap immediately in homa_rpc_end. Instead, dead RPCs are queued up and reaping occurs later, at a more convenient time where it is less likely to impact latency. The challenge is to figure out how to do this so that (a) we keep up with dead RPCs and (b) we minimize diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 261f5a1b..5eff862a 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -68,7 +68,7 @@ void handoff_hook3(char *id) hook3_count++; homa_rpc_handoff(hook_rpc); - homa_rpc_free(hook_rpc); + homa_rpc_end(hook_rpc); } /* The following hook function frees an RPC. */ @@ -77,7 +77,7 @@ void delete_hook(char *id) if (strcmp(id, "schedule") != 0) return; if (delete_count == 0) - homa_rpc_free(hook_rpc); + homa_rpc_end(hook_rpc); delete_count--; } @@ -87,7 +87,7 @@ void lock_delete_hook(char *id) if (strcmp(id, "spin_lock") != 0) return; if (lock_delete_count == 0) - homa_rpc_free(hook_rpc); + homa_rpc_end(hook_rpc); lock_delete_count--; } @@ -97,7 +97,7 @@ void lock_delete_hook(char *id) void match_free_hook(char *id) { if (strcmp(id, "found_rpc") == 0) - homa_rpc_free(hook_rpc); + homa_rpc_end(hook_rpc); } /* The following hook function shuts down a socket. */ @@ -696,7 +696,7 @@ TEST_F(homa_incoming, homa_copy_to_user__rpc_freed) self->server_ip, self->server_port, self->client_id, 1000, 4000); ASSERT_NE(NULL, crpc); - homa_rpc_free(crpc); + homa_rpc_end(crpc); unit_log_clear(); mock_copy_to_user_dont_copy = -1; @@ -1178,7 +1178,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) struct homa_rpc *srpc; mock_ns_tick = 10; - homa_rpc_free(dead); + homa_rpc_end(dead); EXPECT_EQ(31, self->hsk.dead_skbs); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, @@ -2021,7 +2021,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__ignore_dead_rpcs) self->server_port, self->client_id, 5000, 1600); ASSERT_NE(NULL, crpc); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(RPC_DEAD, crpc->state); unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, 0, -ENOTCONN); @@ -2081,7 +2081,7 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__rpc_already_dead) self->server_port, self->client_id, 5000, 1600); ASSERT_NE(NULL, crpc); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(RPC_DEAD, crpc->state); unit_log_clear(); homa_abort_sock_rpcs(&self->hsk, -ENOTCONN); @@ -2343,7 +2343,7 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_sleeping) UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 20000, 20000); self->homa.reap_limit = 5; - homa_rpc_free(crpc2); + homa_rpc_end(crpc2); EXPECT_EQ(31, self->hsk.dead_skbs); unit_log_clear(); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 65b95083..2c0ff0bc 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -17,7 +17,7 @@ static void unlock_hook(char *id) if (strcmp(id, "unlock") != 0) return; if (hook_rpc) { - homa_rpc_free(hook_rpc); + homa_rpc_end(hook_rpc); hook_rpc = NULL; } } @@ -28,7 +28,7 @@ void lock_free_hook(char *id) if (strcmp(id, "spin_lock") != 0) return; if (hook_rpc) { - homa_rpc_free(hook_rpc); + homa_rpc_end(hook_rpc); hook_rpc = NULL; } } @@ -785,7 +785,7 @@ TEST_F(homa_outgoing, homa_xmit_data__rpc_freed) unit_hook_register(lock_free_hook); hook_rpc = crpc; homa_xmit_data(crpc, false); - EXPECT_STREQ("xmit DATA 1400@0; homa_rpc_free invoked", + EXPECT_STREQ("xmit DATA 1400@0; homa_rpc_end invoked", unit_log_get()); EXPECT_EQ(1400, crpc->msgout.next_xmit_offset); } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 61ea9005..f596d7ea 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -16,7 +16,7 @@ static void unlock_hook(char *id) if (strcmp(id, "unlock") != 0) return; if (hook_rpc) { - homa_rpc_free(hook_rpc); + homa_rpc_end(hook_rpc); hook_rpc = 0; } } diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 942bbd78..cf1e3dab 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -95,7 +95,7 @@ TEST_F(homa_rpc, homa_rpc_new_client__normal) &self->server_addr); ASSERT_FALSE(IS_ERR(crpc)); - homa_rpc_free(crpc); + homa_rpc_end(crpc); homa_rpc_unlock(crpc); } TEST_F(homa_rpc, homa_rpc_new_client__malloc_error) @@ -142,7 +142,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__normal) EXPECT_EQ(RPC_INCOMING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, created); - homa_rpc_free(srpc); + homa_rpc_end(srpc); } TEST_F(homa_rpc, homa_rpc_new_server__already_exists) { @@ -218,7 +218,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__allocate_buffers) ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); EXPECT_EQ(3, srpc->msgin.num_bpages); - homa_rpc_free(srpc); + homa_rpc_end(srpc); } TEST_F(homa_rpc, homa_rpc_new_server__no_buffer_pool) { @@ -245,7 +245,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__handoff_rpc) EXPECT_EQ(RPC_INCOMING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, unit_list_length(&self->hsk.ready_requests)); - homa_rpc_free(srpc); + homa_rpc_end(srpc); } TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_no_buffers) { @@ -259,7 +259,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_no_buffers) ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); - homa_rpc_free(srpc); + homa_rpc_end(srpc); } TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_rpc) { @@ -275,7 +275,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_rpc) EXPECT_EQ(RPC_INCOMING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); - homa_rpc_free(srpc); + homa_rpc_end(srpc); } TEST_F(homa_rpc, homa_bucket_lock_slow) @@ -286,7 +286,7 @@ TEST_F(homa_rpc, homa_bucket_lock_slow) mock_ns_tick = 10; crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); ASSERT_FALSE(IS_ERR(crpc)); - homa_rpc_free(crpc); + homa_rpc_end(crpc); homa_rpc_unlock(crpc); srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, &created); @@ -380,7 +380,7 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) homa_sock_destroy(&hsk); } -TEST_F(homa_rpc, homa_rpc_free__basics) +TEST_F(homa_rpc, homa_rpc_end__basics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, @@ -390,13 +390,13 @@ TEST_F(homa_rpc, homa_rpc_free__basics) ASSERT_NE(NULL, crpc); unit_log_clear(); mock_log_rcu_sched = 1; - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(0, self->homa.num_grantable_rpcs); EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, crpc->id)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, unit_list_length(&self->hsk.dead_rpcs)); } -TEST_F(homa_rpc, homa_rpc_free__already_dead) +TEST_F(homa_rpc, homa_rpc_end__already_dead) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, @@ -404,14 +404,14 @@ TEST_F(homa_rpc, homa_rpc_free__already_dead) ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_rpc_free(crpc); - EXPECT_STREQ("homa_rpc_free invoked", + homa_rpc_end(crpc); + EXPECT_STREQ("homa_rpc_end invoked", unit_log_get()); unit_log_clear(); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_rpc, homa_rpc_free__state_ready) +TEST_F(homa_rpc, homa_rpc_end__state_ready) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, @@ -419,10 +419,10 @@ TEST_F(homa_rpc, homa_rpc_free__state_ready) ASSERT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(0, unit_list_length(&self->hsk.ready_responses)); } -TEST_F(homa_rpc, homa_rpc_free__wakeup_interest) +TEST_F(homa_rpc, homa_rpc_end__wakeup_interest) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -434,12 +434,12 @@ TEST_F(homa_rpc, homa_rpc_free__wakeup_interest) interest.reg_rpc = crpc; crpc->interest = &interest; unit_log_clear(); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(NULL, interest.reg_rpc); - EXPECT_STREQ("homa_rpc_free invoked; " + EXPECT_STREQ("homa_rpc_end invoked; " "wake_up_process pid -1", unit_log_get()); } -TEST_F(homa_rpc, homa_rpc_free__free_gaps) +TEST_F(homa_rpc, homa_rpc_end__free_gaps) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -457,28 +457,28 @@ TEST_F(homa_rpc, homa_rpc_free__free_gaps) EXPECT_STREQ("start 0, end 1400; start 2800, end 4200", unit_print_gaps(crpc)); - homa_rpc_free(crpc); + homa_rpc_end(crpc); /* (Test infrastructure will complain if gaps aren't freed) */ } -TEST_F(homa_rpc, homa_rpc_free__dead_buffs) +TEST_F(homa_rpc, homa_rpc_end__dead_buffs) { struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 1000); ASSERT_NE(NULL, crpc1); - homa_rpc_free(crpc1); + homa_rpc_end(crpc1); EXPECT_EQ(9, self->homa.max_dead_buffs); EXPECT_EQ(9, self->hsk.dead_skbs); struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id+2, 5000, 1000); ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc2); + homa_rpc_end(crpc2); EXPECT_EQ(14, self->homa.max_dead_buffs); EXPECT_EQ(14, self->hsk.dead_skbs); } -TEST_F(homa_rpc, homa_rpc_free__remove_from_throttled_list) +TEST_F(homa_rpc, homa_rpc_end__remove_from_throttled_list) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -487,7 +487,7 @@ TEST_F(homa_rpc, homa_rpc_free__remove_from_throttled_list) homa_add_to_throttled(crpc); EXPECT_EQ(1, unit_list_length(&self->homa.throttled_rpcs)); unit_log_clear(); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(0, unit_list_length(&self->homa.throttled_rpcs)); } @@ -506,9 +506,9 @@ TEST_F(homa_rpc, homa_rpc_reap__basics) ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); ASSERT_NE(NULL, crpc3); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - homa_rpc_free(crpc3); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + homa_rpc_end(crpc3); unit_log_clear(); EXPECT_STREQ("1234 1236 1238", dead_rpcs(&self->hsk)); EXPECT_EQ(11, self->hsk.dead_skbs); @@ -531,8 +531,8 @@ TEST_F(homa_rpc, homa_rpc_reap__reap_all) ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); unit_log_clear(); EXPECT_STREQ("1234 1236", dead_rpcs(&self->hsk)); self->homa.reap_limit = 3; @@ -550,7 +550,7 @@ TEST_F(homa_rpc, homa_rpc_reap__protected) self->server_port, self->client_id, 5000, 2000); ASSERT_NE(NULL, crpc1); - homa_rpc_free(crpc1); + homa_rpc_end(crpc1); unit_log_clear(); homa_protect_rpcs(&self->hsk); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); @@ -564,7 +564,7 @@ TEST_F(homa_rpc, homa_rpc_reap__protected_and_reap_all) self->server_port, self->client_id, 5000, 2000); ASSERT_NE(NULL, crpc1); - homa_rpc_free(crpc1); + homa_rpc_end(crpc1); unit_log_clear(); homa_protect_rpcs(&self->hsk); hook_hsk = &self->hsk; @@ -584,8 +584,8 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_flags) ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); unit_log_clear(); atomic_or(RPC_COPYING_TO_USER, &crpc1->flags); self->homa.reap_limit = 3; @@ -609,8 +609,8 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_active_xmits) ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); unit_log_clear(); atomic_inc(&crpc1->msgout.active_xmits); self->homa.reap_limit = 100; @@ -632,8 +632,8 @@ TEST_F(homa_rpc, homa_rpc_reap__grant_in_progress) ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); unit_log_clear(); atomic_inc(&crpc1->grants_in_progress); self->homa.reap_limit = 3; @@ -653,7 +653,7 @@ TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) self->server_port, self->client_id, 10000, 100); ASSERT_NE(NULL, crpc); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(9, self->hsk.dead_skbs); unit_log_clear(); self->homa.reap_limit = 5; @@ -670,7 +670,7 @@ TEST_F(homa_rpc, homa_rpc_reap__release_buffers) ASSERT_NE(NULL, crpc); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); - homa_rpc_free(crpc); + homa_rpc_end(crpc); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); self->hsk.buffer_pool->check_waiting_invoked = 0; self->homa.reap_limit = 5; @@ -691,7 +691,7 @@ TEST_F(homa_rpc, homa_rpc_reap__free_gaps) EXPECT_STREQ("start 1000, end 2000; start 5000, end 6000, time 1000", unit_print_gaps(crpc)); - homa_rpc_free(crpc); + homa_rpc_end(crpc); self->homa.reap_limit = 5; homa_rpc_reap(&self->hsk, false); // Test framework will complain if memory not freed. @@ -732,10 +732,10 @@ TEST_F(homa_rpc, homa_find_client_rpc) EXPECT_EQ(crpc4, homa_find_client_rpc(&self->hsk, crpc4->id)); homa_rpc_unlock(crpc4); EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, 15)); - homa_rpc_free(crpc1); - homa_rpc_free(crpc2); - homa_rpc_free(crpc3); - homa_rpc_free(crpc4); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + homa_rpc_end(crpc3); + homa_rpc_end(crpc4); } TEST_F(homa_rpc, homa_find_server_rpc) diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 316df670..c0c6bced 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -272,7 +272,7 @@ TEST_F(homa_timer, homa_timer__reap_dead_rpcs) self->server_port, self->client_id, 40000, 1000); ASSERT_NE(NULL, dead); - homa_rpc_free(dead); + homa_rpc_end(dead); EXPECT_EQ(31, self->hsk.dead_skbs); // First call to homa_timer: not enough dead skbs. diff --git a/test/utils.c b/test/utils.c index 81d397b4..e2979fd1 100644 --- a/test/utils.c +++ b/test/utils.c @@ -47,7 +47,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, if (IS_ERR(crpc)) return NULL; if (homa_message_out_fill(crpc, unit_iov_iter(NULL, req_length), 0)) { - homa_rpc_free(crpc); + homa_rpc_end(crpc); return NULL; } homa_rpc_unlock(crpc); @@ -92,7 +92,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, if (state == UNIT_RCVD_MSG) return crpc; FAIL("%s received unexpected state %d", __func__, state); - homa_rpc_free(crpc); + homa_rpc_end(crpc); return NULL; } @@ -405,7 +405,7 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, FAIL("%s received unexpected state %d", __func__, state); error: - homa_rpc_free(srpc); + homa_rpc_end(srpc); return NULL; } diff --git a/util/tthoma.py b/util/tthoma.py index fa7ebb06..1dc26a6f 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -85,7 +85,7 @@ # ends when last data packet is transmitted by the NIC. # Missing if tx isn't live during the trace. # unsched: # of bytes of unscheduled data in the incoming message -# free Time when the RPC was passed to homa_rpc_free +# end: Time when the RPC was passed to homa_rpc_end # # The following fields will be present if homa_rpc_log_active_tt was invoked # when the timetraces were frozen; they reflect the RPC's state at the end @@ -1426,14 +1426,14 @@ def __bpages_alloced(self, trace, time, core, match, interests): 'regexp': 'RPC id ([0-9]+) has ([0-9]+) bpages allocated' }) - def __rpc_free(self, trace, time, core, match, interests): + def __rpc_end(self, trace, time, core, match, interests): id = int(match.group(1)) for interest in interests: - interest.tt_rpc_free(trace, time, core, id) + interest.tt_rpc_end(trace, time, core, id) patterns.append({ - 'name': 'rpc_free', - 'regexp': 'homa_rpc_free invoked for id ([0-9]+)' + 'name': 'rpc_end', + 'regexp': 'homa_rpc_end invoked for id ([0-9]+)' }) def __grant_recalc_start(self, trace, time, core, match, interests): @@ -5585,8 +5585,8 @@ def tx_end(self, rpc): return None ceiling = None - if 'free' in rpc: - ceiling = rpc['free'] + if 'end' in rpc: + ceiling = rpc['end'] if not (rpc['id'] & 1): if rpc['gro_data']: ceiling = rpc['gro_data'][0][0] @@ -5755,9 +5755,9 @@ def tt_unsched(self, trace, t, core, id, num_bytes): if num_bytes > max_unsched: max_unsched = num_bytes - def tt_rpc_free(self, trace, t, core, id): + def tt_rpc_end(self, trace, t, core, id): global rpcs - rpcs[id]['free'] = t + rpcs[id]['end'] = t def tt_rpc_incoming(self, trace, t, core, id, peer, received, length): global rpcs, max_unsched From 6e8b79bfa1a54acb7ef0897c117114aa277643b0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 24 Jan 2025 16:34:16 -0800 Subject: [PATCH 162/625] Use skb_queue_purge in homa_rpc_reap instead of hand-coding --- homa_rpc.c | 14 ++++---------- test/mock.c | 7 +++++++ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index cf0ed189..63fda72f 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -412,16 +412,10 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * immediately in rare situations where there are * buffers left. */ - if (rpc->msgin.length >= 0) { - while (1) { - struct sk_buff *skb; - - skb = skb_dequeue(&rpc->msgin.packets); - if (!skb) - break; - kfree_skb(skb); - rx_frees++; - } + if (rpc->msgin.length >= 0 && + !skb_queue_empty_lockless(&rpc->msgin.packets)) { + rx_frees += skb_queue_len(&rpc->msgin.packets); + skb_queue_purge(&rpc->msgin.packets); } /* If we get here, it means all packets have been diff --git a/test/mock.c b/test/mock.c index 51609326..bbb58f25 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1152,6 +1152,13 @@ void *skb_put(struct sk_buff *skb, unsigned int len) return result; } +void skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason) +{ + while (skb_queue_len(list) > 0) + kfree_skb(__skb_dequeue(list)); +} + struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features) { From 4cbf499f6d1640b447a560b28b8736095c186d5c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 26 Jan 2025 20:15:37 -0800 Subject: [PATCH 163/625] Eliminate spurious use of rcu for hsk->dead_rpcs --- homa_rpc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index 63fda72f..6c9d30a6 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -121,7 +121,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, */ bucket = homa_server_rpc_bucket(hsk, id); homa_bucket_lock(bucket, id, "homa_rpc_new_server"); - hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { + hlist_for_each_entry(srpc, &bucket->rpcs, hash_links) { if (srpc->id == id && srpc->dport == ntohs(h->common.sport) && ipv6_addr_equal(&srpc->peer->addr, source)) { @@ -282,7 +282,7 @@ void homa_rpc_end(struct homa_rpc *rpc) homa_sock_lock(rpc->hsk, "homa_rpc_end"); __hlist_del(&rpc->hash_links); list_del_rcu(&rpc->active_links); - list_add_tail_rcu(&rpc->dead_links, &rpc->hsk->dead_rpcs); + list_add_tail(&rpc->dead_links, &rpc->hsk->dead_rpcs); __list_del_entry(&rpc->ready_links); __list_del_entry(&rpc->buf_links); if (rpc->interest) { @@ -347,6 +347,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) struct sk_buff *skbs[BATCH_MAX]; int num_skbs, num_rpcs; struct homa_rpc *rpc; + struct homa_rpc *tmp; int i, batch_size; int skbs_to_reap; int rx_frees; @@ -383,7 +384,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) } /* Collect buffers and freeable RPCs. */ - list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) { + list_for_each_entry_safe(rpc, tmp, &hsk->dead_rpcs, dead_links) { if ((atomic_read(&rpc->flags) & RPC_CANT_REAP) || atomic_read(&rpc->grants_in_progress) != 0 || atomic_read(&rpc->msgout.active_xmits) != 0) { @@ -423,7 +424,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) */ rpcs[num_rpcs] = rpc; num_rpcs++; - list_del_rcu(&rpc->dead_links); + list_del(&rpc->dead_links); if (num_rpcs >= batch_size) goto release; } @@ -498,7 +499,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, u64 id) struct homa_rpc *crpc; homa_bucket_lock(bucket, id, __func__); - hlist_for_each_entry_rcu(crpc, &bucket->rpcs, hash_links) { + hlist_for_each_entry(crpc, &bucket->rpcs, hash_links) { if (crpc->id == id) return crpc; } @@ -525,7 +526,7 @@ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, struct homa_rpc *srpc; homa_bucket_lock(bucket, id, __func__); - hlist_for_each_entry_rcu(srpc, &bucket->rpcs, hash_links) { + hlist_for_each_entry(srpc, &bucket->rpcs, hash_links) { if (srpc->id == id && ipv6_addr_equal(&srpc->peer->addr, saddr)) return srpc; } From 58bb282af41c033f98da9f59d08176f2e0cc90aa Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 26 Jan 2025 20:52:40 -0800 Subject: [PATCH 164/625] Eliminate use of RCU for homa->throttled_rpcs * Not necessary, unclear that it would have worked. * Needed to add return value from homa_pacer_xmit. --- homa_impl.h | 5 ++-- homa_outgoing.c | 57 +++++++++++++++++---------------------- test/unit_homa_outgoing.c | 32 +++++++++++++++------- 3 files changed, 49 insertions(+), 45 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 990b1644..8dcfea01 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -378,8 +378,7 @@ struct homa { /** * @throttled_rpcs: Contains all homa_rpcs that have bytes ready * for transmission, but which couldn't be sent without exceeding - * the queue limits for transmission. Manipulate only with "_rcu" - * functions. + * the queue limits for transmission. */ struct list_head throttled_rpcs; @@ -1099,7 +1098,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, void homa_outgoing_sysctl_changed(struct homa *homa); int homa_pacer_main(void *transport); void homa_pacer_stop(struct homa *homa); -void homa_pacer_xmit(struct homa *homa); +bool homa_pacer_xmit(struct homa *homa); __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); char *homa_print_ipv4_addr(__be32 addr); diff --git a/homa_outgoing.c b/homa_outgoing.c index 1ee8a63d..c3e564f8 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -813,6 +813,7 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) int homa_pacer_main(void *transport) { struct homa *homa = (struct homa *)transport; + bool work_left; homa->pacer_wake_time = sched_clock(); while (1) { @@ -820,7 +821,7 @@ int homa_pacer_main(void *transport) homa->pacer_wake_time = 0; break; } - homa_pacer_xmit(homa); + work_left = homa_pacer_xmit(homa); /* Sleep this thread if the throttled list is empty. Even * if the throttled list isn't empty, call the scheduler @@ -829,17 +830,15 @@ int homa_pacer_main(void *transport) * incoming packets from being handled). */ set_current_state(TASK_INTERRUPTIBLE); + if (work_left) + __set_current_state(TASK_RUNNING); #ifndef __STRIP__ /* See strip.py */ - if (list_first_or_null_rcu(&homa->throttled_rpcs, - struct homa_rpc, throttled_links) == NULL) + else { tt_record("pacer sleeping"); - else -#else /* See strip.py */ - if (list_first_or_null_rcu(&homa->throttled_rpcs, - struct homa_rpc, - throttled_links) != NULL) + INC_METRIC(throttled_ns, sched_clock() - + homa->throttle_add); + } #endif /* See strip.py */ - __set_current_state(TASK_RUNNING); INC_METRIC(pacer_ns, sched_clock() - homa->pacer_wake_time); homa->pacer_wake_time = 0; schedule(); @@ -862,17 +861,21 @@ int homa_pacer_main(void *transport) * likelihood that we keep the link busy. Those other invocations are not * guaranteed to happen, so the pacer thread provides a backstop. * @homa: Overall data about the Homa protocol implementation. + * Return: False if there are no throttled RPCs at the time this + * function returns, true if there are throttled RPCs or + * if the answer is unknown at the time of return. */ -void homa_pacer_xmit(struct homa *homa) +bool homa_pacer_xmit(struct homa *homa) { struct homa_rpc *rpc; + bool result = true; int i; /* Make sure only one instance of this function executes at a * time. */ if (!spin_trylock_bh(&homa->pacer_mutex)) - return; + return true; /* Each iteration through the following loop sends one packet. We * limit the number of passes through this loop in order to cap the @@ -916,7 +919,7 @@ void homa_pacer_xmit(struct homa *homa) homa->pacer_fifo_count += 1000; rpc = NULL; - list_for_each_entry_rcu(cur, &homa->throttled_rpcs, + list_for_each_entry(cur, &homa->throttled_rpcs, throttled_links) { if (cur->msgout.init_ns < oldest) { rpc = cur; @@ -924,11 +927,12 @@ void homa_pacer_xmit(struct homa *homa) } } } else { - rpc = list_first_or_null_rcu(&homa->throttled_rpcs, - struct homa_rpc, - throttled_links); + rpc = list_first_entry_or_null(&homa->throttled_rpcs, + struct homa_rpc, + throttled_links); } if (!rpc) { + result = false; homa_throttle_unlock(homa); break; } @@ -957,27 +961,16 @@ void homa_pacer_xmit(struct homa *homa) if (!list_empty(&rpc->throttled_links)) { tt_record2("pacer removing id %d from throttled list, offset %d", rpc->id, rpc->msgout.next_xmit_offset); - list_del_rcu(&rpc->throttled_links); - if (list_empty(&homa->throttled_rpcs)) - INC_METRIC(throttled_ns, sched_clock() - - homa->throttle_add); - - /* Note: this reinitialization is only safe - * because the pacer only looks at the first - * element of the list, rather than traversing - * it (and besides, we know the pacer isn't - * active concurrently, since this code *is* - * the pacer). It would not be safe under more - * general usage patterns. - */ - INIT_LIST_HEAD_RCU(&rpc->throttled_links); + list_del_init(&rpc->throttled_links); } + result = !list_empty(&homa->throttled_rpcs); homa_throttle_unlock(homa); } homa_rpc_unlock(rpc); } done: spin_unlock_bh(&homa->pacer_mutex); + return result; } /** @@ -1016,7 +1009,7 @@ void homa_add_to_throttled(struct homa_rpc *rpc) homa->throttle_add = now; bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; homa_throttle_lock(homa); - list_for_each_entry_rcu(candidate, &homa->throttled_rpcs, + list_for_each_entry(candidate, &homa->throttled_rpcs, throttled_links) { int bytes_left_cand; @@ -1028,12 +1021,12 @@ void homa_add_to_throttled(struct homa_rpc *rpc) bytes_left_cand = candidate->msgout.length - candidate->msgout.next_xmit_offset; if (bytes_left_cand > bytes_left) { - list_add_tail_rcu(&rpc->throttled_links, + list_add_tail(&rpc->throttled_links, &candidate->throttled_links); goto done; } } - list_add_tail_rcu(&rpc->throttled_links, &homa->throttled_rpcs); + list_add_tail(&rpc->throttled_links, &homa->throttled_rpcs); done: homa_throttle_unlock(homa); wake_up_process(homa->pacer_kthread); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 2c0ff0bc..9dc9b9ed 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1108,7 +1108,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__basics) self->homa.max_nic_queue_ns = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); - homa_pacer_xmit(&self->homa); + EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", unit_log_get()); unit_log_clear(); @@ -1143,7 +1143,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); mock_xmit_log_verbose = 1; - homa_pacer_xmit(&self->homa); + EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); EXPECT_SUBSTR("id 4, message_length 10000, offset 0, data_length 1400", unit_log_get()); unit_log_clear(); @@ -1156,7 +1156,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) /* Second attempt: pacer_fifo_count reaches zero. */ atomic64_set(&self->homa.link_idle_time, 10000); unit_log_clear(); - homa_pacer_xmit(&self->homa); + EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); EXPECT_SUBSTR("id 2, message_length 20000, offset 0, data_length 1400", unit_log_get()); unit_log_clear(); @@ -1178,7 +1178,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__pacer_busy) self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; mock_trylock_errors = 1; unit_log_clear(); - homa_pacer_xmit(&self->homa); + EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); EXPECT_STREQ("", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); @@ -1189,7 +1189,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__queue_empty) self->homa.max_nic_queue_ns = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); - homa_pacer_xmit(&self->homa); + EXPECT_EQ(0, homa_pacer_xmit(&self->homa)); unit_log_throttled(&self->homa); EXPECT_STREQ("", unit_log_get()); } @@ -1206,7 +1206,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__nic_queue_fills) atomic64_set(&self->homa.link_idle_time, 12000); self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); - homa_pacer_xmit(&self->homa); + EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); @@ -1224,12 +1224,12 @@ TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); mock_trylock_errors = ~1; - homa_pacer_xmit(&self->homa); + EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_skipped_rpcs); unit_log_clear(); mock_trylock_errors = 0; - homa_pacer_xmit(&self->homa); + EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", unit_log_get()); } @@ -1242,20 +1242,32 @@ TEST_F(homa_outgoing, homa_pacer_xmit__remove_from_queue) struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 4, - 10000, 1000); + 2000, 1000); homa_add_to_throttled(crpc1); homa_add_to_throttled(crpc2); self->homa.max_nic_queue_ns = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); - homa_pacer_xmit(&self->homa); + + /* First call completes id 2, but id 4 is still in the queue. */ + EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); EXPECT_STREQ("xmit DATA 1000@0; xmit DATA 1400@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("request id 4, next_offset 1400", unit_log_get()); EXPECT_TRUE(list_empty(&crpc1->throttled_links)); + + /* Second call completes id 4, queue now empty. */ + unit_log_clear(); + self->homa.max_nic_queue_ns = 10000; + EXPECT_EQ(0, homa_pacer_xmit(&self->homa)); + EXPECT_STREQ("xmit DATA 600@1400", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); + EXPECT_TRUE(list_empty(&crpc2->throttled_links)); } /* Don't know how to unit test homa_pacer_stop... */ From 5416b5c34613433110ae40d365f5cd954362ebf2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 26 Jan 2025 21:07:19 -0800 Subject: [PATCH 165/625] Call rcu_read_lock/unlock in homa_peer_find --- homa_peer.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index 5f53ad53..404ba2bc 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -146,9 +146,6 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, const struct in6_addr *addr, struct inet_sock *inet) { - /* Note: this function uses RCU operators to ensure safety even - * if a concurrent call is adding a new entry. - */ struct homa_peer *peer; struct dst_entry *dst; @@ -162,12 +159,22 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, HOMA_PEERTAB_BUCKET_BITS); bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[3], HOMA_PEERTAB_BUCKET_BITS); + + /* Use RCU operators to ensure safety even if a concurrent call is + * adding a new entry. The calls to rcu_read_lock and rcu_read_unlock + * shouldn't actually be needed, since we don't need to protect + * against concurrent deletion. + */ + rcu_read_lock(); hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], peertab_links) { - if (ipv6_addr_equal(&peer->addr, addr)) + if (ipv6_addr_equal(&peer->addr, addr)) { + rcu_read_unlock(); return peer; + } INC_METRIC(peer_hash_links, 1); } + rcu_read_unlock(); /* No existing entry; create a new one. * @@ -176,8 +183,8 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, * created by a concurrent invocation of this function). */ spin_lock_bh(&peertab->write_lock); - hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], - peertab_links) { + hlist_for_each_entry(peer, &peertab->buckets[bucket], + peertab_links) { if (ipv6_addr_equal(&peer->addr, addr)) goto done; } From 0eeca4fc25db53ef3871582e0aad3c71951bb054 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 26 Jan 2025 21:10:48 -0800 Subject: [PATCH 166/625] Eliminate extraneous use of RCU in homa_pool_allocate --- homa_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_pool.c b/homa_pool.c index 8920bf59..645b0f81 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -367,7 +367,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) goto queued; } } - list_add_tail_rcu(&rpc->buf_links, &pool->hsk->waiting_for_bufs); + list_add_tail(&rpc->buf_links, &pool->hsk->waiting_for_bufs); queued: set_bpages_needed(pool); From d087aecbe27ddbcc4fbd9e2cb98a016e9344a061 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 26 Jan 2025 21:40:24 -0800 Subject: [PATCH 167/625] Remove locker argument from locking functions Functions affected: homa_rpc_lock, homa_sock_lock, homa_rpc_try_lock, and homa_bucket_lock. This should no longer be necessary now that I know about CONFIG_PROVE_LOCKING. --- homa_grant.c | 4 ++-- homa_incoming.c | 20 ++++++++++---------- homa_outgoing.c | 10 +++++----- homa_plumbing.c | 2 +- homa_pool.c | 8 ++++---- homa_rpc.c | 18 +++++++++--------- homa_rpc.h | 12 ++++-------- homa_sock.c | 19 +++++++++---------- homa_sock.h | 23 +++++------------------ homa_timer.c | 2 +- test/unit_homa_grant.c | 26 +++++++++++++------------- test/unit_homa_sock.c | 4 ++-- 12 files changed, 65 insertions(+), 83 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index d9a8ed37..c80d4a08 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -469,7 +469,7 @@ void homa_grant_recalc(struct homa *homa, int locked) for (i = 0; i < active; i++) { struct homa_rpc *rpc = active_rpcs[i]; - homa_rpc_lock(rpc, "homa_grant_recalc"); + homa_rpc_lock(rpc); homa_grant_send(rpc, homa); try_again += homa_grant_update_incoming(rpc, homa); if (rpc->msgin.granted >= rpc->msgin.length) { @@ -626,7 +626,7 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) */ homa_rpc_unlock(rpc); homa_grant_recalc(homa, 1); - homa_rpc_lock(rpc, "homa_grant_free_rpc"); + homa_rpc_lock(rpc); } else { homa_grantable_unlock(homa); } diff --git a/homa_incoming.c b/homa_incoming.c index e1b14138..0786625d 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -342,7 +342,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) n, rpc->id); n = 0; atomic_or(APP_NEEDS_LOCK, &rpc->flags); - homa_rpc_lock(rpc, "homa_copy_to_user"); + homa_rpc_lock(rpc); atomic_andnot(APP_NEEDS_LOCK | RPC_COPYING_TO_USER, &rpc->flags); if (error) @@ -613,7 +613,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) if (skb_queue_len(&rpc->msgin.packets) != 0 && !(atomic_read(&rpc->flags) & RPC_PKTS_READY)) { atomic_or(RPC_PKTS_READY, &rpc->flags); - homa_sock_lock(rpc->hsk, "homa_data_pkt"); + homa_sock_lock(rpc->hsk); homa_rpc_handoff(rpc); homa_sock_unlock(rpc->hsk); } @@ -961,7 +961,7 @@ struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) * the RPC, just skip it (waiting could deadlock), and it * will eventually get updated elsewhere. */ - if (homa_rpc_try_lock(oldest, "homa_choose_fifo_grant")) { + if (homa_rpc_try_lock(oldest)) { homa_grant_update_incoming(oldest, homa); homa_rpc_unlock(oldest); } @@ -998,7 +998,7 @@ void homa_rpc_abort(struct homa_rpc *rpc, int error) tt_record3("aborting client RPC: peer 0x%x, id %d, error %d", tt_addr(rpc->peer->addr), rpc->id, error); rpc->error = error; - homa_sock_lock(rpc->hsk, "homa_rpc_abort"); + homa_sock_lock(rpc->hsk); if (!rpc->hsk->shutdown) homa_rpc_handoff(rpc); homa_sock_unlock(rpc->hsk); @@ -1036,7 +1036,7 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, continue; if (port && rpc->dport != port) continue; - homa_rpc_lock(rpc, "rpc_abort_rpcs"); + homa_rpc_lock(rpc); homa_rpc_abort(rpc, error); homa_rpc_unlock(rpc); } @@ -1068,7 +1068,7 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) list_for_each_entry_safe(rpc, tmp, &hsk->active_rpcs, active_links) { if (!homa_is_client(rpc->id)) continue; - homa_rpc_lock(rpc, "homa_abort_sock_rpcs"); + homa_rpc_lock(rpc); if (rpc->state == RPC_DEAD) { homa_rpc_unlock(rpc); continue; @@ -1126,7 +1126,7 @@ int homa_register_interests(struct homa_interest *interest, /* Need both the RPC lock (acquired above) and the socket lock to * avoid races. */ - homa_sock_lock(hsk, "homa_register_interests"); + homa_sock_lock(hsk); if (hsk->shutdown) { homa_sock_unlock(hsk); if (rpc) @@ -1190,7 +1190,7 @@ int homa_register_interests(struct homa_interest *interest, homa_sock_unlock(hsk); if (!locked) { atomic_or(APP_NEEDS_LOCK, &rpc->flags); - homa_rpc_lock(rpc, "homa_register_interests"); + homa_rpc_lock(rpc); atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); locked = 1; } @@ -1332,7 +1332,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, if (interest.reg_rpc || !list_empty(&interest.request_links) || !list_empty(&interest.response_links)) { - homa_sock_lock(hsk, "homa_wait_for_message"); + homa_sock_lock(hsk); if (interest.reg_rpc) interest.reg_rpc->interest = NULL; if (!list_empty(&interest.request_links)) @@ -1352,7 +1352,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, rpc->id, current->pid); if (!interest.locked) { atomic_or(APP_NEEDS_LOCK, &rpc->flags); - homa_rpc_lock(rpc, "homa_wait_for_message"); + homa_rpc_lock(rpc); atomic_andnot(APP_NEEDS_LOCK | RPC_HANDING_OFF, &rpc->flags); } else { diff --git a/homa_outgoing.c b/homa_outgoing.c index c3e564f8..e0f26a6b 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -292,12 +292,12 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) max_seg_data); if (unlikely(IS_ERR(skb))) { err = PTR_ERR(skb); - homa_rpc_lock(rpc, "homa_message_out_fill"); + homa_rpc_lock(rpc); goto error; } bytes_left -= skb_data_bytes; - homa_rpc_lock(rpc, "homa_message_out_fill2"); + homa_rpc_lock(rpc); if (rpc->state == RPC_DEAD) { /* RPC was freed while we were copying. */ err = -EINVAL; @@ -552,7 +552,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) txq->dql.num_queued, txq->dql.adj_limit); #endif /* See strip.py */ force = false; - homa_rpc_lock(rpc, "homa_xmit_data"); + homa_rpc_lock(rpc); if (rpc->state == RPC_DEAD) break; } @@ -936,7 +936,7 @@ bool homa_pacer_xmit(struct homa *homa) homa_throttle_unlock(homa); break; } - if (!homa_rpc_try_lock(rpc, "homa_pacer_xmit")) { + if (!homa_rpc_try_lock(rpc)) { homa_throttle_unlock(homa); INC_METRIC(pacer_skipped_rpcs, 1); break; @@ -1069,7 +1069,7 @@ void homa_log_throttled(struct homa *homa) homa_throttle_lock(homa); list_for_each_entry_rcu(rpc, &homa->throttled_rpcs, throttled_links) { rpcs++; - if (!homa_rpc_try_lock(rpc, "homa_log_throttled")) { + if (!homa_rpc_try_lock(rpc)) { pr_notice("Skipping throttled RPC: locked\n"); continue; } diff --git a/homa_plumbing.c b/homa_plumbing.c index 3d6ce000..2018c04f 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -868,7 +868,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sizeof(args))) return -EFAULT; - homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_RCV_BUF"); + homa_sock_lock(hsk); ret = homa_pool_init(hsk, u64_to_user_ptr(args.start), args.length); homa_sock_unlock(hsk); INC_METRIC(so_set_buf_calls, 1); diff --git a/homa_pool.c b/homa_pool.c index 645b0f81..d7439f31 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -133,7 +133,7 @@ void homa_pool_destroy(struct homa_pool *pool) void homa_pool_get_rcvbuf(struct homa_sock *hsk, struct homa_rcvbuf_args *args) { - homa_sock_lock(hsk, "homa_pool_get_rcvbuf"); + homa_sock_lock(hsk); args->start = (uintptr_t)hsk->buffer_pool->region; args->length = hsk->buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; homa_sock_unlock(hsk); @@ -360,7 +360,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) tt_record4("Buffer allocation failed, port %d, id %d, length %d, free_bpages %d", pool->hsk->port, rpc->id, rpc->msgin.length, atomic_read(&pool->free_bpages)); - homa_sock_lock(pool->hsk, "homa_pool_allocate"); + homa_sock_lock(pool->hsk); list_for_each_entry(other, &pool->hsk->waiting_for_bufs, buf_links) { if (other->msgin.length > rpc->msgin.length) { list_add_tail(&rpc->buf_links, &other->buf_links); @@ -462,7 +462,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) while (atomic_read(&pool->free_bpages) >= pool->bpages_needed) { struct homa_rpc *rpc; - homa_sock_lock(pool->hsk, "buffer pool"); + homa_sock_lock(pool->hsk); if (list_empty(&pool->hsk->waiting_for_bufs)) { pool->bpages_needed = INT_MAX; homa_sock_unlock(pool->hsk); @@ -470,7 +470,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) } rpc = list_first_entry(&pool->hsk->waiting_for_bufs, struct homa_rpc, buf_links); - if (!homa_rpc_try_lock(rpc, "homa_pool_check_waiting")) { + if (!homa_rpc_try_lock(rpc)) { /* Can't just spin on the RPC lock because we're * holding the socket lock (see sync.txt). Instead, * release the socket lock and try the entire diff --git a/homa_rpc.c b/homa_rpc.c index 6c9d30a6..aefada36 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -71,8 +71,8 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, * to be performed without holding locks. Also, can't hold spin * locks while doing things that could block, such as memory allocation. */ - homa_bucket_lock(bucket, crpc->id, "homa_rpc_new_client"); - homa_sock_lock(hsk, "homa_rpc_new_client"); + homa_bucket_lock(bucket, crpc->id); + homa_sock_lock(hsk); if (hsk->shutdown) { homa_sock_unlock(hsk); homa_rpc_unlock(crpc); @@ -120,7 +120,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, * the desired RPC. */ bucket = homa_server_rpc_bucket(hsk, id); - homa_bucket_lock(bucket, id, "homa_rpc_new_server"); + homa_bucket_lock(bucket, id); hlist_for_each_entry(srpc, &bucket->rpcs, hash_links) { if (srpc->id == id && srpc->dport == ntohs(h->common.sport) && @@ -176,7 +176,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, goto error; /* Initialize fields that require socket to be locked. */ - homa_sock_lock(hsk, "homa_rpc_new_server"); + homa_sock_lock(hsk); if (hsk->shutdown) { homa_sock_unlock(hsk); err = -ESHUTDOWN; @@ -279,7 +279,7 @@ void homa_rpc_end(struct homa_rpc *rpc) homa_grant_free_rpc(rpc); /* Unlink from all lists, so no-one will ever find this RPC again. */ - homa_sock_lock(rpc->hsk, "homa_rpc_end"); + homa_sock_lock(rpc->hsk); __hlist_del(&rpc->hash_links); list_del_rcu(&rpc->active_links); list_add_tail(&rpc->dead_links, &rpc->hsk->dead_rpcs); @@ -371,7 +371,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) num_rpcs = 0; rx_frees = 0; - homa_sock_lock(hsk, "homa_rpc_reap"); + homa_sock_lock(hsk); if (atomic_read(&hsk->protect_count)) { INC_METRIC(disabled_reaps, 1); tt_record2("homa_rpc_reap returning: protect_count %d, dead_skbs %d", @@ -446,7 +446,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * that invoked homa_rpc_end hasn't unlocked the * RPC yet. */ - homa_rpc_lock(rpc, "homa_rpc_reap"); + homa_rpc_lock(rpc); homa_rpc_unlock(rpc); if (unlikely(rpc->msgin.num_bpages)) @@ -498,7 +498,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, u64 id) struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); struct homa_rpc *crpc; - homa_bucket_lock(bucket, id, __func__); + homa_bucket_lock(bucket, id); hlist_for_each_entry(crpc, &bucket->rpcs, hash_links) { if (crpc->id == id) return crpc; @@ -525,7 +525,7 @@ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); struct homa_rpc *srpc; - homa_bucket_lock(bucket, id, __func__); + homa_bucket_lock(bucket, id); hlist_for_each_entry(srpc, &bucket->rpcs, hash_links) { if (srpc->id == id && ipv6_addr_equal(&srpc->peer->addr, saddr)) return srpc; diff --git a/homa_rpc.h b/homa_rpc.h index 5840bc19..874b22f2 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -451,23 +451,19 @@ int homa_validate_incoming(struct homa *homa, int verbose, * One approach is to use homa_protect_rpcs. Don't use this function * unless you are very sure what you are doing! See sync.txt for * more info on locking. - * @locker: Static string identifying the locking code. Normally ignored, - * but used occasionally for diagnostics and debugging. */ -static inline void homa_rpc_lock(struct homa_rpc *rpc, const char *locker) +static inline void homa_rpc_lock(struct homa_rpc *rpc) { - homa_bucket_lock(rpc->bucket, rpc->id, locker); + homa_bucket_lock(rpc->bucket, rpc->id); } /** * homa_rpc_try_lock() - Acquire the lock for an RPC if it is available. * @rpc: RPC to lock. - * @locker: Static string identifying the locking code. Normally ignored, - * but used when debugging deadlocks. * Return: Nonzero if lock was successfully acquired, zero if it is * currently owned by someone else. */ -static inline int homa_rpc_try_lock(struct homa_rpc *rpc, const char *locker) +static inline int homa_rpc_try_lock(struct homa_rpc *rpc) { if (!spin_trylock_bh(&rpc->bucket->lock)) return 0; @@ -498,7 +494,7 @@ static inline int homa_protect_rpcs(struct homa_sock *hsk) { int result; - homa_sock_lock(hsk, __func__); + homa_sock_lock(hsk); result = !hsk->shutdown; if (result) atomic_inc(&hsk->protect_count); diff --git a/homa_sock.c b/homa_sock.c index 42311055..029075cf 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -139,7 +139,6 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) spin_lock_bh(&socktab->write_lock); atomic_set(&hsk->protect_count, 0); spin_lock_init(&hsk->lock); - hsk->last_locker = "none"; atomic_set(&hsk->protect_count, 0); hsk->homa = homa; hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) @@ -235,7 +234,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) int i = 0; #endif /* See strip.py */ - homa_sock_lock(hsk, "homa_socket_shutdown"); + homa_sock_lock(hsk); if (hsk->shutdown) { homa_sock_unlock(hsk); return; @@ -260,12 +259,12 @@ void homa_sock_shutdown(struct homa_sock *hsk) homa_sock_unlock(hsk); list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - homa_rpc_lock(rpc, "homa_sock_shutdown"); + homa_rpc_lock(rpc); homa_rpc_end(rpc); homa_rpc_unlock(rpc); } - homa_sock_lock(hsk, "homa_socket_shutdown #2"); + homa_sock_lock(hsk); list_for_each_entry(interest, &hsk->request_interests, request_links) wake_up_process(interest->thread); list_for_each_entry(interest, &hsk->response_interests, response_links) @@ -325,7 +324,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, return result; if (port >= HOMA_MIN_DEFAULT_PORT) return -EINVAL; - homa_sock_lock(hsk, "homa_sock_bind"); + homa_sock_lock(hsk); spin_lock_bh(&socktab->write_lock); if (hsk->shutdown) { result = -ESHUTDOWN; @@ -402,19 +401,19 @@ void homa_sock_lock_slow(struct homa_sock *hsk) * lock isn't immediately available. It waits for the lock, but also records * statistics about the waiting time. * @bucket: The hash table bucket to lock. - * @id: ID of the particular RPC being locked (multiple RPCs may - * share a single bucket lock). + * @id: Id of the RPC on whose behalf the bucket is being locked. + * Used only for metrics. */ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) __acquires(&bucket->lock) { u64 start = sched_clock(); - tt_record2("beginning wait for rpc lock, id %d (bucket %d)", + tt_record2("beginning wait for rpc lock, id %d, (bucket %d)", id, bucket->id); spin_lock_bh(&bucket->lock); - tt_record2("ending wait for bucket lock, id %d (bucket %d)", - id, bucket->id); + tt_record2("ending wait for bucket lock, id %d, (bucket %d)", + id, bucket->id); if (homa_is_client(id)) { INC_METRIC(client_lock_misses, 1); INC_METRIC(client_lock_miss_ns, sched_clock() - start); diff --git a/homa_sock.h b/homa_sock.h index b1963c58..1b17518c 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -152,12 +152,6 @@ struct homa_sock { */ spinlock_t lock; - /** - * @last_locker: identifies the code that most recently acquired - * @lock successfully. Occasionally used for debugging. - */ - char *last_locker; - /** * @protect_count: counts the number of calls to homa_protect_rpcs * for which there have not yet been calls to homa_unprotect_rpcs. @@ -301,16 +295,12 @@ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, * homa_sock_lock() - Acquire the lock for a socket. If the socket * isn't immediately available, record stats on the waiting time. * @hsk: Socket to lock. - * @locker: Static string identifying where the socket was locked. - * Not normally used, but can be helpful when tracking down - * deadlocks. */ -static inline void homa_sock_lock(struct homa_sock *hsk, const char *locker) +static inline void homa_sock_lock(struct homa_sock *hsk) __acquires(&hsk->lock) { if (!spin_trylock_bh(&hsk->lock)) homa_sock_lock_slow(hsk); -// hsk->last_locker = locker; } /** @@ -378,14 +368,11 @@ static inline struct homa_rpc_bucket *homa_server_rpc_bucket(struct homa_sock *h /** * homa_bucket_lock() - Acquire the lock for an RPC hash table bucket. - * @bucket: Bucket to lock - * @id: ID of the RPC that is requesting the lock. Normally ignored, - * but used occasionally for diagnostics and debugging. - * @locker: Static string identifying the locking code. Normally ignored, - * but used occasionally for diagnostics and debugging. + * @bucket: Bucket to lock. + * @id: Id of the RPC on whose behalf the bucket is being locked. + * Used only for metrics. */ -static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, - u64 id, const char *locker) +static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) { if (!spin_trylock_bh(&bucket->lock)) homa_bucket_lock_slow(bucket, id); diff --git a/homa_timer.c b/homa_timer.c index 0e7d20f1..39cf158e 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -225,7 +225,7 @@ void homa_timer(struct homa *homa) continue; list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { total_rpcs++; - homa_rpc_lock(rpc, "homa_timer"); + homa_rpc_lock(rpc); if (rpc->state == RPC_IN_SERVICE) { rpc->silent_ticks = 0; homa_rpc_unlock(rpc); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 148cd6e4..e4ce7dff 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -516,7 +516,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) rpc->msgin.bytes_remaining = 500; rpc->msgin.granted = 2000; rpc->msgin.rec_incoming = 0; - homa_rpc_lock(rpc, "test"); + homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(0, rpc->msgin.rec_incoming); EXPECT_EQ(0, atomic_read(&self->homa.total_incoming)); @@ -529,7 +529,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) int old_state; homa_message_in_init(rpc, 2000, 0); - homa_rpc_lock(rpc, "test"); + homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(2000, rpc->msgin.rec_incoming); EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); @@ -537,7 +537,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) old_state = rpc->state; rpc->state = RPC_DEAD; rpc->msgin.bytes_remaining = 0; - homa_rpc_lock(rpc, "test"); + homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); rpc->state = old_state; EXPECT_EQ(2000, rpc->msgin.rec_incoming); @@ -553,13 +553,13 @@ TEST_F(homa_grant, homa_grant_check_rpc__message_doesnt_need_grants) rpc->msgin.granted = 2000; rpc->msgin.bytes_remaining = 500; - homa_rpc_lock(rpc, "test"); + homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(500, rpc->msgin.rec_incoming); EXPECT_EQ(500, atomic_read(&self->homa.total_incoming)); rpc->msgin.bytes_remaining = 0; - homa_rpc_lock(rpc, "test"); + homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(0, rpc->msgin.rec_incoming); EXPECT_EQ(0, atomic_read(&self->homa.total_incoming)); @@ -573,7 +573,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) homa_message_in_init(rpc, 20000, 0); rpc->msgin.bytes_remaining = 12000; - homa_rpc_lock(rpc, "test"); + homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(18000, rpc->msgin.granted); EXPECT_EQ(10000, rpc->msgin.rec_incoming); @@ -594,7 +594,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) rpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 104, 1000, 25000); homa_message_in_init(rpc3, 20000, 0); - homa_rpc_lock(rpc3, "test"); + homa_rpc_lock(rpc3); homa_grant_check_rpc(rpc3); EXPECT_EQ(10000, rpc3->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.rec_incoming); @@ -617,7 +617,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) rpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 104, 1000, 30000); homa_message_in_init(rpc3, 30000, 0); - homa_rpc_lock(rpc3, "test"); + homa_rpc_lock(rpc3); homa_grant_check_rpc(rpc3); EXPECT_EQ(0, rpc3->msgin.granted); EXPECT_EQ(0, rpc3->msgin.rec_incoming); @@ -640,7 +640,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) EXPECT_EQ(0, rpc3->msgin.granted); rpc3->msgin.bytes_remaining = 15000; - homa_rpc_lock(rpc3, "test"); + homa_rpc_lock(rpc3); homa_grant_check_rpc(rpc3); EXPECT_EQ(35000, rpc3->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.rec_incoming); @@ -664,7 +664,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) rpc3->msgin.bytes_remaining = 25000; unit_log_clear(); - homa_rpc_lock(rpc3, "test"); + homa_rpc_lock(rpc3); homa_grant_check_rpc(rpc3); EXPECT_EQ(25000, rpc3->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.rec_incoming); @@ -685,7 +685,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__send_new_grant) rpc->msgin.bytes_remaining = 35000; unit_log_clear(); - homa_rpc_lock(rpc, "test"); + homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(15000, rpc->msgin.granted); EXPECT_EQ(10000, rpc->msgin.rec_incoming); @@ -706,7 +706,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__remove_from_grantable) rpc->msgin.granted = 30000; rpc->msgin.rec_incoming = 10000; unit_log_clear(); - homa_rpc_lock(rpc, "test"); + homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(40000, rpc->msgin.granted); EXPECT_EQ(10000, rpc->msgin.rec_incoming); @@ -732,7 +732,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__recalc_because_of_headroom) rpc1->msgin.granted = 12000; rpc1->msgin.rec_incoming = 10000; unit_log_clear(); - homa_rpc_lock(rpc1, "test"); + homa_rpc_lock(rpc1); homa_grant_check_rpc(rpc1); EXPECT_EQ(20000, rpc1->msgin.granted); EXPECT_EQ(4000, rpc1->msgin.rec_incoming); diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 166aa571..e88d2879 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -396,13 +396,13 @@ TEST_F(homa_sock, homa_sock_lock_slow) { mock_ns_tick = 100; - homa_sock_lock(&self->hsk, "unit test"); + homa_sock_lock(&self->hsk); EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_misses); EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_miss_ns); homa_sock_unlock(&self->hsk); mock_trylock_errors = 1; - homa_sock_lock(&self->hsk, "unit test"); + homa_sock_lock(&self->hsk); EXPECT_EQ(1, homa_metrics_per_cpu()->socket_lock_misses); EXPECT_EQ(100, homa_metrics_per_cpu()->socket_lock_miss_ns); homa_sock_unlock(&self->hsk); From f59723ebeff1a7f20deabb18692eff539cc43596 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Jan 2025 09:22:35 -0800 Subject: [PATCH 168/625] Use __GFP_ZERO in kmalloc calls This replaces code that explicitly initialized fields to zero. --- homa_peer.c | 8 +------- homa_pool.c | 14 +++----------- homa_rpc.c | 22 ++-------------------- homa_skb.c | 5 +---- 4 files changed, 7 insertions(+), 42 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index 404ba2bc..05daf424 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -188,7 +188,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, if (ipv6_addr_equal(&peer->addr, addr)) goto done; } - peer = kmalloc(sizeof(*peer), GFP_ATOMIC); + peer = kmalloc(sizeof(*peer), GFP_ATOMIC | __GFP_ZERO); if (!peer) { peer = (struct homa_peer *)ERR_PTR(-ENOMEM); INC_METRIC(peer_kmalloc_errors, 1); @@ -210,13 +210,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, INIT_LIST_HEAD(&peer->grantable_rpcs); INIT_LIST_HEAD(&peer->grantable_links); hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]); - peer->outstanding_resends = 0; - peer->most_recent_resend = 0; - peer->least_recent_rpc = NULL; - peer->least_recent_ticks = 0; peer->current_ticks = -1; - peer->resend_rpc = NULL; - peer->num_acks = 0; spin_lock_init(&peer->ack_lock); INC_METRIC(peer_new_entries, 1); diff --git a/homa_pool.c b/homa_pool.c index d7439f31..3e435990 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -69,7 +69,7 @@ int homa_pool_init(struct homa_sock *hsk, void __user *region, } pool->descriptors = kmalloc_array(pool->num_bpages, sizeof(struct homa_bpage), - GFP_ATOMIC); + GFP_ATOMIC | __GFP_ZERO); if (!pool->descriptors) { result = -ENOMEM; goto error; @@ -78,27 +78,19 @@ int homa_pool_init(struct homa_sock *hsk, void __user *region, struct homa_bpage *bp = &pool->descriptors[i]; spin_lock_init(&bp->lock); - atomic_set(&bp->refs, 0); bp->owner = -1; - bp->expiration = 0; } atomic_set(&pool->free_bpages, pool->num_bpages); pool->bpages_needed = INT_MAX; /* Allocate and initialize core-specific data. */ - pool->cores = alloc_percpu_gfp(struct homa_pool_core, GFP_ATOMIC); + pool->cores = alloc_percpu_gfp(struct homa_pool_core, + GFP_ATOMIC | __GFP_ZERO); if (!pool->cores) { result = -ENOMEM; goto error; } pool->num_cores = nr_cpu_ids; - for (i = 0; i < pool->num_cores; i++) { - struct homa_pool_core *core = per_cpu_ptr(pool->cores, i); - - core->page_hint = 0; - core->allocated = 0; - core->next_candidate = 0; - } pool->check_waiting_invoked = 0; return 0; diff --git a/homa_rpc.c b/homa_rpc.c index aefada36..82841f24 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -28,7 +28,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, struct homa_rpc *crpc; int err; - crpc = kmalloc(sizeof(*crpc), GFP_KERNEL); + crpc = kmalloc(sizeof(*crpc), GFP_KERNEL | __GFP_ZERO); if (unlikely(!crpc)) return ERR_PTR(-ENOMEM); @@ -38,8 +38,6 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, bucket = homa_client_rpc_bucket(hsk, crpc->id); crpc->bucket = bucket; crpc->state = RPC_OUTGOING; - atomic_set(&crpc->flags, 0); - atomic_set(&crpc->grants_in_progress, 0); crpc->peer = homa_peer_find(hsk->homa->peers, &dest_addr_as_ipv6, &hsk->inet); if (IS_ERR(crpc->peer)) { @@ -48,21 +46,14 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, goto error; } crpc->dport = ntohs(dest->in6.sin6_port); - crpc->completion_cookie = 0; - crpc->error = 0; crpc->msgin.length = -1; - crpc->msgin.num_bpages = 0; - memset(&crpc->msgout, 0, sizeof(crpc->msgout)); crpc->msgout.length = -1; INIT_LIST_HEAD(&crpc->ready_links); INIT_LIST_HEAD(&crpc->buf_links); INIT_LIST_HEAD(&crpc->dead_links); - crpc->interest = NULL; INIT_LIST_HEAD(&crpc->grantable_links); INIT_LIST_HEAD(&crpc->throttled_links); - crpc->silent_ticks = 0; crpc->resend_timer_ticks = hsk->homa->timer_ticks; - crpc->done_timer_ticks = 0; crpc->magic = HOMA_RPC_MAGIC; crpc->start_ns = sched_clock(); @@ -134,7 +125,7 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, } /* Initialize fields that don't require the socket lock. */ - srpc = kmalloc(sizeof(*srpc), GFP_ATOMIC); + srpc = kmalloc(sizeof(*srpc), GFP_ATOMIC | __GFP_ZERO); if (!srpc) { err = -ENOMEM; goto error; @@ -142,8 +133,6 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, srpc->hsk = hsk; srpc->bucket = bucket; srpc->state = RPC_INCOMING; - atomic_set(&srpc->flags, 0); - atomic_set(&srpc->grants_in_progress, 0); srpc->peer = homa_peer_find(hsk->homa->peers, source, &hsk->inet); if (IS_ERR(srpc->peer)) { err = PTR_ERR(srpc->peer); @@ -151,21 +140,14 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, } srpc->dport = ntohs(h->common.sport); srpc->id = id; - srpc->completion_cookie = 0; - srpc->error = 0; srpc->msgin.length = -1; - srpc->msgin.num_bpages = 0; - memset(&srpc->msgout, 0, sizeof(srpc->msgout)); srpc->msgout.length = -1; INIT_LIST_HEAD(&srpc->ready_links); INIT_LIST_HEAD(&srpc->buf_links); INIT_LIST_HEAD(&srpc->dead_links); - srpc->interest = NULL; INIT_LIST_HEAD(&srpc->grantable_links); INIT_LIST_HEAD(&srpc->throttled_links); - srpc->silent_ticks = 0; srpc->resend_timer_ticks = hsk->homa->timer_ticks; - srpc->done_timer_ticks = 0; srpc->magic = HOMA_RPC_MAGIC; srpc->start_ns = sched_clock(); tt_record2("Incoming message for id %d has %d unscheduled bytes", diff --git a/homa_skb.c b/homa_skb.c index 706e84ee..affca847 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -52,12 +52,9 @@ int homa_skb_init(struct homa *homa) if (!homa->page_pools[numa]) { struct homa_page_pool *pool; - pool = kmalloc(sizeof(*pool), GFP_ATOMIC); + pool = kmalloc(sizeof(*pool), GFP_ATOMIC | __GFP_ZERO); if (!pool) return -ENOMEM; - pool->avail = 0; - pool->low_mark = 0; - memset(pool->pages, 0, sizeof(pool->pages)); homa->page_pools[numa] = pool; } skb_core->pool = homa->page_pools[numa]; From 967613ddd1f814110c1f198820374a5c0ef19597 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Jan 2025 11:32:17 -0800 Subject: [PATCH 169/625] Use smp_processor_id instead of raw_smp_processor_id --- homa_impl.h | 9 ++++-- homa_offload.c | 14 ++++----- homa_pool.c | 4 +-- homa_skb.c | 17 +++++++---- test/mock.c | 63 ++++++++++++++++++++++++++++------------ test/mock.h | 15 ++++++++++ test/unit_homa_offload.c | 4 +-- test/unit_homa_pool.c | 48 +++++++++++++++--------------- test/unit_homa_skb.c | 34 +++++++++++----------- 9 files changed, 131 insertions(+), 77 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 8dcfea01..1ff9e1cc 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -58,7 +58,7 @@ #ifndef __STRIP__ /* See strip.py */ /* Null out things that confuse VSCode Intellisense */ #ifdef __VSCODE__ -#define raw_smp_processor_id() 1 +#define smp_processor_id() 1 #define BUG() #define BUG_ON(...) #define set_current_state(...) @@ -135,7 +135,8 @@ struct homa_interest { /** * @core: Core on which @thread was executing when it registered - * its interest. Used for load balancing (see balance.txt). + * its interest. This is a hint used for load balancing + * (see balance.txt). */ int core; @@ -170,6 +171,10 @@ static inline void homa_interest_init(struct homa_interest *interest) atomic_set(&interest->rpc_ready, 0); interest->rpc = NULL; interest->locked = 0; + + /* Safe (and necessary) to use raw_smp_processor_id: this is only + * a hint. + */ interest->core = raw_smp_processor_id(); interest->reg_rpc = NULL; INIT_LIST_HEAD(&interest->request_links); diff --git a/homa_offload.c b/homa_offload.c index 291913e3..8bc4f36f 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -299,7 +299,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, tt_record("homa_gro_receive couldn't pull enough data from packet"); h_new = (struct homa_data_hdr *)skb_transport_header(skb); - offload_core = &per_cpu(homa_offload_core, raw_smp_processor_id()); + offload_core = &per_cpu(homa_offload_core, smp_processor_id()); busy = (now - offload_core->last_gro) < homa->gro_busy_ns; offload_core->last_active = now; if (skb_is_ipv6(skb)) { @@ -440,7 +440,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, offload_core->held_skb = skb; offload_core->held_bucket = hash; if (likely(homa->gro_policy & HOMA_GRO_SAME_CORE)) - homa_set_softirq_cpu(skb, raw_smp_processor_id()); + homa_set_softirq_cpu(skb, smp_processor_id()); done: homa_check_pacer(homa, 1); @@ -483,7 +483,7 @@ void homa_gro_gen2(struct homa *homa, struct sk_buff *skb) */ struct homa_data_hdr *h = (struct homa_data_hdr *)skb_transport_header(skb); - int this_core = raw_smp_processor_id(); + int this_core = smp_processor_id(); struct homa_offload_core *offload_core; int candidate = this_core; u64 now = sched_clock(); @@ -545,7 +545,7 @@ void homa_gro_gen3(struct homa *homa, struct sk_buff *skb) int i, core; candidates = per_cpu(homa_offload_core, - raw_smp_processor_id()).gen3_softirq_cores; + smp_processor_id()).gen3_softirq_cores; now = sched_clock(); busy_time = now - homa->busy_ns; @@ -593,7 +593,7 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) // ntohl(h->seg.offset), // NAPI_GRO_CB(skb)->count); - per_cpu(homa_offload_core, raw_smp_processor_id()).held_skb = NULL; + per_cpu(homa_offload_core, smp_processor_id()).held_skb = NULL; if (homa->gro_policy & HOMA_GRO_GEN3) { homa_gro_gen3(homa, skb); } else if (homa->gro_policy & HOMA_GRO_GEN2) { @@ -610,7 +610,7 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) * hasn't done NAPI or SoftIRQ processing for Homa in the * longest time. */ - best = raw_smp_processor_id(); + best = smp_processor_id(); core = best; for (i = 0; i < CORES_TO_CHECK; i++) { core++; @@ -630,7 +630,7 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) /* Use the next core (in circular order) to handle the * SoftIRQ processing. */ - int target = raw_smp_processor_id() + 1; + int target = smp_processor_id() + 1; if (unlikely(target >= nr_cpu_ids)) target = 0; diff --git a/homa_pool.c b/homa_pool.c index 3e435990..941561a2 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -162,7 +162,7 @@ bool homa_bpage_available(struct homa_bpage *bpage, u64 now) int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, int set_owner) { - int core_num = raw_smp_processor_id(); + int core_num = smp_processor_id(); struct homa_pool_core *core; u64 now = sched_clock(); int alloced = 0; @@ -286,7 +286,7 @@ int homa_pool_allocate(struct homa_rpc *rpc) partial = rpc->msgin.length & (HOMA_BPAGE_SIZE - 1); if (unlikely(partial == 0)) goto success; - core_id = raw_smp_processor_id(); + core_id = smp_processor_id(); core = this_cpu_ptr(pool->cores); bpage = &pool->descriptors[core->page_hint]; if (!spin_trylock_bh(&bpage->lock)) { diff --git a/homa_skb.c b/homa_skb.c index affca847..8b129016 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -151,7 +151,7 @@ struct sk_buff *homa_skb_new_tx(int length) void homa_skb_stash_pages(struct homa *homa, int length) { struct homa_skb_core *skb_core = &per_cpu(homa_skb_core, - raw_smp_processor_id()); + smp_processor_id()); struct homa_page_pool *pool = skb_core->pool; int pages_needed = HOMA_MAX_STASHED(length); @@ -188,8 +188,10 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) skb_frag_t *frag; char *result; + preempt_disable(); + /* Can we just extend the skb's last fragment? */ - skb_core = &per_cpu(homa_skb_core, raw_smp_processor_id()); + skb_core = &per_cpu(homa_skb_core, smp_processor_id()); if (shinfo->nr_frags > 0) { frag = &shinfo->frags[shinfo->nr_frags - 1]; if (skb_frag_page(frag) == skb_core->skb_page && @@ -206,15 +208,17 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) skb_core->page_inuse; skb_core->page_inuse += actual_size; skb_len_add(skb, actual_size); - return result; + goto done; } } /* Need to add a new fragment to the skb. */ skb_core->page_inuse = ALIGN(skb_core->page_inuse, SMP_CACHE_BYTES); if (skb_core->page_inuse >= skb_core->page_size) { - if (!homa_skb_page_alloc(homa, skb_core)) - return NULL; + if (!homa_skb_page_alloc(homa, skb_core)) { + result = NULL; + goto done; + } } if ((skb_core->page_size - skb_core->page_inuse) < actual_size) actual_size = skb_core->page_size - skb_core->page_inuse; @@ -228,6 +232,9 @@ void *homa_skb_extend_frags(struct homa *homa, struct sk_buff *skb, int *length) result = page_address(skb_frag_page(frag)) + skb_core->page_inuse; skb_core->page_inuse += actual_size; skb_len_add(skb, actual_size); + +done: + preempt_enable(); return result; } diff --git a/test/mock.c b/test/mock.c index bbb58f25..879a7c20 100644 --- a/test/mock.c +++ b/test/mock.c @@ -131,6 +131,11 @@ static int mock_active_spin_locks; */ static int mock_active_rcu_locks; +/* The number of times preempt_disable() has been invoked, minus the + * number of times preempt_enable has been invoked. + */ +static int mock_preempt_disables; + /* Used as the return value for calls to get_cycles. A value of ~0 means * return actual clock time. Shouldn't be used much anymore (get_cycles * shouldn't be used). @@ -229,7 +234,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, return NULL; skb = malloc(sizeof(struct sk_buff)); if (skb == NULL) - FAIL("skb malloc failed in %s", __func__); + FAIL(" skb malloc failed in %s", __func__); memset(skb, 0, sizeof(*skb)); if (!skbs_in_use) skbs_in_use = unit_hash_new(); @@ -239,7 +244,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, skb->head = malloc(size + shinfo_size); memset(skb->head, 0, size + shinfo_size); if (skb->head == NULL) - FAIL("data malloc failed in %s", __func__); + FAIL(" data malloc failed in %s", __func__); skb->data = skb->head; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; @@ -354,7 +359,7 @@ void dst_release(struct dst_entry *dst) if (atomic_read(&dst->__rcuref.refcnt) > 0) return; if (!routes_in_use || unit_hash_get(routes_in_use, dst) == NULL) { - FAIL("%s on unknown route", __func__); + FAIL(" %s on unknown route", __func__); return; } unit_hash_erase(routes_in_use, dst); @@ -583,7 +588,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, return ERR_PTR(-EHOSTUNREACH); route = malloc(sizeof(struct rtable)); if (!route) { - FAIL("malloc failed"); + FAIL(" malloc failed"); return ERR_PTR(-ENOMEM); } atomic_set(&route->dst.__rcuref.refcnt, 1); @@ -682,7 +687,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, return ERR_PTR(-EHOSTUNREACH); route = malloc(sizeof(struct rtable)); if (!route) { - FAIL("malloc failed"); + FAIL(" malloc failed"); return ERR_PTR(-ENOMEM); } atomic_set(&route->dst.__rcuref.refcnt, 1); @@ -713,7 +718,7 @@ struct file *filp_open(const char *, int, umode_t) void __fortify_panic(const u8 reason, const size_t avail, const size_t size) { - FAIL("__fortify_panic invoked"); + FAIL(" __fortify_panic invoked"); /* API prohibits return. */ while (1) ; @@ -735,7 +740,7 @@ void kfree(const void *block) if (block == NULL) return; if (!kmallocs_in_use || unit_hash_get(kmallocs_in_use, block) == NULL) { - FAIL("%s on unknown block %p", __func__, block); + FAIL(" %s on unknown block %p", __func__, block); return; } unit_hash_erase(kmallocs_in_use, block); @@ -756,7 +761,7 @@ void __kfree_skb(struct sk_buff *skb) return; skb_dst_drop(skb); if (!skbs_in_use || unit_hash_get(skbs_in_use, skb) == NULL) { - FAIL("kfree_skb on unknown sk_buff"); + FAIL(" kfree_skb on unknown sk_buff"); return; } unit_hash_erase(skbs_in_use, skb); @@ -784,11 +789,11 @@ void *mock_kmalloc(size_t size, gfp_t flags) if (mock_check_error(&mock_kmalloc_errors)) return NULL; if (mock_active_spin_locks > 0 && (flags & ~__GFP_ZERO) != GFP_ATOMIC) - FAIL("Incorrect flags 0x%x passed to mock_kmalloc; expected GFP_ATOMIC (0x%x)", + FAIL(" Incorrect flags 0x%x passed to mock_kmalloc; expected GFP_ATOMIC (0x%x)", flags, GFP_ATOMIC); block = malloc(size); if (!block) { - FAIL("malloc failed"); + FAIL(" malloc failed"); return NULL; } if (flags & __GFP_ZERO) @@ -935,7 +940,7 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *entry = malloc(40); if (!entry) { - FAIL("malloc failed"); + FAIL(" malloc failed"); return ERR_PTR(-ENOMEM); } if (!proc_files_in_use) @@ -961,7 +966,7 @@ void proc_remove(struct proc_dir_entry *de) return; if (!proc_files_in_use || unit_hash_get(proc_files_in_use, de) == NULL) { - FAIL("%s on unknown dir_entry", __func__); + FAIL(" %s on unknown dir_entry", __func__); return; } unit_hash_erase(proc_files_in_use, de); @@ -1129,7 +1134,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) void *skb_pull(struct sk_buff *skb, unsigned int len) { if ((skb_tail_pointer(skb) - skb->data) < len) - FAIL("sk_buff underflow during %s", __func__); + FAIL(" sk_buff underflow during %s", __func__); skb->len -= len; return skb->data += len; } @@ -1139,7 +1144,7 @@ void *skb_push(struct sk_buff *skb, unsigned int len) skb->data -= len; skb->len += len; if (unlikely(skb->data < skb->head)) - FAIL("sk_buff underflow during %s", __func__); + FAIL(" sk_buff underflow during %s", __func__); return skb->data; } @@ -1242,7 +1247,7 @@ void unregister_net_sysctl_table(struct ctl_table_header *header) void vfree(const void *block) { if (!vmallocs_in_use || unit_hash_get(vmallocs_in_use, block) == NULL) { - FAIL("%s on unknown block", __func__); + FAIL(" %s on unknown block", __func__); return; } unit_hash_erase(vmallocs_in_use, block); @@ -1391,7 +1396,7 @@ void mock_get_page(struct page *page) int64_t ref_count = (int64_t) unit_hash_get(pages_in_use, page); if (ref_count == 0) - FAIL("unallocated page passed to %s", __func__); + FAIL(" unallocated page passed to %s", __func__); else unit_hash_set(pages_in_use, page, (void *) (ref_count+1)); } @@ -1420,12 +1425,29 @@ int mock_page_to_nid(struct page *page) return result; } +void mock_preempt_disable() +{ + mock_preempt_disables++; +} + +void mock_preempt_enable() +{ + if (mock_preempt_disables == 0) + FAIL(" preempt_enable invoked without preempt_disable"); + mock_preempt_disables--; +} + +int mock_processor_id() +{ + return pcpu_hot.cpu_number; +} + void mock_put_page(struct page *page) { int64_t ref_count = (int64_t) unit_hash_get(pages_in_use, page); if (ref_count == 0) - FAIL("unallocated page passed to %s", __func__); + FAIL(" unallocated page passed to %s", __func__); else { ref_count--; if (ref_count == 0) { @@ -1761,6 +1783,11 @@ void mock_teardown(void) mock_active_rcu_locks); mock_active_rcu_locks = 0; + if (mock_preempt_disables != 0) + FAIL(" %d preempt_disables still active after test", + mock_preempt_disables); + mock_preempt_disables = 0; + memset(homa_metrics, 0, sizeof(homa_metrics)); unit_hook_clear(); @@ -1779,7 +1806,7 @@ void *mock_vmalloc(size_t size) return NULL; block = malloc(size); if (!block) { - FAIL("malloc failed"); + FAIL(" malloc failed"); return NULL; } if (!vmallocs_in_use) diff --git a/test/mock.h b/test/mock.h index 89c917b8..278f62b5 100644 --- a/test/mock.h +++ b/test/mock.h @@ -24,6 +24,9 @@ #undef DECLARE_PER_CPU #define DECLARE_PER_CPU(type, name) extern type name[10] +#undef debug_smp_processor_id +#define debug_smp_processor_id() (pcpu_hot.cpu_number) + #undef DEFINE_PER_CPU #define DEFINE_PER_CPU(type, name) type name[10] @@ -58,6 +61,12 @@ #undef per_cpu_ptr #define per_cpu_ptr(name, core) (&name[core]) +#undef preempt_disable +#define preempt_disable() mock_preempt_disable() + +#undef preempt_enable +#define preempt_enable() mock_preempt_enable() + #define put_page mock_put_page #define rcu_read_lock mock_rcu_read_lock @@ -69,6 +78,9 @@ #define signal_pending(...) mock_signal_pending +#undef smp_processor_id +#define smp_processor_id() mock_processor_id() + #define spin_unlock mock_spin_unlock #undef this_cpu_ptr @@ -143,6 +155,9 @@ void *mock_kmalloc(size_t size, gfp_t flags); int mock_page_refs(struct page *page); int mock_page_refs(struct page *page); int mock_page_to_nid(struct page *page); +void mock_preempt_disable(void); +void mock_preempt_enable(void); +int mock_processor_id(void); void mock_put_page(struct page *page); void mock_rcu_read_lock(void); void mock_rcu_read_unlock(void); diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 9287b82b..ec74d447 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -9,7 +9,7 @@ #include "mock.h" #include "utils.h" -#define cur_offload_core (&per_cpu(homa_offload_core, raw_smp_processor_id())) +#define cur_offload_core (&per_cpu(homa_offload_core, smp_processor_id())) static struct sk_buff *tcp_gro_receive(struct list_head *held_list, struct sk_buff *skb) @@ -617,7 +617,7 @@ TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) TEST_F(homa_offload, homa_gro_complete__clear_held_skb) { struct homa_offload_core *offload_core = &per_cpu(homa_offload_core, - raw_smp_processor_id()); + smp_processor_id()); offload_core->held_skb = self->skb2; homa_gro_complete(self->skb, 0); diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 5d9e19ec..dce67405 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -61,7 +61,7 @@ static void change_owner_hook(char *id) return; if (!cur_pool) return; - cur_pool->descriptors[cur_pool->cores[raw_smp_processor_id()] + cur_pool->descriptors[cur_pool->cores[smp_processor_id()] .page_hint].owner = -1; } @@ -135,7 +135,7 @@ TEST_F(homa_pool, homa_pool_get_pages__basics) EXPECT_EQ(1, pages[1]); EXPECT_EQ(1, atomic_read(&pool->descriptors[1].refs)); EXPECT_EQ(-1, pool->descriptors[1].owner); - EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].next_candidate); + EXPECT_EQ(2, pool->cores[smp_processor_id()].next_candidate); EXPECT_EQ(98, atomic_read(&pool->free_bpages)); } TEST_F(homa_pool, homa_pool_get_pages__not_enough_space) @@ -154,7 +154,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit) u32 pages[10]; atomic_set(&pool->free_bpages, 62); - pool->cores[raw_smp_processor_id()].next_candidate = 49; + pool->cores[smp_processor_id()].next_candidate = 49; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(49, pages[0]); EXPECT_EQ(0, pages[1]); @@ -165,7 +165,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_limit_with_MIN_EXTRA) u32 pages[10]; atomic_set(&pool->free_bpages, 92); - pool->cores[raw_smp_processor_id()].next_candidate = 13; + pool->cores[smp_processor_id()].next_candidate = 13; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(13, pages[0]); EXPECT_EQ(0, pages[1]); @@ -251,9 +251,9 @@ TEST_F(homa_pool, homa_pool_allocate__basics) EXPECT_EQ(0, crpc->msgin.bpage_offsets[0]); EXPECT_EQ(-1, pool->descriptors[0].owner); EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[2]); - EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); EXPECT_EQ(150000 - 2*HOMA_BPAGE_SIZE, - pool->cores[raw_smp_processor_id()].allocated); + pool->cores[smp_processor_id()].allocated); } TEST_F(homa_pool, homa_pool_no_buffer_pool) { @@ -301,14 +301,14 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; - pool->cores[raw_smp_processor_id()].next_candidate = 2; + pool->cores[smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 40); crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); // First allocation just sets up a partially-allocated bpage. - EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); // Try a second allocation; the lock hook steals the partial bpage, // so a new one has to be allocated. @@ -318,8 +318,8 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) EXPECT_EQ(0, homa_pool_allocate(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(3, pool->cores[raw_smp_processor_id()].page_hint); - EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); + EXPECT_EQ(3, pool->cores[smp_processor_id()].page_hint); + EXPECT_EQ(2000, pool->cores[smp_processor_id()].allocated); EXPECT_EQ(1, -pool->descriptors[2].owner); EXPECT_EQ(1, pool->descriptors[3].owner); EXPECT_EQ(38, atomic_read(&pool->free_bpages)); @@ -329,19 +329,19 @@ TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; - pool->cores[raw_smp_processor_id()].page_hint = 2; - pool->cores[raw_smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; + pool->cores[smp_processor_id()].page_hint = 2; + pool->cores[smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; atomic_set(&pool->descriptors[2].refs, 1); - pool->descriptors[2].owner = raw_smp_processor_id(); + pool->descriptors[2].owner = smp_processor_id(); crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); - EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); - EXPECT_EQ(raw_smp_processor_id(), pool->descriptors[2].owner); + EXPECT_EQ(2000, pool->cores[smp_processor_id()].allocated); + EXPECT_EQ(smp_processor_id(), pool->descriptors[2].owner); EXPECT_EQ(1, homa_metrics_per_cpu()->bpage_reuses); } TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) @@ -349,20 +349,20 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; - pool->cores[raw_smp_processor_id()].next_candidate = 2; + pool->cores[smp_processor_id()].next_candidate = 2; atomic_set(&pool->free_bpages, 50); crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc); - EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); crpc->msgin.num_bpages = 0; - pool->cores[raw_smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; + pool->cores[smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; EXPECT_EQ(0, homa_pool_allocate(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); - EXPECT_EQ(3, pool->cores[raw_smp_processor_id()].page_hint); - EXPECT_EQ(2000, pool->cores[raw_smp_processor_id()].allocated); + EXPECT_EQ(3, pool->cores[smp_processor_id()].page_hint); + EXPECT_EQ(2000, pool->cores[smp_processor_id()].allocated); EXPECT_EQ(-1, pool->descriptors[2].owner); EXPECT_EQ(1, atomic_read(&pool->descriptors[2].refs)); EXPECT_EQ(1, pool->descriptors[3].owner); @@ -373,7 +373,7 @@ TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc1, *crpc2; - pool->cores[raw_smp_processor_id()].next_candidate = 2; + pool->cores[smp_processor_id()].next_candidate = 2; crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); ASSERT_NE(NULL, crpc1); @@ -386,8 +386,8 @@ TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) EXPECT_EQ(1, crpc2->msgin.num_bpages); EXPECT_EQ(2*HOMA_BPAGE_SIZE + 2000, crpc2->msgin.bpage_offsets[0]); EXPECT_EQ(3, atomic_read(&pool->descriptors[2].refs)); - EXPECT_EQ(2, pool->cores[raw_smp_processor_id()].page_hint); - EXPECT_EQ(5000, pool->cores[raw_smp_processor_id()].allocated); + EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); + EXPECT_EQ(5000, pool->cores[smp_processor_id()].allocated); } TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) { diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 3362c71e..932c4882 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -18,7 +18,7 @@ static inline struct homa_skb_core *get_skb_core(int core) */ static struct sk_buff *test_skb(struct homa *homa) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); struct sk_buff *skb = homa_skb_new_tx(100); int32_t data[1000]; char *src; @@ -144,7 +144,7 @@ TEST_F(homa_skb, homa_skb_cleanup) TEST_F(homa_skb, homa_skb_stash_pages) { - int id = raw_smp_processor_id(); + int id = smp_processor_id(); struct homa_skb_core *skb_core; skb_core = get_skb_core(id); @@ -172,7 +172,7 @@ TEST_F(homa_skb, homa_skb_stash_pages) TEST_F(homa_skb, homa_skb_extend_frags__basics) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); char *p1, *p2, *p3; int length = 100; @@ -195,7 +195,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__basics) } TEST_F(homa_skb, homa_skb_extend_frags__merge_but_reduce_length) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); int length = 1000; char *p1, *p2; @@ -213,7 +213,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__merge_but_reduce_length) } TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); char *p1, *p2, *p3; int length; @@ -245,7 +245,7 @@ TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_allocate_new_page) } TEST_F(homa_skb, homa_skb_extend_frags__cant_merge_use_same_page_reduce_length) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); struct sk_buff *skb2 = alloc_skb_fclone(200, GFP_KERNEL); char *p1, *p2, *p3; int length; @@ -291,7 +291,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__free_previous_page) } TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); struct sk_buff *skb = homa_skb_new_tx(100); struct page *page; int length = 100; @@ -308,9 +308,9 @@ TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) } TEST_F(homa_skb, homa_skb_page_alloc__from_stash) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); - add_to_pool(&self->homa, 5, raw_smp_processor_id()); + add_to_pool(&self->homa, 5, smp_processor_id()); homa_skb_stash_pages(&self->homa, 3*HOMA_SKB_PAGE_SIZE - 100); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); EXPECT_NE(NULL, skb_core->skb_page); @@ -320,9 +320,9 @@ TEST_F(homa_skb, homa_skb_page_alloc__from_stash) } TEST_F(homa_skb, homa_skb_page_alloc__from_pool) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); - add_to_pool(&self->homa, 5, raw_smp_processor_id()); + add_to_pool(&self->homa, 5, smp_processor_id()); EXPECT_EQ(5, skb_core->pool->avail); EXPECT_EQ(0, skb_core->num_stashed_pages); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); @@ -331,9 +331,9 @@ TEST_F(homa_skb, homa_skb_page_alloc__from_pool) } TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); - add_to_pool(&self->homa, 1, raw_smp_processor_id()); + add_to_pool(&self->homa, 1, smp_processor_id()); EXPECT_EQ(1, skb_core->pool->avail); EXPECT_EQ(0, skb_core->num_stashed_pages); hook_pool = skb_core->pool; @@ -346,7 +346,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) } TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); mock_ns_tick = 100; EXPECT_EQ(0, skb_core->pool->avail); @@ -382,7 +382,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__no_pages_available) TEST_F(homa_skb, homa_skb_append_to_frag__basics) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); struct skb_shared_info *shinfo = skb_shinfo(self->skb); char *p; @@ -415,7 +415,7 @@ TEST_F(homa_skb, homa_skb_append_to_frag__no_memory) TEST_F(homa_skb, homa_skb_append_from_iter__basics) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); struct skb_shared_info *shinfo = skb_shinfo(self->skb); @@ -466,7 +466,7 @@ TEST_F(homa_skb, homa_skb_append_from_skb__header_only) } TEST_F(homa_skb, homa_skb_append_from_skb__error_copying_header) { - struct homa_skb_core *skb_core = get_skb_core(raw_smp_processor_id()); + struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); struct sk_buff *src_skb = test_skb(&self->homa); struct sk_buff *dst_skb = homa_skb_new_tx(100); From b65476d2779eadb9afea49b68f709263015f0a12 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Jan 2025 13:55:23 -0800 Subject: [PATCH 170/625] Refactored homa_peertab_get_peers * Use hlist_for_each_entry_rcu instead of hlist_for_each_entry_safe * Fix potential array overflow if new entries allocated while the function runs. * Don't include in __STRIP__ version --- homa_peer.c | 65 ++++++++++++++++++++++++++++--------------- homa_peer.h | 2 ++ test/ccutils.cc | 6 ++++ test/mock.c | 2 ++ test/unit_homa_peer.c | 52 ++++++++++++++++++++++++++++++++++ 5 files changed, 104 insertions(+), 23 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index 05daf424..d54c0f7d 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -60,6 +60,7 @@ void homa_peertab_destroy(struct homa_peertab *peertab) homa_peertab_gc_dsts(peertab, ~0); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_peertab_get_peers() - Return information about all of the peers * currently known @@ -67,45 +68,63 @@ void homa_peertab_destroy(struct homa_peertab *peertab) * @num_peers: Modified to hold the number of peers returned. * Return: kmalloced array holding pointers to all known peers. The * caller must free this. If there is an error, or if there - * are no peers, NULL is returned. + * are no peers, NULL is returned. Note: if a large number + * of new peers are created while this function executes, + * then the results may not be complete. */ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, int *num_peers) { + int i, slots, next_slot; struct homa_peer **result; - struct hlist_node *next; struct homa_peer *peer; - int i, count; - *num_peers = 0; - if (!peertab->buckets) - return NULL; + /* Note: RCU must be used in the iterators below to ensure safety + * with concurrent insertions. Technically, rcu_read_lock and + * rcu_read_unlock shouldn't be necessary because we don't have to + * worry about concurrent deletions. But without them, some sanity + * checkers will complain. + */ + rcu_read_lock(); - /* Figure out how many peers there are. */ - count = 0; - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], - peertab_links) - count++; + /* Figure out how large an array to allocate. */ + slots = 0; + next_slot = 0; + result = NULL; + if (peertab->buckets) { + for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { + hlist_for_each_entry_rcu(peer, &peertab->buckets[i], + peertab_links) + slots++; + } } + if (slots == 0) + goto done; - if (count == 0) - return NULL; - - result = kmalloc_array(count, sizeof(peer), GFP_ATOMIC); + /* Allocate extra space in case new peers got created while we + * were counting. + */ + slots += 10; + result = kmalloc_array(slots, sizeof(peer), GFP_ATOMIC); if (!result) - return NULL; - *num_peers = count; - count = 0; + goto done; for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], - peertab_links) { - result[count] = peer; - count++; + hlist_for_each_entry_rcu(peer, &peertab->buckets[i], + peertab_links) { + result[next_slot] = peer; + next_slot++; + + /* We might not have allocated enough extra space. */ + if (next_slot >= slots) + goto done; } } +done: + rcu_read_unlock(); + *num_peers = next_slot; return result; } +#endif /* See strip.py */ /** * homa_peertab_gc_dsts() - Invoked to free unused dst_entries, if it is diff --git a/homa_peer.h b/homa_peer.h index aba37914..c257d55b 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -198,9 +198,11 @@ struct homa_peer { void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, struct homa_sock *hsk); void homa_peertab_destroy(struct homa_peertab *peertab); +#ifndef __STRIP__ /* See strip.py */ struct homa_peer ** homa_peertab_get_peers(struct homa_peertab *peertab, int *num_peers); +#endif /* See strip.py */ int homa_peertab_init(struct homa_peertab *peertab); void homa_peer_add_ack(struct homa_rpc *rpc); struct homa_peer diff --git a/test/ccutils.cc b/test/ccutils.cc index 55d0f004..fb9877e2 100644 --- a/test/ccutils.cc +++ b/test/ccutils.cc @@ -113,8 +113,14 @@ int unit_hash_size(struct unit_hash *hash) */ void unit_hook(char *id) { + static bool hook_active = false; + + if (hook_active) + return; + hook_active = true; for (hook_func& func: hooks) func(id); + hook_active = false; } /** diff --git a/test/mock.c b/test/mock.c index 879a7c20..796440fa 100644 --- a/test/mock.c +++ b/test/mock.c @@ -786,6 +786,7 @@ void *mock_kmalloc(size_t size, gfp_t flags) { void *block; + UNIT_HOOK("kmalloc"); if (mock_check_error(&mock_kmalloc_errors)) return NULL; if (mock_active_spin_locks > 0 && (flags & ~__GFP_ZERO) != GFP_ATOMIC) @@ -1802,6 +1803,7 @@ void *mock_vmalloc(size_t size) { void *block; + UNIT_HOOK("kmalloc"); if (mock_check_error(&mock_vmalloc_errors)) return NULL; block = malloc(size); diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 358b17c6..e13eaf32 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -13,6 +13,26 @@ struct in6_addr ip1111[1]; struct in6_addr ip2222[1]; struct in6_addr ip3333[1]; +static int hook_new_peer_count; +static struct homa_peertab *hook_peertab; +static struct homa_sock *hook_hsk; + +static void kmalloc_hook(char *id) +{ + int i; + + if (strcmp(id, "kmalloc") != 0) + return; + for (i = 0; i < hook_new_peer_count; i++) { + char addr_string[20]; + struct in6_addr addr; + + snprintf(addr_string, sizeof(addr_string), "10.0.0.%d", i); + addr = unit_get_in_addr(addr_string); + homa_peer_find(hook_peertab, &addr, &hook_hsk->inet); + } +} + FIXTURE(homa_peer) { struct homa homa; struct homa_sock hsk; @@ -181,6 +201,38 @@ TEST_F(homa_peer, homa_peertab_get_peers__multiple_peers) || (peers[2] == peer3)); kfree(peers); } +TEST_F(homa_peer, homa_peertab_get_peers__a_few_new_peers_created_concurrently) +{ + struct homa_peer **peers; + int num_peers = 45; + + homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); + homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); + unit_hook_register(kmalloc_hook); + hook_hsk = &self->hsk; + hook_peertab = &self->peertab; + hook_new_peer_count = 3; + peers = homa_peertab_get_peers(&self->peertab, &num_peers); + ASSERT_NE(NULL, peers); + EXPECT_EQ(5, num_peers); + kfree(peers); +} +TEST_F(homa_peer, homa_peertab_get_peers__many_new_peers_created_concurrently) +{ + struct homa_peer **peers; + int num_peers = 45; + + homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); + homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); + unit_hook_register(kmalloc_hook); + hook_hsk = &self->hsk; + hook_peertab = &self->peertab; + hook_new_peer_count = 20; + peers = homa_peertab_get_peers(&self->peertab, &num_peers); + ASSERT_NE(NULL, peers); + EXPECT_EQ(12, num_peers); + kfree(peers); +} TEST_F(homa_peer, homa_peer_find__conflicting_creates) { From f376ceb2bc6af08dff2b2f385a169aab0bb6edf1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Jan 2025 14:18:41 -0800 Subject: [PATCH 171/625] Add annotation to homa_peertab_gc_dsts requiring write_lock --- homa_peer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/homa_peer.c b/homa_peer.c index d54c0f7d..d9f9c1d3 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -49,6 +49,7 @@ void homa_peertab_destroy(struct homa_peertab *peertab) if (!peertab->buckets) return; + spin_lock_bh(&peertab->write_lock); for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], peertab_links) { @@ -58,6 +59,7 @@ void homa_peertab_destroy(struct homa_peertab *peertab) } vfree(peertab->buckets); homa_peertab_gc_dsts(peertab, ~0); + spin_unlock_bh(&peertab->write_lock); } #ifndef __STRIP__ /* See strip.py */ @@ -135,6 +137,7 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, * free all entries. */ void homa_peertab_gc_dsts(struct homa_peertab *peertab, u64 now) + __must_hold(&peer_tab->write_lock) { while (!list_empty(&peertab->dead_dsts)) { struct homa_dead_dst *dead = From 17e8ce8c46a2b544c67ffe5484085574adab83d9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Jan 2025 15:33:33 -0800 Subject: [PATCH 172/625] Remove "lock_slow" methods from stripped version. --- homa_impl.h | 14 +++++++++++++- homa_peer.c | 2 ++ homa_peer.h | 14 ++++++++++++++ homa_sock.c | 2 ++ homa_sock.h | 29 +++++++++++++++++++++++++++++ homa_utils.c | 2 ++ test/unit_homa_pool.c | 4 ++++ 7 files changed, 66 insertions(+), 1 deletion(-) diff --git a/homa_impl.h b/homa_impl.h index 1ff9e1cc..ba32991f 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -72,11 +72,11 @@ struct homa; #ifndef __STRIP__ /* See strip.py */ #include "timetrace.h" -#endif /* See strip.py */ #include "homa_metrics.h" /* Declarations used in this file, so they can't be made at the end. */ void homa_throttle_lock_slow(struct homa *homa); +#endif /* See strip.py */ #define sizeof32(type) ((int)(sizeof(type))) @@ -905,6 +905,7 @@ static inline void homa_set_doff(struct homa_data_hdr *h, int size) h->common.doff = size << 2; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_throttle_lock() - Acquire the throttle lock. If the lock * isn't immediately available, record stats on the waiting time. @@ -916,6 +917,17 @@ static inline void homa_throttle_lock(struct homa *homa) if (!spin_trylock_bh(&homa->throttle_lock)) homa_throttle_lock_slow(homa); } +#else /* See strip.py */ +/** + * homa_throttle_lock() - Acquire the throttle lock. + * @homa: Overall data about the Homa protocol implementation. + */ +static inline void homa_throttle_lock(struct homa *homa) + __acquires(&homa->throttle_lock) +{ + spin_lock_bh(&homa->throttle_lock); +} +#endif /* See strip.py */ /** * homa_throttle_unlock() - Release the throttle lock. diff --git a/homa_peer.c b/homa_peer.c index d9f9c1d3..6164c330 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -353,6 +353,7 @@ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, &peer->flow.u.ip6, NULL); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_peer_set_cutoffs() - Set the cutoffs for unscheduled priorities in * a peer object. This is a convenience function used primarily by unit tests. @@ -397,6 +398,7 @@ void homa_peer_lock_slow(struct homa_peer *peer) INC_METRIC(peer_ack_lock_misses, 1); INC_METRIC(peer_ack_lock_miss_ns, sched_clock() - start); } +#endif /* See strip.py */ /** * homa_peer_add_ack() - Add a given RPC to the list of unacked diff --git a/homa_peer.h b/homa_peer.h index c257d55b..a634ce21 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -214,11 +214,14 @@ int homa_peer_get_acks(struct homa_peer *peer, int count, struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, struct inet_sock *inet); +#ifndef __STRIP__ /* See strip.py */ void homa_peer_lock_slow(struct homa_peer *peer); void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7); +#endif /* See strip.py */ void homa_peertab_gc_dsts(struct homa_peertab *peertab, u64 now); +#ifndef __STRIP__ /* See strip.py */ /** * homa_peer_lock() - Acquire the lock for a peer's @unacked_lock. If the lock * isn't immediately available, record stats on the waiting time. @@ -230,6 +233,17 @@ static inline void homa_peer_lock(struct homa_peer *peer) if (!spin_trylock_bh(&peer->ack_lock)) homa_peer_lock_slow(peer); } +#else /* See strip.py */ +/** + * homa_peer_lock() - Acquire the lock for a peer's @ack_lock. + * @peer: Peer to lock. + */ +static inline void homa_peer_lock(struct homa_peer *peer) + __acquires(&peer->ack_lock) +{ + spin_lock_bh(&peer->ack_lock); +} +#endif /* See strip.py */ /** * homa_peer_unlock() - Release the lock for a peer's @unacked_lock. diff --git a/homa_sock.c b/homa_sock.c index 029075cf..3e36c21b 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -376,6 +376,7 @@ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) return result; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_sock_lock_slow() - This function implements the slow path for * acquiring a socketC lock. It is invoked when a socket lock isn't immediately @@ -422,3 +423,4 @@ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) INC_METRIC(server_lock_miss_ns, sched_clock() - start); } } +#endif /* See strip.py */ diff --git a/homa_sock.h b/homa_sock.h index 1b17518c..f3768b92 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -9,7 +9,9 @@ struct homa; struct homa_pool; +#ifndef __STRIP__ /* See strip.py */ void homa_sock_lock_slow(struct homa_sock *hsk); +#endif /* See strip.py */ /** * define HOMA_SOCKTAB_BUCKETS - Number of hash buckets in a homa_socktab. @@ -274,8 +276,10 @@ struct homa_v6_sock { struct ipv6_pinfo inet6; }; +#ifndef __STRIP__ /* See strip.py */ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id); +#endif /* See strip.py */ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, __u16 port); void homa_sock_destroy(struct homa_sock *hsk); @@ -291,6 +295,7 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan); struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, struct homa_socktab_scan *scan); +#ifndef __STRIP__ /* See strip.py */ /** * homa_sock_lock() - Acquire the lock for a socket. If the socket * isn't immediately available, record stats on the waiting time. @@ -302,6 +307,17 @@ static inline void homa_sock_lock(struct homa_sock *hsk) if (!spin_trylock_bh(&hsk->lock)) homa_sock_lock_slow(hsk); } +#else /* See strip.py */ +/** + * homa_sock_lock() - Acquire the lock for a socket. + * @hsk: Socket to lock. + */ +static inline void homa_sock_lock(struct homa_sock *hsk) + __acquires(&hsk->lock) +{ + spin_lock_bh(&hsk->lock); +} +#endif /* See strip.py */ /** * homa_sock_unlock() - Release the lock for a socket. @@ -366,6 +382,7 @@ static inline struct homa_rpc_bucket *homa_server_rpc_bucket(struct homa_sock *h & (HOMA_SERVER_RPC_BUCKETS - 1)]; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_bucket_lock() - Acquire the lock for an RPC hash table bucket. * @bucket: Bucket to lock. @@ -377,6 +394,18 @@ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) if (!spin_trylock_bh(&bucket->lock)) homa_bucket_lock_slow(bucket, id); } +#else /* See strip.py */ +/** + * homa_bucket_lock() - Acquire the lock for an RPC hash table bucket. + * @bucket: Bucket to lock. + * @id: Id of the RPC on whose behalf the bucket is being locked. + * Used only for metrics. + */ +static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) +{ + spin_lock_bh(&bucket->lock); +} +#endif /* See strip.py */ /** * homa_bucket_unlock() - Release the lock for an RPC hash table bucket. diff --git a/homa_utils.c b/homa_utils.c index e057b21c..d56ee76c 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -673,6 +673,7 @@ void homa_spin(int ns) ; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_throttle_lock_slow() - This function implements the slow path for * acquiring the throttle lock. It is invoked when the lock isn't immediately @@ -723,3 +724,4 @@ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) homa_freeze_peers(rpc->hsk->homa); } } +#endif /* See strip.py */ diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index dce67405..08ac886d 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -573,7 +573,11 @@ TEST_F(homa_pool, homa_pool_check_waiting__rpc_initially_locked) ASSERT_NE(NULL, crpc); EXPECT_EQ(0, crpc->msgin.num_bpages); +#ifndef __STRIP__ /* See strip.py */ mock_trylock_errors = 0xa; +#else /* See strip.py */ + mock_trylock_errors = 0x3; +#endif /* See strip.py */ unit_log_clear(); atomic_set(&pool->free_bpages, 1); homa_pool_check_waiting(pool); From 806cada1264da650b718bc29136f7af8d3b0d493 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Jan 2025 15:58:30 -0800 Subject: [PATCH 173/625] Remove homa_peer fields that aren't used in stripped version --- homa_incoming.c | 2 ++ homa_peer.c | 8 +++++--- homa_peer.h | 4 ++++ test/unit_homa_timer.c | 2 ++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 0786625d..bd7c098a 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -469,7 +469,9 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) h->common.type == BUSY || h->common.type == NEED_ACK) rpc->silent_ticks = 0; +#ifndef __STRIP__ /* See strip.py */ rpc->peer->outstanding_resends = 0; +#endif /* See strip.py */ } switch (h->common.type) { diff --git a/homa_peer.c b/homa_peer.c index 6164c330..d2cff024 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -225,14 +225,16 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, goto done; } peer->dst = dst; - peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; +#ifndef __STRIP__ /* See strip.py */ + peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 2] = INT_MAX; - peer->cutoff_version = 0; - peer->last_update_jiffies = 0; INIT_LIST_HEAD(&peer->grantable_rpcs); INIT_LIST_HEAD(&peer->grantable_links); +#endif /* See strip.py */ hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]); +#ifndef __STRIP__ /* See strip.py */ peer->current_ticks = -1; +#endif /* See strip.py */ spin_lock_init(&peer->ack_lock); INC_METRIC(peer_new_entries, 1); diff --git a/homa_peer.h b/homa_peer.h index a634ce21..3ac0f5da 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -93,6 +93,7 @@ struct homa_peer { */ struct dst_entry *dst; +#ifndef __STRIP__ /* See strip.py */ /** * @unsched_cutoffs: priorities to use for unscheduled packets * sent to this host, as specified in the most recent CUTOFFS @@ -130,6 +131,7 @@ struct homa_peer { * empty list pointing to itself. */ struct list_head grantable_links; +#endif /* See strip.py */ /** * @peertab_links: Links this object into a bucket of its @@ -137,6 +139,7 @@ struct homa_peer { */ struct hlist_node peertab_links; +#ifndef __STRIP__ /* See strip.py */ /** * @outstanding_resends: the number of resend requests we have * sent to this server (spaced @homa.resend_interval apart) since @@ -176,6 +179,7 @@ struct homa_peer { * in the current pass, if it still needs one. */ struct homa_rpc *resend_rpc; +#endif /* See strip.py */ /** * @num_acks: the number of (initial) entries in @acks that diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index c0c6bced..e8f8577b 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -260,7 +260,9 @@ TEST_F(homa_timer, homa_timer__basics) /* Timeout the peer. */ unit_log_clear(); +#ifndef __STRIP__ /* See strip.py */ crpc->peer->outstanding_resends = self->homa.timeout_resends; +#endif /* See strip.py */ homa_timer(&self->homa); EXPECT_EQ(1, homa_metrics_per_cpu()->rpc_timeouts); EXPECT_EQ(ETIMEDOUT, -crpc->error); From d0faea54a6ff218b3f5a361c87ebf3255fb6ed48 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Jan 2025 16:36:42 -0800 Subject: [PATCH 174/625] Reorder fields in homa_rpc_bucket to save space --- homa_sock.c | 4 ++-- homa_sock.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/homa_sock.c b/homa_sock.c index 3e36c21b..7ef3c147 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -175,15 +175,15 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) struct homa_rpc_bucket *bucket = &hsk->client_rpc_buckets[i]; spin_lock_init(&bucket->lock); - INIT_HLIST_HEAD(&bucket->rpcs); bucket->id = i; + INIT_HLIST_HEAD(&bucket->rpcs); } for (i = 0; i < HOMA_SERVER_RPC_BUCKETS; i++) { struct homa_rpc_bucket *bucket = &hsk->server_rpc_buckets[i]; spin_lock_init(&bucket->lock); - INIT_HLIST_HEAD(&bucket->rpcs); bucket->id = i + 1000000; + INIT_HLIST_HEAD(&bucket->rpcs); } hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_ATOMIC); if (!hsk->buffer_pool) diff --git a/homa_sock.h b/homa_sock.h index f3768b92..5153c2db 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -102,9 +102,6 @@ struct homa_rpc_bucket { */ spinlock_t lock; - /** @rpcs: list of RPCs that hash to this bucket. */ - struct hlist_head rpcs; - /** * @id: identifier for this bucket, used in error messages etc. * It's the index of the bucket within its hash table bucket @@ -112,6 +109,9 @@ struct homa_rpc_bucket { * client RPCs. */ int id; + + /** @rpcs: list of RPCs that hash to this bucket. */ + struct hlist_head rpcs; }; /** From 75bf7f6da47e790159c236e89244572fc46057b1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 28 Jan 2025 10:50:57 -0800 Subject: [PATCH 175/625] Refactor homa_socktab_start_scan etc. * Take a reference on the current socket; this eliminates the need for homa_socktab_links or homa_socktab::active_scans. * Scan callers no longer need to think about RCU. * Also cleaned up RCU usage for hsk->active_rpcs. --- homa_incoming.c | 15 +++--- homa_rpc.h | 2 +- homa_sock.c | 109 +++++++++++++++++++----------------------- homa_sock.h | 55 ++++++--------------- homa_timer.c | 11 ++--- test/mock.c | 23 ++++++++- test/mock.h | 7 +++ test/unit_homa_sock.c | 82 ++++++------------------------- test/utils.c | 3 +- 9 files changed, 119 insertions(+), 188 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index bd7c098a..270188f5 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1019,10 +1019,9 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, int port, int error) { struct homa_socktab_scan scan; - struct homa_rpc *rpc, *tmp; + struct homa_rpc *rpc; struct homa_sock *hsk; - rcu_read_lock(); for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk; hsk = homa_socktab_next(&scan)) { /* Skip the (expensive) lock acquisition if there's no @@ -1032,8 +1031,8 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, continue; if (!homa_protect_rpcs(hsk)) continue; - list_for_each_entry_safe(rpc, tmp, &hsk->active_rpcs, - active_links) { + rcu_read_lock(); + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { if (!ipv6_addr_equal(&rpc->peer->addr, addr)) continue; if (port && rpc->dport != port) @@ -1042,10 +1041,10 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, homa_rpc_abort(rpc, error); homa_rpc_unlock(rpc); } + rcu_read_unlock(); homa_unprotect_rpcs(hsk); } homa_socktab_end_scan(&scan); - rcu_read_unlock(); } /** @@ -1060,14 +1059,15 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, */ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) { - struct homa_rpc *rpc, *tmp; + struct homa_rpc *rpc; rcu_read_lock(); if (list_empty(&hsk->active_rpcs)) goto done; if (!homa_protect_rpcs(hsk)) goto done; - list_for_each_entry_safe(rpc, tmp, &hsk->active_rpcs, active_links) { + rcu_read_lock(); + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { if (!homa_is_client(rpc->id)) continue; homa_rpc_lock(rpc); @@ -1084,6 +1084,7 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) homa_rpc_end(rpc); homa_rpc_unlock(rpc); } + rcu_read_unlock(); homa_unprotect_rpcs(hsk); done: rcu_read_unlock(); diff --git a/homa_rpc.h b/homa_rpc.h index 874b22f2..757e7b39 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -354,7 +354,7 @@ struct homa_rpc { * The next field will be LIST_POISON1 if this RPC hasn't yet been * linked into @hsk->active_rpcs. Access with RCU. */ - struct list_head active_links; + struct list_head __rcu active_links; /** @dead_links: For linking this object into @hsk->dead_rpcs. */ struct list_head dead_links; diff --git a/homa_sock.c b/homa_sock.c index 7ef3c147..c7ea84fc 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -17,7 +17,6 @@ void homa_socktab_init(struct homa_socktab *socktab) spin_lock_init(&socktab->write_lock); for (i = 0; i < HOMA_SOCKTAB_BUCKETS; i++) INIT_HLIST_HEAD(&socktab->buckets[i]); - INIT_LIST_HEAD(&socktab->active_scans); } /** @@ -29,13 +28,11 @@ void homa_socktab_destroy(struct homa_socktab *socktab) struct homa_socktab_scan scan; struct homa_sock *hsk; - rcu_read_lock(); for (hsk = homa_socktab_start_scan(socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { homa_sock_destroy(hsk); } homa_socktab_end_scan(&scan); - rcu_read_unlock(); } /** @@ -43,35 +40,27 @@ void homa_socktab_destroy(struct homa_socktab *socktab) * in a socktab. * @socktab: Socktab to scan. * @scan: Will hold the current state of the scan; any existing - * contents are discarded. + * contents are discarded. The caller must eventually pass this + * to homa_socktab_end_scan. * * Return: The first socket in the table, or NULL if the table is - * empty. + * empty. If non-NULL, a reference is held on the socket to + * prevent its deletion. * * Each call to homa_socktab_next will return the next socket in the table. * All sockets that are present in the table at the time this function is * invoked will eventually be returned, as long as they are not removed - * from the table. It is safe to remove sockets from the table and/or - * delete them while the scan is in progress. If a socket is removed from - * the table during the scan, it may or may not be returned by - * homa_socktab_next. New entries added during the scan may or may not be - * returned. The caller must hold an RCU read lock when invoking the - * scan-related methods here, as well as when manipulating sockets returned - * during the scan. It is safe to release and reacquire the RCU read lock - * during a scan, as long as no socket is held when the read lock is - * released and homa_socktab_next isn't invoked until the RCU read lock - * is reacquired. + * from the table. It is safe to remove sockets from the table while the + * scan is in progress. If a socket is removed from the table during the scan, + * it may or may not be returned by homa_socktab_next. New entries added + * during the scan may or may not be returned. */ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, struct homa_socktab_scan *scan) { scan->socktab = socktab; + scan->hsk = NULL; scan->current_bucket = -1; - scan->next = NULL; - - spin_lock_bh(&socktab->write_lock); - list_add_tail_rcu(&scan->scan_links, &socktab->active_scans); - spin_unlock_bh(&socktab->write_lock); return homa_socktab_next(scan); } @@ -81,32 +70,39 @@ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, * @scan: State of the scan. * * Return: The next socket in the table, or NULL if the iteration has - * returned all of the sockets in the table. Sockets are not - * returned in any particular order. It's possible that the - * returned socket has been destroyed. + * returned all of the sockets in the table. If non-NULL, a + * reference is held on the socket to prevent its deletion. + * Sockets are not returned in any particular order. It's + * possible that the returned socket has been destroyed. */ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) { - struct homa_socktab_links *links; - struct homa_sock *hsk; + struct hlist_head __rcu *bucket; + struct hlist_node *next; - while (1) { - while (!scan->next) { - struct hlist_head *bucket; - - scan->current_bucket++; - if (scan->current_bucket >= HOMA_SOCKTAB_BUCKETS) - return NULL; - bucket = &scan->socktab->buckets[scan->current_bucket]; - scan->next = (struct homa_socktab_links *) - rcu_dereference(hlist_first_rcu(bucket)); - } - links = scan->next; - hsk = links->sock; - scan->next = (struct homa_socktab_links *) - rcu_dereference(hlist_next_rcu(&links->hash_links)); - return hsk; + rcu_read_lock(); + if (scan->hsk) { + sock_put(&scan->hsk->sock); + next = rcu_dereference(hlist_next_rcu(&scan->hsk->socktab_links)); + if (next) + goto success; } + while (scan->current_bucket < HOMA_SOCKTAB_BUCKETS - 1) { + scan->current_bucket++; + bucket = &scan->socktab->buckets[scan->current_bucket]; + next = rcu_dereference(hlist_first_rcu(bucket)); + if (next) + goto success; + } + scan->hsk = NULL; + rcu_read_unlock(); + return NULL; + +success: + scan->hsk = hlist_entry(next, struct homa_sock, socktab_links); + sock_hold(&scan->hsk->sock); + rcu_read_unlock(); + return scan->hsk; } /** @@ -116,9 +112,10 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) */ void homa_socktab_end_scan(struct homa_socktab_scan *scan) { - spin_lock_bh(&scan->socktab->write_lock); - list_del(&scan->scan_links); - spin_unlock_bh(&scan->socktab->write_lock); + if (scan->hsk) { + sock_put(&scan->hsk->sock); + scan->hsk = NULL; + } } /** @@ -160,8 +157,7 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) hsk->port = homa->prev_default_port; hsk->inet.inet_num = hsk->port; hsk->inet.inet_sport = htons(hsk->port); - hsk->socktab_links.sock = hsk; - hlist_add_head_rcu(&hsk->socktab_links.hash_links, + hlist_add_head_rcu(&hsk->socktab_links, &socktab->buckets[homa_port_hash(hsk->port)]); INIT_LIST_HEAD(&hsk->active_rpcs); INIT_LIST_HEAD(&hsk->dead_rpcs); @@ -202,19 +198,12 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) void homa_sock_unlink(struct homa_sock *hsk) { struct homa_socktab *socktab = hsk->homa->port_map; - struct homa_socktab_scan *scan; /* If any scans refer to this socket, advance them to refer to * the next socket instead. */ spin_lock_bh(&socktab->write_lock); - list_for_each_entry(scan, &socktab->active_scans, scan_links) { - if (!scan->next || scan->next->sock != hsk) - continue; - scan->next = (struct homa_socktab_links *) - rcu_dereference(hlist_next_rcu(&scan->next->hash_links)); - } - hlist_del_rcu(&hsk->socktab_links.hash_links); + hlist_del_rcu(&hsk->socktab_links); spin_unlock_bh(&socktab->write_lock); } @@ -337,11 +326,11 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, result = -EADDRINUSE; goto done; } - hlist_del_rcu(&hsk->socktab_links.hash_links); + hlist_del_rcu(&hsk->socktab_links); hsk->port = port; hsk->inet.inet_num = port; hsk->inet.inet_sport = htons(hsk->port); - hlist_add_head_rcu(&hsk->socktab_links.hash_links, + hlist_add_head_rcu(&hsk->socktab_links, &socktab->buckets[homa_port_hash(port)]); done: spin_unlock_bh(&socktab->write_lock); @@ -361,13 +350,11 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, */ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) { - struct homa_socktab_links *link; + struct homa_sock *hsk; struct homa_sock *result = NULL; - hlist_for_each_entry_rcu(link, &socktab->buckets[homa_port_hash(port)], - hash_links) { - struct homa_sock *hsk = link->sock; - + hlist_for_each_entry_rcu(hsk, &socktab->buckets[homa_port_hash(port)], + socktab_links) { if (hsk->port == port) { result = hsk; break; diff --git a/homa_sock.h b/homa_sock.h index 5153c2db..1e137989 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -36,56 +36,32 @@ struct homa_socktab { /** * @buckets: Heads of chains for hash table buckets. Chains - * consist of homa_socktab_link objects. + * consist of homa_sock objects. */ - struct hlist_head buckets[HOMA_SOCKTAB_BUCKETS]; - - /** - * @active_scans: List of homa_socktab_scan structs for all scans - * currently underway on this homa_socktab. - */ - struct list_head active_scans; -}; - -/** - * struct homa_socktab_links - Used to link homa_socks into the hash chains - * of a homa_socktab. - */ -struct homa_socktab_links { - /** hash_links: links this element into the hash chain. */ - struct hlist_node hash_links; - - /** @sock: Homa socket structure. */ - struct homa_sock *sock; + struct hlist_head __rcu buckets[HOMA_SOCKTAB_BUCKETS]; }; /** * struct homa_socktab_scan - Records the state of an iteration over all - * the entries in a homa_socktab, in a way that permits RCU-safe deletion - * of entries. + * the entries in a homa_socktab, in a way that is safe against concurrent + * reclamation of sockets. */ struct homa_socktab_scan { /** @socktab: The table that is being scanned. */ struct homa_socktab *socktab; /** - * @current_bucket: the index of the bucket in socktab->buckets - * currently being scanned. If >= HOMA_SOCKTAB_BUCKETS, the scan - * is complete. + * @hsk: Points to the current socket in the iteration, or NULL if + * we're at the beginning or end of the iteration. If non-NULL then + * we are holding a reference to this socket. */ - int current_bucket; - - /** - * @next: the next socket to return from homa_socktab_next (this - * socket has not yet been returned). NULL means there are no - * more sockets in the current bucket. - */ - struct homa_socktab_links *next; + struct homa_sock *hsk; /** - * @scan_links: Used to link this scan into @socktab->scans. + * @current_bucket: The index of the bucket in socktab->buckets + * currently being scanned. */ - struct list_head scan_links; + int current_bucket; }; /** @@ -184,11 +160,8 @@ struct homa_sock { */ int ip_header_length; - /** - * @socktab_links: Links this socket into the homa_socktab - * based on @port. - */ - struct homa_socktab_links socktab_links; + /** @socktab_links: Links this socket into a homa_socktab bucket. */ + struct hlist_node __rcu socktab_links; /** * @active_rpcs: List of all existing RPCs related to this socket, @@ -199,7 +172,7 @@ struct homa_sock { * The list is sorted, with the oldest RPC first. Manipulate with * RCU so timer can access without locking. */ - struct list_head active_rpcs; + struct list_head __rcu active_rpcs; /** * @dead_rpcs: Contains RPCs for which homa_rpc_end has been diff --git a/homa_timer.c b/homa_timer.c index 39cf158e..fb169654 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -199,10 +199,7 @@ void homa_timer(struct homa *homa) } prev_grant_count = total_grants; - /* Scan all existing RPCs in all sockets. The rcu_read_lock - * below prevents sockets from being deleted during the scan. - */ - rcu_read_lock(); + /* Scan all existing RPCs in all sockets. */ for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk; hsk = homa_socktab_next(&scan)) { while (hsk->dead_skbs >= homa->dead_buffs_limit) { @@ -223,6 +220,7 @@ void homa_timer(struct homa *homa) if (!homa_protect_rpcs(hsk)) continue; + rcu_read_lock(); list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { total_rpcs++; homa_rpc_lock(rpc); @@ -243,8 +241,7 @@ void homa_timer(struct homa *homa) rpc_count++; if (rpc_count >= 10) { /* Give other kernel threads a chance to run - * on this core. Must release the RCU read lock - * while doing this. + * on this core. */ rcu_read_unlock(); schedule(); @@ -252,10 +249,10 @@ void homa_timer(struct homa *homa) rpc_count = 0; } } + rcu_read_unlock(); homa_unprotect_rpcs(hsk); } homa_socktab_end_scan(&scan); - rcu_read_unlock(); tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", total_incoming_rpcs, sum_incoming, sum_incoming_rec, atomic_read(&homa->total_incoming)); diff --git a/test/mock.c b/test/mock.c index 796440fa..5d0171b4 100644 --- a/test/mock.c +++ b/test/mock.c @@ -131,6 +131,11 @@ static int mock_active_spin_locks; */ static int mock_active_rcu_locks; +/* Number of calss to sock_hold that haven't been matched with calls + * to sock_put. + */ +int mock_sock_holds; + /* The number of times preempt_disable() has been invoked, minus the * number of times preempt_enable has been invoked. */ @@ -1629,6 +1634,18 @@ int mock_skb_count(void) return unit_hash_size(skbs_in_use); } +void mock_sock_hold(struct sock *sk) +{ + mock_sock_holds++; +} + +void mock_sock_put(struct sock *sk) +{ + if (mock_sock_holds == 0) + FAIL("sock_put invoked when there were no active sock_holds"); + mock_sock_holds--; +} + /** * mock_sock_init() - Constructor for sockets; initializes the Homa-specific * part, and mocks out the non-Homa-specific parts. @@ -1784,13 +1801,17 @@ void mock_teardown(void) mock_active_rcu_locks); mock_active_rcu_locks = 0; + if (mock_sock_holds != 0) + FAIL(" %d sock_holds still active after test", + mock_sock_holds); + mock_sock_holds = 0; + if (mock_preempt_disables != 0) FAIL(" %d preempt_disables still active after test", mock_preempt_disables); mock_preempt_disables = 0; memset(homa_metrics, 0, sizeof(homa_metrics)); - unit_hook_clear(); } diff --git a/test/mock.h b/test/mock.h index 278f62b5..bc0e1339 100644 --- a/test/mock.h +++ b/test/mock.h @@ -81,6 +81,10 @@ #undef smp_processor_id #define smp_processor_id() mock_processor_id() +#define sock_hold(sock) mock_sock_hold(sock) + +#define sock_put(sock) mock_sock_put(sock) + #define spin_unlock mock_spin_unlock #undef this_cpu_ptr @@ -130,6 +134,7 @@ extern int mock_page_nid_mask; extern char mock_printk_output[]; extern int mock_route_errors; extern int mock_signal_pending; +extern int mock_sock_holds; extern int mock_spin_lock_held; extern struct task_struct mock_task; @@ -175,8 +180,10 @@ struct sk_buff * int extra_bytes, int first_value); void mock_sock_destroy(struct homa_sock *hsk, struct homa_socktab *socktab); +void mock_sock_hold(struct sock *sk); int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port); +void mock_sock_put(struct sock *sk); void mock_teardown(void); void *mock_vmalloc(size_t size); diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index e88d2879..102c0047 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -11,16 +11,6 @@ #define n(x) htons(x) #define N(x) htonl(x) -int num_active_scans(struct homa_socktab *socktab) -{ - struct homa_socktab_scan *scan; - int count = 0; - - list_for_each_entry(scan, &socktab->active_scans, scan_links) - count++; - return count; -} - FIXTURE(homa_sock) { struct homa homa; struct homa_sock hsk; @@ -63,11 +53,11 @@ TEST_F(homa_sock, homa_socktab_start_scan) EXPECT_EQ(&self->hsk, homa_socktab_start_scan(self->homa.port_map, &scan)); EXPECT_EQ(100, scan.current_bucket); - EXPECT_EQ(1, num_active_scans(self->homa.port_map)); + EXPECT_EQ(1, mock_sock_holds); homa_socktab_end_scan(&scan); } -TEST_F(homa_sock, homa_socktab_next__basics) +TEST_F(homa_sock, homa_socktab_next) { struct homa_sock hsk1, hsk2, hsk3, hsk4, *hsk; struct homa_socktab_scan scan; @@ -81,14 +71,19 @@ TEST_F(homa_sock, homa_socktab_next__basics) mock_sock_init(&hsk4, &self->homa, first_port+5); hsk = homa_socktab_start_scan(self->homa.port_map, &scan); EXPECT_EQ(first_port+2*HOMA_SOCKTAB_BUCKETS, hsk->port); + EXPECT_EQ(1, mock_sock_holds); hsk = homa_socktab_next(&scan); EXPECT_EQ(first_port+HOMA_SOCKTAB_BUCKETS, hsk->port); + EXPECT_EQ(1, mock_sock_holds); hsk = homa_socktab_next(&scan); EXPECT_EQ(first_port, hsk->port); + EXPECT_EQ(1, mock_sock_holds); hsk = homa_socktab_next(&scan); EXPECT_EQ(first_port+5, hsk->port); + EXPECT_EQ(1, mock_sock_holds); hsk = homa_socktab_next(&scan); EXPECT_EQ(NULL, hsk); + EXPECT_EQ(0, mock_sock_holds); homa_sock_destroy(&hsk1); homa_sock_destroy(&hsk2); homa_sock_destroy(&hsk3); @@ -106,13 +101,15 @@ TEST_F(homa_sock, homa_socktab_end_scan) homa_socktab_start_scan(self->homa.port_map, &scan1); homa_socktab_start_scan(self->homa.port_map, &scan2); homa_socktab_start_scan(self->homa.port_map, &scan3); - EXPECT_EQ(3, num_active_scans(self->homa.port_map)); - homa_socktab_end_scan(&scan2); - EXPECT_EQ(2, num_active_scans(self->homa.port_map)); + EXPECT_EQ(3, mock_sock_holds); + homa_socktab_next(&scan2); + EXPECT_EQ(2, mock_sock_holds); homa_socktab_end_scan(&scan1); - EXPECT_EQ(1, num_active_scans(self->homa.port_map)); + EXPECT_EQ(1, mock_sock_holds); + homa_socktab_end_scan(&scan2); + EXPECT_EQ(1, mock_sock_holds); homa_socktab_end_scan(&scan3); - EXPECT_EQ(0, num_active_scans(self->homa.port_map)); + EXPECT_EQ(0, mock_sock_holds); } TEST_F(homa_sock, homa_sock_init__skip_port_in_use) @@ -177,57 +174,6 @@ TEST_F(homa_sock, homa_sock_init__hijack_tcp) homa_sock_destroy(&no_hijack); } -TEST_F(homa_sock, homa_sock_unlink__update_scans) -{ - struct homa_sock hsk1, hsk2, hsk3, hsk4, *hska, *hskb; - struct homa_socktab_scan scana, scanb; - int first_port = 34000; - - homa_destroy(&self->homa); - homa_init(&self->homa); - mock_sock_init(&hsk1, &self->homa, first_port); - mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); - mock_sock_init(&hsk3, &self->homa, first_port+2*HOMA_SOCKTAB_BUCKETS); - mock_sock_init(&hsk4, &self->homa, first_port+3*HOMA_SOCKTAB_BUCKETS); - - /* Set scana to first socket in hash list. */ - hska = homa_socktab_start_scan(self->homa.port_map, &scana); - EXPECT_NE(NULL, hska); - EXPECT_EQ(first_port + 3*HOMA_SOCKTAB_BUCKETS, hska->port); - - /* Set scanb to second socket in hash list. */ - homa_socktab_start_scan(self->homa.port_map, &scanb); - hskb = homa_socktab_next(&scanb); - EXPECT_NE(NULL, hskb); - EXPECT_EQ(first_port + 2*HOMA_SOCKTAB_BUCKETS, hskb->port); - - /* Delete third socket. */ - homa_sock_destroy(&hsk2); - EXPECT_NE(NULL, scana.next); - EXPECT_EQ(first_port + 2*HOMA_SOCKTAB_BUCKETS, scana.next->sock->port); - EXPECT_NE(NULL, scanb.next); - EXPECT_EQ(first_port, scanb.next->sock->port); - - /* Delete second socket. */ - homa_sock_destroy(&hsk3); - EXPECT_NE(NULL, scana.next); - EXPECT_EQ(first_port, scana.next->sock->port); - EXPECT_NE(NULL, scanb.next); - EXPECT_EQ(first_port, scanb.next->sock->port); - - /* Delete last socket. */ - homa_sock_destroy(&hsk1); - EXPECT_EQ(NULL, scana.next); - EXPECT_EQ(NULL, scanb.next); - - /* Delete first socket. */ - homa_sock_destroy(&hsk4); - EXPECT_EQ(NULL, scana.next); - EXPECT_EQ(NULL, scanb.next); - - homa_socktab_end_scan(&scana); - homa_socktab_end_scan(&scanb); -} TEST_F(homa_sock, homa_sock_unlink__remove_from_map) { struct homa_sock hsk2, hsk3; diff --git a/test/utils.c b/test/utils.c index e2979fd1..ffc44361 100644 --- a/test/utils.c +++ b/test/utils.c @@ -458,6 +458,5 @@ char *unit_ack_string(struct homa_ack *ack) */ void unit_homa_destroy(struct homa *homa) { - if (homa->port_map && !list_empty(&homa->port_map->active_scans)) - FAIL("struct homa deleted with active socktab scans"); + /* Currently nothing to check. */ } \ No newline at end of file From bd86c4c32d8f6f50b6aa9cbf2eb541f5b103b417 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 28 Jan 2025 11:31:57 -0800 Subject: [PATCH 176/625] Change homa_find_sock to take a reference on the returned socket --- homa_incoming.c | 1 + homa_rpc.c | 4 +++- homa_sock.c | 17 +++++++++++------ test/unit_homa_sock.c | 12 ++++++++++++ 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 270188f5..75ed5e3d 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -560,6 +560,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) homa_rpc_reap(hsk, false); INC_METRIC(data_pkt_reap_ns, sched_clock() - start); } + sock_put(&hsk->sock); } /** diff --git a/homa_rpc.c b/homa_rpc.c index 82841f24..17e2ce50 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -200,7 +200,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_rpc *rpc; UNIT_LOG("; ", "ack %llu", id); - if (hsk2->port != server_port) { + if (hsk->port != server_port) { /* Without RCU, sockets other than hsk can be deleted * out from under us. */ @@ -215,6 +215,8 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, homa_rpc_end(rpc); homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ } + if (hsk->port != server_port) + sock_put(&hsk2->sock); done: if (hsk->port != server_port) diff --git a/homa_sock.c b/homa_sock.c index c7ea84fc..65073f53 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -129,6 +129,7 @@ void homa_socktab_end_scan(struct homa_socktab_scan *scan) int homa_sock_init(struct homa_sock *hsk, struct homa *homa) { struct homa_socktab *socktab = homa->port_map; + struct homa_sock *other; int starting_port; int result = 0; int i; @@ -146,8 +147,10 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) homa->prev_default_port++; if (homa->prev_default_port < HOMA_MIN_DEFAULT_PORT) homa->prev_default_port = HOMA_MIN_DEFAULT_PORT; - if (!homa_sock_find(socktab, homa->prev_default_port)) + other = homa_sock_find(socktab, homa->prev_default_port); + if (!other) break; + sock_put(&other->sock); if (homa->prev_default_port == starting_port) { spin_unlock_bh(&socktab->write_lock); hsk->shutdown = true; @@ -322,6 +325,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, owner = homa_sock_find(socktab, port); if (owner) { + sock_put(&owner->sock); if (owner != hsk) result = -EADDRINUSE; goto done; @@ -342,24 +346,25 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, * homa_sock_find() - Returns the socket associated with a given port. * @socktab: Hash table in which to perform lookup. * @port: The port of interest. - * Return: The socket that owns @port, or NULL if none. - * - * Note: this function uses RCU list-searching facilities, but it doesn't - * call rcu_read_lock. The caller should do that, if the caller cares (this - * way, the caller's use of the socket will also be protected). + * Return: The socket that owns @port, or NULL if none. If non-NULL + * then this method has taken a reference on the socket and + * the caller must call sock_put to release it. */ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) { struct homa_sock *hsk; struct homa_sock *result = NULL; + rcu_read_lock(); hlist_for_each_entry_rcu(hsk, &socktab->buckets[homa_port_hash(port)], socktab_links) { if (hsk->port == port) { result = hsk; + sock_hold(&hsk->sock); break; } } + rcu_read_unlock(); return result; } diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 102c0047..38c0c084 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -187,11 +187,14 @@ TEST_F(homa_sock, homa_sock_unlink__remove_from_map) EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, client2)); EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, client3)); + sock_put(&hsk2.sock); + sock_put(&hsk3.sock); homa_sock_shutdown(&hsk2); EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client2)); EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, client3)); + sock_put(&hsk3.sock); homa_sock_shutdown(&hsk3); @@ -208,6 +211,7 @@ TEST_F(homa_sock, homa_sock_shutdown__unlink_socket) EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk, 100)); client = hsk.port; EXPECT_EQ(&hsk, homa_sock_find(self->homa.port_map, client)); + sock_put(&hsk.sock); homa_sock_shutdown(&hsk); EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client)); @@ -281,10 +285,12 @@ TEST_F(homa_sock, homa_sock_bind) 110)); EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, 110)); + sock_put(&self->hsk.sock); EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &self->hsk, 120)); EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, 110)); EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, 120)); + sock_put(&self->hsk.sock); homa_sock_destroy(&hsk2); } TEST_F(homa_sock, homa_sock_bind__socket_shutdown) @@ -302,8 +308,10 @@ TEST_F(homa_sock, homa_sock_find__basics) EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, self->hsk.port)); + sock_put(&self->hsk.sock); EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, hsk2.port)); + sock_put(&hsk2.sock); EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, hsk2.port + 1)); homa_sock_destroy(&hsk2); @@ -326,12 +334,16 @@ TEST_F(homa_sock, homa_sock_find__long_hash_chain) EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, 13)); + sock_put(&self->hsk.sock); EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, 2*HOMA_SOCKTAB_BUCKETS + 13)); + sock_put(&hsk2.sock); EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, 3*HOMA_SOCKTAB_BUCKETS + 13)); + sock_put(&hsk3.sock); EXPECT_EQ(&hsk4, homa_sock_find(self->homa.port_map, 5*HOMA_SOCKTAB_BUCKETS + 13)); + sock_put(&hsk4.sock); homa_sock_destroy(&hsk2); homa_sock_destroy(&hsk3); From 02f2a4c2450e5eeb33a3134eb5398b00167ba6af Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 29 Jan 2025 09:10:08 -0800 Subject: [PATCH 177/625] Minor updates to sync.txt --- sync.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sync.txt b/sync.txt index 42a00394..dcb11d54 100644 --- a/sync.txt +++ b/sync.txt @@ -39,9 +39,10 @@ This file describes the synchronization strategy used for Homa. occur while operations are underway that hold RPC locks but not the socket lock. This creates several potential problems: * A socket might be deleted and its memory reclaimed while an RPC still - has access to it. Home assumes that Linux will prevent socket deletion + has access to it. Homa assumes that Linux will prevent socket deletion while the kernel call is executing. In situations outside kernel call - handling, Homa uses rcu_read_lock to prevent socket deletion. + handling, Homa uses rcu_read_lock and/or socket references to prevent + socket deletion. * A socket might be shut down while there are active operations on RPCs. For example, a new RPC creation might be underway when a socket is shut down, which could add the new RPC after all of its RPCs From 21c4f43698a1313c33af144023785086d118308d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 29 Jan 2025 09:10:22 -0800 Subject: [PATCH 178/625] More RCU cleanup --- homa_incoming.c | 7 ++----- homa_offload.c | 18 +++++++++--------- homa_rpc.c | 7 +------ homa_sock.c | 2 ++ 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 75ed5e3d..731a2d32 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1062,11 +1062,10 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) { struct homa_rpc *rpc; - rcu_read_lock(); if (list_empty(&hsk->active_rpcs)) - goto done; + return; if (!homa_protect_rpcs(hsk)) - goto done; + return; rcu_read_lock(); list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { if (!homa_is_client(rpc->id)) @@ -1087,8 +1086,6 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) } rcu_read_unlock(); homa_unprotect_rpcs(hsk); -done: - rcu_read_unlock(); } /** diff --git a/homa_offload.c b/homa_offload.c index 8bc4f36f..4ec5d6f7 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -170,17 +170,17 @@ static void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) struct rps_sock_flow_table *sock_flow_table; int hash; + rcu_read_lock(); sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); - if (!sock_flow_table) - return; - hash = cpu + net_hotdata.rps_cpu_mask + 1; - if (sock_flow_table->ents[hash] != hash) { - rcu_read_lock(); - sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); - sock_flow_table->ents[hash] = hash; - rcu_read_unlock(); + if (sock_flow_table) { + hash = cpu + net_hotdata.rps_cpu_mask + 1; + if (sock_flow_table->ents[hash] != hash) { + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); + sock_flow_table->ents[hash] = hash; + } + __skb_set_sw_hash(skb, hash, false); } - __skb_set_sw_hash(skb, hash, false); + rcu_read_unlock(); } /** diff --git a/homa_rpc.c b/homa_rpc.c index 17e2ce50..846fde31 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -204,10 +204,9 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, /* Without RCU, sockets other than hsk can be deleted * out from under us. */ - rcu_read_lock(); hsk2 = homa_sock_find(hsk->homa->port_map, server_port); if (!hsk2) - goto done; + return; } rpc = homa_find_server_rpc(hsk2, saddr, id); if (rpc) { @@ -217,10 +216,6 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, } if (hsk->port != server_port) sock_put(&hsk2->sock); - -done: - if (hsk->port != server_port) - rcu_read_unlock(); } /** diff --git a/homa_sock.c b/homa_sock.c index 65073f53..29383eb0 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -250,11 +250,13 @@ void homa_sock_shutdown(struct homa_sock *hsk) homa_sock_unlink(hsk); homa_sock_unlock(hsk); + rcu_read_lock(); list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { homa_rpc_lock(rpc); homa_rpc_end(rpc); homa_rpc_unlock(rpc); } + rcu_read_unlock(); homa_sock_lock(hsk); list_for_each_entry(interest, &hsk->request_interests, request_links) From e905b85ffdfb7176f7587865cce0caa0bd73dd6d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 29 Jan 2025 15:35:40 -0800 Subject: [PATCH 179/625] Add homa_rpc_hold and homa_rpc_put, plus refs field in homa_rpc Replaces several flags, such as RPC_COPYING_FROM_USER and RPC_COPYING_TO_USER, plus fields like active_xmits, with a general mechanism. --- homa_grant.c | 4 +-- homa_impl.h | 9 ++++--- homa_incoming.c | 31 ++++++++++++--------- homa_outgoing.c | 11 ++++---- homa_rpc.c | 4 +-- homa_rpc.h | 57 +++++++++++++++++++++------------------ test/mock.c | 27 ++++++++++++++++++- test/mock.h | 6 +++++ test/unit_homa_grant.c | 1 - test/unit_homa_incoming.c | 30 ++++++++++++++++----- test/unit_homa_rpc.c | 54 +++---------------------------------- 11 files changed, 120 insertions(+), 114 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index c80d4a08..3486aab4 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -430,7 +430,7 @@ void homa_grant_recalc(struct homa *homa, int locked) int extra_levels; active_rpcs[i] = rpc; - atomic_inc(&rpc->grants_in_progress); + homa_rpc_hold(rpc); atomic_set(&rpc->msgin.rank, i); atomic_set(&homa->active_remaining[i], rpc->msgin.bytes_remaining); @@ -478,8 +478,8 @@ void homa_grant_recalc(struct homa *homa, int locked) homa_grant_remove_rpc(rpc); homa_grantable_unlock(homa); } + homa_rpc_put(rpc); homa_rpc_unlock(rpc); - atomic_dec(&rpc->grants_in_progress); } if (try_again == 0) diff --git a/homa_impl.h b/homa_impl.h index ba32991f..9d556884 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -198,7 +198,8 @@ enum homa_freeze_type { * homa_interest_get_rpc() - Return the ready RPC stored in an interest, * if there is one. * @interest: Struct to check - * Return: the ready RPC, or NULL if none. + * Return: the ready RPC, or NULL if none. If an RPC is returned, a + * reference has been taken on it; caller must call homa_rpc_put(). */ static inline struct homa_rpc *homa_interest_get_rpc(struct homa_interest *interest) { @@ -209,10 +210,10 @@ static inline struct homa_rpc *homa_interest_get_rpc(struct homa_interest *inter /** * homa_interest_set_rpc() - Hand off a ready RPC to an interest from a - * waiting receiver thread. Note: interest->locked must be set before - * calling this function. + * waiting receiver thread. * @interest: Belongs to a thread that is waiting for an incoming message. - * @rpc: Ready rpc to assign to @interest. + * @rpc: Ready rpc to assign to @interest. Caller must have taken a + * reference by calling homa_rpc_hold(). * @locked: 1 means @rpc is locked, 0 means unlocked. */ static inline void homa_interest_set_rpc(struct homa_interest *interest, diff --git a/homa_incoming.c b/homa_incoming.c index 731a2d32..6eb4c84c 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -268,7 +268,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) * run out of packets); copy any available packets out to * user space. */ - atomic_or(RPC_COPYING_TO_USER, &rpc->flags); + homa_rpc_hold(rpc); homa_rpc_unlock(rpc); tt_record1("starting copy to user space for id %d", @@ -343,8 +343,8 @@ int homa_copy_to_user(struct homa_rpc *rpc) n = 0; atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); - atomic_andnot(APP_NEEDS_LOCK | RPC_COPYING_TO_USER, - &rpc->flags); + atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); + homa_rpc_put(rpc); if (error) break; } @@ -1183,11 +1183,12 @@ int homa_register_interests(struct homa_interest *interest, hsk->sock.sk_data_ready(&hsk->sock); } - /* This flag is needed to keep the RPC from being reaped during the - * gap between when we release the socket lock and we acquire the - * RPC lock. + /* Must take a reference on the RPC before storing in interest + * (match the behavior of homa_rpc_handoff). This also prevents + * the RPC from being reaped during the gap between when we release + * the socket lock and we acquire the RPC lock. */ - atomic_or(RPC_HANDING_OFF, &rpc->flags); + homa_rpc_hold(rpc); homa_sock_unlock(hsk); if (!locked) { atomic_or(APP_NEEDS_LOCK, &rpc->flags); @@ -1195,7 +1196,6 @@ int homa_register_interests(struct homa_interest *interest, atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); locked = 1; } - atomic_andnot(RPC_HANDING_OFF, &rpc->flags); homa_interest_set_rpc(interest, rpc, locked); return 0; } @@ -1359,6 +1359,12 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, } else { atomic_andnot(RPC_HANDING_OFF, &rpc->flags); } + + /* Once the RPC has been locked it's safe to drop + * the reference that was set before storing the RPC + * in interest. + * */ + homa_rpc_put(rpc); if (!rpc->error) rpc->error = homa_copy_to_user(rpc); if (rpc->state == RPC_DEAD) { @@ -1483,11 +1489,10 @@ void homa_rpc_handoff(struct homa_rpc *rpc) return; thread_waiting: - /* We found a waiting thread. The following 3 lines must be here, - * before clearing the interest, in order to avoid a race with - * homa_wait_for_message (which won't acquire the socket lock if - * the interest is clear). + /* We found a waiting thread. Take a reference on it to keep + * it from being freed before homa_wait_for_message picks it up. */ + homa_rpc_hold(rpc); atomic_or(RPC_HANDING_OFF, &rpc->flags); interest->locked = 0; INC_METRIC(handoffs_thread_waiting, 1); @@ -1504,7 +1509,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) /* Clear the interest. This serves two purposes. First, it saves * the waking thread from acquiring the socket lock again, which * reduces contention on that lock). Second, it ensures that - * no-one else attempts to give this interest a different RPC. + * no-one else attempts to give this interest to a different RPC. */ if (interest->reg_rpc) { interest->reg_rpc->interest = NULL; diff --git a/homa_outgoing.c b/homa_outgoing.c index e0f26a6b..1cbea602 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -23,7 +23,6 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) rpc->msgout.packets = NULL; rpc->msgout.next_xmit = &rpc->msgout.packets; rpc->msgout.next_xmit_offset = 0; - atomic_set(&rpc->msgout.active_xmits, 0); rpc->msgout.unscheduled = rpc->hsk->homa->unsched_bytes; if (rpc->msgout.unscheduled > length) rpc->msgout.unscheduled = length; @@ -222,6 +221,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) int gso_size; int err; + homa_rpc_hold(rpc); homa_message_out_init(rpc, iter->count); if (unlikely(rpc->msgout.length > HOMA_MAX_MESSAGE_LENGTH || rpc->msgout.length == 0)) { @@ -265,7 +265,6 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) overlap_xmit = rpc->msgout.length > 2 * max_gso_data; rpc->msgout.granted = rpc->msgout.unscheduled; - atomic_or(RPC_COPYING_FROM_USER, &rpc->flags); homa_skb_stash_pages(rpc->hsk->homa, rpc->msgout.length); /* Each iteration of the loop below creates one GSO packet. */ @@ -317,14 +316,14 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) } tt_record2("finished copy from user space for id %d, length %d", rpc->id, rpc->msgout.length); - atomic_andnot(RPC_COPYING_FROM_USER, &rpc->flags); INC_METRIC(sent_msg_bytes, rpc->msgout.length); + homa_rpc_put(rpc); if (!overlap_xmit && xmit) homa_xmit_data(rpc, false); return 0; error: - atomic_andnot(RPC_COPYING_FROM_USER, &rpc->flags); + homa_rpc_put(rpc); return err; } @@ -509,7 +508,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) struct netdev_queue *txq; #endif /* See strip.py */ - atomic_inc(&rpc->msgout.active_xmits); + homa_rpc_hold(rpc); while (*rpc->msgout.next_xmit) { int priority; struct sk_buff *skb = *rpc->msgout.next_xmit; @@ -556,7 +555,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) if (rpc->state == RPC_DEAD) break; } - atomic_dec(&rpc->msgout.active_xmits); + homa_rpc_put(rpc); } /** diff --git a/homa_rpc.c b/homa_rpc.c index 846fde31..c0d83cc8 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -364,9 +364,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) /* Collect buffers and freeable RPCs. */ list_for_each_entry_safe(rpc, tmp, &hsk->dead_rpcs, dead_links) { - if ((atomic_read(&rpc->flags) & RPC_CANT_REAP) || - atomic_read(&rpc->grants_in_progress) != 0 || - atomic_read(&rpc->msgout.active_xmits) != 0) { + if (atomic_read(&rpc->refs) != 0) { INC_METRIC(disabled_rpc_reaps, 1); continue; } diff --git a/homa_rpc.h b/homa_rpc.h index 757e7b39..8cb18f3a 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -59,13 +59,6 @@ struct homa_message_out { */ int next_xmit_offset; - /** - * @active_xmits: The number of threads that are currently - * transmitting data packets for this RPC; can't reap the RPC - * until this count becomes zero. - */ - atomic_t active_xmits; - /** * @unscheduled: Initial bytes of message that we'll send * without waiting for grants. @@ -251,13 +244,8 @@ struct homa_rpc { /* Valid bits for @flags: * RPC_PKTS_READY - The RPC has input packets ready to be * copied to user space. - * RPC_COPYING_FROM_USER - Data is being copied from user space into - * the RPC; the RPC must not be reaped. - * RPC_COPYING_TO_USER - Data is being copied from this RPC to - * user space; the RPC must not be reaped. - * RPC_HANDING_OFF - This RPC is in the process of being - * handed off to a waiting thread; it must - * not be reaped. + * RPC_HANDING_OFF - The RPC has been handed off to a waiting + * thread but not yet received by that thread. * APP_NEEDS_LOCK - Means that code in the application thread * needs the RPC lock (e.g. so it can start * copying data to user space) so others @@ -269,21 +257,14 @@ struct homa_rpc { * high network speeds). */ #define RPC_PKTS_READY 1 -#define RPC_COPYING_FROM_USER 2 -#define RPC_COPYING_TO_USER 4 -#define RPC_HANDING_OFF 8 -#define APP_NEEDS_LOCK 16 - -#define RPC_CANT_REAP (RPC_COPYING_FROM_USER | RPC_COPYING_TO_USER \ - | RPC_HANDING_OFF) +#define RPC_HANDING_OFF 2 +#define APP_NEEDS_LOCK 4 /** - * @grants_in_progress: Count of active grant sends for this RPC; - * it's not safe to reap the RPC unless this value is zero. - * This variable is needed so that grantable_lock can be released - * while sending grants, to reduce contention. + * @refs: Number of unmatched calls to homa_rpc_hold; it's not safe + * to free the RPC until this is zero. */ - atomic_t grants_in_progress; + atomic_t refs; /** * @peer: Information about the other machine (the server, if @@ -512,6 +493,30 @@ static inline void homa_unprotect_rpcs(struct homa_sock *hsk) atomic_dec(&hsk->protect_count); } +#ifndef __UNIT_TEST__ +/** + * homa_rpc_hold() - Increment the reference count on an RPC, which will + * prevent it from being freed until homa_rpc_put() is called. Used in + * situations where a pointer to the RPC needs to be retained during a + * period where it is unprotected by locks. + * @rpc: RPC on which to take a reference. + */ +static inline void homa_rpc_hold(struct homa_rpc *rpc) +{ + atomic_inc(&rpc->refs); +} + +/** + * homa_rpc_put() - Release a reference on an RPC (cancels the effect of + * a previous call to homa_rpc_put). + * @rpc: RPC to release. + */ +static inline void homa_rpc_put(struct homa_rpc *rpc) +{ + atomic_dec(&rpc->refs); +} +#endif /* __UNIT_TEST__ */ + /** * homa_is_client(): returns true if we are the client for a particular RPC, * false if we are the server. diff --git a/test/mock.c b/test/mock.c index 5d0171b4..227c00d3 100644 --- a/test/mock.c +++ b/test/mock.c @@ -131,11 +131,16 @@ static int mock_active_spin_locks; */ static int mock_active_rcu_locks; -/* Number of calss to sock_hold that haven't been matched with calls +/* Number of calls to sock_hold that haven't been matched with calls * to sock_put. */ int mock_sock_holds; +/* Number of calls to homa_rpc_hold that haven't been matched with calls + * to homa_rpc_put. + */ +int mock_rpc_holds; + /* The number of times preempt_disable() has been invoked, minus the * number of times preempt_enable has been invoked. */ @@ -1495,6 +1500,20 @@ struct ctl_table_header *mock_register_net_sysctl(struct net *net, return (struct ctl_table_header *)11111; } +void mock_rpc_hold(struct homa_rpc *rpc) +{ + mock_rpc_holds++; + atomic_inc(&rpc->refs); +} + +void mock_rpc_put(struct homa_rpc *rpc) +{ + if (atomic_read(&rpc->refs) == 0) + FAIL("homa_rpc_put invoked when RPC has no active holds"); + mock_rpc_holds--; + atomic_dec(&rpc->refs); +} + /** * mock_set_core() - Set internal state that indicates the "current core". * @num: Integer identifier for a core. @@ -1806,12 +1825,18 @@ void mock_teardown(void) mock_sock_holds); mock_sock_holds = 0; + if (mock_rpc_holds != 0) + FAIL(" %d homa_rpc_holds still active after test", + mock_rpc_holds); + mock_rpc_holds = 0; + if (mock_preempt_disables != 0) FAIL(" %d preempt_disables still active after test", mock_preempt_disables); mock_preempt_disables = 0; memset(homa_metrics, 0, sizeof(homa_metrics)); + unit_hook_clear(); } diff --git a/test/mock.h b/test/mock.h index bc0e1339..b0865621 100644 --- a/test/mock.h +++ b/test/mock.h @@ -38,6 +38,10 @@ #undef HOMA_MIN_DEFAULT_PORT #define HOMA_MIN_DEFAULT_PORT mock_min_default_port +#define homa_rpc_hold mock_rpc_hold + +#define homa_rpc_put mock_rpc_put + #undef kmalloc #define kmalloc mock_kmalloc @@ -170,6 +174,8 @@ struct ctl_table_header * mock_register_net_sysctl(struct net *net, const char *path, struct ctl_table *table); +void mock_rpc_hold(struct homa_rpc *rpc); +void mock_rpc_put(struct homa_rpc *rpc); void mock_set_core(int num); void mock_set_ipv6(struct homa_sock *hsk); void mock_spin_lock(spinlock_t *lock); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index e4ce7dff..e9154850 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -764,7 +764,6 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(20000, atomic_read(&self->homa.active_remaining[0])); EXPECT_EQ(1, atomic_read(&self->homa.grant_recalc_count)); - EXPECT_EQ(0, atomic_read(&rpc1->grants_in_progress)); EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(1, rpc3->msgin.priority); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 5eff862a..797a1293 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2149,6 +2149,8 @@ TEST_F(homa_incoming, homa_register_interests__return_response_by_id) 0, self->client_id); EXPECT_EQ(0, result); EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); + EXPECT_EQ(1, atomic_read(&crpc->refs)); + homa_rpc_put(crpc); homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_register_interests__socket_shutdown) @@ -2173,6 +2175,8 @@ TEST_F(homa_incoming, homa_register_interests__specified_id_has_packets) HOMA_RECVMSG_REQUEST, crpc->id); EXPECT_EQ(0, result); EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); + EXPECT_EQ(1, atomic_read(&crpc->refs)); + homa_rpc_put(crpc); homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_register_interests__specified_id_has_error) @@ -2189,6 +2193,8 @@ TEST_F(homa_incoming, homa_register_interests__specified_id_has_error) HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_NONBLOCKING, crpc->id); EXPECT_EQ(0, result); EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); + EXPECT_EQ(1, atomic_read(&crpc->refs)); + homa_rpc_put(crpc); homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_register_interests__specified_id_not_ready) @@ -2218,6 +2224,8 @@ TEST_F(homa_incoming, homa_register_interests__return_queued_response) EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); EXPECT_TRUE(list_empty(&self->interest.request_links)); EXPECT_TRUE(list_empty(&self->interest.response_links)); + EXPECT_EQ(1, atomic_read(&crpc->refs)); + homa_rpc_put(crpc); homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_register_interests__return_queued_request) @@ -2234,6 +2242,8 @@ TEST_F(homa_incoming, homa_register_interests__return_queued_request) EXPECT_EQ(srpc, homa_interest_get_rpc(&self->interest)); EXPECT_TRUE(list_empty(&self->interest.request_links)); EXPECT_TRUE(list_empty(&self->interest.response_links)); + EXPECT_EQ(1, atomic_read(&srpc->refs)); + homa_rpc_put(srpc); homa_rpc_unlock(srpc); } TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) @@ -2253,6 +2263,8 @@ TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) EXPECT_EQ(0, result); EXPECT_EQ(srpc1, homa_interest_get_rpc(&self->interest)); EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); + EXPECT_EQ(1, atomic_read(&srpc1->refs)); + homa_rpc_put(srpc1); homa_rpc_unlock(srpc1); // Second time shouldn't call sk_data_ready (no more RPCs). @@ -2263,6 +2275,8 @@ TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) EXPECT_EQ(0, result); EXPECT_EQ(srpc2, homa_interest_get_rpc(&self->interest)); EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(1, atomic_read(&srpc2->refs)); + homa_rpc_put(srpc2); homa_rpc_unlock(srpc2); } @@ -2388,7 +2402,7 @@ TEST_F(homa_incoming, homa_wait_for_message__handoff_rpc_then_delete_after_givin ASSERT_NE(NULL, crpc); // Prevent the RPC from being reaped during the test. - atomic_or(RPC_COPYING_TO_USER, &crpc->flags); + homa_rpc_hold(crpc); hook_rpc = crpc; hook3_count = 0; @@ -2397,9 +2411,8 @@ TEST_F(homa_incoming, homa_wait_for_message__handoff_rpc_then_delete_after_givin rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_NONBLOCKING|HOMA_RECVMSG_RESPONSE, 0); EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); - EXPECT_EQ(RPC_COPYING_TO_USER, atomic_read(&crpc->flags)); EXPECT_EQ(RPC_DEAD, crpc->state); - atomic_andnot(RPC_COPYING_TO_USER, &crpc->flags); + homa_rpc_put(crpc); } TEST_F(homa_incoming, homa_wait_for_message__explicit_rpc_deleted_while_sleeping) { @@ -2445,8 +2458,7 @@ TEST_F(homa_incoming, homa_wait_for_message__copy_to_user) rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); - EXPECT_EQ(0, atomic_read(&crpc->flags) - & (RPC_PKTS_READY|RPC_COPYING_TO_USER)); + EXPECT_EQ(0, atomic_read(&crpc->flags) & RPC_PKTS_READY); } TEST_F(homa_incoming, homa_wait_for_message__rpc_freed_after_matching) { @@ -2509,8 +2521,7 @@ TEST_F(homa_incoming, homa_wait_for_message__message_complete) HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); ASSERT_FALSE(IS_ERR(rpc)); EXPECT_EQ(crpc, rpc); - EXPECT_EQ(0, atomic_read(&crpc->flags) - & (RPC_PKTS_READY|RPC_COPYING_TO_USER)); + EXPECT_EQ(0, atomic_read(&crpc->flags) & RPC_PKTS_READY); homa_rpc_unlock(rpc); } TEST_F(homa_incoming, homa_wait_for_message__signal) @@ -2654,6 +2665,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__interest_on_rpc) homa_rpc_handoff(crpc); crpc->interest = NULL; EXPECT_EQ(crpc, homa_interest_get_rpc(&interest)); + homa_rpc_put(crpc); EXPECT_EQ(NULL, interest.reg_rpc); EXPECT_EQ(NULL, crpc->interest); EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); @@ -2675,6 +2687,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__response_interests) list_add_tail(&interest.response_links, &self->hsk.response_interests); homa_rpc_handoff(crpc); EXPECT_EQ(crpc, homa_interest_get_rpc(&interest)); + homa_rpc_put(crpc); EXPECT_EQ(0, unit_list_length(&self->hsk.response_interests)); EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); atomic_andnot(RPC_HANDING_OFF, &crpc->flags); @@ -2705,6 +2718,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__request_interests) list_add_tail(&interest.request_links, &self->hsk.request_interests); homa_rpc_handoff(srpc); EXPECT_EQ(srpc, homa_interest_get_rpc(&interest)); + homa_rpc_put(srpc); EXPECT_EQ(0, unit_list_length(&self->hsk.request_interests)); EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); atomic_andnot(RPC_HANDING_OFF, &srpc->flags); @@ -2745,6 +2759,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__detach_interest) homa_rpc_handoff(crpc); crpc->interest = NULL; EXPECT_EQ(crpc, homa_interest_get_rpc(&interest)); + homa_rpc_put(crpc); EXPECT_EQ(NULL, interest.reg_rpc); EXPECT_EQ(NULL, crpc->interest); EXPECT_EQ(0, unit_list_length(&self->hsk.response_interests)); @@ -2770,6 +2785,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__update_last_app_active) mock_ns = 10000; per_cpu(homa_offload_core, 2).last_app_active = 444; homa_rpc_handoff(crpc); + homa_rpc_put(crpc); EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); EXPECT_EQ(10000, per_cpu(homa_offload_core, 2).last_app_active); atomic_andnot(RPC_HANDING_OFF, &crpc->flags); diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index cf1e3dab..db9cc198 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -573,7 +573,7 @@ TEST_F(homa_rpc, homa_rpc_reap__protected_and_reap_all) EXPECT_STREQ("reaped 1234", unit_log_get()); EXPECT_EQ(0, self->hsk.dead_skbs); } -TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_flags) +TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) { struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, @@ -587,62 +587,14 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_flags) homa_rpc_end(crpc1); homa_rpc_end(crpc2); unit_log_clear(); - atomic_or(RPC_COPYING_TO_USER, &crpc1->flags); + homa_rpc_hold(crpc1); self->homa.reap_limit = 3; EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); unit_log_clear(); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("", unit_log_get()); - atomic_andnot(RPC_COPYING_TO_USER, &crpc1->flags); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); - EXPECT_STREQ("reaped 1234", unit_log_get()); -} -TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_active_xmits) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 1000, 2000); - - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - homa_rpc_end(crpc1); - homa_rpc_end(crpc2); - unit_log_clear(); - atomic_inc(&crpc1->msgout.active_xmits); - self->homa.reap_limit = 100; - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); - EXPECT_STREQ("reaped 1236", unit_log_get()); - unit_log_clear(); - atomic_dec(&crpc1->msgout.active_xmits); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); - EXPECT_STREQ("reaped 1234", unit_log_get()); -} -TEST_F(homa_rpc, homa_rpc_reap__grant_in_progress) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 1000, 2000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 1000, 2000); - - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - homa_rpc_end(crpc1); - homa_rpc_end(crpc2); - unit_log_clear(); - atomic_inc(&crpc1->grants_in_progress); - self->homa.reap_limit = 3; - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); - EXPECT_STREQ("reaped 1236", unit_log_get()); - unit_log_clear(); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); - EXPECT_STREQ("", unit_log_get()); - atomic_dec(&crpc1->grants_in_progress); + homa_rpc_put(crpc1); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1234", unit_log_get()); } From 0712eb281a2b2360ab3453868aae447b0f146edd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 29 Jan 2025 16:03:34 -0800 Subject: [PATCH 180/625] Modify homa_ack_pkt so that it returns with RPC locked --- homa_incoming.c | 33 ++++++++++++++++----------------- test/unit_homa_incoming.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 18 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 6eb4c84c..96ebb9e8 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -519,18 +519,6 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) case ACK: INC_METRIC(packets_received[ACK - DATA], 1); homa_ack_pkt(skb, hsk, rpc); - rpc = NULL; - - /* It isn't safe to process more packets once we've - * released the RPC lock (this should never happen). - */ - while (next) { - WARN_ONCE(next, "%s found extra packets after AC<\n", - __func__); - skb = next; - next = skb->next; - kfree_skb(skb); - } break; default: INC_METRIC(unknown_packet_types, 1); @@ -875,8 +863,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, * This function now owns the packet. * @hsk: Socket on which the packet was received. * @rpc: The RPC named in the packet header, or NULL if no such - * RPC exists. The RPC has been locked by the caller but will - * be unlocked here. + * RPC exists. The RPC lock will be dead on return. */ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc) @@ -889,12 +876,24 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, if (rpc) { tt_record1("homa_ack_pkt freeing rpc id %d", rpc->id); homa_rpc_end(rpc); - homa_rpc_unlock(rpc); } count = ntohs(h->num_acks); - for (i = 0; i < count; i++) - homa_rpc_acked(hsk, &saddr, &h->acks[i]); + if (count > 0) { + if (rpc) { + /* Must temporarily release rpc's lock because + * homa_rpc_acked needs to acquire RPC locks. + */ + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); + } + for (i = 0; i < count; i++) + homa_rpc_acked(hsk, &saddr, &h->acks[i]); + if (rpc) { + homa_rpc_lock(rpc); + homa_rpc_put(rpc); + } + } tt_record3("ACK received for id %d, peer 0x%x, with %d other acks", homa_local_id(h->common.sender_id), tt_addr(saddr), count); kfree_skb(skb); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 797a1293..52d73074 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1814,7 +1814,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) unit_log_get()); } -TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) +TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_no_extras) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, @@ -1835,6 +1835,41 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists) EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ACK - DATA]); } +TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_plus_extras) +{ + struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 5000); + struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id+2, 100, 5000); + struct homa_rpc *srpc3 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id+4, 100, 5000); + struct homa_ack_hdr h = {.common = { + .sport = htons(self->client_port), + .dport = htons(self->hsk2.port), + .sender_id = cpu_to_be64(self->client_id), + .type = ACK}, + .num_acks = htons(2)}; + + ASSERT_NE(NULL, srpc1); + ASSERT_NE(NULL, srpc2); + ASSERT_NE(NULL, srpc3); + EXPECT_EQ(3, unit_list_length(&self->hsk2.active_rpcs)); + unit_log_clear(); + mock_xmit_log_verbose = 1; + h.acks[0] = (struct homa_ack) {.server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->server_id+1)}; + h.acks[1] = (struct homa_ack) {.server_port = htons(self->server_port), + .client_id = cpu_to_be64(self->server_id+3)}; + homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + &self->homa); + EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc1)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc2)); + EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc2)); +} TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) { struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, From 98f97786dc7318e3a6059a00562829d7162e21b1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 29 Jan 2025 16:47:57 -0800 Subject: [PATCH 181/625] Add comment --- homa_incoming.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/homa_incoming.c b/homa_incoming.c index 96ebb9e8..b47fd6a3 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -418,6 +418,11 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) if (flags & APP_NEEDS_LOCK) { homa_rpc_unlock(rpc); tt_record2("softirq released lock for id %d, flags 0x%x", rpc->id, flags); + + /* We're going to try to reacquire the RPC + * lock almost immediately below; give the + * app thread a chance to get to it first. + */ homa_spin(200); rpc = NULL; } From 8ecdc377e56fb4945635cdcad941c60f26273320 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 30 Jan 2025 09:54:37 -0800 Subject: [PATCH 182/625] Fix checkpatch.pl issues --- homa_impl.h | 3 +-- homa_rpc.c | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 9d556884..dd1fa6a0 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -217,8 +217,7 @@ static inline struct homa_rpc *homa_interest_get_rpc(struct homa_interest *inter * @locked: 1 means @rpc is locked, 0 means unlocked. */ static inline void homa_interest_set_rpc(struct homa_interest *interest, - struct homa_rpc *rpc, - int locked) + struct homa_rpc *rpc, int locked) { interest->rpc = rpc; interest->locked = locked; diff --git a/homa_rpc.c b/homa_rpc.c index c0d83cc8..94a6680f 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -276,9 +276,10 @@ void homa_rpc_end(struct homa_rpc *rpc) if (rpc->msgin.length >= 0) { rpc->hsk->dead_skbs += skb_queue_len(&rpc->msgin.packets); while (1) { - struct homa_gap *gap = list_first_entry_or_null(&rpc->msgin.gaps, - struct homa_gap, - links); + struct homa_gap *gap; + + gap = list_first_entry_or_null(&rpc->msgin.gaps, + struct homa_gap, links); if (!gap) break; list_del(&gap->links); From 954d16e4d781e9422065b7984e15b494bf3860ca Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 3 Feb 2025 16:23:49 -0800 Subject: [PATCH 183/625] Create homa_devel.c Segragate utility functions that are needed for debugging and performance evaluation but aren't going to get upstreamed to Linux. --- Makefile | 3 +- homa_devel.c | 551 ++++++++++++++++++++++++++++++++++++++++++++++++++ homa_devel.h | 28 +++ homa_rpc.c | 28 --- homa_utils.c | 473 ------------------------------------------- test/Makefile | 3 +- 6 files changed, 583 insertions(+), 503 deletions(-) create mode 100644 homa_devel.c create mode 100644 homa_devel.h diff --git a/Makefile b/Makefile index c3f36464..10fe0d40 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ # Makefile to build Homa as a Linux module. -HOMA_OBJS := homa_grant.o \ +HOMA_OBJS := homa_devel.o \ + homa_grant.o \ homa_incoming.o \ homa_metrics.o \ homa_offload.o \ diff --git a/homa_devel.c b/homa_devel.c new file mode 100644 index 00000000..bbfa17ff --- /dev/null +++ b/homa_devel.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: BSD-2-Clause + +/* This file contains functions that are useful to have in Homa during + * development, but aren't needed in production versions. + */ + +#include "homa_impl.h" +#include "homa_devel.h" +#include "homa_peer.h" +#include "homa_rpc.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_skb.h" +#else /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ +#include "homa_wire.h" + +/** + * homa_print_ipv4_addr() - Convert an IPV4 address to the standard string + * representation. + * @addr: Address to convert, in network byte order. + * + * Return: The converted value. Values are stored in static memory, so + * the caller need not free. This also means that storage is + * eventually reused (there are enough buffers to accommodate + * multiple "active" values). + * + * Note: Homa uses this function, rather than the %pI4 format specifier + * for snprintf et al., because the kernel's version of snprintf isn't + * available in Homa's unit test environment. + */ +char *homa_print_ipv4_addr(__be32 addr) +{ +#define NUM_BUFS_IPV4 4 +#define BUF_SIZE_IPV4 30 + static char buffers[NUM_BUFS_IPV4][BUF_SIZE_IPV4]; + u32 a2 = ntohl(addr); + static int next_buf; + char *buffer; + + buffer = buffers[next_buf]; + next_buf++; + if (next_buf >= NUM_BUFS_IPV4) + next_buf = 0; + snprintf(buffer, BUF_SIZE_IPV4, "%u.%u.%u.%u", (a2 >> 24) & 0xff, + (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); + return buffer; +} + +/** + * homa_print_ipv6_addr() - Convert an IPv6 address to a human-readable string + * representation. IPv4-mapped addresses are printed in IPv4 syntax. + * @addr: Address to convert, in network byte order. + * + * Return: The converted value. Values are stored in static memory, so + * the caller need not free. This also means that storage is + * eventually reused (there are enough buffers to accommodate + * multiple "active" values). + */ +char *homa_print_ipv6_addr(const struct in6_addr *addr) +{ +#define NUM_BUFS BIT(2) +#define BUF_SIZE 64 + static char buffers[NUM_BUFS][BUF_SIZE]; + static int next_buf; + char *buffer; + + buffer = buffers[next_buf]; + next_buf++; + if (next_buf >= NUM_BUFS) + next_buf = 0; +#ifdef __UNIT_TEST__ + struct in6_addr zero = {}; + + if (ipv6_addr_equal(addr, &zero)) { + snprintf(buffer, BUF_SIZE, "0.0.0.0"); + } else if ((addr->s6_addr32[0] == 0) && + (addr->s6_addr32[1] == 0) && + (addr->s6_addr32[2] == htonl(0x0000ffff))) { + u32 a2 = ntohl(addr->s6_addr32[3]); + + snprintf(buffer, BUF_SIZE, "%u.%u.%u.%u", (a2 >> 24) & 0xff, + (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); + } else { + const char *inet_ntop(int af, const void *src, char *dst, + size_t size); + inet_ntop(AF_INET6, addr, buffer + 1, BUF_SIZE); + buffer[0] = '['; + strcat(buffer, "]"); + } +#else + snprintf(buffer, BUF_SIZE, "%pI6", addr); +#endif + return buffer; +} + +/** + * homa_print_packet() - Print a human-readable string describing the + * information in a Homa packet. + * @skb: Packet whose information should be printed. + * @buffer: Buffer in which to generate the string. + * @buf_len: Number of bytes available at @buffer. + * + * Return: @buffer + */ +char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) +{ + struct homa_common_hdr *common; + char header[HOMA_MAX_HEADER]; + struct in6_addr saddr; + int used = 0; + + if (!skb) { + snprintf(buffer, buf_len, "skb is NULL!"); + buffer[buf_len - 1] = 0; + return buffer; + } + + homa_skb_get(skb, &header, 0, sizeof(header)); + common = (struct homa_common_hdr *)header; + saddr = skb_canonical_ipv6_saddr(skb); + used = homa_snprintf(buffer, buf_len, used, + "%s from %s:%u, dport %d, id %llu", + homa_symbol_for_type(common->type), + homa_print_ipv6_addr(&saddr), + ntohs(common->sport), ntohs(common->dport), + be64_to_cpu(common->sender_id)); + switch (common->type) { + case DATA: { + struct homa_skb_info *homa_info = homa_get_skb_info(skb); + struct homa_data_hdr *h = (struct homa_data_hdr *)header; + int data_left, i, seg_length, pos, offset; + + if (skb_shinfo(skb)->gso_segs == 0) { + seg_length = homa_data_len(skb); + data_left = 0; + } else { + seg_length = homa_info->seg_length; + if (seg_length > homa_info->data_bytes) + seg_length = homa_info->data_bytes; + data_left = homa_info->data_bytes - seg_length; + } + offset = ntohl(h->seg.offset); + if (offset == -1) + offset = ntohl(h->common.sequence); +#ifndef __STRIP__ /* See strip.py */ + used = homa_snprintf(buffer, buf_len, used, + ", message_length %d, offset %d, data_length %d, incoming %d", + ntohl(h->message_length), offset, + seg_length, ntohl(h->incoming)); + if (ntohs(h->cutoff_version) != 0) + used = homa_snprintf(buffer, buf_len, used, + ", cutoff_version %d", + ntohs(h->cutoff_version)); +#else /* See strip.py */ + used = homa_snprintf(buffer, buf_len, used, + ", message_length %d, offset %d, data_length %d", + ntohl(h->message_length), offset, + seg_length); +#endif /* See strip.py */ + if (h->retransmit) + used = homa_snprintf(buffer, buf_len, used, + ", RETRANSMIT"); + if (skb_shinfo(skb)->gso_type == 0xd) + used = homa_snprintf(buffer, buf_len, used, + ", TSO disabled"); + if (skb_shinfo(skb)->gso_segs <= 1) + break; + pos = skb_transport_offset(skb) + sizeof32(*h) + seg_length; + used = homa_snprintf(buffer, buf_len, used, ", extra segs"); + for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { + if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { + struct homa_seg_hdr seg; + + homa_skb_get(skb, &seg, pos, sizeof(seg)); + offset = ntohl(seg.offset); + } else { + offset += seg_length; + } + if (seg_length > data_left) + seg_length = data_left; + used = homa_snprintf(buffer, buf_len, used, + " %d@%d", seg_length, offset); + data_left -= seg_length; + pos += skb_shinfo(skb)->gso_size; + }; + break; + } +#ifndef __STRIP__ /* See strip.py */ + case GRANT: { + struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; + char *resend = (h->resend_all) ? ", resend_all" : ""; + + used = homa_snprintf(buffer, buf_len, used, + ", offset %d, grant_prio %u%s", + ntohl(h->offset), h->priority, resend); + break; + } +#endif /* See strip.py */ + case RESEND: { + struct homa_resend_hdr *h = (struct homa_resend_hdr *)header; + +#ifndef __STRIP__ /* See strip.py */ + used = homa_snprintf(buffer, buf_len, used, + ", offset %d, length %d, resend_prio %u", + ntohl(h->offset), ntohl(h->length), + h->priority); +#else /* See strip.py */ + used = homa_snprintf(buffer, buf_len, used, + ", offset %d, length %d", + ntohl(h->offset), ntohl(h->length)); +#endif /* See strip.py */ + break; + } + case UNKNOWN: + /* Nothing to add here. */ + break; + case BUSY: + /* Nothing to add here. */ + break; +#ifndef __STRIP__ /* See strip.py */ + case CUTOFFS: { + struct homa_cutoffs_hdr *h = (struct homa_cutoffs_hdr *)header; + + used = homa_snprintf(buffer, buf_len, used, + ", cutoffs %d %d %d %d %d %d %d %d, version %u", + ntohl(h->unsched_cutoffs[0]), + ntohl(h->unsched_cutoffs[1]), + ntohl(h->unsched_cutoffs[2]), + ntohl(h->unsched_cutoffs[3]), + ntohl(h->unsched_cutoffs[4]), + ntohl(h->unsched_cutoffs[5]), + ntohl(h->unsched_cutoffs[6]), + ntohl(h->unsched_cutoffs[7]), + ntohs(h->cutoff_version)); + break; + } + case FREEZE: + /* Nothing to add here. */ + break; +#endif /* See strip.py */ + case NEED_ACK: + /* Nothing to add here. */ + break; + case ACK: { + struct homa_ack_hdr *h = (struct homa_ack_hdr *)header; + int i, count; + + count = ntohs(h->num_acks); + used = homa_snprintf(buffer, buf_len, used, ", acks"); + for (i = 0; i < count; i++) { + used = homa_snprintf(buffer, buf_len, used, + " [sp %d, id %llu]", + ntohs(h->acks[i].server_port), + be64_to_cpu(h->acks[i].client_id)); + } + break; + } + } + + buffer[buf_len - 1] = 0; + return buffer; +} + +/** + * homa_print_packet_short() - Print a human-readable string describing the + * information in a Homa packet. This function generates a shorter + * description than homa_print_packet. + * @skb: Packet whose information should be printed. + * @buffer: Buffer in which to generate the string. + * @buf_len: Number of bytes available at @buffer. + * + * Return: @buffer + */ +char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) +{ + struct homa_common_hdr *common; + char header[HOMA_MAX_HEADER]; + + common = (struct homa_common_hdr *)header; + homa_skb_get(skb, header, 0, HOMA_MAX_HEADER); + switch (common->type) { + case DATA: { + struct homa_data_hdr *h = (struct homa_data_hdr *)header; + struct homa_skb_info *homa_info = homa_get_skb_info(skb); + int data_left, used, i, seg_length, pos, offset; + + if (skb_shinfo(skb)->gso_segs == 0) { + seg_length = homa_data_len(skb); + data_left = 0; + } else { + seg_length = homa_info->seg_length; + data_left = homa_info->data_bytes - seg_length; + } + offset = ntohl(h->seg.offset); + if (offset == -1) + offset = ntohl(h->common.sequence); + + pos = skb_transport_offset(skb) + sizeof32(*h) + seg_length; + used = homa_snprintf(buffer, buf_len, 0, "DATA%s %d@%d", + h->retransmit ? " retrans" : "", + seg_length, offset); + for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { + if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { + struct homa_seg_hdr seg; + + homa_skb_get(skb, &seg, pos, sizeof(seg)); + offset = ntohl(seg.offset); + } else { + offset += seg_length; + } + if (seg_length > data_left) + seg_length = data_left; + used = homa_snprintf(buffer, buf_len, used, + " %d@%d", seg_length, offset); + data_left -= seg_length; + pos += skb_shinfo(skb)->gso_size; + } + break; + } +#ifndef __STRIP__ /* See strip.py */ + case GRANT: { + struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; + char *resend = h->resend_all ? " resend_all" : ""; + + snprintf(buffer, buf_len, "GRANT %d@%d%s", ntohl(h->offset), + h->priority, resend); + break; + } +#endif /* See strip.py */ + case RESEND: { + struct homa_resend_hdr *h = (struct homa_resend_hdr *)header; + +#ifndef __STRIP__ /* See strip.py */ + snprintf(buffer, buf_len, "RESEND %d-%d@%d", ntohl(h->offset), + ntohl(h->offset) + ntohl(h->length) - 1, + h->priority); +#else /* See strip.py */ + snprintf(buffer, buf_len, "RESEND %d-%d", ntohl(h->offset), + ntohl(h->offset) + ntohl(h->length) - 1); +#endif /* See strip.py */ + break; + } + case UNKNOWN: + snprintf(buffer, buf_len, "UNKNOWN"); + break; + case BUSY: + snprintf(buffer, buf_len, "BUSY"); + break; +#ifndef __STRIP__ /* See strip.py */ + case CUTOFFS: + snprintf(buffer, buf_len, "CUTOFFS"); + break; + case FREEZE: + snprintf(buffer, buf_len, "FREEZE"); + break; +#endif /* See strip.py */ + case NEED_ACK: + snprintf(buffer, buf_len, "NEED_ACK"); + break; + case ACK: + snprintf(buffer, buf_len, "ACK"); + break; + default: + snprintf(buffer, buf_len, "unknown packet type 0x%x", + common->type); + break; + } + return buffer; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_freeze_peers() - Send FREEZE packets to all known peers. + * @homa: Provides info about peers. + */ +void homa_freeze_peers(struct homa *homa) +{ + struct homa_socktab_scan scan; + struct homa_freeze_hdr freeze; + struct homa_peer **peers; + int num_peers, i, err; + struct homa_sock *hsk; + + /* Find a socket to use (any will do). */ + rcu_read_lock(); + hsk = homa_socktab_start_scan(homa->port_map, &scan); + homa_socktab_end_scan(&scan); + if (!hsk) { + tt_record("homa_freeze_peers couldn't find a socket"); + goto done; + } + + peers = homa_peertab_get_peers(homa->peers, &num_peers); + if (!peers) { + tt_record("homa_freeze_peers couldn't find peers to freeze"); + goto done; + } + freeze.common.type = FREEZE; + freeze.common.sport = htons(hsk->port); + freeze.common.dport = 0; + freeze.common.flags = HOMA_TCP_FLAGS; + freeze.common.urgent = htons(HOMA_TCP_URGENT); + freeze.common.sender_id = 0; + for (i = 0; i < num_peers; i++) { + tt_record1("Sending freeze to 0x%x", tt_addr(peers[i]->addr)); + err = __homa_xmit_control(&freeze, sizeof(freeze), peers[i], hsk); + if (err != 0) + tt_record2("homa_freeze_peers got error %d in xmit to 0x%x\n", + err, tt_addr(peers[i]->addr)); + } + kfree(peers); + +done: + rcu_read_unlock(); + return; +} +#endif /* See strip.py */ + +/** + * homa_snprintf() - This function makes it easy to use a series of calls + * to snprintf to gradually append information to a fixed-size buffer. + * If the buffer fills, the function can continue to be called, but nothing + * more will get added to the buffer. + * @buffer: Characters accumulate here. + * @size: Total space available in @buffer. + * @used: Number of bytes currently occupied in the buffer, not including + * a terminating null character; this is typically the result of + * the previous call to this function. + * @format: Format string suitable for passing to printf-like functions, + * followed by values for the various substitutions requested + * in @format + * @ ... + * + * Return: The number of characters now occupied in @buffer, not + * including the terminating null character. + */ +int homa_snprintf(char *buffer, int size, int used, const char *format, ...) +{ + int new_chars; + va_list ap; + + va_start(ap, format); + + if (used >= (size - 1)) + return used; + + new_chars = vsnprintf(buffer + used, size - used, format, ap); + if (new_chars < 0) + return used; + if (new_chars >= (size - used)) + return size - 1; + return used + new_chars; +} + +/** + * homa_symbol_for_state() - Returns a printable string describing an + * RPC state. + * @rpc: RPC whose state should be returned in printable form. + * + * Return: A static string holding the current state of @rpc. + */ +char *homa_symbol_for_state(struct homa_rpc *rpc) +{ + static char buffer[20]; + + switch (rpc->state) { + case RPC_OUTGOING: + return "OUTGOING"; + case RPC_INCOMING: + return "INCOMING"; + case RPC_IN_SERVICE: + return "IN_SERVICE"; + case RPC_DEAD: + return "DEAD"; + } + + /* See safety comment in homa_symbol_for_type. */ + snprintf(buffer, sizeof(buffer) - 1, "unknown(%u)", rpc->state); + buffer[sizeof(buffer) - 1] = 0; + return buffer; +} + +/** + * homa_symbol_for_type() - Returns a printable string describing a packet type. + * @type: A value from those defined by &homa_packet_type. + * + * Return: A static string holding the packet type corresponding to @type. + */ +char *homa_symbol_for_type(uint8_t type) +{ + switch (type) { + case DATA: + return "DATA"; +#ifndef __STRIP__ /* See strip.py */ + case GRANT: + return "GRANT"; +#endif /* See strip.py */ + case RESEND: + return "RESEND"; + case UNKNOWN: + return "UNKNOWN"; + case BUSY: + return "BUSY"; +#ifndef __STRIP__ /* See strip.py */ + case CUTOFFS: + return "CUTOFFS"; + case FREEZE: + return "FREEZE"; +#endif /* See strip.py */ + case NEED_ACK: + return "NEED_ACK"; + case ACK: + return "ACK"; + } + return "??"; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_freeze() - Freezes the timetrace if a particular kind of freeze + * has been requested through sysctl. + * @rpc: If we freeze our timetrace, we'll also send a freeze request + * to the peer for this RPC. + * @type: Condition that just occurred. If this doesn't match the + * externally set "freeze_type" value, then we don't freeze. + * @format: Format string used to generate a time trace record describing + * the reason for the freeze; must include "id %d, peer 0x%x" + */ +void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) +{ + if (type != rpc->hsk->homa->freeze_type) + return; + rpc->hsk->homa->freeze_type = 0; + if (!atomic_read(&tt_frozen)) { +// struct homa_freeze_hdr freeze; + int dummy; + + pr_notice("freezing in %s with freeze_type %d\n", __func__, + type); + tt_record1("homa_freeze calling homa_rpc_log_active with freeze_type %d", type); + homa_rpc_log_active_tt(rpc->hsk->homa, 0); + homa_validate_incoming(rpc->hsk->homa, 1, &dummy); + pr_notice("%s\n", format); + tt_record2(format, rpc->id, tt_addr(rpc->peer->addr)); + tt_freeze(); +// homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); + homa_freeze_peers(rpc->hsk->homa); + } +} +#endif /* See strip.py */ diff --git a/homa_devel.h b/homa_devel.h new file mode 100644 index 00000000..006b8e01 --- /dev/null +++ b/homa_devel.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file defines functions that are useful during Homa development; + * they are not present in the upstreamed version of Homa in Linux. + */ + +#ifndef _HOMA_DEVEL_H +#define _HOMA_DEVEL_H + +#include "homa_impl.h" +struct homa_rpc; + +#ifndef __STRIP__ /* See strip.py */ +void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, + char *format); +void homa_freeze_peers(struct homa *homa); +#endif /* See strip.py */ +char *homa_print_ipv4_addr(__be32 addr); +char *homa_print_ipv6_addr(const struct in6_addr *addr); +char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); +char *homa_print_packet_short(struct sk_buff *skb, char *buffer, + int buf_len); +int homa_snprintf(char *buffer, int size, int used, + const char *format, ...) __printf(4, 5); +char *homa_symbol_for_type(uint8_t type); +char *homa_symbol_for_state(struct homa_rpc *rpc); + +#endif /* _HOMA_DEVEL_H */ diff --git a/homa_rpc.c b/homa_rpc.c index 94a6680f..3c1bb183 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -735,31 +735,3 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) actual - total_incoming, total_incoming, actual); return actual - total_incoming; } - -/** - * homa_symbol_for_state() - Returns a printable string describing an - * RPC state. - * @rpc: RPC whose state should be returned in printable form. - * - * Return: A static string holding the current state of @rpc. - */ -char *homa_symbol_for_state(struct homa_rpc *rpc) -{ - static char buffer[20]; - - switch (rpc->state) { - case RPC_OUTGOING: - return "OUTGOING"; - case RPC_INCOMING: - return "INCOMING"; - case RPC_IN_SERVICE: - return "IN_SERVICE"; - case RPC_DEAD: - return "DEAD"; - } - - /* See safety comment in homa_symbol_for_type. */ - snprintf(buffer, sizeof(buffer) - 1, "unknown(%u)", rpc->state); - buffer[sizeof(buffer) - 1] = 0; - return buffer; -} diff --git a/homa_utils.c b/homa_utils.c index d56ee76c..7a0338ac 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -180,447 +180,6 @@ void homa_destroy(struct homa *homa) homa->metrics = NULL; } -/** - * homa_print_ipv4_addr() - Convert an IPV4 address to the standard string - * representation. - * @addr: Address to convert, in network byte order. - * - * Return: The converted value. Values are stored in static memory, so - * the caller need not free. This also means that storage is - * eventually reused (there are enough buffers to accommodate - * multiple "active" values). - * - * Note: Homa uses this function, rather than the %pI4 format specifier - * for snprintf et al., because the kernel's version of snprintf isn't - * available in Homa's unit test environment. - */ -char *homa_print_ipv4_addr(__be32 addr) -{ -#define NUM_BUFS_IPV4 4 -#define BUF_SIZE_IPV4 30 - static char buffers[NUM_BUFS_IPV4][BUF_SIZE_IPV4]; - u32 a2 = ntohl(addr); - static int next_buf; - char *buffer; - - buffer = buffers[next_buf]; - next_buf++; - if (next_buf >= NUM_BUFS_IPV4) - next_buf = 0; - snprintf(buffer, BUF_SIZE_IPV4, "%u.%u.%u.%u", (a2 >> 24) & 0xff, - (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); - return buffer; -} - -/** - * homa_print_ipv6_addr() - Convert an IPv6 address to a human-readable string - * representation. IPv4-mapped addresses are printed in IPv4 syntax. - * @addr: Address to convert, in network byte order. - * - * Return: The converted value. Values are stored in static memory, so - * the caller need not free. This also means that storage is - * eventually reused (there are enough buffers to accommodate - * multiple "active" values). - */ -char *homa_print_ipv6_addr(const struct in6_addr *addr) -{ -#define NUM_BUFS BIT(2) -#define BUF_SIZE 64 - static char buffers[NUM_BUFS][BUF_SIZE]; - static int next_buf; - char *buffer; - - buffer = buffers[next_buf]; - next_buf++; - if (next_buf >= NUM_BUFS) - next_buf = 0; -#ifdef __UNIT_TEST__ - struct in6_addr zero = {}; - - if (ipv6_addr_equal(addr, &zero)) { - snprintf(buffer, BUF_SIZE, "0.0.0.0"); - } else if ((addr->s6_addr32[0] == 0) && - (addr->s6_addr32[1] == 0) && - (addr->s6_addr32[2] == htonl(0x0000ffff))) { - u32 a2 = ntohl(addr->s6_addr32[3]); - - snprintf(buffer, BUF_SIZE, "%u.%u.%u.%u", (a2 >> 24) & 0xff, - (a2 >> 16) & 0xff, (a2 >> 8) & 0xff, a2 & 0xff); - } else { - const char *inet_ntop(int af, const void *src, char *dst, - size_t size); - inet_ntop(AF_INET6, addr, buffer + 1, BUF_SIZE); - buffer[0] = '['; - strcat(buffer, "]"); - } -#else - snprintf(buffer, BUF_SIZE, "%pI6", addr); -#endif - return buffer; -} - -/** - * homa_print_packet() - Print a human-readable string describing the - * information in a Homa packet. - * @skb: Packet whose information should be printed. - * @buffer: Buffer in which to generate the string. - * @buf_len: Number of bytes available at @buffer. - * - * Return: @buffer - */ -char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) -{ - struct homa_common_hdr *common; - char header[HOMA_MAX_HEADER]; - struct in6_addr saddr; - int used = 0; - - if (!skb) { - snprintf(buffer, buf_len, "skb is NULL!"); - buffer[buf_len - 1] = 0; - return buffer; - } - - homa_skb_get(skb, &header, 0, sizeof(header)); - common = (struct homa_common_hdr *)header; - saddr = skb_canonical_ipv6_saddr(skb); - used = homa_snprintf(buffer, buf_len, used, - "%s from %s:%u, dport %d, id %llu", - homa_symbol_for_type(common->type), - homa_print_ipv6_addr(&saddr), - ntohs(common->sport), ntohs(common->dport), - be64_to_cpu(common->sender_id)); - switch (common->type) { - case DATA: { - struct homa_skb_info *homa_info = homa_get_skb_info(skb); - struct homa_data_hdr *h = (struct homa_data_hdr *)header; - int data_left, i, seg_length, pos, offset; - - if (skb_shinfo(skb)->gso_segs == 0) { - seg_length = homa_data_len(skb); - data_left = 0; - } else { - seg_length = homa_info->seg_length; - if (seg_length > homa_info->data_bytes) - seg_length = homa_info->data_bytes; - data_left = homa_info->data_bytes - seg_length; - } - offset = ntohl(h->seg.offset); - if (offset == -1) - offset = ntohl(h->common.sequence); - used = homa_snprintf(buffer, buf_len, used, - ", message_length %d, offset %d, data_length %d, incoming %d", - ntohl(h->message_length), offset, - seg_length, ntohl(h->incoming)); - if (ntohs(h->cutoff_version) != 0) - used = homa_snprintf(buffer, buf_len, used, - ", cutoff_version %d", - ntohs(h->cutoff_version)); - if (h->retransmit) - used = homa_snprintf(buffer, buf_len, used, - ", RETRANSMIT"); - if (skb_shinfo(skb)->gso_type == 0xd) - used = homa_snprintf(buffer, buf_len, used, - ", TSO disabled"); - if (skb_shinfo(skb)->gso_segs <= 1) - break; - pos = skb_transport_offset(skb) + sizeof32(*h) + seg_length; - used = homa_snprintf(buffer, buf_len, used, ", extra segs"); - for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { - if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { - struct homa_seg_hdr seg; - - homa_skb_get(skb, &seg, pos, sizeof(seg)); - offset = ntohl(seg.offset); - } else { - offset += seg_length; - } - if (seg_length > data_left) - seg_length = data_left; - used = homa_snprintf(buffer, buf_len, used, - " %d@%d", seg_length, offset); - data_left -= seg_length; - pos += skb_shinfo(skb)->gso_size; - }; - break; - } - case GRANT: { - struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; - char *resend = (h->resend_all) ? ", resend_all" : ""; - - used = homa_snprintf(buffer, buf_len, used, - ", offset %d, grant_prio %u%s", - ntohl(h->offset), h->priority, resend); - break; - } - case RESEND: { - struct homa_resend_hdr *h = (struct homa_resend_hdr *)header; - - used = homa_snprintf(buffer, buf_len, used, - ", offset %d, length %d, resend_prio %u", - ntohl(h->offset), ntohl(h->length), - h->priority); - break; - } - case UNKNOWN: - /* Nothing to add here. */ - break; - case BUSY: - /* Nothing to add here. */ - break; - case CUTOFFS: { - struct homa_cutoffs_hdr *h = (struct homa_cutoffs_hdr *)header; - - used = homa_snprintf(buffer, buf_len, used, - ", cutoffs %d %d %d %d %d %d %d %d, version %u", - ntohl(h->unsched_cutoffs[0]), - ntohl(h->unsched_cutoffs[1]), - ntohl(h->unsched_cutoffs[2]), - ntohl(h->unsched_cutoffs[3]), - ntohl(h->unsched_cutoffs[4]), - ntohl(h->unsched_cutoffs[5]), - ntohl(h->unsched_cutoffs[6]), - ntohl(h->unsched_cutoffs[7]), - ntohs(h->cutoff_version)); - break; - } - case FREEZE: - /* Nothing to add here. */ - break; - case NEED_ACK: - /* Nothing to add here. */ - break; - case ACK: { - struct homa_ack_hdr *h = (struct homa_ack_hdr *)header; - int i, count; - - count = ntohs(h->num_acks); - used = homa_snprintf(buffer, buf_len, used, ", acks"); - for (i = 0; i < count; i++) { - used = homa_snprintf(buffer, buf_len, used, - " [sp %d, id %llu]", - ntohs(h->acks[i].server_port), - be64_to_cpu(h->acks[i].client_id)); - } - break; - } - } - - buffer[buf_len - 1] = 0; - return buffer; -} - -/** - * homa_print_packet_short() - Print a human-readable string describing the - * information in a Homa packet. This function generates a shorter - * description than homa_print_packet. - * @skb: Packet whose information should be printed. - * @buffer: Buffer in which to generate the string. - * @buf_len: Number of bytes available at @buffer. - * - * Return: @buffer - */ -char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) -{ - struct homa_common_hdr *common; - char header[HOMA_MAX_HEADER]; - - common = (struct homa_common_hdr *)header; - homa_skb_get(skb, header, 0, HOMA_MAX_HEADER); - switch (common->type) { - case DATA: { - struct homa_data_hdr *h = (struct homa_data_hdr *)header; - struct homa_skb_info *homa_info = homa_get_skb_info(skb); - int data_left, used, i, seg_length, pos, offset; - - if (skb_shinfo(skb)->gso_segs == 0) { - seg_length = homa_data_len(skb); - data_left = 0; - } else { - seg_length = homa_info->seg_length; - data_left = homa_info->data_bytes - seg_length; - } - offset = ntohl(h->seg.offset); - if (offset == -1) - offset = ntohl(h->common.sequence); - - pos = skb_transport_offset(skb) + sizeof32(*h) + seg_length; - used = homa_snprintf(buffer, buf_len, 0, "DATA%s %d@%d", - h->retransmit ? " retrans" : "", - seg_length, offset); - for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { - if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { - struct homa_seg_hdr seg; - - homa_skb_get(skb, &seg, pos, sizeof(seg)); - offset = ntohl(seg.offset); - } else { - offset += seg_length; - } - if (seg_length > data_left) - seg_length = data_left; - used = homa_snprintf(buffer, buf_len, used, - " %d@%d", seg_length, offset); - data_left -= seg_length; - pos += skb_shinfo(skb)->gso_size; - } - break; - } - case GRANT: { - struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; - char *resend = h->resend_all ? " resend_all" : ""; - - snprintf(buffer, buf_len, "GRANT %d@%d%s", ntohl(h->offset), - h->priority, resend); - break; - } - case RESEND: { - struct homa_resend_hdr *h = (struct homa_resend_hdr *)header; - - snprintf(buffer, buf_len, "RESEND %d-%d@%d", ntohl(h->offset), - ntohl(h->offset) + ntohl(h->length) - 1, - h->priority); - break; - } - case UNKNOWN: - snprintf(buffer, buf_len, "UNKNOWN"); - break; - case BUSY: - snprintf(buffer, buf_len, "BUSY"); - break; - case CUTOFFS: - snprintf(buffer, buf_len, "CUTOFFS"); - break; - case FREEZE: - snprintf(buffer, buf_len, "FREEZE"); - break; - case NEED_ACK: - snprintf(buffer, buf_len, "NEED_ACK"); - break; - case ACK: - snprintf(buffer, buf_len, "ACK"); - break; - default: - snprintf(buffer, buf_len, "unknown packet type 0x%x", - common->type); - break; - } - return buffer; -} - -/** - * homa_freeze_peers() - Send FREEZE packets to all known peers. - * @homa: Provides info about peers. - */ -void homa_freeze_peers(struct homa *homa) -{ - struct homa_socktab_scan scan; - struct homa_freeze_hdr freeze; - struct homa_peer **peers; - int num_peers, i, err; - struct homa_sock *hsk; - - /* Find a socket to use (any will do). */ - rcu_read_lock(); - hsk = homa_socktab_start_scan(homa->port_map, &scan); - homa_socktab_end_scan(&scan); - if (!hsk) { - tt_record("homa_freeze_peers couldn't find a socket"); - goto done; - } - - peers = homa_peertab_get_peers(homa->peers, &num_peers); - if (!peers) { - tt_record("homa_freeze_peers couldn't find peers to freeze"); - goto done; - } - freeze.common.type = FREEZE; - freeze.common.sport = htons(hsk->port); - freeze.common.dport = 0; - freeze.common.flags = HOMA_TCP_FLAGS; - freeze.common.urgent = htons(HOMA_TCP_URGENT); - freeze.common.sender_id = 0; - for (i = 0; i < num_peers; i++) { - tt_record1("Sending freeze to 0x%x", tt_addr(peers[i]->addr)); - err = __homa_xmit_control(&freeze, sizeof(freeze), peers[i], hsk); - if (err != 0) - tt_record2("homa_freeze_peers got error %d in xmit to 0x%x\n", - err, tt_addr(peers[i]->addr)); - } - kfree(peers); - -done: - rcu_read_unlock(); - return; -} - -/** - * homa_snprintf() - This function makes it easy to use a series of calls - * to snprintf to gradually append information to a fixed-size buffer. - * If the buffer fills, the function can continue to be called, but nothing - * more will get added to the buffer. - * @buffer: Characters accumulate here. - * @size: Total space available in @buffer. - * @used: Number of bytes currently occupied in the buffer, not including - * a terminating null character; this is typically the result of - * the previous call to this function. - * @format: Format string suitable for passing to printf-like functions, - * followed by values for the various substitutions requested - * in @format - * @ ... - * - * Return: The number of characters now occupied in @buffer, not - * including the terminating null character. - */ -int homa_snprintf(char *buffer, int size, int used, const char *format, ...) -{ - int new_chars; - va_list ap; - - va_start(ap, format); - - if (used >= (size - 1)) - return used; - - new_chars = vsnprintf(buffer + used, size - used, format, ap); - if (new_chars < 0) - return used; - if (new_chars >= (size - used)) - return size - 1; - return used + new_chars; -} - -/** - * homa_symbol_for_type() - Returns a printable string describing a packet type. - * @type: A value from those defined by &homa_packet_type. - * - * Return: A static string holding the packet type corresponding to @type. - */ -char *homa_symbol_for_type(uint8_t type) -{ - switch (type) { - case DATA: - return "DATA"; - case GRANT: - return "GRANT"; - case RESEND: - return "RESEND"; - case UNKNOWN: - return "UNKNOWN"; - case BUSY: - return "BUSY"; - case CUTOFFS: - return "CUTOFFS"; - case FREEZE: - return "FREEZE"; - case NEED_ACK: - return "NEED_ACK"; - case ACK: - return "ACK"; - } - return "??"; -} - /** * homa_prios_changed() - This function is called whenever configuration * information related to priorities, such as @homa->unsched_cutoffs or @@ -692,36 +251,4 @@ void homa_throttle_lock_slow(struct homa *homa) INC_METRIC(throttle_lock_misses, 1); INC_METRIC(throttle_lock_miss_ns, sched_clock() - start); } - -/** - * homa_freeze() - Freezes the timetrace if a particular kind of freeze - * has been requested through sysctl. - * @rpc: If we freeze our timetrace, we'll also send a freeze request - * to the peer for this RPC. - * @type: Condition that just occurred. If this doesn't match the - * externally set "freeze_type" value, then we don't freeze. - * @format: Format string used to generate a time trace record describing - * the reason for the freeze; must include "id %d, peer 0x%x" - */ -void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) -{ - if (type != rpc->hsk->homa->freeze_type) - return; - rpc->hsk->homa->freeze_type = 0; - if (!atomic_read(&tt_frozen)) { -// struct homa_freeze_hdr freeze; - int dummy; - - pr_notice("freezing in %s with freeze_type %d\n", __func__, - type); - tt_record1("homa_freeze calling homa_rpc_log_active with freeze_type %d", type); - homa_rpc_log_active_tt(rpc->hsk->homa, 0); - homa_validate_incoming(rpc->hsk->homa, 1, &dummy); - pr_notice("%s\n", format); - tt_record2(format, rpc->id, tt_addr(rpc->peer->addr)); - tt_freeze(); -// homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); - homa_freeze_peers(rpc->hsk->homa); - } -} #endif /* See strip.py */ diff --git a/test/Makefile b/test/Makefile index ed0e9802..bbe263c1 100644 --- a/test/Makefile +++ b/test/Makefile @@ -51,7 +51,8 @@ TEST_SRCS := unit_homa_grant.c \ unit_timetrace.c TEST_OBJS := $(patsubst %.c,%.o,$(TEST_SRCS)) -HOMA_SRCS := homa_grant.c \ +HOMA_SRCS := homa_devel.c \ + homa_grant.c \ homa_incoming.c \ homa_metrics.c \ homa_offload.c \ From 4115d2e09b2a096bdb435d72221b0dd5a99de618 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 31 Jan 2025 14:54:21 -0800 Subject: [PATCH 184/625] Refactor stripping mechanism Stripping is now completely automated based on #ifdefs and strip.py; no need for manual stripping of source files. The '__STRIP__=y' option to make can be used to perform almost complete stripping at compile time. --- Makefile | 21 +++-- Makefile.upstream | 14 +++ cloudlab/bin/config | 17 ++++ homa.h | 6 +- homa_api.c | 2 + homa_devel.h | 28 +++++- homa_grant.c | 2 + homa_impl.h | 101 ++++++++++++--------- homa_incoming.c | 157 +++++++++++++++++++++++++++++--- homa_offload.c | 6 ++ homa_offload.h | 4 + homa_outgoing.c | 175 +++++++++++++++++++++++++++++++++--- homa_peer.c | 8 +- homa_peer.h | 2 - homa_plumbing.c | 86 ++++++++++++++++-- homa_pool.c | 9 ++ homa_rpc.c | 31 +++++-- homa_rpc.h | 13 ++- homa_sock.c | 2 + homa_sock.h | 6 +- homa_stub.h | 14 ++- homa_timer.c | 61 +++++++++++-- homa_utils.c | 57 ++++++------ homa_wire.h | 75 ++++++++++++++++ notes.txt | 3 + test/Makefile | 25 ++++-- test/mock.c | 16 ++-- test/unit_homa_grant.c | 19 ++-- test/unit_homa_incoming.c | 183 +++++++++++++++++++++++++++++++------- test/unit_homa_offload.c | 27 +++--- test/unit_homa_outgoing.c | 125 ++++++++++++++++++++++---- test/unit_homa_peer.c | 20 +++++ test/unit_homa_plumbing.c | 25 ++++-- test/unit_homa_pool.c | 8 ++ test/unit_homa_rpc.c | 33 ++++--- test/unit_homa_sock.c | 4 + test/unit_homa_timer.c | 51 ++++++++++- test/unit_homa_utils.c | 8 ++ test/utils.c | 50 +++++------ test/utils.h | 2 + timetrace.c | 4 + util/cp_node.cc | 2 + util/cperf.py | 85 ++++++++++++------ util/strip.py | 95 +++++++++++--------- 44 files changed, 1350 insertions(+), 332 deletions(-) create mode 100644 Makefile.upstream diff --git a/Makefile b/Makefile index 10fe0d40..cd8ad375 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,12 @@ # Makefile to build Homa as a Linux module. HOMA_OBJS := homa_devel.o \ - homa_grant.o \ homa_incoming.o \ - homa_metrics.o \ - homa_offload.o \ homa_outgoing.o \ homa_peer.o \ homa_pool.o \ homa_plumbing.o \ homa_rpc.o \ - homa_skb.o \ homa_sock.o \ homa_timer.o \ homa_utils.o \ @@ -21,9 +17,17 @@ ifneq ($(KERNELRELEASE),) obj-m += homa.o homa-y = $(HOMA_OBJS) +ifneq ($(__STRIP__),) +MY_CFLAGS += -D__STRIP__ +else +HOMA_OBJS += homa_grant.o \ + homa_metrics.o \ + homa_offload.o \ + homa_skb.o +endif + MY_CFLAGS += -g -ccflags-y += ${MY_CFLAGS} -CC += ${MY_CFLAGS} +ccflags-y += $(MY_CFLAGS) else @@ -56,13 +60,16 @@ CP_HDRS := homa_impl.h \ homa_wire.h CP_SRCS := $(patsubst %.o,%.c,$(filter-out timetrace.o, $(HOMA_OBJS))) CP_EXTRAS := reap.txt \ - sync.txt + sync.txt \ + Makefile CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS) $(CP_EXTRAS)) net-next: $(CP_TARGETS) $(LINUX_SRC_DIR)/include/uapi/linux/homa.h $(HOMA_TARGET)/%: % util/strip.py util/strip.py $< > $@ $(HOMA_TARGET)/%.txt: %.txt cp $< $@ +$(HOMA_TARGET)/Makefile: Makefile.upstream + cp $< $@ $(LINUX_SRC_DIR)/include/uapi/linux/homa.h: homa.h util/strip.py util/strip.py $< > $@ diff --git a/Makefile.upstream b/Makefile.upstream new file mode 100644 index 00000000..3eb192a6 --- /dev/null +++ b/Makefile.upstream @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: BSD-2-Clause +# +# Makefile for the Linux implementation of the Homa transport protocol. + +obj-$(CONFIG_HOMA) := homa.o +homa-y:= homa_incoming.o \ + homa_outgoing.o \ + homa_peer.o \ + homa_pool.o \ + homa_plumbing.o \ + homa_rpc.o \ + homa_sock.o \ + homa_timer.o \ + homa_utils.o diff --git a/cloudlab/bin/config b/cloudlab/bin/config index e049b24e..6e23ee6c 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -64,6 +64,9 @@ num_phys_cores = 0 xl170_default_cores = [0, 10, 7, 19, 8, 18, 6, 16, 4, 14, 11, 1, 7, 17, 15, 5, 13, 3, 2, 12] +# False means that the installed version of Homa doesn't support sysctls. +sysctl_avl = True + def get_core_mask(core = -1): """ Returns a string representing a bit mask containing one bit for each @@ -373,6 +376,10 @@ def set_sysctl(name, value): """ Set a Homa sysctl configuration option as given by name and value. """ + global sysctl_avl + + if not sysctl_avl: + return subprocess.run(["sudo", "sysctl", ".net.homa.%s=%s" % (name, value)], check=True) @@ -387,6 +394,16 @@ def config_homa(mod): subprocess.run(["sudo", "bash", "-c", "insmod %s" % (mod)], check=True) + # See if Homa supports sysctls (if it has been stripped down for Linux + # upstreaming, it might not). + + result = subprocess.run(["sysctl", ".net.homa.num_priorities"], + capture_output=True) + if result.returncode != 0: + global sysctl_avl + print("Homa doesn't appear to support sysctls") + sysctl_avl = False + set_sysctl("num_priorities", "8") link_mbps = get_link_speed() set_sysctl ("link_mbps", str(link_mbps)) diff --git a/homa.h b/homa.h index c27f7c09..4a893d03 100644 --- a/homa.h +++ b/homa.h @@ -130,6 +130,7 @@ _Static_assert(sizeof(struct homa_recvmsg_args) <= 88, #define HOMA_RECVMSG_NONBLOCKING 0x04 #define HOMA_RECVMSG_VALID_FLAGS 0x07 +#ifndef __STRIP__ /* See strip.py */ /** * struct homa_abort_args - Structure that passes arguments and results * between user space and the HOMAIOCABORT ioctl. @@ -152,6 +153,7 @@ struct homa_abort_args { _Static_assert(sizeof(struct homa_abort_args) >= 32, "homa_abort_args shrunk"); _Static_assert(sizeof(struct homa_abort_args) <= 32, "homa_abort_args grew"); #endif +#endif /* See strip.py */ /** define SO_HOMA_RCVBUF: setsockopt option for specifying buffer region. */ #define SO_HOMA_RCVBUF 10 @@ -180,11 +182,14 @@ struct homa_rcvbuf_args { * SIOCPROTOPRIVATE range of 0x89e0 through 0x89ef. */ +#ifndef __STRIP__ /* See strip.py */ #define HOMAIOCABORT _IOWR(0x89, 0xe3, struct homa_abort_args) +#endif /* See strip.py */ #define HOMAIOCFREEZE _IO(0x89, 0xef) #ifndef __STRIP__ /* See strip.py */ int homa_abort(int sockfd, __u64 id, int error); +#endif /* See strip.py */ int homa_send(int sockfd, const void *message_buf, size_t length, const struct sockaddr *dest_addr, __u32 addrlen, __u64 *id, __u64 completion_cookie); @@ -197,6 +202,5 @@ ssize_t homa_reply(int sockfd, const void *message_buf, ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, const struct sockaddr *dest_addr, __u32 addrlen, __u64 id); -#endif /* See strip.py */ #endif /* _UAPI_LINUX_HOMA_H */ diff --git a/homa_api.c b/homa_api.c index 1a9d903b..8fc125f9 100644 --- a/homa_api.c +++ b/homa_api.c @@ -185,6 +185,7 @@ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, return result; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_abort() - Terminate the execution of an RPC. * @sockfd: File descriptor for the socket associated with the RPC. @@ -206,3 +207,4 @@ int homa_abort(int sockfd, __u64 id, int error) return ioctl(sockfd, HOMAIOCABORT, &args); } +#endif /* See strip.py */ diff --git a/homa_devel.h b/homa_devel.h index 006b8e01..efa19604 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -7,9 +7,35 @@ #ifndef _HOMA_DEVEL_H #define _HOMA_DEVEL_H -#include "homa_impl.h" +struct homa; struct homa_rpc; +/** + * enum homa_freeze_type - The @type argument to homa_freeze must be + * one of these values. + */ +enum homa_freeze_type { + RESTART_RPC = 1, + PEER_TIMEOUT = 2, + SLOW_RPC = 3, + SOCKET_CLOSE = 4, + PACKET_LOST = 5, + NEED_ACK_MISSING_DATA = 6, +}; + +/** + * tt_addr() - Given an address, return a 4-byte id that will (hopefully) + * provide a unique identifier for the address in a timetrace record. + * @x: Address (either IPv6 or IPv4-mapped IPv6) + * Return: see above + */ +static inline u32 tt_addr(const struct in6_addr x) +{ + return ipv6_addr_v4mapped(&x) ? ntohl(x.in6_u.u6_addr32[3]) + : (x.in6_u.u6_addr32[3] ? ntohl(x.in6_u.u6_addr32[3]) + : ntohl(x.in6_u.u6_addr32[1])); +} + #ifndef __STRIP__ /* See strip.py */ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format); diff --git a/homa_grant.c b/homa_grant.c index 3486aab4..fa8d5bef 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -674,6 +674,7 @@ int homa_grantable_lock_slow(struct homa *homa, int recalc) return result; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_grant_log_tt() - Generate timetrace records describing all of * the active RPCs (those we are currently granting to). @@ -693,3 +694,4 @@ void homa_grant_log_tt(struct homa *homa) } homa_grantable_unlock(homa); } +#endif /* See strip.py */ diff --git a/homa_impl.h b/homa_impl.h index dd1fa6a0..6a05ef17 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -43,11 +43,13 @@ #include #include -#ifndef __STRIP__ /* See strip.py --alt */ -#include +#ifndef __UPSTREAM__ /* See strip.py */ #include "homa.h" +#include +#include "homa_devel.h" +#include "homa_strip.h" #else /* See strip.py */ -#include +#include #endif /* See strip.py */ #include "homa_wire.h" @@ -80,12 +82,14 @@ void homa_throttle_lock_slow(struct homa *homa); #define sizeof32(type) ((int)(sizeof(type))) +#ifndef __STRIP__ /* See strip.py */ /** * define HOMA_MAX_GRANTS - Used to size various data structures for grant * management; the max_overcommit sysctl parameter must never be greater than * this. */ #define HOMA_MAX_GRANTS 10 +#endif /* See strip.py */ /** * union sockaddr_in_union - Holds either an IPv4 or IPv6 address (smaller @@ -181,19 +185,6 @@ static inline void homa_interest_init(struct homa_interest *interest) INIT_LIST_HEAD(&interest->response_links); } -/** - * enum homa_freeze_type - The @type argument to homa_freeze must be - * one of these values. - */ -enum homa_freeze_type { - RESTART_RPC = 1, - PEER_TIMEOUT = 2, - SLOW_RPC = 3, - SOCKET_CLOSE = 4, - PACKET_LOST = 5, - NEED_ACK_MISSING_DATA = 6, -}; - /** * homa_interest_get_rpc() - Return the ready RPC stored in an interest, * if there is one. @@ -249,6 +240,7 @@ struct homa { */ atomic64_t link_idle_time ____cacheline_aligned_in_smp; +#ifndef __STRIP__ /* See strip.py */ /** * @grantable_lock: Used to synchronize access to grant-related * fields below, from @grantable_peers to @last_grantable_change. @@ -347,6 +339,7 @@ struct homa { * to the old message. */ int grant_nonfifo_left; +#endif /* See strip.py */ /** * @pacer_mutex: Ensures that only one instance of homa_pacer_xmit @@ -405,6 +398,7 @@ struct homa { */ int throttle_min_bytes; +#ifndef __STRIP__ /* See strip.py */ /** * @total_incoming: the total number of bytes that we expect to receive * (across all messages) even if we don't send out any more grants @@ -414,6 +408,7 @@ struct homa { * homa_send_grants for why we have to allow this possibility). */ atomic_t total_incoming ____cacheline_aligned_in_smp; +#endif /* See strip.py */ /** * @prev_default_port: The most recent port number assigned from @@ -433,6 +428,7 @@ struct homa { */ struct homa_peertab *peers; +#ifndef __STRIP__ /* See strip.py */ /** * @page_pool_mutex: Synchronizes access to any/all of the page_pools * used for outgoing sk_buff data. @@ -444,10 +440,12 @@ struct homa { * If there are no cores for node, then this value is NULL. */ struct homa_page_pool *page_pools[MAX_NUMNODES]; +#endif /* See strip.py */ /** @max_numa: Highest NUMA node id in use by any core. */ int max_numa; +#ifndef __STRIP__ /* See strip.py */ /** * @skb_page_frees_per_sec: Rate at which to return pages from sk_buff * page pools back to Linux. This is the total rate across all pools. @@ -502,6 +500,7 @@ struct homa { * same as @window_param. */ int window_param; +#endif /* See strip.py */ /** * @link_mbps: The raw bandwidth of the network uplink, in @@ -509,6 +508,7 @@ struct homa { */ int link_mbps; +#ifndef __STRIP__ /* See strip.py */ /** * @poll_usecs: Amount of time (in microseconds) that a thread * will spend busy-waiting for an incoming messages before @@ -593,6 +593,7 @@ struct homa { * at a time. Set externally via sysctl. */ int max_rpcs_per_peer; +#endif /* See strip.py */ /** * @resend_ticks: When an RPC's @silent_ticks reaches this value, @@ -676,11 +677,13 @@ struct homa { */ u32 ns_per_mbyte; +#ifndef __STRIP__ /* See strip.py */ /** * @verbose: Nonzero enables additional logging. Set externally via * sysctl. */ int verbose; +#endif /* See strip.py */ /** * @max_gso_size: Maximum number of bytes that will be included @@ -696,6 +699,7 @@ struct homa { */ int gso_force_software; +#ifndef __STRIP__ /* See strip.py */ /** * @hijack_tcp: Non-zero means encapsulate outgoing Homa packets * as TCP packets (i.e. use TCP as the IP protocol). This makes TSO @@ -771,6 +775,7 @@ struct homa { /** @gro_busy_ns: Same as busy_usecs except in sched_clock() units. */ int gro_busy_ns; +#endif /* See strip.py */ /** * @timer_ticks: number of times that homa_timer has been invoked @@ -778,6 +783,7 @@ struct homa { */ u32 timer_ticks; +#ifndef __STRIP__ /* See strip.py */ /** * @metrics_mutex: Used to synchronize accesses to @metrics_active_opens * and updates to @metrics. @@ -806,6 +812,7 @@ struct homa { * currently exist for the metrics file in /proc. */ int metrics_active_opens; +#endif /* See strip.py */ /** * @flags: a collection of bits that can be set using sysctl @@ -813,11 +820,13 @@ struct homa { */ int flags; +#ifndef __STRIP__ /* See strip.py */ /** * @freeze_type: determines conditions under which the time trace * should be frozen. Set externally via sysctl. */ enum homa_freeze_type freeze_type; +#endif /* See strip.py */ /** * @bpage_lease_usecs: how long a core can own a bpage (microseconds) @@ -832,7 +841,7 @@ struct homa { */ int next_id; -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ /** * @temp: the values in this array can be read and written with sysctl. * They have no officially defined purpose, and are available for @@ -1000,23 +1009,13 @@ static inline bool is_homa_pkt(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); +#ifndef __STRIP__ /* See strip.py */ return ((iph->protocol == IPPROTO_HOMA) || ((iph->protocol == IPPROTO_TCP) && (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); -} - -#ifndef __STRIP__ /* See strip.py --alt */ -/** - * tt_addr() - Given an address, return a 4-byte id that will (hopefully) - * provide a unique identifier for the address in a timetrace record. - * @x: Address (either IPv6 or IPv4-mapped IPv6) - * Return: see above - */ -static inline u32 tt_addr(const struct in6_addr x) -{ - return ipv6_addr_v4mapped(&x) ? ntohl(x.in6_u.u6_addr32[3]) - : (x.in6_u.u6_addr32[3] ? ntohl(x.in6_u.u6_addr32[3]) - : ntohl(x.in6_u.u6_addr32[1])); +#else /* See strip.py */ + return (iph->protocol == IPPROTO_HOMA); +#endif /* See strip.py */ } /** @@ -1046,7 +1045,6 @@ void unit_hook(char *id); #define UNIT_LOG(...) #define UNIT_HOOK(...) #endif /* __UNIT_TEST__ */ -#endif /* See strip.py */ extern struct homa *global_homa; @@ -1062,17 +1060,22 @@ int homa_bind(struct socket *sk, struct sockaddr *addr, int addr_len); int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force); +#ifndef __STRIP__ /* See strip.py */ struct homa_rpc *homa_choose_fifo_grant(struct homa *homa); +#endif /* See strip.py */ struct homa_interest *homa_choose_interest(struct homa *homa, struct list_head *head, int offset); void homa_close(struct sock *sock, long timeout); int homa_copy_to_user(struct homa_rpc *rpc); +#ifndef __STRIP__ /* See strip.py */ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); +#endif /* See strip.py */ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_destroy(struct homa *homa); int homa_disconnect(struct sock *sk, int flags); void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); +#ifndef __STRIP__ /* See strip.py */ #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) int homa_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -1080,15 +1083,13 @@ int homa_dointvec(struct ctl_table *table, int write, int homa_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif +#endif /* See strip.py */ int homa_err_handler_v4(struct sk_buff *skb, u32 info); int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter); -void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, - char *format); -void homa_freeze_peers(struct homa *homa); struct homa_gap *homa_gap_new(struct list_head *next, int start, int end); void homa_gap_retry(struct homa_rpc *rpc); int homa_get_port(struct sock *sk, unsigned short snum); @@ -1097,13 +1098,19 @@ int homa_getsockopt(struct sock *sk, int level, int optname, int homa_hash(struct sock *sk); enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); int homa_init(struct homa *homa); +#ifndef __STRIP__ /* See strip.py */ void homa_incoming_sysctl_changed(struct homa *homa); int homa_ioc_abort(struct sock *sk, int *karg); +#endif /* See strip.py */ int homa_ioctl(struct sock *sk, int cmd, int *karg); int homa_load(void); +#ifndef __STRIP__ /* See strip.py */ void homa_log_throttled(struct homa *homa); int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched); +#else /* See strip.py */ +int homa_message_in_init(struct homa_rpc *rpc, int unsched); +#endif /* See strip.py */ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit); void homa_message_out_init(struct homa_rpc *rpc, int length); @@ -1112,25 +1119,28 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data); +#ifndef __STRIP__ /* See strip.py */ void homa_outgoing_sysctl_changed(struct homa *homa); +#endif /* See strip.py */ int homa_pacer_main(void *transport); void homa_pacer_stop(struct homa *homa); bool homa_pacer_xmit(struct homa *homa); __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); -char *homa_print_ipv4_addr(__be32 addr); -char *homa_print_ipv6_addr(const struct in6_addr *addr); -char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); -char *homa_print_packet_short(struct sk_buff *skb, char *buffer, - int buf_len); +#ifndef __STRIP__ /* See strip.py */ void homa_prios_changed(struct homa *homa); +#endif /* See strip.py */ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int homa_register_interests(struct homa_interest *interest, struct homa_sock *hsk, int flags, u64 id); void homa_remove_from_throttled(struct homa_rpc *rpc); +#ifndef __STRIP__ /* See strip.py */ void homa_resend_data(struct homa_rpc *rpc, int start, int end, int priority); +#else /* See strip.py */ +void homa_resend_data(struct homa_rpc *rpc, int start, int end); +#endif /* See strip.py */ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk); void homa_rpc_abort(struct homa_rpc *crpc, int error); @@ -1142,11 +1152,9 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int homa_shutdown(struct socket *sock, int how); -int homa_snprintf(char *buffer, int size, int used, - const char *format, ...) __printf(4, 5); int homa_softirq(struct sk_buff *skb); void homa_spin(int ns); -char *homa_symbol_for_type(uint8_t type); +#ifndef __STRIP__ /* See strip.py */ #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) int homa_sysctl_softirq_cores(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -1156,15 +1164,18 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif +#endif /* See strip.py */ void homa_timer(struct homa *homa); int homa_timer_main(void *transport); void homa_unhash(struct sock *sk); void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_unload(void); +#ifndef __STRIP__ /* See strip.py */ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, int length); int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); +#endif /* See strip.py */ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, u64 id); int homa_xmit_control(enum homa_packet_type type, void *contents, @@ -1172,8 +1183,12 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, struct homa_sock *hsk); void homa_xmit_data(struct homa_rpc *rpc, bool force); +#ifndef __STRIP__ /* See strip.py */ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority); +#else /* See strip.py */ +void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc); +#endif /* See strip.py */ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); /** diff --git a/homa_incoming.c b/homa_incoming.c index b47fd6a3..58ee3a2c 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1,15 +1,22 @@ // SPDX-License-Identifier: BSD-2-Clause +#ifndef __STRIP__ /* See strip.py */ /* This file contains functions that handle incoming Homa messages, including * both receiving information for those messages and sending grants. */ +#else /* See strip.py */ +/* This file contains functions that handle incoming Homa messages. */ +#endif /* See strip.py */ #include "homa_impl.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" #include "homa_offload.h" +#endif /* See strip.py */ #include "homa_peer.h" #include "homa_pool.h" +#ifndef __STRIP__ /* See strip.py */ /** * homa_message_in_init() - Constructor for homa_message_in. * @rpc: RPC whose msgin structure should be initialized. @@ -20,6 +27,16 @@ * if rpc->msgin could not be initialized. */ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) +#else /* See strip.py */ +/** + * homa_message_in_init() - Constructor for homa_message_in. + * @rpc: RPC whose msgin structure should be initialized. + * @length: Total number of bytes in message. + * Return: Zero for successful initialization, or a negative errno + * if rpc->msgin could not be initialized. + */ +int homa_message_in_init(struct homa_rpc *rpc, int length) +#endif /* See strip.py */ { int err; @@ -31,15 +48,18 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) rpc->msgin.recv_end = 0; INIT_LIST_HEAD(&rpc->msgin.gaps); rpc->msgin.bytes_remaining = length; +#ifndef __STRIP__ /* See strip.py */ rpc->msgin.granted = (unsched > length) ? length : unsched; rpc->msgin.rec_incoming = 0; atomic_set(&rpc->msgin.rank, -1); rpc->msgin.priority = 0; +#endif /* See strip.py */ rpc->msgin.resend_all = 0; rpc->msgin.num_bpages = 0; err = homa_pool_allocate(rpc); if (err != 0) return err; +#ifndef __STRIP__ /* See strip.py */ if (rpc->msgin.num_bpages == 0) { /* The RPC is now queued waiting for buffer space, so we're * going to discard all of its packets. @@ -54,6 +74,7 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) INC_METRIC(large_msg_count, 1); INC_METRIC(large_msg_bytes, length); } +#endif /* See strip.py */ return 0; } @@ -92,7 +113,9 @@ void homa_gap_retry(struct homa_rpc *rpc) list_for_each_entry(gap, &rpc->msgin.gaps, links) { resend.offset = htonl(gap->start); resend.length = htonl(gap->end - gap->start); +#ifndef __STRIP__ /* See strip.py */ resend.priority = rpc->hsk->homa->num_priorities - 1; +#endif /* See strip.py */ tt_record3("homa_gap_retry sending RESEND for id %d, start %d, end %d", rpc->id, gap->start, gap->end); homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); @@ -194,18 +217,22 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) } discard: +#ifndef __STRIP__ /* See strip.py */ if (h->retransmit) INC_METRIC(resent_discards, 1); else INC_METRIC(packet_discards, 1); +#endif /* See strip.py */ tt_record4("homa_add_packet discarding packet for id %d, offset %d, length %d, retransmit %d", rpc->id, start, length, h->retransmit); kfree_skb(skb); return; keep: +#ifndef __STRIP__ /* See strip.py */ if (h->retransmit) INC_METRIC(resent_packets_used, 1); +#endif /* See strip.py */ __skb_queue_tail(&rpc->msgin.packets, skb); rpc->msgin.bytes_remaining -= length; } @@ -230,12 +257,14 @@ int homa_copy_to_user(struct homa_rpc *rpc) #define MAX_SKBS 20 #endif /* __UNIT_TEST__ */ struct sk_buff *skbs[MAX_SKBS]; -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ int start_offset = 0; int end_offset = 0; #endif /* See strip.py */ int error = 0; +#ifndef __STRIP__ /* See strip.py */ u64 start; +#endif /* See strip.py */ int n = 0; /* Number of filled entries in skbs. */ int i; @@ -313,7 +342,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) goto free_skbs; copied += chunk_size; } -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ if (end_offset == 0) { start_offset = offset; } else if (end_offset != offset) { @@ -326,14 +355,16 @@ int homa_copy_to_user(struct homa_rpc *rpc) } free_skbs: -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ if (end_offset != 0) { tt_record3("copied out bytes %d-%d for id %d", start_offset, end_offset, rpc->id); end_offset = 0; } #endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ start = sched_clock(); +#endif /* See strip.py */ for (i = 0; i < n; i++) kfree_skb(skbs[i]); INC_METRIC(skb_free_ns, sched_clock() - start); @@ -348,9 +379,11 @@ int homa_copy_to_user(struct homa_rpc *rpc) if (error) break; } +#ifndef __STRIP__ /* See strip.py */ if (error) tt_record2("homa_copy_to_user returning error %d for id %d", -error, rpc->id); +#endif /* See strip.py */ return error; } @@ -456,27 +489,33 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) } } if (unlikely(!rpc)) { +#ifndef __STRIP__ /* See strip.py */ if (h->common.type != CUTOFFS && h->common.type != NEED_ACK && +#else /* See strip.py */ + if (h->common.type != NEED_ACK && +#endif /* See strip.py */ h->common.type != ACK && h->common.type != RESEND) { tt_record4("Discarding packet for unknown RPC, id %u, type %d, peer 0x%x:%d", id, h->common.type, tt_addr(saddr), ntohs(h->common.sport)); +#ifndef __STRIP__ /* See strip.py */ if (h->common.type != GRANT || homa_is_client(id)) INC_METRIC(unknown_rpcs, 1); +#endif /* See strip.py */ goto discard; } } else { if (h->common.type == DATA || +#ifndef __STRIP__ /* See strip.py */ h->common.type == GRANT || +#endif /* See strip.py */ h->common.type == BUSY || h->common.type == NEED_ACK) rpc->silent_ticks = 0; -#ifndef __STRIP__ /* See strip.py */ rpc->peer->outstanding_resends = 0; -#endif /* See strip.py */ } switch (h->common.type) { @@ -493,10 +532,12 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) homa_data_pkt(skb, rpc); INC_METRIC(packets_received[DATA - DATA], 1); break; +#ifndef __STRIP__ /* See strip.py */ case GRANT: INC_METRIC(packets_received[GRANT - DATA], 1); homa_grant_pkt(skb, rpc); break; +#endif /* See strip.py */ case RESEND: INC_METRIC(packets_received[RESEND - DATA], 1); homa_resend_pkt(skb, rpc, hsk); @@ -513,10 +554,12 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) * silent_ticks, which happened above. */ goto discard; +#ifndef __STRIP__ /* See strip.py */ case CUTOFFS: INC_METRIC(packets_received[CUTOFFS - DATA], 1); homa_cutoffs_pkt(skb, hsk); break; +#endif /* See strip.py */ case NEED_ACK: INC_METRIC(packets_received[NEED_ACK - DATA], 1); homa_need_ack_pkt(skb, hsk, rpc); @@ -535,7 +578,11 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) kfree_skb(skb); } if (rpc) +#ifndef __STRIP__ /* See strip.py */ homa_grant_check_rpc(rpc); /* Unlocks rpc. */ +#else /* See strip.py */ + homa_rpc_unlock(rpc); +#endif /* See strip.py */ while (num_acks > 0) { num_acks--; @@ -547,7 +594,9 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) * nor homa_timer can keep up with reaping dead * RPCs. See reap.txt for details. */ +#ifndef __STRIP__ /* See strip.py */ u64 start = sched_clock(); +#endif /* See strip.py */ tt_record("homa_data_pkt calling homa_rpc_reap"); homa_rpc_reap(hsk, false); @@ -566,7 +615,9 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) { struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; +#ifndef __STRIP__ /* See strip.py */ struct homa *homa = rpc->hsk->homa; +#endif /* See strip.py */ tt_record4("incoming data packet, id %d, peer 0x%x, offset %d/%d", homa_local_id(h->common.sender_id), @@ -578,10 +629,16 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) goto discard; INC_METRIC(responses_received, 1); rpc->state = RPC_INCOMING; +#ifndef __STRIP__ /* See strip.py */ tt_record2("Incoming message for id %d has %d unscheduled bytes", rpc->id, ntohl(h->incoming)); +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ if (homa_message_in_init(rpc, ntohl(h->message_length), ntohl(h->incoming)) != 0) +#else /* See strip.py */ + if (homa_message_in_init(rpc, ntohl(h->message_length)) != 0) +#endif /* See strip.py */ goto discard; } else if (rpc->state != RPC_INCOMING) { /* Must be server; note that homa_rpc_new_server already @@ -597,9 +654,14 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) * exceed available cache space, resulting in poor * performance. */ +#ifndef __STRIP__ /* See strip.py */ tt_record4("Dropping packet because no buffer space available: id %d, offset %d, length %d, old incoming %d", rpc->id, ntohl(h->seg.offset), homa_data_len(skb), rpc->msgin.granted); +#else /* See strip.py */ + tt_record3("Dropping packet because no buffer space available: id %d, offset %d, length %d", + rpc->id, ntohl(h->seg.offset), homa_data_len(skb)); +#endif /* See strip.py */ INC_METRIC(dropped_data_no_bufs, homa_data_len(skb)); goto discard; } @@ -614,6 +676,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) homa_sock_unlock(rpc->hsk); } +#ifndef __STRIP__ /* See strip.py */ if (ntohs(h->cutoff_version) != homa->cutoff_version) { /* The sender has out-of-date cutoffs. Note: we may need * to resend CUTOFFS packets if one gets lost, but we don't @@ -636,6 +699,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) rpc->peer->last_update_jiffies = jiffies; } } +#endif /* See strip.py */ return; discard: @@ -643,6 +707,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) UNIT_LOG("; ", "homa_data_pkt discarded packet"); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_grant_pkt() - Handler for incoming GRANT packets * @skb: Incoming packet; size already verified large enough for header. @@ -672,6 +737,7 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) } kfree_skb(skb); } +#endif /* See strip.py */ /** * homa_resend_pkt() - Handler for incoming RESEND packets @@ -686,20 +752,23 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk) { struct homa_resend_hdr *h = (struct homa_resend_hdr *)skb->data; -#ifndef __STRIP__ /* See strip.py */ - const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); -#endif /* See strip.py */ struct homa_busy_hdr busy; if (!rpc) { tt_record4("resend request for unknown id %d, peer 0x%x:%d, offset %d; responding with UNKNOWN", - homa_local_id(h->common.sender_id), tt_addr(saddr), + homa_local_id(h->common.sender_id), + tt_addr(skb_canonical_ipv6_saddr(skb)), ntohs(h->common.sport), ntohl(h->offset)); homa_xmit_unknown(skb, hsk); goto done; } +#ifndef __STRIP__ /* See strip.py */ tt_record4("resend request for id %llu, offset %d, length %d, prio %d", rpc->id, ntohl(h->offset), ntohl(h->length), h->priority); +#else /* See strip.py */ + tt_record3("resend request for id %llu, offset %d, length %d", + rpc->id, ntohl(h->offset), ntohl(h->length)); +#endif /* See strip.py */ if (!homa_is_client(rpc->id) && rpc->state != RPC_OUTGOING) { /* We are the server for this RPC and don't yet have a @@ -710,6 +779,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); goto done; } +#ifndef __STRIP__ /* See strip.py */ if (rpc->msgout.next_xmit_offset < rpc->msgout.granted) { /* We have chosen not to transmit data from this message; * send BUSY instead. @@ -729,6 +799,16 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, ntohl(h->offset) + ntohl(h->length), h->priority); } +#else /* See strip.py */ + if (ntohl(h->length) == 0) + /* This RESEND is from a server just trying to determine + * whether the client still cares about the RPC; return + * BUSY so the server doesn't time us out. + */ + homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); + homa_resend_data(rpc, ntohl(h->offset), + ntohl(h->offset) + ntohl(h->length)); +#endif /* See strip.py */ done: kfree_skb(skb); @@ -752,27 +832,35 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) tt_record4("Restarting id %d to server 0x%x:%d, lost %d bytes", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgout.next_xmit_offset); +#ifndef __STRIP__ /* See strip.py */ homa_freeze(rpc, RESTART_RPC, "Freezing because of RPC restart, id %d, peer 0x%x"); homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset, homa_unsched_priority(rpc->hsk->homa, rpc->peer, rpc->msgout.length)); +#else /* See strip.py */ + homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset); +#endif /* See strip.py */ goto done; } +#ifndef __STRIP__ /* See strip.py */ pr_err("Received unknown for RPC id %llu, peer %s:%d in bogus state %d; discarding unknown\n", rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), rpc->dport, rpc->state); +#endif /* See strip.py */ tt_record4("Discarding unknown for RPC id %d, peer 0x%x:%d: bad state %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->state); } else { +#ifndef __STRIP__ /* See strip.py */ if (rpc->hsk->homa->verbose) pr_notice("Ending rpc id %llu from client %s:%d: unknown to client", rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), rpc->dport); +#endif /* See strip.py */ homa_rpc_end(rpc); INC_METRIC(server_rpcs_unknown, 1); } @@ -780,6 +868,7 @@ void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) kfree_skb(skb); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_cutoffs_pkt() - Handler for incoming CUTOFFS packets * @skb: Incoming packet; size already verified large enough for header. @@ -802,6 +891,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) } kfree_skb(skb); } +#endif /* See strip.py */ /** * homa_need_ack_pkt() - Handler for incoming NEED_ACK packets @@ -848,8 +938,10 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, ack.common.type = ACK; ack.common.sport = h->dport; ack.common.dport = h->sport; +#ifndef __STRIP__ /* See strip.py */ ack.common.flags = HOMA_TCP_FLAGS; ack.common.urgent = htons(HOMA_TCP_URGENT); +#endif /* See strip.py */ ack.common.sender_id = cpu_to_be64(id); ack.num_acks = htons(homa_peer_get_acks(peer, HOMA_MAX_ACKS_PER_PKT, @@ -904,6 +996,7 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, kfree_skb(skb); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_choose_fifo_grant() - This function is invoked occasionally to give * a high-priority grant to the oldest incoming message. We do this in @@ -984,6 +1077,7 @@ struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) } return oldest; } +#endif /* See strip.py */ /** * homa_rpc_abort() - Terminate an RPC. @@ -1221,11 +1315,16 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, u64 id) __acquires(&rpc->bucket_lock) { +#ifndef __STRIP__ /* See strip.py */ u64 poll_start, poll_end, now; - int error, blocked = 0, polled = 0; +#endif /* See strip.py */ struct homa_rpc *result = NULL; struct homa_interest interest; struct homa_rpc *rpc = NULL; +#ifndef __STRIP__ /* See strip.py */ + int blocked = 0, polled = 0; +#endif /* See strip.py */ + int error; /* Each iteration of this loop finds an RPC, but it might not be * in a state where we can return it (e.g., there might be packets @@ -1267,6 +1366,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, goto found_rpc; } +#ifndef __STRIP__ /* See strip.py */ // tt_record4("Preparing to poll, socket %d, flags 0x%x, pid %d, poll_usecs %d", // hsk->port, flags, current->pid, // hsk->homa->poll_usecs); @@ -1303,20 +1403,27 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, tt_record2("Poll ended unsuccessfully on socket %d, pid %d", hsk->port, current->pid); INC_METRIC(poll_ns, now - poll_start); +#endif /* See strip.py */ /* Now it's time to sleep. */ +#ifndef __STRIP__ /* See strip.py */ per_cpu(homa_offload_core, interest.core).last_app_active = now; +#endif /* See strip.py */ set_current_state(TASK_INTERRUPTIBLE); rpc = homa_interest_get_rpc(&interest); if (!rpc && !hsk->shutdown) { +#ifndef __STRIP__ /* See strip.py */ u64 end; u64 start = sched_clock(); +#endif /* See strip.py */ tt_record1("homa_wait_for_message sleeping, pid %d", current->pid); schedule(); - end = sched_clock(); +#ifndef __STRIP__ /* See strip.py */ blocked = 1; + end = sched_clock(); +#endif /* See strip.py */ INC_METRIC(blocked_ns, end - start); } __set_current_state(TASK_RUNNING); @@ -1394,13 +1501,16 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, } done: +#ifndef __STRIP__ /* See strip.py */ if (blocked) INC_METRIC(slow_wakeups, 1); else if (polled) INC_METRIC(fast_wakeups, 1); +#endif /* See strip.py */ return rpc; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_choose_interest() - Given a list of interests for an incoming * message, choose the best one to handle it (if any). @@ -1414,9 +1524,24 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, * interest whose thread is running on a core that isn't * currently busy doing Homa transport work. */ +#else /* See strip.py */ +/** + * homa_choose_interest() - Given a list of interests for an incoming + * message, choose the best one to handle it (if any). + * @homa: Overall information about the Homa transport. + * @head: Head pointers for the list of interest: either + * hsk->request_interests or hsk->response_interests. + * @offset: Offset of "next" pointers in the list elements (either + * offsetof(request_links) or offsetof(response_links). + * Return: An interest to use for the incoming message, or NULL if none + * is available. (future patch sets will fill in additional + * functionality in this function). + */ +#endif /* See strip.py */ struct homa_interest *homa_choose_interest(struct homa *homa, struct list_head *head, int offset) { +#ifndef __STRIP__ /* See strip.py */ u64 busy_time = sched_clock() - homa->busy_ns; struct homa_interest *backup = NULL; struct homa_interest *interest; @@ -1436,6 +1561,12 @@ struct homa_interest *homa_choose_interest(struct homa *homa, /* All interested threads are on busy cores; return the first. */ return backup; +#else /* See strip.py */ + if (list_empty(head)) + return NULL; + else + return (struct homa_interest *)(((char *)head->next) - offset); +#endif /* See strip.py */ } /** @@ -1504,11 +1635,13 @@ void homa_rpc_handoff(struct homa_rpc *rpc) rpc->id, interest->thread->pid, task_cpu(interest->thread)); homa_interest_set_rpc(interest, rpc, 0); +#ifndef __STRIP__ /* See strip.py */ /* Update the last_app_active time for the thread's core, so Homa * will try to avoid doing any work there. */ per_cpu(homa_offload_core, interest->core).last_app_active = sched_clock(); +#endif /* See strip.py */ /* Clear the interest. This serves two purposes. First, it saves * the waking thread from acquiring the socket lock again, which @@ -1526,6 +1659,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) wake_up_process(interest->thread); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_incoming_sysctl_changed() - Invoked whenever a sysctl value is changed; * any input-related parameters that depend on sysctl-settable values. @@ -1549,3 +1683,4 @@ void homa_incoming_sysctl_changed(struct homa *homa) homa->busy_ns = homa->busy_usecs * 1000; homa->gro_busy_ns = homa->gro_busy_usecs * 1000; } +#endif /* See strip.py */ diff --git a/homa_offload.c b/homa_offload.c index 4ec5d6f7..56947685 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -19,6 +19,7 @@ static const struct net_offload homa_offload = { }, }; +#ifndef __STRIP__ /* See strip.py */ /* Pointers to TCP's net_offload structures. NULL means homa_gro_hook_tcp * hasn't been called yet. */ @@ -31,6 +32,7 @@ static const struct net_offload *tcp6_net_offload; */ static struct net_offload hook_tcp_net_offload; static struct net_offload hook_tcp6_net_offload; +#endif /* See strip.py */ /** * homa_offload_init() - Invoked to enable GRO and GSO. Typically invoked @@ -78,6 +80,8 @@ int homa_offload_end(void) return res1 ? res1 : res2; } +#ifndef __STRIP__ /* See strip.py */ +#endif /* See strip.py */ /** * homa_gro_hook_tcp() - Arranges for TCP gro_receive calls to be * mediated by this file, so that Homa-over-TCP packets can be retrieved @@ -156,6 +160,8 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, } return homa_gro_receive(held_list, skb); } +#ifndef __STRIP__ /* See strip.py */ +#endif /* See strip.py */ /** * homa_set_softirq_cpu() - Arrange for SoftIRQ processing of a packet to diff --git a/homa_offload.h b/homa_offload.h index b0f21a8c..f5d1d106 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -76,8 +76,10 @@ DECLARE_PER_CPU(struct homa_offload_core, homa_offload_core); int homa_gro_complete(struct sk_buff *skb, int thoff); void homa_gro_gen2(struct homa *homa, struct sk_buff *skb); void homa_gro_gen3(struct homa *homa, struct sk_buff *skb); +#ifndef __STRIP__ /* See strip.py */ void homa_gro_hook_tcp(void); void homa_gro_unhook_tcp(void); +#endif /* See strip.py */ struct sk_buff *homa_gro_receive(struct list_head *gro_list, struct sk_buff *skb); struct sk_buff *homa_gso_segment(struct sk_buff *skb, @@ -85,7 +87,9 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb, int homa_offload_end(void); int homa_offload_init(void); void homa_send_ipis(void); +#ifndef __STRIP__ /* See strip.py */ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, struct sk_buff *skb); +#endif /* See strip.py */ #endif /* _HOMA_OFFLOAD_H */ diff --git a/homa_outgoing.c b/homa_outgoing.c index 1cbea602..9e309eb3 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -7,9 +7,15 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_rpc.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_skb.h" +#endif /* See strip.py */ #include "homa_wire.h" +#ifdef __STRIP__ /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ + /** * homa_message_out_init() - Initialize rpc->msgout. * @rpc: RPC whose output message should be initialized. @@ -23,13 +29,16 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) rpc->msgout.packets = NULL; rpc->msgout.next_xmit = &rpc->msgout.packets; rpc->msgout.next_xmit_offset = 0; +#ifndef __STRIP__ /* See strip.py */ rpc->msgout.unscheduled = rpc->hsk->homa->unsched_bytes; if (rpc->msgout.unscheduled > length) rpc->msgout.unscheduled = length; rpc->msgout.sched_priority = 0; +#endif /* See strip.py */ rpc->msgout.init_ns = sched_clock(); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_fill_data_interleaved() - This function is invoked to fill in the * part of a data packet after the initial header, when GSO is being used @@ -43,6 +52,21 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) * space. * Return: Either a negative errno or 0 (for success). */ +#else /* See strip.py */ +/** + * homa_fill_data_interleaved() - This function is invoked to fill in the + * part of a data packet after the initial header, when GSO is being used. + * homa_seg_hdrs must be interleaved with the data to provide the correct + * offset for each segment. + * @rpc: RPC whose output message is being created. + * @skb: The packet being filled. The initial homa_data_hdr was + * created and initialized by the caller and the + * homa_skb_info has been filled in with the packet geometry. + * @iter: Describes location(s) of (remaining) message data in user + * space. + * Return: Either a negative errno or 0 (for success). + */ +#endif /* See strip.py */ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter) { @@ -112,7 +136,12 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, do_div(segs, max_seg_data); /* Initialize the overall skb. */ +#ifndef __STRIP__ /* See strip.py */ skb = homa_skb_new_tx(sizeof32(struct homa_data_hdr)); +#else /* See strip.py */ + skb = homa_skb_new_tx(sizeof32(struct homa_data_hdr) + length + + segs * sizeof32(struct homa_seg_hdr)); +#endif /* See strip.py */ if (!skb) return ERR_PTR(-ENOMEM); @@ -125,17 +154,29 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, h->common.sequence = htonl(offset); h->common.type = DATA; homa_set_doff(h, sizeof(struct homa_data_hdr)); +#ifndef __STRIP__ /* See strip.py */ h->common.flags = HOMA_TCP_FLAGS; +#endif /* See strip.py */ h->common.checksum = 0; +#ifndef __STRIP__ /* See strip.py */ h->common.urgent = htons(HOMA_TCP_URGENT); +#endif /* See strip.py */ h->common.sender_id = cpu_to_be64(rpc->id); h->message_length = htonl(rpc->msgout.length); +#ifndef __STRIP__ /* See strip.py */ h->incoming = htonl(rpc->msgout.unscheduled); +#endif /* See strip.py */ h->ack.client_id = 0; homa_peer_get_acks(rpc->peer, 1, &h->ack); +#ifndef __STRIP__ /* See strip.py */ h->cutoff_version = rpc->peer->cutoff_version; +#endif /* See strip.py */ h->retransmit = 0; +#ifndef __STRIP__ /* See strip.py */ h->seg.offset = htonl(-1); +#else /* See strip.py */ + h->seg.offset = htonl(offset); +#endif /* See strip.py */ homa_info = homa_get_skb_info(skb); homa_info->next_skb = NULL; @@ -145,10 +186,16 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, homa_info->seg_length = max_seg_data; homa_info->offset = offset; +#ifndef __STRIP__ /* See strip.py */ if (segs > 1 && rpc->hsk->sock.sk_protocol != IPPROTO_TCP) { +#else /* See strip.py */ + if (segs > 1) { +#endif /* See strip.py */ homa_set_doff(h, sizeof(struct homa_data_hdr) - sizeof32(struct homa_seg_hdr)); +#ifndef __STRIP__ /* See strip.py */ h->seg.offset = htonl(offset); +#endif /* See strip.py */ gso_size = max_seg_data + sizeof(struct homa_seg_hdr); err = homa_fill_data_interleaved(rpc, skb, iter); } else { @@ -240,6 +287,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) if (gso_size > rpc->hsk->homa->max_gso_size) gso_size = rpc->hsk->homa->max_gso_size; +#ifndef __STRIP__ /* See strip.py */ /* Round gso_size down to an even # of mtus; calculation depends * on whether we're doing TCP hijacking (need more space in TSO packet * if no hijacking). @@ -257,6 +305,14 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) do_div(segs_per_gso, max_seg_data + sizeof(struct homa_seg_hdr)); } +#else /* See strip.py */ + /* Round gso_size down to an even # of mtus. */ + segs_per_gso = gso_size - rpc->hsk->ip_header_length - + sizeof(struct homa_data_hdr) + + sizeof(struct homa_seg_hdr); + do_div(segs_per_gso, max_seg_data + + sizeof(struct homa_seg_hdr)); +#endif /* See strip.py */ if (segs_per_gso == 0) segs_per_gso = 1; max_gso_data = segs_per_gso * max_seg_data; @@ -264,12 +320,19 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) mtu, max_seg_data, max_gso_data); overlap_xmit = rpc->msgout.length > 2 * max_gso_data; +#ifndef __STRIP__ /* See strip.py */ rpc->msgout.granted = rpc->msgout.unscheduled; +#endif /* See strip.py */ homa_skb_stash_pages(rpc->hsk->homa, rpc->msgout.length); /* Each iteration of the loop below creates one GSO packet. */ +#ifndef __STRIP__ /* See strip.py */ tt_record3("starting copy from user space for id %d, length %d, unscheduled %d", rpc->id, rpc->msgout.length, rpc->msgout.unscheduled); +#else /* See strip.py */ + tt_record2("starting copy from user space for id %d, length %d", + rpc->id, rpc->msgout.length); +#endif /* See strip.py */ last_link = &rpc->msgout.packets; for (bytes_left = rpc->msgout.length; bytes_left > 0; ) { int skb_data_bytes, offset; @@ -278,6 +341,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) homa_rpc_unlock(rpc); skb_data_bytes = max_gso_data; offset = rpc->msgout.length - bytes_left; +#ifndef __STRIP__ /* See strip.py */ if (offset < rpc->msgout.unscheduled && (offset + skb_data_bytes) > rpc->msgout.unscheduled) { /* Insert a packet boundary at the unscheduled limit, @@ -285,6 +349,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) */ skb_data_bytes = rpc->msgout.unscheduled - offset; } +#endif /* See strip.py */ if (skb_data_bytes > bytes_left) skb_data_bytes = bytes_left; skb = homa_new_data_packet(rpc, iter, offset, skb_data_bytes, @@ -309,7 +374,11 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) rpc->msgout.num_skbs++; rpc->msgout.copied_from_user = rpc->msgout.length - bytes_left; if (overlap_xmit && list_empty(&rpc->throttled_links) && +#ifndef __STRIP__ /* See strip.py */ xmit && offset < rpc->msgout.granted) { +#else /* See strip.py */ + xmit) { +#endif /* See strip.py */ tt_record1("waking up pacer for id %d", rpc->id); homa_add_to_throttled(rpc); } @@ -349,8 +418,10 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, h->type = type; h->sport = htons(rpc->hsk->port); h->dport = htons(rpc->dport); +#ifndef __STRIP__ /* See strip.py */ h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); +#endif /* See strip.py */ h->sender_id = cpu_to_be64(rpc->id); return __homa_xmit_control(contents, length, rpc->peer, rpc->hsk); } @@ -376,9 +447,12 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, #endif /* See strip.py */ struct homa_common_hdr *h; struct dst_entry *dst; - int result, priority; struct sk_buff *skb; int extra_bytes; +#ifndef __STRIP__ /* See strip.py */ + int priority; +#endif /* See strip.py */ + int result; dst = homa_get_dst(peer, hsk); skb = homa_skb_new_tx(HOMA_MAX_HEADER); @@ -395,9 +469,12 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, UNIT_LOG(",", "padded control packet with %d bytes", extra_bytes); } +#ifndef __STRIP__ /* See strip.py */ priority = hsk->homa->num_priorities - 1; +#endif /* See strip.py */ skb->ooo_okay = 1; skb_get(skb); +#ifndef __STRIP__ /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) { result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, NULL, hsk->homa->priority_map[priority] << 4, @@ -407,6 +484,13 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, hsk->inet.tos = hsk->homa->priority_map[priority] << 5; result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); } +#else /* See strip.py */ + if (hsk->inet.sk.sk_family == AF_INET6) + result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, + NULL, 0, 0); + else + result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); +#endif /* See strip.py */ if (unlikely(result != 0)) { INC_METRIC(control_xmit_errors, 1); @@ -442,7 +526,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, #ifndef __STRIP__ /* See strip.py */ txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) - tt_record4("__homa_xmit_control found stopped txq for id %d, qid %d, num_queued %d, limit %d", + tt_record4("__homa_xmit_control found stopped txq for id %d, qid %u, num_queued %u, limit %d", be64_to_cpu(h->sender_id), skb->queue_mapping, txq->dql.num_queued, txq->dql.adj_limit); #endif /* See strip.py */ @@ -466,18 +550,22 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) struct homa_unknown_hdr unknown; struct homa_peer *peer; +#ifndef __STRIP__ /* See strip.py */ if (hsk->homa->verbose) pr_notice("sending UNKNOWN to peer %s:%d for id %llu", homa_print_ipv6_addr(&saddr), ntohs(h->sport), homa_local_id(h->sender_id)); +#endif /* See strip.py */ tt_record3("sending unknown to 0x%x:%d for id %llu", tt_addr(saddr), ntohs(h->sport), homa_local_id(h->sender_id)); unknown.common.sport = h->dport; unknown.common.dport = h->sport; unknown.common.type = UNKNOWN; +#ifndef __STRIP__ /* See strip.py */ unknown.common.flags = HOMA_TCP_FLAGS; unknown.common.urgent = htons(HOMA_TCP_URGENT); +#endif /* See strip.py */ unknown.common.sender_id = cpu_to_be64(homa_local_id(h->sender_id)); peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); if (!IS_ERR(peer)) @@ -510,15 +598,19 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) homa_rpc_hold(rpc); while (*rpc->msgout.next_xmit) { +#ifndef __STRIP__ /* See strip.py */ int priority; +#endif /* See strip.py */ struct sk_buff *skb = *rpc->msgout.next_xmit; +#ifndef __STRIP__ /* See strip.py */ if (rpc->msgout.next_xmit_offset >= rpc->msgout.granted) { tt_record3("homa_xmit_data stopping at offset %d for id %u: granted is %d", rpc->msgout.next_xmit_offset, rpc->id, rpc->msgout.granted); break; } +#endif /* See strip.py */ if ((rpc->msgout.length - rpc->msgout.next_xmit_offset) >= homa->throttle_min_bytes) { @@ -530,25 +622,29 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) } } +#ifndef __STRIP__ /* See strip.py */ if (rpc->msgout.next_xmit_offset < rpc->msgout.unscheduled) { priority = homa_unsched_priority(homa, rpc->peer, rpc->msgout.length); } else { priority = rpc->msgout.sched_priority; } +#endif /* See strip.py */ rpc->msgout.next_xmit = &(homa_get_skb_info(skb)->next_skb); rpc->msgout.next_xmit_offset += homa_get_skb_info(skb)->data_bytes; homa_rpc_unlock(rpc); skb_get(skb); - __homa_xmit_data(skb, rpc, priority); #ifndef __STRIP__ /* See strip.py */ + __homa_xmit_data(skb, rpc, priority); txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) tt_record4("homa_xmit_data found stopped txq for id %d, qid %d, num_queued %d, limit %d", rpc->id, skb->queue_mapping, txq->dql.num_queued, txq->dql.adj_limit); +#else /* See strip.py */ + __homa_xmit_data(skb, rpc); #endif /* See strip.py */ force = false; homa_rpc_lock(rpc); @@ -558,6 +654,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) homa_rpc_put(rpc); } +#ifndef __STRIP__ /* See strip.py */ /** * __homa_xmit_data() - Handles packet transmission stuff that is common * to homa_xmit_data and homa_resend_data. @@ -567,11 +664,19 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) * @priority: Priority level at which to transmit the packet. */ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) -{ -#ifndef __STRIP__ /* See strip.py */ - struct homa_skb_info *homa_info = homa_get_skb_info(skb); +#else /* See strip.py */ +/** + * __homa_xmit_data() - Handles packet transmission stuff that is common + * to homa_xmit_data and homa_resend_data. + * @skb: Packet to be sent. The packet will be freed after transmission + * (and also if errors prevented transmission). + * @rpc: Information about the RPC that the packet belongs to. + */ +void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) #endif /* See strip.py */ +{ struct dst_entry *dst; +#ifndef __STRIP__ /* See strip.py */ int err; /* Update info that may have changed since the message was initially @@ -579,6 +684,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) */ ((struct homa_data_hdr *)skb_transport_header(skb))->cutoff_version = rpc->peer->cutoff_version; +#endif /* See strip.py */ dst = homa_get_dst(rpc->peer, rpc->hsk); dst_hold(dst); @@ -592,30 +698,42 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) tt_record4("calling ip6_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", homa_get_skb_info(skb)->wire_bytes, tt_addr(rpc->peer->addr), rpc->id, - homa_info->offset); + homa_get_skb_info(skb)->offset); +#ifndef __STRIP__ /* See strip.py */ err = ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, 0, NULL, rpc->hsk->homa->priority_map[priority] << 4, 0); +#else /* See strip.py */ + ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, + 0, NULL, 0, 0); +#endif /* See strip.py */ } else { tt_record4("calling ip_queue_xmit: wire_bytes %d, peer 0x%x, id %d, offset %d", homa_get_skb_info(skb)->wire_bytes, tt_addr(rpc->peer->addr), rpc->id, - homa_info->offset); + homa_get_skb_info(skb)->offset); +#ifndef __STRIP__ /* See strip.py */ rpc->hsk->inet.tos = rpc->hsk->homa->priority_map[priority] << 5; err = ip_queue_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow); +#else /* See strip.py */ + ip_queue_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow); +#endif /* See strip.py */ } tt_record4("Finished queueing packet: rpc id %llu, offset %d, len %d, qid %d", - rpc->id, homa_info->offset, + rpc->id, homa_get_skb_info(skb)->offset, homa_get_skb_info(skb)->data_bytes, skb->queue_mapping); +#ifndef __STRIP__ /* See strip.py */ if (err) INC_METRIC(data_xmit_errors, 1); +#endif /* See strip.py */ INC_METRIC(packets_sent[0], 1); INC_METRIC(priority_bytes[priority], skb->len); INC_METRIC(priority_packets[priority], 1); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_resend_data() - This function is invoked as part of handling RESEND * requests. It retransmits the packet(s) containing a given range of bytes @@ -628,6 +746,18 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) */ void homa_resend_data(struct homa_rpc *rpc, int start, int end, int priority) +#else /* See strip.py */ +/** + * homa_resend_data() - This function is invoked as part of handling RESEND + * requests. It retransmits the packet(s) containing a given range of bytes + * from a message. + * @rpc: RPC for which data should be resent. + * @start: Offset within @rpc->msgout of the first byte to retransmit. + * @end: Offset within @rpc->msgout of the byte just after the last one + * to retransmit. + */ +void homa_resend_data(struct homa_rpc *rpc, int start, int end) +#endif /* See strip.py */ { struct homa_skb_info *homa_info; struct sk_buff *skb; @@ -674,12 +804,19 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, continue; /* This segment must be retransmitted. */ +#ifndef __STRIP__ /* See strip.py */ new_skb = homa_skb_new_tx(sizeof(struct homa_data_hdr) - sizeof(struct homa_seg_hdr)); +#else /* See strip.py */ + new_skb = homa_skb_new_tx(sizeof(struct homa_data_hdr) + + seg_length); +#endif /* See strip.py */ if (unlikely(!new_skb)) { +#ifndef __STRIP__ /* See strip.py */ if (rpc->hsk->homa->verbose) pr_notice("%s couldn't allocate skb\n", __func__); +#endif /* See strip.py */ UNIT_LOG("; ", "skb allocation error"); goto resend_done; } @@ -688,12 +825,14 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, h->common.sequence = htonl(offset); h->seg.offset = htonl(offset); h->retransmit = 1; +#ifndef __STRIP__ /* See strip.py */ if ((offset + seg_length) <= rpc->msgout.granted) h->incoming = htonl(rpc->msgout.granted); else if ((offset + seg_length) > rpc->msgout.length) h->incoming = htonl(rpc->msgout.length); else h->incoming = htonl(offset + seg_length); +#endif /* See strip.py */ err = homa_skb_append_from_skb(rpc->hsk->homa, new_skb, skb, seg_offset, seg_length); @@ -716,7 +855,11 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, tt_record3("retransmitting offset %d, length %d, id %d", offset, seg_length, rpc->id); homa_check_nic_queue(rpc->hsk->homa, new_skb, true); +#ifndef __STRIP__ /* See strip.py */ __homa_xmit_data(new_skb, rpc, priority); +#else /* See strip.py */ + __homa_xmit_data(new_skb, rpc); +#endif /* See strip.py */ INC_METRIC(resent_packets, 1); } } @@ -725,6 +868,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, return; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_outgoing_sysctl_changed() - Invoked whenever a sysctl value is changed; * any output-related parameters that depend on sysctl-settable values. @@ -741,6 +885,7 @@ void homa_outgoing_sysctl_changed(struct homa *homa) do_div(tmp, homa->link_mbps); homa->ns_per_mbyte = tmp; } +#endif /* See strip.py */ /** * homa_check_nic_queue() - This function is invoked before passing a packet @@ -773,9 +918,9 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) if ((clock + homa->max_nic_queue_ns) < idle && !force && !(homa->flags & HOMA_FLAG_DONT_THROTTLE)) return 0; +#ifndef __STRIP__ /* See strip.py */ if (!list_empty(&homa->throttled_rpcs)) INC_METRIC(pacer_bytes, bytes); -#ifndef __STRIP__ /* See strip.py */ if (idle < clock) { if (homa->pacer_wake_time) { u64 lost = (homa->pacer_wake_time > idle) @@ -951,8 +1096,12 @@ bool homa_pacer_xmit(struct homa *homa) /* Note: rpc->state could be RPC_DEAD here, but the code * below should work anyway. */ +#ifndef __STRIP__ /* See strip.py */ if (!*rpc->msgout.next_xmit || rpc->msgout.next_xmit_offset >= rpc->msgout.granted) { +#else /* See strip.py */ + if (!*rpc->msgout.next_xmit) { +#endif /* See strip.py */ /* Nothing more to transmit from this message (right * now), so remove it from the throttled list. */ @@ -1003,8 +1152,10 @@ void homa_add_to_throttled(struct homa_rpc *rpc) if (!list_empty(&rpc->throttled_links)) return; now = sched_clock(); +#ifndef __STRIP__ /* See strip.py */ if (!list_empty(&homa->throttled_rpcs)) INC_METRIC(throttled_ns, now - homa->throttle_add); +#endif /* See strip.py */ homa->throttle_add = now; bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; homa_throttle_lock(homa); @@ -1045,14 +1196,17 @@ void homa_remove_from_throttled(struct homa_rpc *rpc) UNIT_LOG("; ", "removing id %llu from throttled list", rpc->id); homa_throttle_lock(rpc->hsk->homa); list_del(&rpc->throttled_links); +#ifndef __STRIP__ /* See strip.py */ if (list_empty(&rpc->hsk->homa->throttled_rpcs)) INC_METRIC(throttled_ns, sched_clock() - rpc->hsk->homa->throttle_add); +#endif /* See strip.py */ homa_throttle_unlock(rpc->hsk->homa); INIT_LIST_HEAD(&rpc->throttled_links); } } +#ifndef __STRIP__ /* See strip.py */ /** * homa_log_throttled() - Print information to the system log about the * RPCs on the throttled list. @@ -1083,3 +1237,4 @@ void homa_log_throttled(struct homa *homa) pr_notice("Finished printing throttle list: %d rpcs, %lld bytes\n", rpcs, bytes); } +#endif /* See strip.py */ diff --git a/homa_peer.c b/homa_peer.c index d2cff024..d23103af 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -232,9 +232,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, INIT_LIST_HEAD(&peer->grantable_links); #endif /* See strip.py */ hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]); -#ifndef __STRIP__ /* See strip.py */ peer->current_ticks = -1; -#endif /* See strip.py */ spin_lock_init(&peer->ack_lock); INC_METRIC(peer_new_entries, 1); @@ -267,11 +265,13 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, dst = homa_peer_get_dst(peer, &hsk->inet); if (IS_ERR(dst)) { +#ifndef __STRIP__ /* See strip.py */ /* Retain the existing dst if we can't create a new one. */ if (hsk->homa->verbose) pr_notice("%s couldn't recreate dst: error %ld", __func__, PTR_ERR(dst)); INC_METRIC(peer_route_errors, 1); +#endif /* See strip.py */ kfree(save_dead); return; } @@ -279,13 +279,14 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, spin_lock_bh(&peertab->write_lock); now = sched_clock(); save_dead->dst = peer->dst; - save_dead->gc_time = now + 100000000; + save_dead->gc_time = now + 100000000; /* 100 ms */ list_add_tail(&save_dead->dst_links, &peertab->dead_dsts); homa_peertab_gc_dsts(peertab, now); peer->dst = dst; spin_unlock_bh(&peertab->write_lock); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_unsched_priority() - Returns the priority level to use for * unscheduled packets of a message. @@ -306,6 +307,7 @@ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, } /* Can't ever get here */ } +#endif /* See strip.py */ /** * homa_peer_get_dst() - Find an appropriate dst structure (either IPv4 diff --git a/homa_peer.h b/homa_peer.h index 3ac0f5da..f85dff11 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -139,7 +139,6 @@ struct homa_peer { */ struct hlist_node peertab_links; -#ifndef __STRIP__ /* See strip.py */ /** * @outstanding_resends: the number of resend requests we have * sent to this server (spaced @homa.resend_interval apart) since @@ -179,7 +178,6 @@ struct homa_peer { * in the current pass, if it still needs one. */ struct homa_rpc *resend_rpc; -#endif /* See strip.py */ /** * @num_acks: the number of (initial) entries in @acks that diff --git a/homa_plumbing.c b/homa_plumbing.c index 2018c04f..2b98d0c0 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -5,7 +5,9 @@ */ #include "homa_impl.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_offload.h" +#endif /* See strip.py */ #include "homa_peer.h" #include "homa_pool.h" @@ -35,10 +37,12 @@ static bool exiting; /* Thread that runs timer code to detect lost packets and crashed peers. */ static struct task_struct *timer_kthread; +#ifndef __STRIP__ /* See strip.py */ /* Set via sysctl to request that a particular action be taken. The value * written determines the action. */ static int action; +#endif /* See strip.py */ /* This structure defines functions that handle various operations on * Homa sockets. These functions are relatively generic: they are called @@ -175,6 +179,7 @@ static struct inet6_protocol homav6_protocol = { .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; +#ifndef __STRIP__ /* See strip.py */ /* Describes file operations implemented for /proc/net/homa_metrics. */ static const struct proc_ops homa_metrics_pops = { .proc_open = homa_metrics_open, @@ -493,8 +498,10 @@ static struct ctl_table homa_ctl_table[] = { {} #endif }; +#endif /* See strip.py */ /* Sizes of the headers for each Homa packet type, in bytes. */ +#ifndef __STRIP__ /* See strip.py */ static __u16 header_lengths[] = { sizeof32(struct homa_data_hdr), sizeof32(struct homa_grant_hdr), @@ -506,9 +513,24 @@ static __u16 header_lengths[] = { sizeof32(struct homa_need_ack_hdr), sizeof32(struct homa_ack_hdr) }; +#else /* See strip.py */ +static __u16 header_lengths[] = { + sizeof32(struct homa_data_hdr), + 0, + sizeof32(struct homa_resend_hdr), + sizeof32(struct homa_unknown_hdr), + sizeof32(struct homa_busy_hdr), + 0, + 0, + sizeof32(struct homa_need_ack_hdr), + sizeof32(struct homa_ack_hdr) +}; +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ /* Used to remove sysctl values when the module is unloaded. */ static struct ctl_table_header *homa_ctl_header; +#endif /* See strip.py */ static DECLARE_COMPLETION(timer_thread_done); @@ -523,11 +545,10 @@ int __init homa_load(void) pr_notice("Homa module loading\n"); #ifndef __STRIP__ /* See strip.py */ - pr_notice("Homa structure sizes: homa_data_hdr %u, homa_seg_hdr %u, ack %u, homa_grant_hdr %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", + pr_notice("Homa structure sizes: homa_data_hdr %u, homa_seg_hdr %u, ack %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", sizeof32(struct homa_data_hdr), sizeof32(struct homa_seg_hdr), sizeof32(struct homa_ack), - sizeof32(struct homa_grant_hdr), sizeof32(struct homa_peer), sizeof32(struct iphdr), sizeof32(struct flowi), @@ -576,6 +597,7 @@ int __init homa_load(void) status = homa_init(homa); if (status) goto homa_init_err; +#ifndef __STRIP__ /* See strip.py */ metrics_dir_entry = proc_create("homa_metrics", 0444, init_net.proc_net, &homa_metrics_pops); if (!metrics_dir_entry) { @@ -597,6 +619,7 @@ int __init homa_load(void) pr_err("Homa couldn't init offloads\n"); goto offload_err; } +#endif /* See strip.py */ timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); if (IS_ERR(timer_kthread)) { @@ -607,20 +630,24 @@ int __init homa_load(void) goto timer_err; } - homa_gro_hook_tcp(); #ifndef __STRIP__ /* See strip.py */ + homa_gro_hook_tcp(); +#endif /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ tt_init("timetrace", homa->temp); #endif /* See strip.py */ return 0; timer_err: +#ifndef __STRIP__ /* See strip.py */ homa_offload_end(); offload_err: unregister_net_sysctl_table(homa_ctl_header); sysctl_err: proc_remove(metrics_dir_entry); metrics_err: +#endif /* See strip.py */ homa_destroy(homa); homa_init_err: inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); @@ -647,18 +674,21 @@ void __exit homa_unload(void) pr_notice("Homa module unloading\n"); exiting = true; -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ tt_destroy(); #endif /* See strip.py */ - +#ifndef __STRIP__ /* See strip.py */ homa_gro_unhook_tcp(); +#endif /* See strip.py */ if (timer_kthread) wake_up_process(timer_kthread); + wait_for_completion(&timer_thread_done); +#ifndef __STRIP__ /* See strip.py */ if (homa_offload_end() != 0) pr_err("Homa couldn't stop offloads\n"); - wait_for_completion(&timer_thread_done); unregister_net_sysctl_table(homa_ctl_header); proc_remove(metrics_dir_entry); +#endif /* See strip.py */ homa_destroy(homa); inet_del_protocol(&homa_protocol, IPPROTO_HOMA); inet_unregister_protosw(&homa_protosw); @@ -713,8 +743,10 @@ void homa_close(struct sock *sk, long timeout) homa_sock_destroy(hsk); sk_common_release(sk); tt_record1("closed socket, port %d", hsk->port); +#ifndef __STRIP__ /* See strip.py */ if (hsk->homa->freeze_type == SOCKET_CLOSE) tt_freeze(); +#endif /* See strip.py */ } /** @@ -746,6 +778,7 @@ int homa_disconnect(struct sock *sk, int flags) return -EINVAL; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_ioc_abort() - The top-level function for the ioctl that implements * the homa_abort user-level API. @@ -781,6 +814,7 @@ int homa_ioc_abort(struct sock *sk, int *karg) homa_rpc_unlock(rpc); /* Locked by homa_find_client_rpc. */ return ret; } +#endif /* See strip.py */ /** * homa_ioctl() - Implements the ioctl system call for Homa sockets. @@ -793,6 +827,7 @@ int homa_ioc_abort(struct sock *sk, int *karg) */ int homa_ioctl(struct sock *sk, int cmd, int *karg) { +#ifndef __STRIP__ /* See strip.py */ int result; u64 start = sched_clock(); @@ -814,6 +849,9 @@ int homa_ioctl(struct sock *sk, int cmd, int *karg) break; } return result; +#else /* See strip.py */ + return -EINVAL; +#endif /* See strip.py */ } /** @@ -850,7 +888,9 @@ int homa_setsockopt(struct sock *sk, int level, int optname, { struct homa_sock *hsk = homa_sk(sk); struct homa_rcvbuf_args args; +#ifndef __STRIP__ /* See strip.py */ u64 start = sched_clock(); +#endif /* See strip.py */ int ret; if (level != IPPROTO_HOMA || optname != SO_HOMA_RCVBUF) @@ -926,13 +966,17 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) struct homa_sock *hsk = homa_sk(sk); struct homa_sendmsg_args args; union sockaddr_in_union *addr; +#ifndef __STRIP__ /* See strip.py */ u64 start = sched_clock(); +#endif /* See strip.py */ struct homa_rpc *rpc = NULL; int result = 0; +#ifndef __STRIP__ /* See strip.py */ u64 finish; per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; +#endif /* See strip.py */ addr = (union sockaddr_in_union *)msg->msg_name; if (!addr) { @@ -990,7 +1034,9 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) result = -EFAULT; goto error; } +#ifndef __STRIP__ /* See strip.py */ finish = sched_clock(); +#endif /* See strip.py */ INC_METRIC(send_ns, finish - start); } else { /* This is a response message. */ @@ -1035,7 +1081,9 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (result && rpc->state != RPC_DEAD) goto error; homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ +#ifndef __STRIP__ /* See strip.py */ finish = sched_clock(); +#endif /* See strip.py */ INC_METRIC(reply_ns, finish - start); } tt_record1("homa_sendmsg finished, id %d", args.id); @@ -1066,13 +1114,19 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, { struct homa_sock *hsk = homa_sk(sk); struct homa_recvmsg_args control; +#ifndef __STRIP__ /* See strip.py */ u64 start = sched_clock(); +#endif /* See strip.py */ struct homa_rpc *rpc; +#ifndef __STRIP__ /* See strip.py */ u64 finish; +#endif /* See strip.py */ int result; INC_METRIC(recv_calls, 1); +#ifndef __STRIP__ /* See strip.py */ per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; +#endif /* See strip.py */ if (unlikely(!msg->msg_control)) { /* This test isn't strictly necessary, but it provides a * hook for testing kernel call times. @@ -1112,6 +1166,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } result = rpc->error ? rpc->error : rpc->msgin.length; +#ifndef __STRIP__ /* See strip.py */ /* Generate time traces on both ends for long elapsed times (used * for performance debugging). */ @@ -1130,6 +1185,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, "Freezing because of long elapsed time for RPC id %d, peer 0x%x"); } } +#endif /* See strip.py */ /* Collect result information. */ control.id = rpc->id; @@ -1182,7 +1238,9 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, result = -EFAULT; } +#ifndef __STRIP__ /* See strip.py */ finish = sched_clock(); +#endif /* See strip.py */ tt_record3("homa_recvmsg returning id %d, length %d, bpage0 %d", control.id, result, control.bpage_offsets[0] >> HOMA_BPAGE_SHIFT); @@ -1236,11 +1294,13 @@ int homa_softirq(struct sk_buff *skb) struct homa *homa = global_homa; struct homa_common_hdr *h; int header_offset; +#ifndef __STRIP__ /* See strip.py */ u64 start; start = sched_clock(); - INC_METRIC(softirq_calls, 1); per_cpu(homa_offload_core, raw_smp_processor_id()).last_active = start; +#endif /* See strip.py */ + INC_METRIC(softirq_calls, 1); /* skb may actually contain many distinct packets, linked through * skb_shinfo(skb)->frag_list by the Homa GRO mechanism. Make a @@ -1262,8 +1322,10 @@ int homa_softirq(struct sk_buff *skb) * on the frag_list, since they aren't handled explicitly by IP. */ if (!homa_make_header_avl(skb)) { +#ifndef __STRIP__ /* See strip.py */ if (homa->verbose) pr_notice("Homa can't handle fragmented packet (no space for header); discarding\n"); +#endif /* See strip.py */ UNIT_LOG("", "pskb discard"); goto discard; } @@ -1276,6 +1338,7 @@ int homa_softirq(struct sk_buff *skb) if (unlikely(skb->len < sizeof(struct homa_common_hdr) || h->type < DATA || h->type >= BOGUS || skb->len < header_lengths[h->type - DATA])) { +#ifndef __STRIP__ /* See strip.py */ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); if (homa->verbose) @@ -1283,10 +1346,12 @@ int homa_softirq(struct sk_buff *skb) homa_symbol_for_type(h->type), homa_print_ipv6_addr(&saddr), skb->len - header_offset); +#endif /* See strip.py */ INC_METRIC(short_packets, 1); goto discard; } +#ifndef __STRIP__ /* See strip.py */ /* Check for FREEZE here, rather than in homa_incoming.c, so * it will work even if the RPC and/or socket are unknown. */ @@ -1302,6 +1367,7 @@ int homa_softirq(struct sk_buff *skb) } goto discard; } +#endif /* See strip.py */ /* Process the packet now if it is a control packet or * if it contains an entire short message. @@ -1367,7 +1433,9 @@ int homa_softirq(struct sk_buff *skb) packets = other_pkts; } +#ifndef __STRIP__ /* See strip.py */ atomic_dec(&per_cpu(homa_offload_core, raw_smp_processor_id()).softirq_backlog); +#endif /* See strip.py */ INC_METRIC(softirq_ns, sched_clock() - start); return 0; } @@ -1494,6 +1562,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, return (__poll_t)mask; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_dointvec() - This function is a wrapper around proc_dointvec. It is * invoked to read and write sysctl values and also update other values @@ -1546,10 +1615,8 @@ int homa_dointvec(const struct ctl_table *table, int write, if (action == 2) { homa_rpc_log_active(homa, 0); } else if (action == 3) { -#ifndef __STRIP__ /* See strip.py */ tt_record("Freezing because of sysctl"); tt_freeze(); -#endif /* See strip.py */ } else if (action == 4) { homa_log_throttled(homa); } else if (action == 5) { @@ -1657,6 +1724,7 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, kfree(values); return result; } +#endif /* See strip.py */ /** * homa_hrtimer() - This function is invoked by the hrtimer mechanism to diff --git a/homa_pool.c b/homa_pool.c index 941561a2..88ee1726 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" +#endif /* See strip.py */ #include "homa_pool.h" /* This file contains functions that manage user-space buffer pools. */ @@ -483,6 +485,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) atomic_read(&pool->free_bpages), pool->bpages_needed); homa_pool_allocate(rpc); +#ifndef __STRIP__ /* See strip.py */ if (rpc->msgin.num_bpages > 0) { /* Allocation succeeded; "wake up" the RPC. */ rpc->msgin.resend_all = 1; @@ -490,5 +493,11 @@ void homa_pool_check_waiting(struct homa_pool *pool) } else { homa_rpc_unlock(rpc); } +#else /* See strip.py */ + if (rpc->msgin.num_bpages > 0) + /* Allocation succeeded; "wake up" the RPC. */ + rpc->msgin.resend_all = 1; + homa_rpc_unlock(rpc); +#endif /* See strip.py */ } } diff --git a/homa_rpc.c b/homa_rpc.c index 3c1bb183..d968eafd 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -5,8 +5,14 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_pool.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" #include "homa_skb.h" +#endif /* See strip.py */ + +#ifdef __STRIP__ /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ /** * homa_rpc_new_client() - Allocate and construct a client RPC (one that is used @@ -51,7 +57,9 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, INIT_LIST_HEAD(&crpc->ready_links); INIT_LIST_HEAD(&crpc->buf_links); INIT_LIST_HEAD(&crpc->dead_links); +#ifndef __STRIP__ /* See strip.py */ INIT_LIST_HEAD(&crpc->grantable_links); +#endif /* See strip.py */ INIT_LIST_HEAD(&crpc->throttled_links); crpc->resend_timer_ticks = hsk->homa->timer_ticks; crpc->magic = HOMA_RPC_MAGIC; @@ -145,15 +153,23 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, INIT_LIST_HEAD(&srpc->ready_links); INIT_LIST_HEAD(&srpc->buf_links); INIT_LIST_HEAD(&srpc->dead_links); +#ifndef __STRIP__ /* See strip.py */ INIT_LIST_HEAD(&srpc->grantable_links); +#endif /* See strip.py */ INIT_LIST_HEAD(&srpc->throttled_links); srpc->resend_timer_ticks = hsk->homa->timer_ticks; srpc->magic = HOMA_RPC_MAGIC; srpc->start_ns = sched_clock(); +#ifndef __STRIP__ /* See strip.py */ tt_record2("Incoming message for id %d has %d unscheduled bytes", srpc->id, ntohl(h->incoming)); +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ err = homa_message_in_init(srpc, ntohl(h->message_length), ntohl(h->incoming)); +#else /* See strip.py */ + err = homa_message_in_init(srpc, ntohl(h->message_length)); +#endif /* See strip.py */ if (err != 0) goto error; @@ -250,12 +266,13 @@ void homa_rpc_end(struct homa_rpc *rpc) tt_record1("homa_rpc_end invoked for id %d", rpc->id); rpc->state = RPC_DEAD; - /* The following line must occur before the socket is locked or - * RPC is added to dead_rpcs. This is necessary because homa_grant_free - * releases the RPC lock and reacquires it (see comment in - * homa_grant_free for more info). +#ifndef __STRIP__ /* See strip.py */ + /* The following line must occur before the socket is locked. This is + * necessary because homa__rpc releases the RPC lock and + * reacquires it. */ homa_grant_free_rpc(rpc); +#endif /* See strip.py */ /* Unlink from all lists, so no-one will ever find this RPC again. */ homa_sock_lock(rpc->hsk); @@ -512,6 +529,7 @@ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, return NULL; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_rpc_log() - Log info about a particular RPC; this is functionality * pulled out of homa_rpc_log_active because its indentation got too deep. @@ -582,7 +600,6 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) */ void homa_rpc_log_tt(struct homa_rpc *rpc) { -#ifndef __STRIP__ /* See strip.py */ if (rpc->state == RPC_INCOMING) { int received = rpc->msgin.length - rpc->msgin.bytes_remaining; @@ -616,7 +633,6 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) } else { tt_record2("RPC id %d is in state %d", rpc->id, rpc->state); } -#endif /* See strip.py */ } /** @@ -710,7 +726,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) continue; total_incoming += rpc->msgin.rec_incoming; if (verbose) - tt_record3("homa_validate_incoming: RPC id %d, ncoming %d, rec_incoming %d", + tt_record3("homa_validate_incoming: RPC id %d, incoming %d, rec_incoming %d", rpc->id, incoming, rpc->msgin.rec_incoming); if (rpc->msgin.granted >= rpc->msgin.length) @@ -735,3 +751,4 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) actual - total_incoming, total_incoming, actual); return actual - total_incoming; } +#endif /* See strip.py */ diff --git a/homa_rpc.h b/homa_rpc.h index 8cb18f3a..752962de 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -59,6 +59,7 @@ struct homa_message_out { */ int next_xmit_offset; +#ifndef __STRIP__ /* See strip.py */ /** * @unscheduled: Initial bytes of message that we'll send * without waiting for grants. @@ -78,6 +79,7 @@ struct homa_message_out { * packets. */ __u8 sched_priority; +#endif /* See strip.py */ /** * @init_ns: Time in sched_clock units when this structure was @@ -142,6 +144,7 @@ struct homa_message_in { */ int bytes_remaining; +#ifndef __STRIP__ /* See strip.py */ /** * @granted: Total # of bytes (starting from offset 0) that the sender * may transmit without additional grants, includes unscheduled bytes. @@ -165,15 +168,18 @@ struct homa_message_in { /** @priority: Priority level to include in future GRANTS. */ int priority; +#endif /* See strip.py */ /** @resend_all: if nonzero, set resend_all in the next grant packet. */ __u8 resend_all; +#ifndef __STRIP__ /* See strip.py */ /** * @birth: sched_clock() time when this RPC was added to the grantable * list. Invalid if RPC isn't in the grantable list. */ u64 birth; +#endif /* See strip.py */ /** * @num_bpages: The number of entries in @bpage_offsets used for this @@ -346,12 +352,14 @@ struct homa_rpc { */ struct homa_interest *interest; +#ifndef __STRIP__ /* See strip.py */ /** * @grantable_links: Used to link this RPC into peer->grantable_rpcs. * If this RPC isn't in peer->grantable_rpcs, this is an empty * list pointing to itself. */ struct list_head grantable_links; +#endif /* See strip.py */ /** * @throttled_links: Used to link this RPC into homa->throttled_rpcs. @@ -406,10 +414,12 @@ struct homa_rpc void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); void homa_rpc_end(struct homa_rpc *rpc); +#ifndef __STRIP__ /* See strip.py */ void homa_rpc_log(struct homa_rpc *rpc); void homa_rpc_log_active(struct homa *homa, uint64_t id); void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); void homa_rpc_log_tt(struct homa_rpc *rpc); +#endif /* See strip.py */ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, const union sockaddr_in_union *dest); @@ -418,9 +428,10 @@ struct homa_rpc const struct in6_addr *source, struct homa_data_hdr *h, int *created); int homa_rpc_reap(struct homa_sock *hsk, bool reap_all); -char *homa_symbol_for_state(struct homa_rpc *rpc); +#ifndef __STRIP__ /* See strip.py */ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); +#endif /* See strip.py */ /** * homa_rpc_lock() - Acquire the lock for an RPC. diff --git a/homa_sock.c b/homa_sock.c index 29383eb0..7eecdc77 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -187,8 +187,10 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_ATOMIC); if (!hsk->buffer_pool) result = -ENOMEM; +#ifndef __STRIP__ /* See strip.py */ if (homa->hijack_tcp) hsk->sock.sk_protocol = IPPROTO_TCP; +#endif /* See strip.py */ spin_unlock_bh(&socktab->write_lock); return result; } diff --git a/homa_sock.h b/homa_sock.h index 1e137989..d1fcf2d2 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -143,9 +143,11 @@ struct homa_sock { */ struct homa *homa; - /** @shutdown: True means the socket is no longer usable (either + /** + * @shutdown: True means the socket is no longer usable (either * shutdown has already been invoked, or the socket was never - * properly initialized). */ + * properly initialized). + */ bool shutdown; /** diff --git a/homa_stub.h b/homa_stub.h index 19a27ab3..3bfe7b8b 100644 --- a/homa_stub.h +++ b/homa_stub.h @@ -11,6 +11,17 @@ #include "homa_impl.h" +static inline int homa_skb_init(struct homa *homa) +{ + return 0; +} + +static inline void homa_skb_cleanup(struct homa *homa) +{} + +static inline void homa_skb_release_pages(struct homa *homa) +{} + static inline int homa_skb_append_from_iter(struct homa *homa, struct sk_buff *skb, struct iov_iter *iter, int length) @@ -66,8 +77,7 @@ static inline struct sk_buff *homa_skb_new_tx(int length) struct sk_buff *skb; skb = alloc_skb(HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH + - sizeof(struct homa_skb_info) + length, - GFP_ATOMIC); + sizeof(struct homa_skb_info) + length, GFP_ATOMIC); if (likely(skb)) { skb_reserve(skb, HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH); skb_reset_transport_header(skb); diff --git a/homa_timer.c b/homa_timer.c index fb169654..0b050fad 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -7,7 +7,13 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_rpc.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_skb.h" +#endif /* See strip.py */ + +#ifdef __STRIP__ /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ /** * homa_check_rpc() - Invoked for each RPC during each timer pass; does @@ -21,7 +27,9 @@ void homa_check_rpc(struct homa_rpc *rpc) { struct homa *homa = rpc->hsk->homa; struct homa_resend_hdr resend; +#ifndef __STRIP__ /* See strip.py */ const char *us, *them; +#endif /* See strip.py */ /* See if we need to request an ack for this RPC. */ if (!homa_is_client(rpc->id) && rpc->state == RPC_OUTGOING && @@ -45,6 +53,7 @@ void homa_check_rpc(struct homa_rpc *rpc) } if (rpc->state == RPC_INCOMING) { +#ifndef __STRIP__ /* See strip.py */ if ((rpc->msgin.length - rpc->msgin.bytes_remaining) >= rpc->msgin.granted) { /* We've received everything that we've granted, so we @@ -53,6 +62,7 @@ void homa_check_rpc(struct homa_rpc *rpc) rpc->silent_ticks = 0; return; } +#endif /* See strip.py */ if (rpc->msgin.num_bpages == 0) { /* Waiting for buffer space, so no problem. */ rpc->silent_ticks = 0; @@ -67,7 +77,11 @@ void homa_check_rpc(struct homa_rpc *rpc) } if (rpc->state == RPC_OUTGOING) { +#ifndef __STRIP__ /* See strip.py */ if (rpc->msgout.next_xmit_offset < rpc->msgout.granted) { +#else /* See strip.py */ + if (rpc->msgout.next_xmit_offset < rpc->msgout.length) { +#endif /* See strip.py */ /* There are granted bytes that we haven't transmitted, * so no need to be concerned; the ball is in our court. */ @@ -82,6 +96,7 @@ void homa_check_rpc(struct homa_rpc *rpc) INC_METRIC(rpc_timeouts, 1); tt_record3("RPC id %d, peer 0x%x, aborted because of timeout, state %d", rpc->id, tt_addr(rpc->peer->addr), rpc->state); +#ifndef __STRIP__ /* See strip.py */ homa_rpc_log_active_tt(homa, 0); tt_record1("Freezing because of RPC abort (id %d)", rpc->id); homa_freeze_peers(homa); @@ -91,6 +106,7 @@ void homa_check_rpc(struct homa_rpc *rpc) rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), rpc->state); +#endif /* See strip.py */ homa_rpc_abort(rpc, -ETIMEDOUT); return; } @@ -112,40 +128,54 @@ void homa_check_rpc(struct homa_rpc *rpc) } else { homa_gap_retry(rpc); resend.offset = htonl(rpc->msgin.recv_end); +#ifndef __STRIP__ /* See strip.py */ resend.length = htonl(rpc->msgin.granted - rpc->msgin.recv_end); +#else /* See strip.py */ + resend.length = htonl(rpc->msgin.length - rpc->msgin.recv_end); +#endif /* See strip.py */ if (resend.length == 0) return; } +#ifndef __STRIP__ /* See strip.py */ resend.priority = homa->num_priorities - 1; +#endif /* See strip.py */ homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); -#ifndef __STRIP__ /* See strip.py */ if (homa_is_client(rpc->id)) { +#ifndef __STRIP__ /* See strip.py */ us = "client"; them = "server"; +#endif /* See strip.py */ tt_record4("Sent RESEND for client RPC id %llu, server 0x%x:%d, offset %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); +#ifndef __STRIP__ /* See strip.py */ tt_record4("length %d, granted %d, rem %d, rec_incoming %d", rpc->msgin.length, rpc->msgin.granted, rpc->msgin.bytes_remaining, rpc->msgin.rec_incoming); +#endif /* See strip.py */ } else { +#ifndef __STRIP__ /* See strip.py */ us = "server"; them = "client"; +#endif /* See strip.py */ tt_record4("Sent RESEND for server RPC id %llu, client 0x%x:%d offset %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); +#ifndef __STRIP__ /* See strip.py */ tt_record4("length %d, granted %d, rem %d, rec_incoming %d", rpc->msgin.length, rpc->msgin.granted, rpc->msgin.bytes_remaining, rpc->msgin.rec_incoming); - } #endif /* See strip.py */ + } +#ifndef __STRIP__ /* See strip.py */ if (homa->verbose) pr_notice("Homa %s RESEND to %s %s:%d for id %llu, offset %d, length %d\n", us, them, homa_print_ipv6_addr(&rpc->peer->addr), rpc->dport, rpc->id, rpc->msgin.recv_end, rpc->msgin.granted - rpc->msgin.recv_end); +#endif /* See strip.py */ } /** @@ -156,22 +186,32 @@ void homa_check_rpc(struct homa_rpc *rpc) void homa_timer(struct homa *homa) { struct homa_socktab_scan scan; +#ifndef __STRIP__ /* See strip.py */ static u64 prev_grant_count; int total_incoming_rpcs = 0; int sum_incoming_rec = 0; +#endif /* See strip.py */ struct homa_sock *hsk; +#ifndef __STRIP__ /* See strip.py */ static int zero_count; +#endif /* See strip.py */ struct homa_rpc *rpc; +#ifndef __STRIP__ /* See strip.py */ int sum_incoming = 0; - cycles_t start, end; u64 total_grants; +#endif /* See strip.py */ int total_rpcs = 0; int rpc_count = 0; +#ifndef __STRIP__ /* See strip.py */ + cycles_t start; + cycles_t end; int core; +#endif /* See strip.py */ - start = sched_clock(); homa->timer_ticks++; +#ifndef __STRIP__ /* See strip.py */ + start = sched_clock(); total_grants = 0; for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = homa_metrics_per_cpu(); @@ -198,6 +238,7 @@ void homa_timer(struct homa *homa) zero_count = 0; } prev_grant_count = total_grants; +#endif /* See strip.py */ /* Scan all existing RPCs in all sockets. */ for (hsk = homa_socktab_start_scan(homa->port_map, &scan); @@ -207,12 +248,14 @@ void homa_timer(struct homa *homa) * isn't keeping up with RPC reaping, so we'll help * out. See reap.txt for more info. */ - u64 start = sched_clock(); +#ifndef __STRIP__ /* See strip.py */ + u64 rpc_start = sched_clock(); +#endif /* See strip.py */ tt_record("homa_timer calling homa_rpc_reap"); if (homa_rpc_reap(hsk, false) == 0) break; - INC_METRIC(timer_reap_ns, sched_clock() - start); + INC_METRIC(timer_reap_ns, sched_clock() - rpc_start); } if (list_empty(&hsk->active_rpcs) || hsk->shutdown) @@ -228,12 +271,14 @@ void homa_timer(struct homa *homa) rpc->silent_ticks = 0; homa_rpc_unlock(rpc); continue; +#ifndef __STRIP__ /* See strip.py */ } else if (rpc->state == RPC_INCOMING) { total_incoming_rpcs += 1; sum_incoming_rec += rpc->msgin.rec_incoming; sum_incoming += rpc->msgin.granted - (rpc->msgin.length - rpc->msgin.bytes_remaining); +#endif /* See strip.py */ } rpc->silent_ticks++; homa_check_rpc(rpc); @@ -253,10 +298,14 @@ void homa_timer(struct homa *homa) homa_unprotect_rpcs(hsk); } homa_socktab_end_scan(&scan); +#ifndef __STRIP__ /* See strip.py */ tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", total_incoming_rpcs, sum_incoming, sum_incoming_rec, atomic_read(&homa->total_incoming)); +#endif /* See strip.py */ homa_skb_release_pages(homa); +#ifndef __STRIP__ /* See strip.py */ end = sched_clock(); INC_METRIC(timer_ns, end - start); +#endif /* See strip.py */ } diff --git a/homa_utils.c b/homa_utils.c index 7a0338ac..0d61d44f 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -7,7 +7,13 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_rpc.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_skb.h" +#endif /* See strip.py */ + +#ifdef __STRIP__ /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ struct completion homa_pacer_kthread_done; @@ -21,40 +27,30 @@ struct completion homa_pacer_kthread_done; */ int homa_init(struct homa *homa) { - int i, err; + int err; +#ifndef __STRIP__ /* See strip.py */ + int i; _Static_assert(HOMA_MAX_PRIORITIES >= 8, "homa_init assumes at least 8 priority levels"); +#endif /* See strip.py */ - homa->pacer_kthread = NULL; + memset(homa, 0, sizeof(*homa)); init_completion(&homa_pacer_kthread_done); atomic64_set(&homa->next_outgoing_id, 2); atomic64_set(&homa->link_idle_time, sched_clock()); +#ifndef __STRIP__ /* See strip.py */ spin_lock_init(&homa->grantable_lock); - homa->grantable_lock_time = 0; - atomic_set(&homa->grant_recalc_count, 0); INIT_LIST_HEAD(&homa->grantable_peers); INIT_LIST_HEAD(&homa->grantable_rpcs); - homa->num_grantable_rpcs = 0; homa->last_grantable_change = sched_clock(); - homa->max_grantable_rpcs = 0; - homa->oldest_rpc = NULL; - homa->num_active_rpcs = 0; - for (i = 0; i < HOMA_MAX_GRANTS; i++) { - homa->active_rpcs[i] = NULL; - atomic_set(&homa->active_remaining[i], 0); - } - homa->grant_nonfifo = 0; - homa->grant_nonfifo_left = 0; +#endif /* See strip.py */ spin_lock_init(&homa->pacer_mutex); homa->pacer_fifo_fraction = 50; homa->pacer_fifo_count = 1; - homa->pacer_wake_time = 0; spin_lock_init(&homa->throttle_lock); INIT_LIST_HEAD_RCU(&homa->throttled_rpcs); - homa->throttle_add = 0; homa->throttle_min_bytes = 200; - atomic_set(&homa->total_incoming, 0); homa->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL); if (!homa->port_map) { @@ -74,17 +70,22 @@ int homa_init(struct homa *homa) __func__, -err); return err; } +#ifndef __STRIP__ /* See strip.py */ err = homa_skb_init(homa); if (err) { pr_err("Couldn't initialize skb management (errno %d)\n", -err); return err; } +#endif /* See strip.py */ /* Wild guesses to initialize configuration values... */ +#ifndef __STRIP__ /* See strip.py */ homa->unsched_bytes = 40000; homa->window_param = 100000; +#endif /* See strip.py */ homa->link_mbps = 25000; +#ifndef __STRIP__ /* See strip.py */ homa->poll_usecs = 50; homa->num_priorities = HOMA_MAX_PRIORITIES; for (i = 0; i < HOMA_MAX_PRIORITIES; i++) @@ -107,6 +108,7 @@ int homa_init(struct homa *homa) homa->max_overcommit = 8; homa->max_incoming = 400000; homa->max_rpcs_per_peer = 1; +#endif /* See strip.py */ homa->resend_ticks = 5; homa->resend_interval = 5; homa->timeout_ticks = 100; @@ -114,7 +116,6 @@ int homa_init(struct homa *homa) homa->request_ack_ticks = 2; homa->reap_limit = 10; homa->dead_buffs_limit = 5000; - homa->max_dead_buffs = 0; homa->pacer_kthread = kthread_run(homa_pacer_main, homa, "homa_pacer"); if (IS_ERR(homa->pacer_kthread)) { @@ -125,27 +126,23 @@ int homa_init(struct homa *homa) } homa->pacer_exit = false; homa->max_nic_queue_ns = 5000; - homa->ns_per_mbyte = 0; +#ifndef __STRIP__ /* See strip.py */ homa->verbose = 0; +#endif /* See strip.py */ homa->max_gso_size = 10000; - homa->gso_force_software = 0; - homa->hijack_tcp = 0; +#ifndef __STRIP__ /* See strip.py */ homa->max_gro_skbs = 20; homa->gro_policy = HOMA_GRO_NORMAL; homa->busy_usecs = 100; homa->gro_busy_usecs = 5; - homa->timer_ticks = 0; mutex_init(&homa->metrics_mutex); homa->metrics = NULL; - homa->metrics_capacity = 0; - homa->metrics_length = 0; - homa->metrics_active_opens = 0; - homa->flags = 0; - homa->freeze_type = 0; +#endif /* See strip.py */ homa->bpage_lease_usecs = 10000; - homa->next_id = 0; +#ifndef __STRIP__ /* See strip.py */ homa_outgoing_sysctl_changed(homa); homa_incoming_sysctl_changed(homa); +#endif /* See strip.py */ return 0; } @@ -175,11 +172,14 @@ void homa_destroy(struct homa *homa) kfree(homa->peers); homa->peers = NULL; } +#ifndef __STRIP__ /* See strip.py */ homa_skb_cleanup(homa); kfree(homa->metrics); homa->metrics = NULL; +#endif /* See strip.py */ } +#ifndef __STRIP__ /* See strip.py */ /** * homa_prios_changed() - This function is called whenever configuration * information related to priorities, such as @homa->unsched_cutoffs or @@ -217,6 +217,7 @@ void homa_prios_changed(struct homa *homa) } homa->cutoff_version++; } +#endif /* See strip.py */ /** * homa_spin() - Delay (without sleeping) for a given time interval. diff --git a/homa_wire.h b/homa_wire.h index 46b8bb97..c9f77575 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -13,12 +13,16 @@ */ enum homa_packet_type { DATA = 0x10, +#ifndef __STRIP__ /* See strip.py */ GRANT = 0x11, +#endif /* See strip.py */ RESEND = 0x12, UNKNOWN = 0x13, BUSY = 0x14, +#ifndef __STRIP__ /* See strip.py */ CUTOFFS = 0x15, FREEZE = 0x16, +#endif /* See strip.py */ NEED_ACK = 0x17, ACK = 0x18, BOGUS = 0x19, /* Used only in unit tests. */ @@ -62,12 +66,14 @@ enum homa_packet_type { */ #define HOMA_MAX_HEADER 90 +#ifndef __STRIP__ /* See strip.py */ /** * define HOMA_MAX_PRIORITIES - The maximum number of priority levels that * Homa can use (the actual number can be restricted to less than this at * runtime). Changing this value will affect packet formats. */ #define HOMA_MAX_PRIORITIES 8 +#endif /* See strip.py */ /** * struct homa_common_hdr - Wire format for the first bytes in every Homa @@ -116,6 +122,7 @@ struct homa_common_hdr { */ __u8 doff; +#ifndef __STRIP__ /* See strip.py */ /** * @flags: Holds TCP flags such as URG, ACK, etc. The special value * HOMA_TCP_FLAGS is stored here to distinguish Homa-over-TCP packets @@ -125,6 +132,9 @@ struct homa_common_hdr { */ __u8 flags; #define HOMA_TCP_FLAGS 6 +#else /* See strip.py */ + __u8 reserved1; +#endif /* See strip.py */ /** * @window: Corresponds to the window field in TCP headers. Not used @@ -138,6 +148,7 @@ struct homa_common_hdr { */ __be16 checksum; +#ifndef __STRIP__ /* See strip.py */ /** * @urgent: occupies the same bytes as the urgent pointer in a TCP * header. When Homa packets are transmitted over TCP, this has the @@ -146,6 +157,9 @@ struct homa_common_hdr { */ __be16 urgent; #define HOMA_TCP_URGENT 0xb97d +#else /* See strip.py */ + __be16 reserved2; +#endif /* See strip.py */ /** * @sender_id: the identifier of this RPC as used on the sender (i.e., @@ -176,6 +190,7 @@ struct homa_ack { __be16 server_port; } __packed; +#ifndef __STRIP__ /* See strip.py */ /* struct homa_data_hdr - Contains data for part or all of a Homa message. * An incoming packet consists of a homa_data_hdr followed by message data. * An outgoing packet can have this simple format as well, or it can be @@ -219,8 +234,48 @@ struct homa_ack { * throughout the segment data; TSO/GSO will include a different homa_seg_hdr * in each generated packet. */ +#else /* See strip.py */ +/* struct homa_data_hdr - Contains data for part or all of a Homa message. + * An incoming packet consists of a homa_data_hdr followed by message data. + * An outgoing packet can have this simple format as well, or it can be + * structured as a GSO packet with the following format: + * + * |-----------------------| + * | | + * | data_header | + * | | + * |---------------------- | + * | | + * | | + * | segment data | + * | | + * | | + * |-----------------------| + * | seg_header | + * |-----------------------| + * | | + * | | + * | segment data | + * | | + * | | + * |-----------------------| + * | seg_header | + * |-----------------------| + * | | + * | | + * | segment data | + * | | + * | | + * |-----------------------| + * + * TSO will not adjust @homa_common_hdr.sequence in the segments, so Homa + * sprinkles correct offsets (in homa_seg_hdrs) throughout the segment data; + * TSO/GSO will include a different homa_seg_hdr in each generated packet. + */ +#endif /* See strip.py */ struct homa_seg_hdr { +#ifndef __STRIP__ /* See strip.py */ /** * @offset: Offset within message of the first byte of data in * this segment. If this field is -1 it means that the packet was @@ -229,6 +284,12 @@ struct homa_seg_hdr { * and updates this value from @common.sequence if needed, so the * value will always be valid once the packet reaches homa_softirq. */ +#else /* See strip.py */ + /** + * @offset: Offset within message of the first byte of data in + * this segment. + */ +#endif /* See strip.py */ __be32 offset; } __packed; @@ -238,6 +299,7 @@ struct homa_data_hdr { /** @message_length: Total #bytes in the message. */ __be32 message_length; +#ifndef __STRIP__ /* See strip.py */ /** * @incoming: The receiver can expect the sender to send all of the * bytes in the message up to at least this offset (exclusive), @@ -246,6 +308,9 @@ struct homa_data_hdr { * transmits unilaterally (e.g., to round up to a full GSO batch). */ __be32 incoming; +#else /* See strip.py */ + __be32 reserved1; +#endif /* See strip.py */ /** @ack: If the @client_id field of this is nonzero, provides info * about an RPC that the recipient can now safely free. Note: in @@ -256,6 +321,7 @@ struct homa_data_hdr { */ struct homa_ack ack; +#ifndef __STRIP__ /* See strip.py */ /** * @cutoff_version: The cutoff_version from the most recent * CUTOFFS packet that the source of this packet has received @@ -263,6 +329,9 @@ struct homa_data_hdr { * yet received a CUTOFFS packet. */ __be16 cutoff_version; +#else /* See strip.py */ + __be16 reserved2; +#endif /* See strip.py */ /** * @retransmit: 1 means this packet was sent in response to a RESEND @@ -296,6 +365,7 @@ static inline int homa_data_len(struct sk_buff *skb) sizeof(struct homa_data_hdr); } +#ifndef __STRIP__ /* See strip.py */ /** * struct homa_grant_hdr - Wire format for GRANT packets, which are sent by * the receiver back to the sender to indicate that the sender may transmit @@ -329,6 +399,7 @@ struct homa_grant_hdr { } __packed; _Static_assert(sizeof(struct homa_grant_hdr) <= HOMA_MAX_HEADER, "homa_grant_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +#endif /* See strip.py */ /** * struct homa_resend_hdr - Wire format for RESEND packets. @@ -357,6 +428,7 @@ struct homa_resend_hdr { */ __be32 length; +#ifndef __STRIP__ /* See strip.py */ /** * @priority: Packet priority to use. * @@ -364,6 +436,7 @@ struct homa_resend_hdr { * priority. */ __u8 priority; +#endif /* See strip.py */ } __packed; _Static_assert(sizeof(struct homa_resend_hdr) <= HOMA_MAX_HEADER, "homa_resend_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); @@ -397,6 +470,7 @@ struct homa_busy_hdr { _Static_assert(sizeof(struct homa_busy_hdr) <= HOMA_MAX_HEADER, "homa_busy_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +#ifndef __STRIP__ /* See strip.py */ /** * struct homa_cutoffs_hdr - Wire format for CUTOFFS packets. * @@ -436,6 +510,7 @@ struct homa_freeze_hdr { } __packed; _Static_assert(sizeof(struct homa_freeze_hdr) <= HOMA_MAX_HEADER, "homa_freeze_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +#endif /* See strip.py */ /** * struct homa_need_ack_hdr - Wire format for NEED_ACK packets. diff --git a/notes.txt b/notes.txt index 590afb59..81a90eeb 100755 --- a/notes.txt +++ b/notes.txt @@ -411,3 +411,6 @@ Notes for Homa implementation in Linux: ip_input.c: ip_rcv_finish ip_input.c: dst_input homa_plumbing.c: homa_softirq + +#ifndef __STRIP__ /* See strip.py */ +#endif /* See strip.py */ diff --git a/test/Makefile b/test/Makefile index bbe263c1..7b1785ee 100644 --- a/test/Makefile +++ b/test/Makefile @@ -29,43 +29,50 @@ CCINCLUDES := \ DEFS := -D__KERNEL__ \ -D__UNIT_TEST__ \ -D KBUILD_MODNAME='"homa"' +ifneq ($(__STRIP__),) +DEFS += -D__STRIP__ +endif WARNS := -Wall -Wundef -Wno-trigraphs -Wno-sign-compare \ -Wno-strict-aliasing -Werror CFLAGS := $(WARNS) -Wstrict-prototypes -MD -g $(CINCLUDES) $(DEFS) CCFLAGS := -std=c++11 $(WARNS) -MD -g $(CCINCLUDES) $(DEFS) -fsanitize=address -TEST_SRCS := unit_homa_grant.c \ - unit_homa_incoming.c \ - unit_homa_offload.c \ - unit_homa_metrics.c \ +TEST_SRCS := unit_homa_incoming.c \ unit_homa_outgoing.c \ unit_homa_peer.c \ unit_homa_pool.c \ unit_homa_plumbing.c \ unit_homa_rpc.c \ - unit_homa_skb.c \ unit_homa_sock.c \ unit_homa_timer.c \ unit_homa_utils.c \ unit_timetrace.c +ifeq ($(__STRIP__),) +TEST_SRCS += unit_homa_grant.c \ + unit_homa_offload.c \ + unit_homa_metrics.c \ + unit_homa_skb.c +endif TEST_OBJS := $(patsubst %.c,%.o,$(TEST_SRCS)) HOMA_SRCS := homa_devel.c \ - homa_grant.c \ homa_incoming.c \ - homa_metrics.c \ - homa_offload.c \ homa_outgoing.c \ homa_peer.c \ homa_pool.c \ homa_plumbing.c \ homa_rpc.c \ - homa_skb.c \ homa_sock.c \ homa_timer.c \ homa_utils.c \ timetrace.c +ifeq ($(__STRIP__),) +HOMA_SRCS += homa_grant.c \ + homa_metrics.c \ + homa_offload.c \ + homa_skb.c +endif HOMA_OBJS := $(patsubst %.c,%.o,$(HOMA_SRCS)) OTHER_SRCS := ccutils.cc \ diff --git a/test/mock.c b/test/mock.c index 227c00d3..800d5581 100644 --- a/test/mock.c +++ b/test/mock.c @@ -7,7 +7,9 @@ #include "homa_impl.h" #include "homa_pool.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_skb.h" +#endif /* See strip.py */ #include "ccutils.h" #include "mock.h" #include "utils.h" @@ -380,11 +382,7 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} -#if KERNEL_VERSION(5, 18, 0) > LINUX_VERSION_CODE - void get_random_bytes(void *buf, int nbytes) -#else - void get_random_bytes(void *buf, size_t nbytes) -#endif +void get_random_bytes(void *buf, size_t nbytes) { memset(buf, 0, nbytes); } @@ -1341,6 +1339,7 @@ void mock_clear_xmit_prios(void) mock_xmit_prios[0] = 0; } +#ifndef __STRIP__ /* See strip.py */ /** * mock_compound_order() - Replacement for compound_order function. */ @@ -1355,6 +1354,7 @@ unsigned int mock_compound_order(struct page *page) mock_compound_order_mask >>= 1; return result; } +#endif /* See strip.py */ /** * mock_cpu_to_node() - Replaces cpu_to_node to determine NUMA node for @@ -1566,9 +1566,11 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct homa_common_hdr *h, case DATA: header_size = sizeof(struct homa_data_hdr); break; +#ifndef __STRIP__ /* See strip.py */ case GRANT: header_size = sizeof(struct homa_grant_hdr); break; +#endif /* See strip.py */ case RESEND: header_size = sizeof(struct homa_resend_hdr); break; @@ -1578,12 +1580,14 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct homa_common_hdr *h, case BUSY: header_size = sizeof(struct homa_busy_hdr); break; +#ifndef __STRIP__ /* See strip.py */ case CUTOFFS: header_size = sizeof(struct homa_cutoffs_hdr); break; case FREEZE: header_size = sizeof(struct homa_freeze_hdr); break; +#endif /* See strip.py */ case NEED_ACK: header_size = sizeof(struct homa_need_ack_hdr); break; @@ -1835,7 +1839,9 @@ void mock_teardown(void) mock_preempt_disables); mock_preempt_disables = 0; +#ifndef __STRIP__ /* See strip.py */ memset(homa_metrics, 0, sizeof(homa_metrics)); +#endif /* See strip.py */ unit_hook_clear(); } diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index e9154850..aaf08668 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -78,16 +78,15 @@ FIXTURE_SETUP(homa_grant) self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->data = (struct homa_data_hdr){.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .type = DATA, - .sender_id = cpu_to_be64(self->client_id)}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0}, - .retransmit = 0, - .seg = {.offset = 0}}; + memset(&self->data, 0, sizeof(self->data)); + self->data.common = (struct homa_common_hdr){ + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = DATA, + .sender_id = cpu_to_be64(self->client_id) + }; + self->data.message_length = htonl(10000); + self->data.incoming = htonl(10000); unit_log_clear(); self->incoming_delta = 0; } diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 52d73074..299c724e 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_offload.h" +#endif /* See strip.py */ #include "homa_peer.h" #include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 @@ -108,6 +110,7 @@ void shutdown_hook(char *id) homa_sock_shutdown(hook_hsk); } +#ifndef __STRIP__ /* See strip.py */ /* The following hook function updates hook_rpc->msgin.granted. */ int unlock_count; void unlock_hook(char *id) @@ -118,6 +121,16 @@ void unlock_hook(char *id) hook_rpc->msgin.granted = hook_granted; unlock_count--; } +#endif /* See strip.py */ + +#ifdef __STRIP__ /* See strip.py */ +int mock_message_in_init(struct homa_rpc *rpc, int length, int unsched) +{ + return homa_message_in_init(rpc, length); +} +#define homa_message_in_init(rpc, length, unsched) \ + mock_message_in_init(rpc, length, unsched) +#endif /* See strip.py */ FIXTURE(homa_incoming) { struct in6_addr client_ip[5]; @@ -147,28 +160,33 @@ FIXTURE_SETUP(homa_incoming) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); +#ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 1; self->homa.poll_usecs = 0; +#endif /* See strip.py */ self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.pacer_fifo_fraction = 0; +#ifndef __STRIP__ /* See strip.py */ self->homa.grant_fifo_fraction = 0; self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; +#endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, 0); mock_sock_init(&self->hsk2, &self->homa, self->server_port); self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->data = (struct homa_data_hdr){.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .type = DATA, - .sender_id = cpu_to_be64(self->client_id)}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0}, - .retransmit = 0, - .seg = {.offset = 0}}; + memset(&self->data, 0, sizeof(self->data)); + self->data.common = (struct homa_common_hdr){ + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = DATA, + .sender_id = cpu_to_be64(self->client_id) + }; + self->data.message_length = htonl(10000); +#ifndef __STRIP__ /* See strip.py */ + self->data.incoming = htonl(10000); +#endif /* See strip.py */ unit_log_clear(); delete_count = 0; lock_delete_count = 0; @@ -179,6 +197,7 @@ FIXTURE_TEARDOWN(homa_incoming) unit_teardown(); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_message_in_init__basics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -191,6 +210,7 @@ TEST_F(homa_incoming, homa_message_in_init__basics) EXPECT_EQ(128, crpc->msgin.granted); EXPECT_EQ(1, crpc->msgin.num_bpages); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_message_in_init__message_too_long) { struct homa_rpc *srpc; @@ -221,8 +241,11 @@ TEST_F(homa_incoming, homa_message_in_init__no_buffers_available) atomic_set(&self->hsk.buffer_pool->free_bpages, 0); EXPECT_EQ(0, homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 10000)); EXPECT_EQ(0, crpc->msgin.num_bpages); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(0, crpc->msgin.granted); +#endif /* See strip.py */ } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_message_in_init__update_metrics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -242,6 +265,7 @@ TEST_F(homa_incoming, homa_message_in_init__update_metrics) EXPECT_EQ(0, homa_metrics_per_cpu()->medium_msg_bytes[15]); EXPECT_EQ(1900000, homa_metrics_per_cpu()->large_msg_bytes); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_gap_retry) { @@ -252,14 +276,23 @@ TEST_F(homa_incoming, homa_gap_retry) homa_gap_new(&srpc->msgin.gaps, 1000, 2000); homa_gap_new(&srpc->msgin.gaps, 4000, 6000); homa_gap_new(&srpc->msgin.gaps, 7000, 8000); +#ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 8; +#endif /* See strip.py */ unit_log_clear(); homa_gap_retry(srpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1000-1999@7; " "xmit RESEND 4000-5999@7; " "xmit RESEND 7000-7999@7", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1000-1999; " + "xmit RESEND 4000-5999; " + "xmit RESEND 7000-7999", + unit_log_get()); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_add_packet__basics) @@ -629,6 +662,7 @@ TEST_F(homa_incoming, homa_add_packet__scan_multiple_gaps) EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); EXPECT_STREQ("start 0, end 1400", unit_print_gaps(crpc)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_add_packet__metrics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -657,6 +691,7 @@ TEST_F(homa_incoming, homa_add_packet__metrics) EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); EXPECT_EQ(1, homa_metrics_per_cpu()->resent_packets_used); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_copy_to_user__basics) { @@ -938,7 +973,9 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cant_create_server_rpc) 1400, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, mock_skb_count()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->server_cant_create_rpcs); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_dispatch_pkts__existing_server_rpc) { @@ -965,8 +1002,12 @@ TEST_F(homa_incoming, homa_dispatch_pkts__non_data_packet_for_existing_server_rp .type = RESEND, .sender_id = cpu_to_be64(self->client_id)}, .offset = 0, +#ifndef __STRIP__ /* See strip.py */ .length = 1000, .priority = 3}; +#else /* See strip.py */ + .length = 1000}; +#endif /* See strip.py */ ASSERT_NE(NULL, srpc); unit_log_clear(); @@ -974,6 +1015,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__non_data_packet_for_existing_server_rp &self->homa); EXPECT_STREQ("xmit BUSY", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__existing_client_rpc) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -981,19 +1023,15 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_client_rpc) self->server_port, self->client_id, 20000, 1600); ASSERT_NE(NULL, crpc); - EXPECT_EQ(10000, crpc->msgout.granted); + EXPECT_EQ(RPC_OUTGOING, crpc->state); unit_log_clear(); - struct homa_grant_hdr h = {{.sport = htons(self->server_port), - .dport = htons(self->hsk.port), - .sender_id = cpu_to_be64(self->server_id), - .type = GRANT}, - .offset = htonl(12600), - .priority = 3, - .resend_all = 0}; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), - &self->homa); - EXPECT_EQ(12600, crpc->msgout.granted); + crpc->msgout.next_xmit_offset = crpc->msgout.length; + self->data.message_length = htonl(1600); + homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + 1400, 0), crpc); + EXPECT_EQ(RPC_INCOMING, crpc->state); + EXPECT_EQ(200, crpc->msgin.bytes_remaining); } TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) { @@ -1040,18 +1078,24 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) EXPECT_EQ(9, peer->unsched_cutoffs[1]); EXPECT_EQ(3, peer->unsched_cutoffs[7]); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) { struct homa_resend_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99990), .type = RESEND}, +#ifndef __STRIP__ /* See strip.py */ .offset = 0, .length = 2000, .priority = 5}; +#else /* See strip.py */ + .offset = 0, .length = 2000}; +#endif /* See strip.py */ homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit UNKNOWN", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -1064,7 +1108,9 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) .offset = htonl(12600), .priority = 3, .resend_all = 0}; ASSERT_NE(NULL, crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(10000, crpc->msgout.granted); +#endif /* See strip.py */ unit_log_clear(); crpc->silent_ticks = 5; crpc->peer->outstanding_resends = 2; @@ -1082,6 +1128,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) EXPECT_EQ(5, crpc->silent_ticks); EXPECT_EQ(0, crpc->peer->outstanding_resends); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__multiple_ack_packets) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, @@ -1114,14 +1161,18 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) self->server_port, self->client_id, 20000, 1600); ASSERT_NE(NULL, crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(10000, crpc->msgout.granted); +#endif /* See strip.py */ unit_log_clear(); struct homa_common_hdr h = {.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = 99}; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h, 0, 0), &self->homa); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_packet_types); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) { @@ -1160,6 +1211,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__too_many_acks) EXPECT_STREQ("sk->sk_data_ready invoked; ack 1237; ack 1235", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__invoke_homa_grant_check_rpc) { self->data.incoming = htonl(1000); @@ -1170,6 +1222,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__invoke_homa_grant_check_rpc) unit_log_grantables(&self->homa); EXPECT_SUBSTR("id 1235", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) { struct homa_rpc *dead = unit_client_rpc(&self->hsk, @@ -1179,7 +1232,11 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) mock_ns_tick = 10; homa_rpc_end(dead); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(31, self->hsk.dead_skbs); +#else /* See strip.py */ + EXPECT_EQ(30, self->hsk.dead_skbs); +#endif /* See strip.py */ srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 5000); @@ -1190,16 +1247,28 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) self->data.common.dport = htons(self->hsk.port); homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(31, self->hsk.dead_skbs); +#else /* See strip.py */ + EXPECT_EQ(30, self->hsk.dead_skbs); +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(0, homa_metrics_per_cpu()->data_pkt_reap_ns); +#endif /* See strip.py */ /* Second packet: must reap. */ self->homa.dead_buffs_limit = 15; self->homa.reap_limit = 10; homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, 1400, 0), &self->homa); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(21, self->hsk.dead_skbs); +#else /* See strip.py */ + EXPECT_EQ(20, self->hsk.dead_skbs); +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ EXPECT_NE(0, homa_metrics_per_cpu()->data_pkt_reap_ns); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_data_pkt__basics) @@ -1218,8 +1287,10 @@ TEST_F(homa_incoming, homa_data_pkt__basics) EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); EXPECT_EQ(200, crpc->msgin.bytes_remaining); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1600, crpc->msgin.granted); EXPECT_EQ(1, homa_metrics_per_cpu()->responses_received); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_data_pkt__wrong_client_rpc_state) { @@ -1248,7 +1319,9 @@ TEST_F(homa_incoming, homa_data_pkt__initialize_msgin) homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); EXPECT_EQ(200, crpc->msgin.bytes_remaining); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1600, crpc->msgin.granted); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_data_pkt__no_buffer_pool) { @@ -1288,7 +1361,9 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffers) atomic_set(&self->hsk.buffer_pool->free_bpages, 0); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1400, homa_metrics_per_cpu()->dropped_data_no_bufs); +#endif /* See strip.py */ EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } TEST_F(homa_incoming, homa_data_pkt__update_delta) @@ -1302,7 +1377,9 @@ TEST_F(homa_incoming, homa_data_pkt__update_delta) /* Total incoming goes up on first packet (count unscheduled bytes). */ self->data.message_length = htonl(5000); +#ifndef __STRIP__ /* See strip.py */ self->data.incoming = htonl(4000); +#endif /* See strip.py */ homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); @@ -1347,6 +1424,7 @@ TEST_F(homa_incoming, homa_data_pkt__handoff) 200, 0), crpc); EXPECT_STREQ("", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_data_pkt__send_cutoffs) { self->homa.cutoff_version = 2; @@ -1476,6 +1554,7 @@ TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) &self->homa); EXPECT_EQ(20000, crpc->msgout.granted); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) { @@ -1484,8 +1563,7 @@ TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, .offset = htonl(100), - .length = htonl(200), - .priority = 3}; + .length = htonl(200)}; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); @@ -1498,8 +1576,7 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_in_service_server_sends_busy) .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, .offset = htonl(0), - .length = htonl(200), - .priority = 3}; + .length = htonl(200)}; struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 20000); @@ -1521,14 +1598,15 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_incoming_server_sends_busy) .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, .offset = htonl(1400), - .length = htonl(200), - .priority = 3}; + .length = htonl(200)}; struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 2000, 20000); ASSERT_NE(NULL, srpc); +#ifndef __STRIP__ /* See strip.py */ srpc->msgin.granted = 1400; +#endif /* See strip.py */ unit_log_clear(); homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), @@ -1546,8 +1624,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, .offset = htonl(100), - .length = htonl(200), - .priority = 3}; + .length = htonl(200)}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 3000); @@ -1559,6 +1636,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) &self->homa); EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) { struct homa_resend_hdr h = {{.sport = htons(self->server_port), @@ -1566,8 +1644,7 @@ TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, .offset = htonl(100), - .length = htonl(200), - .priority = 3}; + .length = htonl(200)}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 100); @@ -1579,6 +1656,7 @@ TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) &self->homa); EXPECT_SUBSTR("xmit BUSY", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_resend_pkt__client_send_data) { struct homa_resend_hdr h = {{.sport = htons(self->server_port), @@ -1586,8 +1664,12 @@ TEST_F(homa_incoming, homa_resend_pkt__client_send_data) .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, .offset = htonl(100), +#ifndef __STRIP__ /* See strip.py */ .length = htonl(200), .priority = 3}; +#else /* See strip.py */ + .length = htonl(200)}; +#endif /* See strip.py */ struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 100); @@ -1600,7 +1682,9 @@ TEST_F(homa_incoming, homa_resend_pkt__client_send_data) homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_SUBSTR("xmit DATA retrans 1400@0", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("3", mock_xmit_prios); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_resend_pkt__server_send_data) { @@ -1609,8 +1693,12 @@ TEST_F(homa_incoming, homa_resend_pkt__server_send_data) .sender_id = cpu_to_be64(self->client_id), .type = RESEND}, .offset = htonl(100), +#ifndef __STRIP__ /* See strip.py */ .length = htonl(2000), .priority = 4}; +#else /* See strip.py */ + .length = htonl(2000)}; +#endif /* See strip.py */ struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); @@ -1624,7 +1712,9 @@ TEST_F(homa_incoming, homa_resend_pkt__server_send_data) &self->homa); EXPECT_STREQ("xmit DATA retrans 1400@0; " "xmit DATA retrans 1400@1400", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("4 4", mock_xmit_prios); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) @@ -1644,9 +1734,15 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); +#ifndef __STRIP__ /* See strip.py */ EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 2000, RETRANSMIT; " "xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 1400, data_length 600, incoming 2000, RETRANSMIT", unit_log_get()); +#else /* See strip.py */ + EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, RETRANSMIT; " + "xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 1400, data_length 600, RETRANSMIT", + unit_log_get()); +#endif /* See strip.py */ EXPECT_EQ(-1, crpc->msgin.length); } TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) @@ -1660,15 +1756,22 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) self->server_port, self->client_id, 2000, 2000); ASSERT_NE(NULL, crpc); +#ifndef __STRIP__ /* See strip.py */ crpc->msgout.granted = 1400; +#endif /* See strip.py */ homa_xmit_data(crpc, false); unit_log_clear(); mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); +#ifndef __STRIP__ /* See strip.py */ EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 1400, RETRANSMIT", unit_log_get()); +#else /* See strip.py */ + EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, RETRANSMIT", + unit_log_get()); +#endif /* See strip.py */ EXPECT_EQ(-1, crpc->msgin.length); } TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) @@ -1689,6 +1792,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -1732,6 +1836,7 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(0, peer->cutoff_version); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) { @@ -1751,8 +1856,10 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) &self->homa); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ NEED_ACK - DATA]); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) { @@ -1771,8 +1878,10 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ NEED_ACK - DATA]); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) { @@ -1791,8 +1900,10 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ NEED_ACK - DATA]); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) { @@ -1833,7 +1944,9 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_no_extras) homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ACK - DATA]); +#endif /* See strip.py */ } TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_plus_extras) { @@ -2393,7 +2506,11 @@ TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_sleeping) self->server_port, self->client_id+2, 20000, 20000); self->homa.reap_limit = 5; homa_rpc_end(crpc2); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(31, self->hsk.dead_skbs); +#else /* See strip.py */ + EXPECT_EQ(30, self->hsk.dead_skbs); +#endif /* See strip.py */ unit_log_clear(); hook_rpc = crpc1; @@ -2576,6 +2693,7 @@ TEST_F(homa_incoming, homa_choose_interest__empty_list) EXPECT_EQ(NULL, result); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_choose_interest__find_idle_core) { struct homa_interest interest1, interest2, interest3; @@ -2630,6 +2748,7 @@ TEST_F(homa_incoming, homa_choose_interest__all_cores_busy) ASSERT_NE(NULL, result); EXPECT_EQ(1, result->core); } +#endif /* See strip.py */ TEST_F(homa_incoming, homa_rpc_handoff__handoff_already_in_progress) { @@ -2801,6 +2920,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__detach_interest) EXPECT_EQ(0, unit_list_length(&self->hsk.request_interests)); atomic_andnot(RPC_HANDING_OFF, &crpc->flags); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_rpc_handoff__update_last_app_active) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -2867,3 +2987,4 @@ TEST_F(homa_incoming, homa_incoming_sysctl_changed__convert_usec_to_ns) EXPECT_EQ(53000, self->homa.busy_ns); EXPECT_EQ(140000, self->homa.gro_busy_ns); } +#endif /* See strip.py */ diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index ec74d447..b780132e 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -47,17 +47,17 @@ FIXTURE_SETUP(homa_offload) global_homa = &self->homa; mock_sock_init(&self->hsk, &self->homa, 99); self->ip = unit_get_in_addr("196.168.0.1"); - self->header = (struct homa_data_hdr){.common = { - .sport = htons(40000), .dport = htons(99), - .type = DATA, - .flags = HOMA_TCP_FLAGS, - .urgent = HOMA_TCP_URGENT, - .sender_id = cpu_to_be64(1000)}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0}, - .retransmit = 0, - .seg = {.offset = htonl(2000)}}; + memset(&self->header, 0, sizeof(self->header)); + self->header.common = (struct homa_common_hdr){ + .sport = htons(40000), .dport = htons(99), + .type = DATA, + .flags = HOMA_TCP_FLAGS, + .urgent = HOMA_TCP_URGENT, + .sender_id = cpu_to_be64(1000) + }; + self->header.message_length = htonl(10000); + self->header.incoming = htonl(10000); + self->header.seg.offset = htonl(2000); for (i = 0; i < GRO_HASH_BUCKETS; i++) { INIT_LIST_HEAD(&self->napi.gro_hash[i].list); self->napi.gro_hash[i].count = 0; @@ -261,16 +261,13 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) int server_port = 99; struct homa_data_hdr h; + memset(&h, 0, sizeof(h)); h.common.sport = htons(40000); h.common.dport = htons(server_port); h.common.type = DATA; h.common.sender_id = cpu_to_be64(client_id); h.message_length = htonl(10000); h.incoming = htonl(10000); - h.cutoff_version = 0; - h.ack.client_id = 0; - h.ack.server_port = 0; - h.retransmit = 0; h.seg.offset = htonl(2000); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 9dc9b9ed..9bb36088 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -3,7 +3,11 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_rpc.h" +#ifndef __STRIP__ /* See strip.py */ #include "homa_skb.h" +#else /* See strip.py */ +#include "homa_stub.h" +#endif /* See strip.py */ #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -33,6 +37,16 @@ void lock_free_hook(char *id) } } +#ifdef __STRIP__ /* See strip.py */ +void mock_resend_data(struct homa_rpc *rpc, int start, int end, + int priority) +{ + homa_resend_data(rpc, start, end); +} +#define homa_resend_data(rpc, start, end, priority) \ + mock_resend_data(rpc, start, end, priority); +#endif /* See strip.py */ + FIXTURE(homa_outgoing) { struct in6_addr client_ip[1]; int client_port; @@ -58,8 +72,10 @@ FIXTURE_SETUP(homa_outgoing) atomic64_set(&self->homa.link_idle_time, 10000); self->homa.ns_per_mbyte = 1000000; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; +#ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; +#endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, self->client_port); self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = self->server_ip[0]; @@ -74,6 +90,7 @@ FIXTURE_TEARDOWN(homa_outgoing) unit_teardown(); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, set_priority__priority_mapping) { struct homa_grant_hdr h; @@ -91,6 +108,7 @@ TEST_F(homa_outgoing, set_priority__priority_mapping) EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), srpc)); EXPECT_STREQ("7 3", mock_xmit_prios); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_fill_data_interleaved) { @@ -110,8 +128,13 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved) "_copy_from_iter 1500 bytes at 4000; " "_copy_from_iter 500 bytes at 5500", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", homa_print_packet(skb, buffer, sizeof(buffer))); +#else /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); +#endif /* See strip.py */ EXPECT_EQ(5000 + sizeof32(struct homa_data_hdr) + 3*sizeof32(struct homa_seg_hdr), skb->len); kfree_skb(skb); @@ -147,8 +170,13 @@ TEST_F(homa_outgoing, homa_new_data_packet__one_segment) skb = homa_new_data_packet(crpc, iter, 5000, 500, 2000); EXPECT_STREQ("_copy_from_iter 500 bytes at 1000", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 500, offset 5000, data_length 500, incoming 500", homa_print_packet(skb, buffer, sizeof(buffer))); +#else /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 500, offset 5000, data_length 500", + homa_print_packet(skb, buffer, sizeof(buffer))); +#endif /* See strip.py */ EXPECT_EQ(0, skb_shinfo(skb)->gso_segs); kfree_skb(skb); @@ -186,8 +214,13 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_int "_copy_from_iter 1500 bytes at 2500; " "_copy_from_iter 1500 bytes at 4000; " "_copy_from_iter 500 bytes at 5500", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", homa_print_packet(skb, buffer, sizeof(buffer))); +#else /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, extra segs 1500@11500 1500@13000 500@14500", + homa_print_packet(skb, buffer, sizeof(buffer))); +#endif /* See strip.py */ EXPECT_EQ(4*(sizeof(struct homa_data_hdr) + crpc->hsk->ip_header_length + HOMA_ETH_OVERHEAD) + 5000, @@ -195,6 +228,7 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_int EXPECT_EQ(5000, homa_get_skb_info(skb)->data_bytes); kfree_skb(skb); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_new_data_packet__error_in_homa_fill_data_interleaved) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); @@ -250,6 +284,7 @@ TEST_F(homa_outgoing, homa_new_data_packet__error_copying_data_hijacking_path) EXPECT_TRUE(IS_ERR(skb)); EXPECT_EQ(EFAULT, -PTR_ERR(skb)); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_new_data_packet__gso_information) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); @@ -297,7 +332,9 @@ TEST_F(homa_outgoing, homa_message_out_fill__basics) ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 3000), 0)); homa_rpc_unlock(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(3000, crpc->msgout.granted); +#endif /* See strip.py */ EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("mtu 1496, max_seg_data 1400, max_gso_data 1400; " "_copy_from_iter 1400 bytes at 1000; " @@ -305,10 +342,17 @@ TEST_F(homa_outgoing, homa_message_out_fill__basics) "_copy_from_iter 200 bytes at 3800", unit_log_get()); unit_log_clear(); unit_log_message_out_packets(&crpc->msgout, 1); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 0, data_length 1400, incoming 3000; " "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 1400, data_length 1400, incoming 3000; " "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 2800, data_length 200, incoming 3000", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 0, data_length 1400; " + "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 1400, data_length 1400; " + "DATA from 0.0.0.0:40000, dport 99, id 2, message_length 3000, offset 2800, data_length 200", + unit_log_get()); +#endif /* See strip.py */ EXPECT_EQ(3, crpc->msgout.num_skbs); EXPECT_EQ(3000, crpc->msgout.copied_from_user); } @@ -333,6 +377,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__zero_length_message) unit_iov_iter((void *) 1000, 0), 0)); homa_rpc_unlock(crpc); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) { struct homa_rpc *crpc1, *crpc2; @@ -362,6 +407,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) homa_rpc_unlock(crpc2); EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 4200", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) { struct homa_rpc *crpc1, *crpc2; @@ -527,7 +573,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__too_short_for_pipelining) TEST_F(homa_outgoing, homa_xmit_control__server_request) { - struct homa_grant_hdr h; + struct homa_busy_hdr h; struct homa_rpc *srpc; homa_sock_bind(self->homa.port_map, &self->hsk, self->server_port); @@ -537,19 +583,17 @@ TEST_F(homa_outgoing, homa_xmit_control__server_request) ASSERT_NE(NULL, srpc); unit_log_clear(); - h.offset = htonl(12345); - h.priority = 4; - h.resend_all = 0; - h.common.sender_id = cpu_to_be64(self->client_id); mock_xmit_log_verbose = 1; - EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), srpc)); - EXPECT_STREQ("xmit GRANT from 0.0.0.0:99, dport 40000, id 1235, offset 12345, grant_prio 4", + EXPECT_EQ(0, homa_xmit_control(BUSY, &h, sizeof(h), srpc)); + EXPECT_STREQ("xmit BUSY from 0.0.0.0:99, dport 40000, id 1235", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("7", mock_xmit_prios); +#endif /* See strip.py */ } TEST_F(homa_outgoing, homa_xmit_control__client_response) { - struct homa_grant_hdr h; + struct homa_busy_hdr h; struct homa_rpc *crpc; crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, @@ -558,19 +602,18 @@ TEST_F(homa_outgoing, homa_xmit_control__client_response) ASSERT_NE(NULL, crpc); unit_log_clear(); - h.offset = htonl(12345); - h.priority = 4; - h.resend_all = 0; mock_xmit_log_verbose = 1; - EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), crpc)); - EXPECT_STREQ("xmit GRANT from 0.0.0.0:40000, dport 99, id 1234, offset 12345, grant_prio 4", + EXPECT_EQ(0, homa_xmit_control(BUSY, &h, sizeof(h), crpc)); + EXPECT_STREQ("xmit BUSY from 0.0.0.0:40000, dport 99, id 1234", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("7", mock_xmit_prios); +#endif /* See strip.py */ } TEST_F(homa_outgoing, __homa_xmit_control__cant_alloc_skb) { - struct homa_grant_hdr h; + struct homa_busy_hdr h; struct homa_rpc *srpc; srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, @@ -578,10 +621,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__cant_alloc_skb) ASSERT_NE(NULL, srpc); unit_log_clear(); - h.common.type = GRANT; - h.offset = htonl(12345); - h.priority = 4; - h.resend_all = 0; + h.common.type = BUSY; mock_xmit_log_verbose = 1; mock_alloc_skb_errors = 1; EXPECT_EQ(ENOBUFS, -__homa_xmit_control(&h, sizeof(h), srpc->peer, @@ -602,6 +642,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__pad_packet) "xmit unknown packet type 0x0", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) { struct homa_grant_hdr h; @@ -658,7 +699,6 @@ TEST_F(homa_outgoing, homa_xmit_unknown) .sender_id = cpu_to_be64(99990), .type = GRANT}, .offset = htonl(11200), - .priority = 3, .resend_all = 0}; struct sk_buff *skb; @@ -669,6 +709,7 @@ TEST_F(homa_outgoing, homa_xmit_unknown) unit_log_get()); kfree_skb(skb); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__basics) { @@ -676,24 +717,36 @@ TEST_F(homa_outgoing, homa_xmit_data__basics) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); +#ifndef __STRIP__ /* See strip.py */ crpc->msgout.sched_priority = 2; crpc->msgout.unscheduled = 2000; crpc->msgout.granted = 5000; homa_peer_set_cutoffs(crpc->peer, INT_MAX, 0, 0, 0, 0, INT_MAX, 7000, 0); +#endif /* See strip.py */ unit_log_clear(); mock_clear_xmit_prios(); homa_xmit_data(crpc, false); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit DATA 1400@0; " "xmit DATA 1400@1400; " "xmit DATA 1400@2800; " "xmit DATA 1400@4200", unit_log_get()); EXPECT_STREQ("6 6 2 2", mock_xmit_prios); EXPECT_EQ(5600, crpc->msgout.next_xmit_offset); +#else /* See strip.py */ + EXPECT_STREQ("xmit DATA 1400@0; " + "xmit DATA 1400@1400; " + "xmit DATA 1400@2800; " + "xmit DATA 1400@4200; " + "xmit DATA 400@5600", unit_log_get()); + EXPECT_EQ(6000, crpc->msgout.next_xmit_offset); +#endif /* See strip.py */ unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__stop_because_no_more_granted) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -708,6 +761,7 @@ TEST_F(homa_outgoing, homa_xmit_data__stop_because_no_more_granted) unit_log_throttled(&self->homa); EXPECT_STREQ("", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -778,8 +832,10 @@ TEST_F(homa_outgoing, homa_xmit_data__rpc_freed) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 6000, 1000); +#ifndef __STRIP__ /* See strip.py */ crpc->msgout.unscheduled = 2000; crpc->msgout.granted = 5000; +#endif /* See strip.py */ unit_log_clear(); unit_hook_register(lock_free_hook); @@ -790,6 +846,7 @@ TEST_F(homa_outgoing, homa_xmit_data__rpc_freed) EXPECT_EQ(1400, crpc->msgout.next_xmit_offset); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, __homa_xmit_data__update_cutoff_version) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -803,6 +860,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__update_cutoff_version) __homa_xmit_data(crpc->msgout.packets, crpc, 4); EXPECT_SUBSTR("cutoff_version 123", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -816,11 +874,16 @@ TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) old_refcount = atomic_read(&dst->__rcuref.refcnt); skb_get(crpc->msgout.packets); +#ifndef __STRIP__ /* See strip.py */ __homa_xmit_data(crpc->msgout.packets, crpc, 6); +#else /* See strip.py */ + __homa_xmit_data(crpc->msgout.packets, crpc); +#endif /* See strip.py */ EXPECT_STREQ("xmit DATA 1000@0", unit_log_get()); EXPECT_EQ(dst, skb_dst(crpc->msgout.packets)); EXPECT_EQ(old_refcount+1, atomic_read(&dst->__rcuref.refcnt)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) { struct homa_rpc *crpc; @@ -857,6 +920,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) __homa_xmit_data(crpc->msgout.packets, crpc, 5); EXPECT_EQ(1, homa_metrics_per_cpu()->data_xmit_errors); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_resend_data__basics) { @@ -874,11 +938,20 @@ TEST_F(homa_outgoing, homa_resend_data__basics) skb_push(crpc->msgout.packets, 8); homa_resend_data(crpc, 7000, 10000, 2); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 7000, data_length 1400, incoming 10000, RETRANSMIT; " "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 8400, data_length 1400, incoming 10000, RETRANSMIT; " "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 9800, data_length 200, incoming 10000, RETRANSMIT", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 7000, data_length 1400, RETRANSMIT; " + "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 8400, data_length 1400, RETRANSMIT; " + "xmit DATA from 0.0.0.0:40000, dport 99, id 1234, message_length 16000, offset 9800, data_length 1400, RETRANSMIT", + unit_log_get()); +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("2 2 2", mock_xmit_prios); +#endif /* See strip.py */ unit_log_clear(); mock_clear_xmit_prios(); @@ -891,7 +964,9 @@ TEST_F(homa_outgoing, homa_resend_data__basics) mock_xmit_log_verbose = 0; homa_resend_data(crpc, 2800, 4200, 3); EXPECT_STREQ("xmit DATA retrans 1400@2800", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("3", mock_xmit_prios); +#endif /* See strip.py */ unit_log_clear(); mock_clear_xmit_prios(); @@ -899,7 +974,9 @@ TEST_F(homa_outgoing, homa_resend_data__basics) homa_resend_data(crpc, 4199, 4201, 7); EXPECT_STREQ("xmit DATA retrans 1400@2800; " "xmit DATA retrans 1400@4200", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("7 7", mock_xmit_prios); +#endif /* See strip.py */ unit_log_clear(); mock_xmit_log_verbose = 0; @@ -931,6 +1008,7 @@ TEST_F(homa_outgoing, homa_resend_data__cant_allocate_skb) homa_resend_data(crpc, 7000, 10000, 2); EXPECT_STREQ("skb allocation error", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_resend_data__set_incoming) { struct homa_rpc *crpc; @@ -968,6 +1046,7 @@ TEST_F(homa_outgoing, homa_resend_data__error_copying_data) EXPECT_STREQ("homa_resend_data got error 22 while copying data", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_resend_data__set_homa_info) { struct homa_rpc *crpc; @@ -985,6 +1064,7 @@ TEST_F(homa_outgoing, homa_resend_data__set_homa_info) unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_outgoing_sysctl_changed) { self->homa.link_mbps = 10000; @@ -999,6 +1079,7 @@ TEST_F(homa_outgoing, homa_outgoing_sysctl_changed) homa_outgoing_sysctl_changed(&self->homa); EXPECT_EQ(202000, self->homa.ns_per_mbyte); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_check_nic_queue__basics) { @@ -1065,8 +1146,10 @@ TEST_F(homa_outgoing, homa_check_nic_queue__pacer_metrics) EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, true)); EXPECT_EQ(10500, atomic64_read(&self->homa.link_idle_time)); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); EXPECT_EQ(200, homa_metrics_per_cpu()->pacer_lost_ns); +#endif /* See strip.py */ } TEST_F(homa_outgoing, homa_check_nic_queue__queue_empty) { @@ -1226,7 +1309,9 @@ TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) mock_trylock_errors = ~1; EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); EXPECT_STREQ("", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_skipped_rpcs); +#endif /* See strip.py */ unit_log_clear(); mock_trylock_errors = 0; EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); @@ -1319,6 +1404,7 @@ TEST_F(homa_outgoing, homa_add_to_throttled__basics) "request id 8, next_offset 0; " "request id 6, next_offset 0", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_add_to_throttled__inc_metrics) { struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, @@ -1343,6 +1429,7 @@ TEST_F(homa_outgoing, homa_add_to_throttled__inc_metrics) EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_adds); EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_checks); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_remove_from_throttled) { diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index e13eaf32..3810052b 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -13,6 +13,7 @@ struct in6_addr ip1111[1]; struct in6_addr ip2222[1]; struct in6_addr ip3333[1]; +#ifndef __STRIP__ /* See strip.py */ static int hook_new_peer_count; static struct homa_peertab *hook_peertab; static struct homa_sock *hook_hsk; @@ -32,6 +33,7 @@ static void kmalloc_hook(char *id) homa_peer_find(hook_peertab, &addr, &hook_hsk->inet); } } +#endif /* See strip.py */ FIXTURE(homa_peer) { struct homa homa; @@ -70,12 +72,14 @@ static int dead_count(struct homa_peertab *peertab) return count; } +#ifndef __STRIP__ /* See strip.py */ static void peer_spinlock_hook(char *id) { if (strcmp(id, "spin_lock") != 0) return; mock_ns += 1000; } +#endif /* See strip.py */ TEST_F(homa_peer, homa_peer_find__basics) { @@ -84,8 +88,10 @@ TEST_F(homa_peer, homa_peer_find__basics) peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); EXPECT_EQ(0, peer->cutoff_version); +#endif /* See strip.py */ peer2 = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); EXPECT_EQ(peer, peer2); @@ -93,7 +99,9 @@ TEST_F(homa_peer, homa_peer_find__basics) peer2 = homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); EXPECT_NE(peer, peer2); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(2, homa_metrics_per_cpu()->peer_new_entries); +#endif /* See strip.py */ } static struct _test_data_homa_peer *test_data; @@ -143,6 +151,7 @@ TEST_F(homa_peer, homa_peertab_gc_dsts) EXPECT_EQ(0, dead_count(&self->peertab)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_peer, homa_peertab_get_peers__not_init) { struct homa_peertab peertab; @@ -233,6 +242,7 @@ TEST_F(homa_peer, homa_peertab_get_peers__many_new_peers_created_concurrently) EXPECT_EQ(12, num_peers); kfree(peers); } +#endif /* See strip.py */ TEST_F(homa_peer, homa_peer_find__conflicting_creates) { @@ -253,7 +263,9 @@ TEST_F(homa_peer, homa_peer_find__kmalloc_error) peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_EQ(ENOMEM, -PTR_ERR(peer)); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); +#endif /* See strip.py */ } TEST_F(homa_peer, homa_peer_find__route_error) { @@ -263,7 +275,9 @@ TEST_F(homa_peer, homa_peer_find__route_error) peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); +#endif /* See strip.py */ } TEST_F(homa_peer, homa_dst_refresh__basics) @@ -308,7 +322,9 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) mock_route_errors = 1; homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); +#endif /* See strip.py */ EXPECT_EQ(0, dead_count(self->homa.peers)); } TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) @@ -328,6 +344,7 @@ TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) EXPECT_EQ(1, dead_count(self->homa.peers)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_peer, homa_unsched_priority) { struct homa_peer peer; @@ -338,6 +355,7 @@ TEST_F(homa_peer, homa_unsched_priority) EXPECT_EQ(4, homa_unsched_priority(&self->homa, &peer, 200)); EXPECT_EQ(3, homa_unsched_priority(&self->homa, &peer, 201)); } +#endif /* See strip.py */ TEST_F(homa_peer, homa_peer_get_dst_ipv4) { @@ -383,6 +401,7 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) homa_print_ipv6_addr(&peer->flow.u.ip6.daddr)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_peer, homa_peer_lock_slow) { struct homa_peer *peer = homa_peer_find(&self->peertab, ip3333, @@ -402,6 +421,7 @@ TEST_F(homa_peer, homa_peer_lock_slow) EXPECT_EQ(1000, homa_metrics_per_cpu()->peer_ack_lock_miss_ns); homa_peer_unlock(peer); } +#endif /* See strip.py */ TEST_F(homa_peer, homa_peer_add_ack) { diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index f596d7ea..dcd01a12 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -67,14 +67,17 @@ FIXTURE_SETUP(homa_plumbing) ipv6_to_ipv4(self->server_addr.in6.sin6_addr); } homa_sock_bind(self->homa.port_map, &self->hsk, self->server_port); + memset(&self->data, 0, sizeof(self->data)); self->data = (struct homa_data_hdr){.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .type = DATA, - .sender_id = cpu_to_be64(self->client_id)}, - .message_length = htonl(10000), - .incoming = htonl(10000), .retransmit = 0, - .seg = {.offset = 0}}; + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = DATA, + .sender_id = cpu_to_be64(self->client_id)}, + .message_length = htonl(10000), +#ifndef __STRIP__ /* See strip.py */ + .incoming = htonl(10000), +#endif /* See strip.py */ + }; self->recvmsg_args.id = 0; self->recvmsg_hdr.msg_name = &self->addr; self->recvmsg_hdr.msg_namelen = 0; @@ -206,6 +209,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_ok) EXPECT_EQ(345, self->hsk.port); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_plumbing, homa_ioc_abort__basics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -248,6 +252,7 @@ TEST_F(homa_plumbing, homa_ioc_abort__nonexistent_rpc) EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); } +#endif /* See strip.py */ TEST_F(homa_plumbing, homa_socket__success) { @@ -312,7 +317,9 @@ TEST_F(homa_plumbing, homa_setsockopt__success) sizeof(struct homa_rcvbuf_args))); EXPECT_EQ(args.start, (uintptr_t)self->hsk.buffer_pool->region); EXPECT_EQ(64, self->hsk.buffer_pool->num_bpages); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->so_set_buf_calls); +#endif /* See strip.py */ } @@ -835,7 +842,9 @@ TEST_F(homa_plumbing, homa_softirq__packet_too_short) skb->len -= 1; homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->short_packets); +#endif /* See strip.py */ } TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) { @@ -845,7 +854,9 @@ TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->short_packets); +#endif /* See strip.py */ } TEST_F(homa_plumbing, homa_softirq__process_short_messages_first) { diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 08ac886d..ce8e4442 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -19,8 +19,10 @@ FIXTURE(homa_pool) { FIXTURE_SETUP(homa_pool) { homa_init(&self->homa); +#ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; +#endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); self->server_ip = unit_get_in_addr("1.2.3.4"); @@ -342,7 +344,9 @@ TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) EXPECT_EQ(2*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); EXPECT_EQ(2000, pool->cores[smp_processor_id()].allocated); EXPECT_EQ(smp_processor_id(), pool->descriptors[2].owner); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->bpage_reuses); +#endif /* See strip.py */ } TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) { @@ -432,7 +436,9 @@ TEST_F(homa_pool, homa_pool_allocate__out_of_space) rpc = list_next_entry(rpc, buf_links); EXPECT_EQ(100, rpc->id); EXPECT_TRUE(list_is_last(&rpc->buf_links, &self->hsk.waiting_for_bufs)); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(3, homa_metrics_per_cpu()->buffer_alloc_failures); +#endif /* See strip.py */ EXPECT_EQ(1, pool->bpages_needed); } @@ -611,6 +617,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__reset_bpages_needed) EXPECT_EQ(0, crpc2->msgin.num_bpages); EXPECT_EQ(2, pool->bpages_needed); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) { struct homa_pool *pool = self->hsk.buffer_pool; @@ -631,6 +638,7 @@ TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) EXPECT_EQ(2, crpc->msgin.num_bpages); EXPECT_STREQ("xmit GRANT 10000@0 resend_all", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_pool, homa_pool_check_waiting__reallocation_fails) { struct homa_pool *pool = self->hsk.buffer_pool; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index db9cc198..fc2187f6 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -50,19 +50,22 @@ FIXTURE_SETUP(homa_rpc) self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); +#ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; +#endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, 0); - self->data = (struct homa_data_hdr){.common = { - .sport = htons(self->client_port), - .dport = htons(self->server_port), - .type = DATA, - .sender_id = self->client_id}, - .message_length = htonl(10000), - .incoming = htonl(10000), .cutoff_version = 0, - .ack = {0, 0}, - .retransmit = 0, - .seg = {.offset = 0}}; + memset(&self->data, 0, sizeof(self->data)); + self->data.common = (struct homa_common_hdr){ + .sport = htons(self->client_port), + .dport = htons(self->server_port), + .type = DATA, + .sender_id = self->client_id + }; + self->data.message_length = htonl(10000); +#ifndef __STRIP__ /* See strip.py */ + self->data.incoming = htonl(10000); +#endif /* See strip.py */ self->iovec.iov_base = (void *) 2000; self->iovec.iov_len = 10000; iov_iter_init(&self->iter, WRITE, &self->iovec, 1, self->iovec.iov_len); @@ -278,6 +281,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_rpc) homa_rpc_end(srpc); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_rpc, homa_bucket_lock_slow) { struct homa_rpc *crpc, *srpc; @@ -306,6 +310,7 @@ TEST_F(homa_rpc, homa_bucket_lock_slow) EXPECT_EQ(1, homa_metrics_per_cpu()->server_lock_misses); EXPECT_EQ(10, homa_metrics_per_cpu()->server_lock_miss_ns); } +#endif /* See strip.py */ TEST_F(homa_rpc, homa_rpc_acked__basics) { @@ -386,12 +391,16 @@ TEST_F(homa_rpc, homa_rpc_end__basics) UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 20000); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, self->homa.num_grantable_rpcs); +#endif /* See strip.py */ ASSERT_NE(NULL, crpc); unit_log_clear(); mock_log_rcu_sched = 1; homa_rpc_end(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(0, self->homa.num_grantable_rpcs); +#endif /* See strip.py */ EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, crpc->id)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, unit_list_length(&self->hsk.dead_rpcs)); @@ -445,7 +454,11 @@ TEST_F(homa_rpc, homa_rpc_end__free_gaps) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); +#ifndef __STRIP__ /* See strip.py */ homa_message_in_init(crpc, 10000, 0); +#else /* See strip.py */ + homa_message_in_init(crpc, 10000); +#endif /* See strip.py */ unit_log_clear(); self->data.seg.offset = htonl(1400); homa_add_packet(crpc, mock_skb_new(self->client_ip, diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 38c0c084..db096385 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -160,6 +160,7 @@ TEST_F(homa_sock, homa_sock_init__kzalloc_failure) EXPECT_EQ(ENOMEM, -homa_sock_init(&sock, &self->homa)); homa_sock_destroy(&sock); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_sock, homa_sock_init__hijack_tcp) { struct homa_sock hijack, no_hijack; @@ -173,6 +174,7 @@ TEST_F(homa_sock, homa_sock_init__hijack_tcp) homa_sock_destroy(&hijack); homa_sock_destroy(&no_hijack); } +#endif /* See strip.py */ TEST_F(homa_sock, homa_sock_unlink__remove_from_map) { @@ -350,6 +352,7 @@ TEST_F(homa_sock, homa_sock_find__long_hash_chain) homa_sock_destroy(&hsk4); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_sock, homa_sock_lock_slow) { mock_ns_tick = 100; @@ -365,3 +368,4 @@ TEST_F(homa_sock, homa_sock_lock_slow) EXPECT_EQ(100, homa_metrics_per_cpu()->socket_lock_miss_ns); homa_sock_unlock(&self->hsk); } +#endif /* See strip.py */ diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index e8f8577b..c8c3409c 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -35,8 +35,10 @@ FIXTURE_SETUP(homa_timer) self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.resend_ticks = 2; self->homa.timer_ticks = 100; +#ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; +#endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, 0); unit_log_clear(); } @@ -80,6 +82,7 @@ TEST_F(homa_timer, homa_check_rpc__request_ack) EXPECT_EQ(100, srpc->done_timer_ticks); EXPECT_STREQ("xmit NEED_ACK", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_timer, homa_check_rpc__all_granted_bytes_received) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -94,6 +97,7 @@ TEST_F(homa_timer, homa_check_rpc__all_granted_bytes_received) EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_timer, homa_check_rpc__no_buffer_space) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -144,11 +148,15 @@ TEST_F(homa_timer, homa_check_rpc__timeout) unit_log_clear(); crpc->silent_ticks = self->homa.timeout_ticks-1; homa_check_rpc(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(0, homa_metrics_per_cpu()->rpc_timeouts); +#endif /* See strip.py */ EXPECT_EQ(0, crpc->error); crpc->silent_ticks = self->homa.timeout_ticks; homa_check_rpc(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->rpc_timeouts); +#endif /* See strip.py */ EXPECT_EQ(ETIMEDOUT, -crpc->error); } TEST_F(homa_timer, homa_check_rpc__issue_resend) @@ -158,10 +166,12 @@ TEST_F(homa_timer, homa_check_rpc__issue_resend) self->server_port, self->client_id, 200, 10000); ASSERT_NE(NULL, crpc); - crpc->msgin.granted = 5000; self->homa.resend_ticks = 3; self->homa.resend_interval = 2; +#ifndef __STRIP__ /* See strip.py */ + crpc->msgin.granted = 5000; crpc->msgout.granted = 0; +#endif /* See strip.py */ /* First call: resend_ticks-1. */ crpc->silent_ticks = 2; @@ -173,7 +183,11 @@ TEST_F(homa_timer, homa_check_rpc__issue_resend) crpc->silent_ticks = 3; unit_log_clear(); homa_check_rpc(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-9999", unit_log_get()); +#endif /* See strip.py */ /* Third call: not yet time for next resend. */ crpc->silent_ticks = 4; @@ -185,7 +199,11 @@ TEST_F(homa_timer, homa_check_rpc__issue_resend) crpc->silent_ticks = 5; unit_log_clear(); homa_check_rpc(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-9999", unit_log_get()); +#endif /* See strip.py */ } TEST_F(homa_timer, homa_check_rpc__request_first_bytes_of_message) { @@ -194,7 +212,9 @@ TEST_F(homa_timer, homa_check_rpc__request_first_bytes_of_message) self->server_port, self->client_id, 5000, 10000); ASSERT_NE(NULL, crpc); +#ifndef __STRIP__ /* See strip.py */ crpc->msgout.granted = 5000; +#endif /* See strip.py */ crpc->msgout.next_xmit_offset = 5000; self->homa.resend_ticks = 3; @@ -208,7 +228,11 @@ TEST_F(homa_timer, homa_check_rpc__request_first_bytes_of_message) crpc->silent_ticks = 3; unit_log_clear(); homa_check_rpc(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 0-99@7", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 0-99", unit_log_get()); +#endif /* See strip.py */ } TEST_F(homa_timer, homa_check_rpc__call_homa_gap_retry) { @@ -218,8 +242,10 @@ TEST_F(homa_timer, homa_check_rpc__call_homa_gap_retry) ASSERT_NE(NULL, crpc); crpc->silent_ticks = 3; +#ifndef __STRIP__ /* See strip.py */ crpc->msgin.granted = 10000; crpc->msgin.recv_end = 10000; +#endif /* See strip.py */ crpc->msgin.bytes_remaining = 15000; homa_gap_new(&crpc->msgin.gaps, 7000, 8000); self->homa.resend_ticks = 3; @@ -227,7 +253,12 @@ TEST_F(homa_timer, homa_check_rpc__call_homa_gap_retry) unit_log_clear(); homa_check_rpc(crpc); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 7000-7999@7", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 7000-7999; xmit RESEND 1400-19999", + unit_log_get()); +#endif /* See strip.py */ } TEST_F(homa_timer, homa_timer__basics) @@ -250,7 +281,11 @@ TEST_F(homa_timer, homa_timer__basics) unit_log_clear(); homa_timer(&self->homa); EXPECT_EQ(3, crpc->silent_ticks); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-4999", unit_log_get()); +#endif /* See strip.py */ /* Don't send another RESEND (resend_interval not reached). */ unit_log_clear(); @@ -264,7 +299,9 @@ TEST_F(homa_timer, homa_timer__basics) crpc->peer->outstanding_resends = self->homa.timeout_resends; #endif /* See strip.py */ homa_timer(&self->homa); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->rpc_timeouts); +#endif /* See strip.py */ EXPECT_EQ(ETIMEDOUT, -crpc->error); } TEST_F(homa_timer, homa_timer__reap_dead_rpcs) @@ -275,17 +312,29 @@ TEST_F(homa_timer, homa_timer__reap_dead_rpcs) ASSERT_NE(NULL, dead); homa_rpc_end(dead); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(31, self->hsk.dead_skbs); +#else /* See strip.py */ + EXPECT_EQ(30, self->hsk.dead_skbs); +#endif /* See strip.py */ // First call to homa_timer: not enough dead skbs. self->homa.dead_buffs_limit = 32; homa_timer(&self->homa); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(31, self->hsk.dead_skbs); +#else /* See strip.py */ + EXPECT_EQ(30, self->hsk.dead_skbs); +#endif /* See strip.py */ // Second call to homa_timer: must reap. self->homa.dead_buffs_limit = 15; homa_timer(&self->homa); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(11, self->hsk.dead_skbs); +#else /* See strip.py */ + EXPECT_EQ(10, self->hsk.dead_skbs); +#endif /* See strip.py */ } TEST_F(homa_timer, homa_timer__rpc_in_service) { diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 10c5f1b4..92ad1a2a 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -24,6 +24,7 @@ FIXTURE_TEARDOWN(homa_utils) unit_teardown(); } +#ifndef __STRIP__ /* See strip.py */ /** * set_cutoffs() - A convenience method to allow all of the values in * homa->unsched_cutoffs to be set concisely. @@ -49,6 +50,7 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, homa->unsched_cutoffs[6] = c6; homa->unsched_cutoffs[7] = c7; } +#endif /* See strip.py */ TEST_F(homa_utils, homa_init__kmalloc_failure_for_port_map) { @@ -71,6 +73,7 @@ TEST_F(homa_utils, homa_init__kmalloc_failure_for_peers) EXPECT_EQ(NULL, homa2.peers); homa_destroy(&homa2); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_init__homa_skb_init_failure) { struct homa homa2; @@ -82,6 +85,7 @@ TEST_F(homa_utils, homa_init__homa_skb_init_failure) mock_printk_output); homa_destroy(&homa2); } +#endif /* See strip.py */ TEST_F(homa_utils, homa_init__cant_create_pacer_thread) { struct homa homa2; @@ -93,6 +97,7 @@ TEST_F(homa_utils, homa_init__cant_create_pacer_thread) homa_destroy(&homa2); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_print_ipv4_addr) { struct in6_addr test_addr1 = unit_get_in_addr("192.168.0.1"); @@ -134,7 +139,9 @@ TEST_F(homa_utils, homa_snprintf) EXPECT_STREQ("Test message with values: 100 and 1000; plus: 123", buffer); } +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_prios_changed__basics) { set_cutoffs(&self->homa, 90, 80, HOMA_MAX_MESSAGE_LENGTH*2, 60, 50, @@ -166,3 +173,4 @@ TEST_F(homa_utils, homa_prios_changed__share_lowest_priority) EXPECT_EQ(0x7fffffff, self->homa.unsched_cutoffs[0]); EXPECT_EQ(0, self->homa.max_sched_prio); } +#endif /* See strip.py */ diff --git a/test/utils.c b/test/utils.c index ffc44361..7faef956 100644 --- a/test/utils.c +++ b/test/utils.c @@ -58,20 +58,18 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, return crpc; crpc->msgout.next_xmit_offset = crpc->msgout.length; - struct homa_data_hdr h = { - .common = { - .sport = htons(server_port), - .dport = htons(hsk->port), - .type = DATA, - .sender_id = cpu_to_be64(id ^ 1) - }, - .message_length = htonl(resp_length), - .incoming = htonl(10000), - .ack = {0, 0}, - .cutoff_version = 0, - .retransmit = 0, - .seg = {.offset = 0} + struct homa_data_hdr h; + memset(&h, 0, sizeof(h)); + h.common = (struct homa_common_hdr){ + .sport = htons(server_port), + .dport = htons(hsk->port), + .type = DATA, + .sender_id = cpu_to_be64(id ^ 1) }; + h.message_length = htonl(resp_length); +#ifndef __STRIP__ /* See strip.py */ + h.incoming = htonl(10000); +#endif /* See strip.py */ this_size = (resp_length > UNIT_TEST_DATA_PER_PACKET) ? UNIT_TEST_DATA_PER_PACKET : resp_length; @@ -197,6 +195,7 @@ void unit_log_frag_list(struct sk_buff *skb, int verbose) } } +#ifndef __STRIP__ /* See strip.py */ /** * unit_log_grantables() - Append to the test log information about all of * the messages that are currently grantable. @@ -219,6 +218,7 @@ void unit_log_grantables(struct homa *homa) } } } +#endif /* See strip.py */ /** * unit_log_message_out_packets() - Append to the test log a human-readable @@ -352,20 +352,18 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, int req_length, int resp_length) { int bytes_received, created; - struct homa_data_hdr h = { - .common = { - .sport = htons(client_port), - .dport = htons(hsk->port), - .type = DATA, - .sender_id = cpu_to_be64(id ^ 1) - }, - .message_length = htonl(req_length), - .incoming = htonl(10000), - .ack = {0, 0}, - .cutoff_version = 0, - .retransmit = 0, - .seg = {.offset = 0} + struct homa_data_hdr h; + memset(&h, 0, sizeof(h)); + h.common = (struct homa_common_hdr){ + .sport = htons(client_port), + .dport = htons(hsk->port), + .type = DATA, + .sender_id = cpu_to_be64(id ^ 1) }; + h.message_length = htonl(req_length); +#ifndef __STRIP__ /* See strip.py */ + h.incoming = htonl(10000); +#endif /* See strip.py */ struct homa_rpc *srpc = homa_rpc_new_server(hsk, client_ip, &h, &created); diff --git a/test/utils.h b/test/utils.h index 9f876e5e..ac9b0d42 100644 --- a/test/utils.h +++ b/test/utils.h @@ -45,7 +45,9 @@ extern int unit_list_length(struct list_head *head); extern void unit_log_active_ids(struct homa_sock *hsk); extern void unit_log_filled_skbs(struct sk_buff *skb, int verbose); extern void unit_log_frag_list(struct sk_buff *skb, int verbose); +#ifndef __STRIP__ /* See strip.py */ extern void unit_log_grantables(struct homa *homa); +#endif /* See strip.py */ extern void unit_log_hashed_rpcs(struct homa_sock *hsk); extern void unit_log_message_out_packets( struct homa_message_out *message, int verbose); diff --git a/timetrace.c b/timetrace.c index 79eb1d0a..4a0a6f65 100644 --- a/timetrace.c +++ b/timetrace.c @@ -108,6 +108,8 @@ int tt_init(char *proc_file, int *temp) { int i; + pr_notice("tt_init invoked, init %d, proc_file %s\n", init, proc_file); + if (init) return 0; @@ -835,6 +837,7 @@ void tt_dbg3(char *msg, ...) */ void tt_inc_metric(int metric, u64 count) { +#ifndef __STRIP__ /* See strip.py */ /* Maps from the metric argument to an offset within homa_metrics. * This level of indirection is needed so that the kernel doesn't * have to be recompiled every time a new metric gets added (which @@ -849,4 +852,5 @@ void tt_inc_metric(int metric, u64 count) u64 *metric_addr = (u64 *)(((char *)homa_metrics_per_cpu()) + offsets[metric]); *metric_addr += count; +#endif /* See strip.py */ } diff --git a/util/cp_node.cc b/util/cp_node.cc index f90874fd..404ac2eb 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -444,6 +444,7 @@ void log_affinity() */ void kfreeze() { +#ifndef __STRIP__ /* See strip.py */ kfreeze_count++; if (kfreeze_count > 1) return; @@ -457,6 +458,7 @@ void kfreeze() log(NORMAL, "ERROR: HOMAIOCFREEZE ioctl failed: %s\n", strerror(errno)); close(fd); +#endif /* See strip.py */ } /** diff --git a/util/cperf.py b/util/cperf.py index f2f674a2..6077c4b9 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -65,6 +65,9 @@ # The CloudLab node type for this node (e.g. xl170) node_type = None +# Value of the "--stripped" option. +stripped = False + # Defaults for command-line options; assumes that servers and clients # share nodes. default_defaults = { @@ -276,6 +279,10 @@ def get_parser(description, usage, defaults = {}): metavar='nodes', help='List of node numbers not to use in the experiment; can ' ' contain ranges, such as "3,5-8,12"') + parser.add_argument('--stripped', dest='stripped', type=boolean, + default=True, metavar="T/F", help='Boolean value: true means ' + 'Homa has been stripped for upstreaming, which means some ' + 'facilities are not available (default: false)') parser.add_argument('--tcp-client-max', dest='tcp_client_max', type=int, metavar='count', default=0, help="Maximum number of TCP requests " "that can be outstanding from a client node at once (divided evenly " @@ -569,6 +576,11 @@ def set_sysctl_parameter(name, value, nodes): nodes: specifies ids of the nodes on which to execute the command: should be a range, list, or other object that supports "in" """ + global stripped + if stripped: + vlog("Skipping set of Homa %s parameter to %s on nodes %s (Homa is stripped)" + % (name, value, str(nodes))) + return vlog("Setting Homa parameter %s to %s on nodes %s" % (name, value, str(nodes))) for id in nodes: @@ -615,7 +627,8 @@ def start_servers(exp, ids, options): port_threads protocol """ - global server_nodes + global server_nodes, stripped + stripped = options.stripped log("Starting servers for %s experiment on nodes %s" % (exp, ids)) if len(server_nodes) > 0: do_cmd("stop servers", server_nodes) @@ -654,7 +667,7 @@ def run_experiment(name, clients, options): workload """ - global active_nodes + global active_nodes, stripped exp_nodes = list(set(options.servers + list(clients))) start_nodes(clients, options) nodes = [] @@ -712,9 +725,12 @@ def run_experiment(name, clients, options): if options.protocol == "homa": # Wait a bit so that homa_prio can set priorities appropriately time.sleep(2) - vlog("Recording initial metrics") - for id in exp_nodes: - do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) + if stripped: + vlog("Skipping initial read of metrics (Homa is stripped)") + else: + vlog("Recording initial metrics") + for id in exp_nodes: + do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) if not "no_rtt_files" in options: do_cmd("dump_times /dev/null %s" % (name), clients) do_cmd("log Starting measurements for %s experiment" % (name), @@ -737,15 +753,22 @@ def run_experiment(name, clients, options): if not "no_rtt_files" in options: do_cmd("dump_times rtts %s" % (name), clients) if (options.protocol == "homa") and not "unloaded" in options: - vlog("Recording final metrics from nodes %s" % (exp_nodes)) - for id in exp_nodes: - f = open("%s/%s-%d.metrics" % (options.log_dir, name, id), 'w') - subprocess.run(["ssh", "node%d" % (id), "metrics.py"], stdout=f) - f.close() - shutil.copyfile("%s/%s-%d.metrics" % (options.log_dir, name, options.servers[0]), - "%s/reports/%s-%d.metrics" % (options.log_dir, name, options.servers[0])) - shutil.copyfile("%s/%s-%d.metrics" % (options.log_dir, name, clients[0]), - "%s/reports/%s-%d.metrics" % (options.log_dir, name, clients[0])) + if stripped: + vlog("Skipping final read of metrics (Homa is stripped)") + else: + vlog("Recording final metrics from nodes %s" % (exp_nodes)) + for id in exp_nodes: + f = open("%s/%s-%d.metrics" % (options.log_dir, name, id), 'w') + subprocess.run(["ssh", "node%d" % (id), "metrics.py"], stdout=f) + f.close() + shutil.copyfile("%s/%s-%d.metrics" % + (options.log_dir, name, options.servers[0]), + "%s/reports/%s-%d.metrics" % + (options.log_dir, name, options.servers[0])) + shutil.copyfile("%s/%s-%d.metrics" % + (options.log_dir, name, clients[0]), + "%s/reports/%s-%d.metrics" % + (options.log_dir, name, clients[0])) do_cmd("stop senders", clients) if False and "dctcp" in name: do_cmd("tt print cp.tt", clients) @@ -792,7 +815,7 @@ def run_experiments(*args): There may be additional optional values that used if present. """ - global active_nodes + global active_nodes, stripped homa_nodes = [] homa_clients = [] @@ -880,8 +903,11 @@ def run_experiments(*args): # Wait a bit so that homa_prio can set priorities appropriately time.sleep(2) if homa_nodes: - vlog("Initializing metrics") - do_ssh(["metrics.py > /dev/null"], homa_nodes) + if stripped: + vlog("Skipping metrics initialization (Homa is stripped)") + else: + vlog("Initializing metrics") + do_ssh(["metrics.py > /dev/null"], homa_nodes) do_cmd("dump_times /dev/null", all_nodes) do_cmd("log Starting measurements", all_nodes) log("Starting measurements") @@ -897,15 +923,22 @@ def run_experiments(*args): for exp in args: do_cmd("dump_times %s.rtts %s" % (exp.name, exp.name), exp.clients) if homa_nodes: - vlog("Recording final metrics from nodes %s" % (homa_nodes)) - for id in homa_nodes: - f = open("%s/node%d.metrics" % (exp.log_dir, id), 'w') - subprocess.run(["ssh", "node%d" % (id), "metrics.py"], stdout=f) - f.close() - shutil.copyfile("%s/node%d.metrics" % (exp.log_dir, homa_clients[0]), - "%s/reports/node%d.metrics" % (exp.log_dir, homa_clients[0])) - shutil.copyfile("%s/node%d.metrics" % (exp.log_dir, homa_servers[0]), - "%s/reports/node%d.metrics" % (exp.log_dir, homa_servers[0])) + if stripped: + vlog("Skipping final read of metrics (Homa is stripped)") + else: + vlog("Recording final metrics from nodes %s" % (homa_nodes)) + for id in homa_nodes: + f = open("%s/node%d.metrics" % (exp.log_dir, id), 'w') + subprocess.run(["ssh", "node%d" % (id), "metrics.py"], stdout=f) + f.close() + shutil.copyfile("%s/node%d.metrics" % + (exp.log_dir, homa_clients[0]), + "%s/reports/node%d.metrics" % + (exp.log_dir, homa_clients[0])) + shutil.copyfile("%s/node%d.metrics" % + (exp.log_dir, homa_servers[0]), + "%s/reports/node%d.metrics" % + (exp.log_dir, homa_servers[0])) do_cmd("stop senders", all_nodes) do_cmd("stop clients", all_nodes) for exp in args: diff --git a/util/strip.py b/util/strip.py index a9f721a9..2af83593 100755 --- a/util/strip.py +++ b/util/strip.py @@ -7,43 +7,58 @@ a Linux kernel repo, removing information that doesn't belong in the official kernel version (such as calls to tt_record). -Usage: strip.py [--alt] file file file ... destdir +Usage: strip.py file file file ... destdir Each of the files will be read, stripped as appropriate, and copied to a file by the same name in destdir. If there is only a single file and no destdir, then the stripped file is printed on standard output. -In some cases, such as calls to tt_record* and code related to unit tests, -information is removed automatically. In other cases, it is controlled with -#if statments in the following ways: - -* This entire block will be removed in the stripped version: +The following code is removed automatically: + * Calls to timetracing, such as tt_record* + * Blocks conditionalized on '#ifdef __UNIT_TEST__' + * UNIT_LOG and UNIT_HOOK statements + * INC_METRIC statements + +Additional stripping is controlled by #ifdefs. The #ifdefs allow the +code to be used in three ways: +* Normal compilation in a development environment: includes unit testing + and timetracing support, nothing is stripped. The code is compiled as + is. +* Upstreaming: source files are run through this program, which produces + a statically-stripped version. +* Compile-time stripping: the code is compiled as is, but "__STRIP__=y" is + set on the make command line (both for compiling Homa and for unit testing). + This omits almost all of the information that must be omitted for + upstreaming, but retains a few debugging facilities like timetracing. + +Here are details about the #ifdefs used for stripping: + +* This entire block will be removed in the stripped version, but it will + be compiled in normal mode: #ifndef __STRIP__ /* See strip.py */ ... #endif /* See strip.py */ +* This entire block will be removed in the stripped version, but it will + be compiled in both normal mode and with compile-time stripping. + #ifndef __UPSTREAM__ /* See strip.py */ + ... + #endif /* See strip.py */ + * The #if and #endif statements will be removed, leaving just the code - in between: + in between. The code will be compiled in compile-time stripping mode #ifdef __STRIP__ /* See strip.py */ ... #endif /* See strip.py */ -* Everything will be removed except the code between #else and #endif: +* Everything will be removed except the code between #else and #endif. + During normal mode the #ifndef block will be compiled; under compile-time + stripping the #else block will be compiled. #ifndef __STRIP__ /* See strip.py */ ... #else /* See strip.py */ ... #endif /* See strip.py */ - -* It is also possible to strip using "alt" mode, with lines like this: - #ifndef __STRIP__ /* See strip.py --alt */ - #ifdef __STRIP__ /* See strip.py --alt */ - If the --alt option was not specified then these lines are handled as - if "--alt" wasn't present in the comments. However, if the --alt option - was specified then these lines are ignored. - -If the --alt option is specified, it means the output is intended for -testing outside the Linux kernel. In this case, the lines should remain. """ from collections import defaultdict @@ -111,12 +126,11 @@ def last_non_blank(s): return s2[-1] return None -def scan(file, alt_mode): +def scan(file): """ Read a file, remove information that shouldn't appear in the Linux kernel version, and return an array of lines representing the stripped file. file: Pathname of file to read - alt_mode: True means the --alt option was specified """ global exit_code @@ -206,9 +220,9 @@ def scan(file, alt_mode): non_comment = pline non_comment = non_comment.strip() - # Strip groups of lines labeled with special '#ifndef __STRIP__' - # Note: don't do brace elimination here: this allows greater control - # to the __STRIP__ code. + # Strip groups of lines labeled with '#ifndef __STRIP__' or + # '#ifndef __UPSTREAM__'. Note: don't do brace elimination here: + # this gives greater control to the __STRIP__ code. if in_labeled_skip != None: if line.startswith('#endif /* See strip.py */'): in_labeled_skip = None @@ -220,16 +234,13 @@ def scan(file, alt_mode): if in_labeled_skip == 1: continue if line.startswith('#ifndef __STRIP__ /* See strip.py */') or ( - line.startswith('#ifndef __STRIP__ /* See strip.py --alt */') - and not alt_mode): + line.startswith('#ifndef __UPSTREAM__ /* See strip.py */')): if slines[-1].strip() == '': delete_empty_line = True in_labeled_skip = 1 check_braces = False continue - if line.startswith('#ifdef __STRIP__ /* See strip.py */') or ( - line.startswith('#ifdef __STRIP__ /* See strip.py --alt */') - and not alt_mode): + if line.startswith('#ifdef __STRIP__ /* See strip.py */') : if slines[-1].strip() == '': slines.pop() in_labeled_skip = 0 @@ -243,13 +254,16 @@ def scan(file, alt_mode): delete_empty_line = True continue - # Strip tt_record statements. if skip_statement: if pline[-1] == ';': skip_statement = False check_braces = True continue + + # Strip tt_record and INC_METRIC statements. match = re.match('(//[ \t]*)?tt_record[1-4]?[(]', pline) + if not match: + match = re.match('(//[ \t]*)?INC_METRIC[(]', pline) if match: # If this is the only statement in its block, delete the # outer block statement (if, while, etc.). Don't delete case @@ -277,8 +291,7 @@ def scan(file, alt_mode): continue # Strip UNIT_LOG and UNIT_HOOK statements. - if not alt_mode and (pline.startswith('UNIT_LOG(') or - pline.startswith('UNIT_HOOK(')): + if (pline.startswith('UNIT_LOG(') or pline.startswith('UNIT_HOOK(')): if pline[-1] != ';': skip_statement = True if slines[-1].strip() == '': @@ -286,6 +299,12 @@ def scan(file, alt_mode): check_braces = True continue + # Strip #include "homa_strip.h" statements. + if pline.startswith('#include "homa_strip.h"'): + if slines[-1].strip() == '': + delete_empty_line = True + continue + # Strip '#ifdef __UNIT_TEST__' blocks (keep #else clauses) if in_unit: if line.startswith('#endif /* __UNIT_TEST__ */'): @@ -296,12 +315,12 @@ def scan(file, alt_mode): continue if in_unit == 'if': continue - elif line.startswith('#ifdef __UNIT_TEST__') and not alt_mode: + elif line.startswith('#ifdef __UNIT_TEST__'): in_unit = 'if' if slines[-1].strip() == '': delete_empty_line = True continue - elif line.startswith('#ifndef __UNIT_TEST__') and not alt_mode: + elif line.startswith('#ifndef __UNIT_TEST__'): in_unit = 'else' if slines[-1].strip() == '': delete_empty_line = True @@ -317,7 +336,7 @@ def scan(file, alt_mode): continue if in_version == 'if': continue - elif line.startswith('#if LINUX_VERSION_CODE') and not alt_mode: + elif line.startswith('#if LINUX_VERSION_CODE'): in_version = 'if' if slines[-1].strip() == '': delete_empty_line = True @@ -361,21 +380,17 @@ def scan(file, alt_mode): if __name__ == '__main__': f = sys.stdin - alt_mode = False; - if (len(sys.argv) >= 2) and (sys.argv[1] == '--alt'): - alt_mode = True; - del sys.argv[1] if len(sys.argv) < 2: print('Usage: strip.py [--alt] file [file ... destdir]', file=sys.stderr) exit(1) if len(sys.argv) == 2: - for line in scan(sys.argv[1], alt_mode): + for line in scan(sys.argv[1]): print(line, end='') else: for file in sys.argv[1:-1]: dst_file = '%s/%s' % (sys.argv[-1], file) print('Stripping %s into %s' % (file, dst_file)) - slines = scan(file, alt_mode) + slines = scan(file) dst = open(dst_file, 'w') for line in slines: print(line, end='', file=dst) From 0a273b01a024789e15755e9fd9b6545924eb3e1c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 5 Feb 2025 10:45:42 -0800 Subject: [PATCH 185/625] Fix minor bug in cloudlab/bin/config --- cloudlab/bin/config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudlab/bin/config b/cloudlab/bin/config index 6e23ee6c..1995eff6 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -156,7 +156,7 @@ def get_link_speed(): nic = get_interfaces()[0] num_channels = -1 - for line in subprocess.run(["ethtool", "eno1"], stdout=subprocess.PIPE, + for line in subprocess.run(["ethtool", nic], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8", check=True).stdout.splitlines(): match = re.match('.*Speed: ([0-9]+)Mb/s', line) From 3f5e1bb52cdd58799d548832bc8c8287f10fb50d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 5 Feb 2025 12:06:33 -0800 Subject: [PATCH 186/625] Fix bugs in command-line arg specs for cperf.py --- util/cperf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index 6077c4b9..fa2fb8fa 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -280,7 +280,7 @@ def get_parser(description, usage, defaults = {}): help='List of node numbers not to use in the experiment; can ' ' contain ranges, such as "3,5-8,12"') parser.add_argument('--stripped', dest='stripped', type=boolean, - default=True, metavar="T/F", help='Boolean value: true means ' + default=False, metavar="T/F", help='Boolean value: true means ' 'Homa has been stripped for upstreaming, which means some ' 'facilities are not available (default: false)') parser.add_argument('--tcp-client-max', dest='tcp_client_max', type=int, @@ -309,7 +309,7 @@ def get_parser(description, usage, defaults = {}): parser.add_argument('--tt-freeze', dest='tt_freeze', type=boolean, default=True, metavar="T/F", help="Boolean value: if true, " "timetraces will be frozen on all nodes at the end of the " - "Homa benchmark run (default: false)") + "Homa benchmark run (default: true)") parser.add_argument('--unsched', type=int, dest='unsched', metavar='count', default=defaults['unsched'], help='If nonzero, homa_prio will always use this number of ' From 41cbfa2f90c5d9ee9e8fce6020386906263a2d42 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 5 Feb 2025 16:04:07 -0800 Subject: [PATCH 187/625] Reduce spin time in call to homa_spin Also beef up comment to mention experimental data. --- homa_incoming.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 58ee3a2c..d10e5c5e 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -452,11 +452,16 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) homa_rpc_unlock(rpc); tt_record2("softirq released lock for id %d, flags 0x%x", rpc->id, flags); - /* We're going to try to reacquire the RPC - * lock almost immediately below; give the - * app thread a chance to get to it first. + /* This short spin is needed to ensure that the + * other thread gets the lock before this thread + * grabs it again below (the need for this + * was confirmed experimentally in 2/2025; + * without it, the handoff fails 20-25% of the + * time). Furthermore, the call to homa_spin + * seems to allow the other thread to acquire + * the lock more quickly. */ - homa_spin(200); + homa_spin(100); rpc = NULL; } } @@ -508,6 +513,8 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) goto discard; } } else { + tt_record1("homa_dispatch_pkts has rpc lock for id %d", + rpc->id); if (h->common.type == DATA || #ifndef __STRIP__ /* See strip.py */ h->common.type == GRANT || From 16b3766066a83df23a62ee60ac7d74d8f2f64a88 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 6 Feb 2025 09:21:28 -0800 Subject: [PATCH 188/625] Add #ifdefs in mock.c to accomodate different Linux versions --- test/mock.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/mock.c b/test/mock.c index 800d5581..88a23c58 100644 --- a/test/mock.c +++ b/test/mock.c @@ -222,7 +222,9 @@ unsigned int nr_cpu_ids = 8; unsigned long page_offset_base; unsigned long phys_base; unsigned long vmemmap_base; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES]; +#endif int __preempt_count; struct pcpu_hot pcpu_hot = {.cpu_number = 1}; char sock_flow_table[RPS_SOCK_FLOW_TABLE_SIZE(1024)]; @@ -358,10 +360,12 @@ void __copy_overflow(int size, unsigned long count) abort(); } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) int debug_lockdep_rcu_enabled(void) { return 0; } +#endif void dst_release(struct dst_entry *dst) { @@ -861,8 +865,10 @@ bool __list_del_entry_valid_or_report(struct list_head *entry) void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) {} +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) void lockdep_rcu_suspicious(const char *file, const int line, const char *s) {} +#endif int lock_is_held_type(const struct lockdep_map *lock, int read) { @@ -1034,6 +1040,7 @@ int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) return 1; } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) int rcu_read_lock_held(void) { return 0; @@ -1043,6 +1050,7 @@ int rcu_read_lock_bh_held(void) { return 0; } +#endif bool rcuref_get_slowpath(rcuref_t *ref) { From 3f113145f33d91fcd481a4580aa418430b583a1e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 6 Feb 2025 09:22:10 -0800 Subject: [PATCH 189/625] Consolidate homa_grant_update_incoming calls in homa_grant_check_rpc --- homa_grant.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index fa8d5bef..65cfa365 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -297,8 +297,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) goto done; } + homa_grant_update_incoming(rpc, homa); if (rpc->msgin.granted >= rpc->msgin.length) { - homa_grant_update_incoming(rpc, homa); homa_rpc_unlock(rpc); goto done; } @@ -311,7 +311,6 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * granting. */ if (list_empty(&rpc->grantable_links)) { - homa_grant_update_incoming(rpc, homa); homa_grantable_lock(homa, 0); homa_grant_add_rpc(rpc); recalc = (homa->num_active_rpcs < homa->max_overcommit || @@ -329,7 +328,6 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) /* Not a new message; see if we can upgrade the message's priority. */ rank = atomic_read(&rpc->msgin.rank); if (rank < 0) { - homa_grant_update_incoming(rpc, homa); if (rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining[homa->max_overcommit - 1])) { homa_rpc_unlock(rpc); INC_METRIC(grant_priority_bumps, 1); @@ -342,7 +340,6 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) atomic_set(&homa->active_remaining[rank], rpc->msgin.bytes_remaining); if (rank > 0 && rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining[rank - 1])) { - homa_grant_update_incoming(rpc, homa); homa_rpc_unlock(rpc); INC_METRIC(grant_priority_bumps, 1); homa_grant_recalc(homa, 0); From fa0b3076dc52cfcc40bb00768539f53cbe338900 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 6 Feb 2025 09:29:15 -0800 Subject: [PATCH 190/625] Improve documentation for homa_rpc_lock --- homa_rpc.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/homa_rpc.h b/homa_rpc.h index 752962de..37203219 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -438,11 +438,10 @@ int homa_validate_incoming(struct homa *homa, int verbose, * @rpc: RPC to lock. Note: this function is only safe under * limited conditions (in most cases homa_bucket_lock should be * used). The caller must ensure that the RPC cannot be reaped - * before the lock is acquired. It cannot do that by acquirin - * the socket lock, since that violates lock ordering constraints. - * One approach is to use homa_protect_rpcs. Don't use this function - * unless you are very sure what you are doing! See sync.txt for - * more info on locking. + * before the lock is acquired, such as by taking a reference on + * the rpc with homa_rpc_hold or calling homa_protect_rpcs. + * Don't use this function unless you are very sure what you are + * doing! See sync.txt for more info on locking. */ static inline void homa_rpc_lock(struct homa_rpc *rpc) { From e2ed94589309395b5e76df7073e493fe6dd33e4a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 6 Feb 2025 09:53:37 -0800 Subject: [PATCH 191/625] Change homa_grant_check_rpc to return with RPC still locked Handle lock release and reacquisition in homa_grant_recalc. --- homa_grant.c | 70 +++++++++++++++++++----------------------- homa_grant.h | 3 +- homa_incoming.c | 8 ++--- homa_pool.c | 10 ++---- test/unit_homa_grant.c | 51 ++++++++++++------------------ 5 files changed, 58 insertions(+), 84 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 65cfa365..be0d46b8 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -269,12 +269,12 @@ int homa_grant_send(struct homa_rpc *rpc, struct homa *homa) * RPC has changed (such as packets arriving). It checks the state of the * RPC relative to outgoing grants and takes any appropriate actions that * are needed (such as adding the RPC to the grantable list or sending - * grants). - * @rpc: RPC to check. Must be locked by the caller. Note: THIS FUNCTION - * WILL RELEASE THE LOCK before returning. + * grants for this or other RPCs). + * @rpc: RPC to check. Must be locked by the caller. This function may + * release and then reacquire that lock, so caller must not hold + * any locks that would disallow that. */ void homa_grant_check_rpc(struct homa_rpc *rpc) - __releases(rpc->bucket_lock) { /* Overall design notes: * The grantable lock has proven to be a performance bottleneck, @@ -292,16 +292,12 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) int rank, recalc; if (rpc->msgin.length < 0 || rpc->state == RPC_DEAD || - rpc->msgin.num_bpages <= 0) { - homa_rpc_unlock(rpc); + rpc->msgin.num_bpages <= 0) goto done; - } homa_grant_update_incoming(rpc, homa); - if (rpc->msgin.granted >= rpc->msgin.length) { - homa_rpc_unlock(rpc); + if (rpc->msgin.granted >= rpc->msgin.length) goto done; - } tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, @@ -317,9 +313,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining [homa->max_overcommit - 1])); - homa_rpc_unlock(rpc); if (recalc) - homa_grant_recalc(homa, 1); + homa_grant_recalc(homa, 1, rpc); else homa_grantable_unlock(homa); goto done; @@ -329,20 +324,16 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) rank = atomic_read(&rpc->msgin.rank); if (rank < 0) { if (rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining[homa->max_overcommit - 1])) { - homa_rpc_unlock(rpc); INC_METRIC(grant_priority_bumps, 1); - homa_grant_recalc(homa, 0); - } else { - homa_rpc_unlock(rpc); + homa_grant_recalc(homa, 0, rpc); } goto done; } atomic_set(&homa->active_remaining[rank], rpc->msgin.bytes_remaining); if (rank > 0 && rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining[rank - 1])) { - homa_rpc_unlock(rpc); INC_METRIC(grant_priority_bumps, 1); - homa_grant_recalc(homa, 0); + homa_grant_recalc(homa, 0, rpc); goto done; } @@ -356,14 +347,12 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) if (rpc->msgin.granted >= rpc->msgin.length) { homa_grantable_lock(homa, 0); homa_grant_remove_rpc(rpc); - homa_rpc_unlock(rpc); - homa_grant_recalc(homa, 1); + homa_grant_recalc(homa, 1, rpc); goto done; } - homa_rpc_unlock(rpc); if (recalc) - homa_grant_recalc(homa, 0); + homa_grant_recalc(homa, 0, rpc); done: tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); } @@ -373,15 +362,20 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * and what priorities to use for each. If needed, send out grant packets to * ensure that all appropriate grants have been issued. This function is * invoked whenever something happens that could change the contents or order - * of homa->active_rpcs. No RPC locks may be held when this function is invoked, - * because RPC locks will be acquired here. + * of homa->active_rpcs. * @homa: Overall information about the Homa transport. * @locked: Normally this function will acquire (and release) * homa->grantable_lock. If this value is nonzero, it means * the caller has already acquired homa->grantable_lock. In * either case the lock will be released upon return. + * @caller_rpc: An RPC for which the caller holds the lock. This function + * may release this lock in order acquire other RPC locks, + * but if so, it will reacquire the lock before returning. + * The caller must not hold any locks that prevent RPCs + * from being locked (see sync.txt). NULL means no lock held. */ -void homa_grant_recalc(struct homa *homa, int locked) +void homa_grant_recalc(struct homa *homa, int locked, + struct homa_rpc *caller_rpc) { /* The tricky part of this method is that we need to release * homa->grantable_lock before actually sending grants, because @@ -404,6 +398,11 @@ void homa_grant_recalc(struct homa *homa, int locked) } start = sched_clock(); + if (likely(caller_rpc)) { + homa_rpc_hold(caller_rpc); + homa_rpc_unlock(caller_rpc); + } + /* We may have to recalculate multiple times if grants sent in one * round cause messages to be completely granted, opening up * opportunities to grant to additional messages. @@ -487,6 +486,10 @@ void homa_grant_recalc(struct homa *homa, int locked) break; } } + if (likely(caller_rpc)) { + homa_rpc_lock(caller_rpc); + homa_rpc_put(caller_rpc); + } INC_METRIC(grant_recalc_ns, sched_clock() - start); } @@ -612,21 +615,10 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) if (!list_empty(&rpc->grantable_links)) { homa_grantable_lock(homa, 0); homa_grant_remove_rpc(rpc); - if (atomic_read(&rpc->msgin.rank) >= 0) { - /* Very tricky code below. We have to unlock the RPC before - * calling homa_grant_recalc. This creates a risk that the - * RPC could be reaped before the lock is reacquired. - * However, this function is only called from a specific - * place in homa_rpc_end where the RPC hasn't yet been put - * on the reap list, so there is no way it can be reaped - * until we return. - */ - homa_rpc_unlock(rpc); - homa_grant_recalc(homa, 1); - homa_rpc_lock(rpc); - } else { + if (atomic_read(&rpc->msgin.rank) >= 0) + homa_grant_recalc(homa, 1, rpc); + else homa_grantable_unlock(homa); - } } if (rpc->msgin.rec_incoming != 0) diff --git a/homa_grant.h b/homa_grant.h index 45073e68..248aa835 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -16,7 +16,8 @@ int homa_grant_outranks(struct homa_rpc *rpc1, int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, int max_rpcs); void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -void homa_grant_recalc(struct homa *homa, int locked); +void homa_grant_recalc(struct homa *homa, int locked, + struct homa_rpc *locked_rpc); void homa_grant_remove_rpc(struct homa_rpc *rpc); int homa_grant_send(struct homa_rpc *rpc, struct homa *homa); int homa_grant_update_incoming(struct homa_rpc *rpc, diff --git a/homa_incoming.c b/homa_incoming.c index d10e5c5e..f8fbffd0 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -584,12 +584,12 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) discard: kfree_skb(skb); } - if (rpc) + if (rpc) { #ifndef __STRIP__ /* See strip.py */ - homa_grant_check_rpc(rpc); /* Unlocks rpc. */ -#else /* See strip.py */ - homa_rpc_unlock(rpc); + homa_grant_check_rpc(rpc); #endif /* See strip.py */ + homa_rpc_unlock(rpc); + } while (num_acks > 0) { num_acks--; diff --git a/homa_pool.c b/homa_pool.c index 88ee1726..3472a56d 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -489,15 +489,9 @@ void homa_pool_check_waiting(struct homa_pool *pool) if (rpc->msgin.num_bpages > 0) { /* Allocation succeeded; "wake up" the RPC. */ rpc->msgin.resend_all = 1; - homa_grant_check_rpc(rpc); /* Unlocks rpc. */ - } else { - homa_rpc_unlock(rpc); + homa_grant_check_rpc(rpc); } -#else /* See strip.py */ - if (rpc->msgin.num_bpages > 0) - /* Allocation succeeded; "wake up" the RPC. */ - rpc->msgin.resend_all = 1; - homa_rpc_unlock(rpc); #endif /* See strip.py */ + homa_rpc_unlock(rpc); } } diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index aaf08668..579b1899 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -515,7 +515,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) rpc->msgin.bytes_remaining = 500; rpc->msgin.granted = 2000; rpc->msgin.rec_incoming = 0; - homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(0, rpc->msgin.rec_incoming); EXPECT_EQ(0, atomic_read(&self->homa.total_incoming)); @@ -528,7 +527,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) int old_state; homa_message_in_init(rpc, 2000, 0); - homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(2000, rpc->msgin.rec_incoming); EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); @@ -536,7 +534,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) old_state = rpc->state; rpc->state = RPC_DEAD; rpc->msgin.bytes_remaining = 0; - homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); rpc->state = old_state; EXPECT_EQ(2000, rpc->msgin.rec_incoming); @@ -552,13 +549,11 @@ TEST_F(homa_grant, homa_grant_check_rpc__message_doesnt_need_grants) rpc->msgin.granted = 2000; rpc->msgin.bytes_remaining = 500; - homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(500, rpc->msgin.rec_incoming); EXPECT_EQ(500, atomic_read(&self->homa.total_incoming)); rpc->msgin.bytes_remaining = 0; - homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(0, rpc->msgin.rec_incoming); EXPECT_EQ(0, atomic_read(&self->homa.total_incoming)); @@ -572,7 +567,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) homa_message_in_init(rpc, 20000, 0); rpc->msgin.bytes_remaining = 12000; - homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(18000, rpc->msgin.granted); EXPECT_EQ(10000, rpc->msgin.rec_incoming); @@ -586,14 +580,13 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); rpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 104, 1000, 25000); homa_message_in_init(rpc3, 20000, 0); - homa_rpc_lock(rpc3); homa_grant_check_rpc(rpc3); EXPECT_EQ(10000, rpc3->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.rec_incoming); @@ -608,7 +601,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); rpc2->msgin.bytes_remaining = 1000; @@ -616,7 +609,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) rpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 104, 1000, 30000); homa_message_in_init(rpc3, 30000, 0); - homa_rpc_lock(rpc3); homa_grant_check_rpc(rpc3); EXPECT_EQ(0, rpc3->msgin.granted); EXPECT_EQ(0, rpc3->msgin.rec_incoming); @@ -632,14 +624,13 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(0, rpc3->msgin.granted); rpc3->msgin.bytes_remaining = 15000; - homa_rpc_lock(rpc3); homa_grant_check_rpc(rpc3); EXPECT_EQ(35000, rpc3->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.rec_incoming); @@ -655,7 +646,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 4; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); @@ -663,7 +654,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) rpc3->msgin.bytes_remaining = 25000; unit_log_clear(); - homa_rpc_lock(rpc3); homa_grant_check_rpc(rpc3); EXPECT_EQ(25000, rpc3->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.rec_incoming); @@ -677,14 +667,13 @@ TEST_F(homa_grant, homa_grant_check_rpc__send_new_grant) struct homa_rpc *rpc; rpc = test_rpc(self, 100, self->server_ip, 40000); - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); EXPECT_EQ(10000, rpc->msgin.granted); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); rpc->msgin.bytes_remaining = 35000; unit_log_clear(); - homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(15000, rpc->msgin.granted); EXPECT_EQ(10000, rpc->msgin.rec_incoming); @@ -696,7 +685,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__remove_from_grantable) struct homa_rpc *rpc; rpc = test_rpc(self, 100, self->server_ip, 40000); - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); EXPECT_EQ(10000, rpc->msgin.granted); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); @@ -705,7 +694,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__remove_from_grantable) rpc->msgin.granted = 30000; rpc->msgin.rec_incoming = 10000; unit_log_clear(); - homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); EXPECT_EQ(40000, rpc->msgin.granted); EXPECT_EQ(10000, rpc->msgin.rec_incoming); @@ -722,7 +710,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__recalc_because_of_headroom) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_incoming = 15000; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(5000, rpc2->msgin.granted); @@ -731,7 +719,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__recalc_because_of_headroom) rpc1->msgin.granted = 12000; rpc1->msgin.rec_incoming = 10000; unit_log_clear(); - homa_rpc_lock(rpc1); homa_grant_check_rpc(rpc1); EXPECT_EQ(20000, rpc1->msgin.granted); EXPECT_EQ(4000, rpc1->msgin.rec_incoming); @@ -754,7 +741,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) mock_ns_tick = 10; unit_log_clear(); - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_STREQ("xmit GRANT 10000@2; " "xmit GRANT 10000@1; " "xmit GRANT 10000@0", unit_log_get()); @@ -779,7 +766,7 @@ TEST_F(homa_grant, homa_grant_recalc__already_locked) homa_grantable_lock(&self->homa, 0); unit_log_clear(); - homa_grant_recalc(&self->homa, 1); + homa_grant_recalc(&self->homa, 1, NULL); EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); EXPECT_EQ(10000, rpc->msgin.granted); } @@ -792,7 +779,7 @@ TEST_F(homa_grant, homa_grant_recalc__skip_recalc) mock_trylock_errors = 0xff; unit_log_clear(); - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, rpc->msgin.granted); EXPECT_EQ(2, atomic_read(&self->homa.grant_recalc_count)); @@ -813,7 +800,7 @@ TEST_F(homa_grant, homa_grant_recalc__clear_existing_active_rpcs) self->homa.max_rpcs_per_peer = 10; self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(-1, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(2, self->homa.num_active_rpcs); } @@ -827,7 +814,7 @@ TEST_F(homa_grant, homa_grant_recalc__use_only_lowest_priorities) self->homa.max_sched_prio = 5; unit_log_clear(); - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 10000@0", unit_log_get()); EXPECT_EQ(1, rpc1->msgin.priority); EXPECT_EQ(0, rpc2->msgin.priority); @@ -844,7 +831,7 @@ TEST_F(homa_grant, homa_grant_recalc__share_lowest_priority_level) self->homa.max_sched_prio = 2; unit_log_clear(); - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_STREQ("xmit GRANT 10000@2; " "xmit GRANT 10000@1; " "xmit GRANT 10000@0; " @@ -866,7 +853,7 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) /* First try: fixed window size. */ homa_grantable_lock(&self->homa, 0); self->homa.window_param = 5000; - homa_grant_recalc(&self->homa, 1); + homa_grant_recalc(&self->homa, 1, NULL); EXPECT_EQ(5000, self->homa.grant_window); EXPECT_EQ(5000, rpc1->msgin.granted); EXPECT_EQ(5000, rpc2->msgin.granted); @@ -874,7 +861,7 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) /* Second try: dynamic window size. */ self->homa.window_param = 0; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(25000, self->homa.grant_window); EXPECT_EQ(25000, rpc1->msgin.granted); EXPECT_EQ(25000, rpc2->msgin.granted); @@ -891,7 +878,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted) self->homa.max_incoming = 32000; self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.granted); @@ -912,7 +899,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) mock_trylock_errors = 0xfe0; EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_skips); - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(0, rpc3->msgin.granted); @@ -1071,7 +1058,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); @@ -1093,7 +1080,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__not_in_active_list) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0); + homa_grant_recalc(&self->homa, 0, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); From a00cd2518b9f088d041f9c3d53a9dbb18ca8f3a5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 6 Feb 2025 11:10:22 -0800 Subject: [PATCH 192/625] Refactor grantable lock usage in homa_grant.c * Acquire grantable lock in homa_grant_add_rpc, homa_grant_remove_rpc, and homa_grant_recalc, not in homa_grant_check. * Eliminate "locked" argument to homa_grant_recalc. * Factor out homa_grant_recalc calls in homa_grant_check: do them all in one place. --- homa_grant.c | 70 +++++++++++++++++------------------------- homa_grant.h | 3 +- test/mock.c | 7 +++++ test/mock.h | 1 + test/unit_homa_grant.c | 56 +++++++++++++++++---------------- 5 files changed, 68 insertions(+), 69 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index be0d46b8..c8f70766 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -64,8 +64,8 @@ int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) * homa_grant_add_rpc() - Make sure that an RPC is present in the grantable * list for its peer and in the appropriate position, and that the peer is * present in the overall grantable list for Homa and in the correct - * position. The caller must hold the grantable lock and the RPC's lock. - * @rpc: The RPC to add/reposition. + * position. + * @rpc: The RPC to add/reposition. Must be locked by caller. */ void homa_grant_add_rpc(struct homa_rpc *rpc) { @@ -74,6 +74,8 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) struct homa_peer *peer_cand; struct homa_rpc *candidate; + homa_grantable_lock(homa, 0); + /* Make sure this message is in the right place in the grantable_rpcs * list for its peer. */ @@ -151,14 +153,15 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) list_add(&prev_peer->grantable_links, &peer->grantable_links); } done: + homa_grantable_unlock(homa); return; } /** * homa_grant_remove_rpc() - Unlink an RPC from the grantable lists, so it will - * no longer be considered for grants. The caller must hold the grantable lock. + * no longer be considered for grants. * @rpc: RPC to remove from grantable lists. Must currently be in - * a grantable list. + * a grantable list. Must be locked by caller. */ void homa_grant_remove_rpc(struct homa_rpc *rpc) { @@ -171,6 +174,8 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) if (list_empty(&rpc->grantable_links)) return; + homa_grantable_lock(homa, 0); + if (homa->oldest_rpc == rpc) homa->oldest_rpc = NULL; @@ -184,7 +189,7 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) tt_record2("Decremented num_grantable_rpcs to %d, id %d", homa->num_grantable_rpcs, rpc->id); if (rpc != head) - return; + goto done; /* The removed RPC was at the front of the peer's list. This means * we may have to adjust the position of the peer in Homa's list, @@ -192,7 +197,7 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) */ if (list_empty(&peer->grantable_rpcs)) { list_del_init(&peer->grantable_links); - return; + goto done; } /* The peer may have to move down in Homa's list (removal of @@ -211,6 +216,10 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) __list_del_entry(&peer->grantable_links); list_add(&peer->grantable_links, &next_peer->grantable_links); } + +done: + homa_grantable_unlock(homa); + return; } /** @@ -307,25 +316,21 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * granting. */ if (list_empty(&rpc->grantable_links)) { - homa_grantable_lock(homa, 0); homa_grant_add_rpc(rpc); recalc = (homa->num_active_rpcs < homa->max_overcommit || rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining [homa->max_overcommit - 1])); - if (recalc) - homa_grant_recalc(homa, 1, rpc); - else - homa_grantable_unlock(homa); goto done; } /* Not a new message; see if we can upgrade the message's priority. */ rank = atomic_read(&rpc->msgin.rank); if (rank < 0) { - if (rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining[homa->max_overcommit - 1])) { + if (rpc->msgin.bytes_remaining < + atomic_read(&homa->active_remaining[homa->max_overcommit - 1])) { INC_METRIC(grant_priority_bumps, 1); - homa_grant_recalc(homa, 0, rpc); + recalc = 1; } goto done; } @@ -333,7 +338,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) if (rank > 0 && rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining[rank - 1])) { INC_METRIC(grant_priority_bumps, 1); - homa_grant_recalc(homa, 0, rpc); + recalc = 1; goto done; } @@ -345,15 +350,13 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) /* Is the message now fully granted? */ if (rpc->msgin.granted >= rpc->msgin.length) { - homa_grantable_lock(homa, 0); homa_grant_remove_rpc(rpc); - homa_grant_recalc(homa, 1, rpc); - goto done; + recalc = 1; } - if (recalc) - homa_grant_recalc(homa, 0, rpc); done: + if (recalc) + homa_grant_recalc(homa, rpc); tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); } @@ -364,18 +367,13 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * invoked whenever something happens that could change the contents or order * of homa->active_rpcs. * @homa: Overall information about the Homa transport. - * @locked: Normally this function will acquire (and release) - * homa->grantable_lock. If this value is nonzero, it means - * the caller has already acquired homa->grantable_lock. In - * either case the lock will be released upon return. * @caller_rpc: An RPC for which the caller holds the lock. This function * may release this lock in order acquire other RPC locks, * but if so, it will reacquire the lock before returning. * The caller must not hold any locks that prevent RPCs * from being locked (see sync.txt). NULL means no lock held. */ -void homa_grant_recalc(struct homa *homa, int locked, - struct homa_rpc *caller_rpc) +void homa_grant_recalc(struct homa *homa, struct homa_rpc *caller_rpc) { /* The tricky part of this method is that we need to release * homa->grantable_lock before actually sending grants, because @@ -390,12 +388,6 @@ void homa_grant_recalc(struct homa *homa, int locked, tt_record("homa_grant_recalc starting"); INC_METRIC(grant_recalc_calls, 1); - if (!locked) { - if (!homa_grantable_lock(homa, 1)) { - INC_METRIC(grant_recalc_skips, 1); - return; - } - } start = sched_clock(); if (likely(caller_rpc)) { @@ -408,6 +400,11 @@ void homa_grant_recalc(struct homa *homa, int locked, * opportunities to grant to additional messages. */ while (1) { + if (!homa_grantable_lock(homa, 1)) { + INC_METRIC(grant_recalc_skips, 1); + break; + } + try_again = 0; atomic_inc(&homa->grant_recalc_count); @@ -469,10 +466,8 @@ void homa_grant_recalc(struct homa *homa, int locked, homa_grant_send(rpc, homa); try_again += homa_grant_update_incoming(rpc, homa); if (rpc->msgin.granted >= rpc->msgin.length) { - homa_grantable_lock(homa, 0); try_again += 1; homa_grant_remove_rpc(rpc); - homa_grantable_unlock(homa); } homa_rpc_put(rpc); homa_rpc_unlock(rpc); @@ -481,10 +476,6 @@ void homa_grant_recalc(struct homa *homa, int locked, if (try_again == 0) break; INC_METRIC(grant_recalc_loops, 1); - if (!homa_grantable_lock(homa, 1)) { - INC_METRIC(grant_recalc_skips, 1); - break; - } } if (likely(caller_rpc)) { homa_rpc_lock(caller_rpc); @@ -613,12 +604,9 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) struct homa *homa = rpc->hsk->homa; if (!list_empty(&rpc->grantable_links)) { - homa_grantable_lock(homa, 0); homa_grant_remove_rpc(rpc); if (atomic_read(&rpc->msgin.rank) >= 0) - homa_grant_recalc(homa, 1, rpc); - else - homa_grantable_unlock(homa); + homa_grant_recalc(homa, rpc); } if (rpc->msgin.rec_incoming != 0) diff --git a/homa_grant.h b/homa_grant.h index 248aa835..0088aeb4 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -16,8 +16,7 @@ int homa_grant_outranks(struct homa_rpc *rpc1, int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, int max_rpcs); void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -void homa_grant_recalc(struct homa *homa, int locked, - struct homa_rpc *locked_rpc); +void homa_grant_recalc(struct homa *homa, struct homa_rpc *locked_rpc); void homa_grant_remove_rpc(struct homa_rpc *rpc); int homa_grant_send(struct homa_rpc *rpc, struct homa *homa); int homa_grant_update_incoming(struct homa_rpc *rpc, diff --git a/test/mock.c b/test/mock.c index 88a23c58..ac1409bd 100644 --- a/test/mock.c +++ b/test/mock.c @@ -127,6 +127,9 @@ static int mock_active_locks; */ static int mock_active_spin_locks; +/* Total number of successful spinlock acquisitions during current test. */ +int mock_total_spin_locks; + /* The number of times rcu_read_lock has been called minus the number * of times rcu_read_unlock has been called. * Should be 0 at the end of each test. @@ -1004,12 +1007,14 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) void _raw_spin_lock(raw_spinlock_t *lock) { mock_active_spin_locks++; + mock_total_spin_locks++; } void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { UNIT_HOOK("spin_lock"); mock_active_spin_locks++; + mock_total_spin_locks++; } void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, @@ -1022,6 +1027,7 @@ int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) if (mock_check_error(&mock_trylock_errors)) return 0; mock_active_spin_locks++; + mock_total_spin_locks++; return 1; } @@ -1826,6 +1832,7 @@ void mock_teardown(void) FAIL(" %d spin locks still locked after test", mock_active_spin_locks); mock_active_spin_locks = 0; + mock_total_spin_locks = 0; if (mock_active_rcu_locks != 0) FAIL(" %d rcu_read_locks still active after test", diff --git a/test/mock.h b/test/mock.h index b0865621..d7196206 100644 --- a/test/mock.h +++ b/test/mock.h @@ -142,6 +142,7 @@ extern int mock_sock_holds; extern int mock_spin_lock_held; extern struct task_struct mock_task; +extern int mock_total_spin_locks; extern int mock_trylock_errors; extern int mock_vmalloc_errors; extern int mock_xmit_log_verbose; diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 579b1899..3f95df37 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -580,7 +580,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); @@ -601,7 +601,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); rpc2->msgin.bytes_remaining = 1000; @@ -624,7 +624,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); @@ -646,7 +646,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 4; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); @@ -667,7 +667,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__send_new_grant) struct homa_rpc *rpc; rpc = test_rpc(self, 100, self->server_ip, 40000); - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); EXPECT_EQ(10000, rpc->msgin.granted); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); @@ -685,7 +685,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__remove_from_grantable) struct homa_rpc *rpc; rpc = test_rpc(self, 100, self->server_ip, 40000); - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); EXPECT_EQ(10000, rpc->msgin.granted); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); @@ -710,7 +710,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__recalc_because_of_headroom) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_incoming = 15000; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(5000, rpc2->msgin.granted); @@ -741,7 +741,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) mock_ns_tick = 10; unit_log_clear(); - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_STREQ("xmit GRANT 10000@2; " "xmit GRANT 10000@1; " "xmit GRANT 10000@0", unit_log_get()); @@ -760,15 +760,20 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); EXPECT_NE(0, homa_metrics_per_cpu()->grant_recalc_ns); } -TEST_F(homa_grant, homa_grant_recalc__already_locked) +TEST_F(homa_grant, homa_grant_recalc__release_parent_rpc_lock) { - struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 100, 1000, 10000); - homa_grantable_lock(&self->homa, 0); - unit_log_clear(); - homa_grant_recalc(&self->homa, 1, NULL); - EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); - EXPECT_EQ(10000, rpc->msgin.granted); + homa_rpc_lock(rpc); + mock_total_spin_locks = 0; + homa_grant_recalc(&self->homa, NULL); + EXPECT_EQ(1, mock_total_spin_locks); + + homa_grant_recalc(&self->homa, rpc); + EXPECT_EQ(3, mock_total_spin_locks); + homa_rpc_unlock(rpc); } TEST_F(homa_grant, homa_grant_recalc__skip_recalc) { @@ -779,7 +784,7 @@ TEST_F(homa_grant, homa_grant_recalc__skip_recalc) mock_trylock_errors = 0xff; unit_log_clear(); - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, rpc->msgin.granted); EXPECT_EQ(2, atomic_read(&self->homa.grant_recalc_count)); @@ -800,7 +805,7 @@ TEST_F(homa_grant, homa_grant_recalc__clear_existing_active_rpcs) self->homa.max_rpcs_per_peer = 10; self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(-1, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(2, self->homa.num_active_rpcs); } @@ -814,7 +819,7 @@ TEST_F(homa_grant, homa_grant_recalc__use_only_lowest_priorities) self->homa.max_sched_prio = 5; unit_log_clear(); - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 10000@0", unit_log_get()); EXPECT_EQ(1, rpc1->msgin.priority); EXPECT_EQ(0, rpc2->msgin.priority); @@ -831,7 +836,7 @@ TEST_F(homa_grant, homa_grant_recalc__share_lowest_priority_level) self->homa.max_sched_prio = 2; unit_log_clear(); - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_STREQ("xmit GRANT 10000@2; " "xmit GRANT 10000@1; " "xmit GRANT 10000@0; " @@ -851,9 +856,8 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) self->homa.max_incoming = 100000; /* First try: fixed window size. */ - homa_grantable_lock(&self->homa, 0); self->homa.window_param = 5000; - homa_grant_recalc(&self->homa, 1, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(5000, self->homa.grant_window); EXPECT_EQ(5000, rpc1->msgin.granted); EXPECT_EQ(5000, rpc2->msgin.granted); @@ -861,7 +865,7 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) /* Second try: dynamic window size. */ self->homa.window_param = 0; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(25000, self->homa.grant_window); EXPECT_EQ(25000, rpc1->msgin.granted); EXPECT_EQ(25000, rpc2->msgin.granted); @@ -878,7 +882,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted) self->homa.max_incoming = 32000; self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.granted); @@ -899,7 +903,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) mock_trylock_errors = 0xfe0; EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_skips); - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(0, rpc3->msgin.granted); @@ -1058,7 +1062,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); @@ -1080,7 +1084,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__not_in_active_list) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, 0, NULL); + homa_grant_recalc(&self->homa, NULL); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); From bb87a673fda0a952a73b8d7052f0d4123b140ebb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 6 Feb 2025 11:21:59 -0800 Subject: [PATCH 193/625] Remove "parent_rpc" argument to homa_grant_recalc Handle RPC unlocking/relocking in callers. --- homa_grant.c | 37 ++++++++++++++---------------- homa_grant.h | 2 +- test/unit_homa_grant.c | 51 +++++++++++++++--------------------------- 3 files changed, 36 insertions(+), 54 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index c8f70766..1ad87fc3 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -355,8 +355,13 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) } done: - if (recalc) - homa_grant_recalc(homa, rpc); + if (recalc) { + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); + homa_grant_recalc(homa); + homa_rpc_lock(rpc); + homa_rpc_put(rpc); + } tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); } @@ -365,15 +370,11 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * and what priorities to use for each. If needed, send out grant packets to * ensure that all appropriate grants have been issued. This function is * invoked whenever something happens that could change the contents or order - * of homa->active_rpcs. + * of homa->active_rpcs. Caller must not hold any RPC locks (this function + * may need to lock RPCs). * @homa: Overall information about the Homa transport. - * @caller_rpc: An RPC for which the caller holds the lock. This function - * may release this lock in order acquire other RPC locks, - * but if so, it will reacquire the lock before returning. - * The caller must not hold any locks that prevent RPCs - * from being locked (see sync.txt). NULL means no lock held. */ -void homa_grant_recalc(struct homa *homa, struct homa_rpc *caller_rpc) +void homa_grant_recalc(struct homa *homa) { /* The tricky part of this method is that we need to release * homa->grantable_lock before actually sending grants, because @@ -390,11 +391,6 @@ void homa_grant_recalc(struct homa *homa, struct homa_rpc *caller_rpc) INC_METRIC(grant_recalc_calls, 1); start = sched_clock(); - if (likely(caller_rpc)) { - homa_rpc_hold(caller_rpc); - homa_rpc_unlock(caller_rpc); - } - /* We may have to recalculate multiple times if grants sent in one * round cause messages to be completely granted, opening up * opportunities to grant to additional messages. @@ -477,10 +473,6 @@ void homa_grant_recalc(struct homa *homa, struct homa_rpc *caller_rpc) break; INC_METRIC(grant_recalc_loops, 1); } - if (likely(caller_rpc)) { - homa_rpc_lock(caller_rpc); - homa_rpc_put(caller_rpc); - } INC_METRIC(grant_recalc_ns, sched_clock() - start); } @@ -605,8 +597,13 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) if (!list_empty(&rpc->grantable_links)) { homa_grant_remove_rpc(rpc); - if (atomic_read(&rpc->msgin.rank) >= 0) - homa_grant_recalc(homa, rpc); + if (atomic_read(&rpc->msgin.rank) >= 0) { + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); + homa_grant_recalc(homa); + homa_rpc_lock(rpc); + homa_rpc_put(rpc); + } } if (rpc->msgin.rec_incoming != 0) diff --git a/homa_grant.h b/homa_grant.h index 0088aeb4..fc43d43a 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -16,7 +16,7 @@ int homa_grant_outranks(struct homa_rpc *rpc1, int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, int max_rpcs); void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -void homa_grant_recalc(struct homa *homa, struct homa_rpc *locked_rpc); +void homa_grant_recalc(struct homa *homa); void homa_grant_remove_rpc(struct homa_rpc *rpc); int homa_grant_send(struct homa_rpc *rpc, struct homa *homa); int homa_grant_update_incoming(struct homa_rpc *rpc, diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 3f95df37..d606f699 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -580,7 +580,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); @@ -601,7 +601,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); rpc2->msgin.bytes_remaining = 1000; @@ -624,7 +624,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); @@ -646,7 +646,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 4; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); @@ -667,7 +667,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__send_new_grant) struct homa_rpc *rpc; rpc = test_rpc(self, 100, self->server_ip, 40000); - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); EXPECT_EQ(10000, rpc->msgin.granted); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); @@ -685,7 +685,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__remove_from_grantable) struct homa_rpc *rpc; rpc = test_rpc(self, 100, self->server_ip, 40000); - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); EXPECT_EQ(10000, rpc->msgin.granted); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); @@ -710,7 +710,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__recalc_because_of_headroom) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); self->homa.max_incoming = 15000; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(5000, rpc2->msgin.granted); @@ -741,7 +741,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) mock_ns_tick = 10; unit_log_clear(); - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_STREQ("xmit GRANT 10000@2; " "xmit GRANT 10000@1; " "xmit GRANT 10000@0", unit_log_get()); @@ -760,21 +760,6 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); EXPECT_NE(0, homa_metrics_per_cpu()->grant_recalc_ns); } -TEST_F(homa_grant, homa_grant_recalc__release_parent_rpc_lock) -{ - struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 100, 1000, 10000); - - homa_rpc_lock(rpc); - mock_total_spin_locks = 0; - homa_grant_recalc(&self->homa, NULL); - EXPECT_EQ(1, mock_total_spin_locks); - - homa_grant_recalc(&self->homa, rpc); - EXPECT_EQ(3, mock_total_spin_locks); - homa_rpc_unlock(rpc); -} TEST_F(homa_grant, homa_grant_recalc__skip_recalc) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); @@ -784,7 +769,7 @@ TEST_F(homa_grant, homa_grant_recalc__skip_recalc) mock_trylock_errors = 0xff; unit_log_clear(); - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, rpc->msgin.granted); EXPECT_EQ(2, atomic_read(&self->homa.grant_recalc_count)); @@ -805,7 +790,7 @@ TEST_F(homa_grant, homa_grant_recalc__clear_existing_active_rpcs) self->homa.max_rpcs_per_peer = 10; self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(-1, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(2, self->homa.num_active_rpcs); } @@ -819,7 +804,7 @@ TEST_F(homa_grant, homa_grant_recalc__use_only_lowest_priorities) self->homa.max_sched_prio = 5; unit_log_clear(); - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 10000@0", unit_log_get()); EXPECT_EQ(1, rpc1->msgin.priority); EXPECT_EQ(0, rpc2->msgin.priority); @@ -836,7 +821,7 @@ TEST_F(homa_grant, homa_grant_recalc__share_lowest_priority_level) self->homa.max_sched_prio = 2; unit_log_clear(); - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_STREQ("xmit GRANT 10000@2; " "xmit GRANT 10000@1; " "xmit GRANT 10000@0; " @@ -857,7 +842,7 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) /* First try: fixed window size. */ self->homa.window_param = 5000; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(5000, self->homa.grant_window); EXPECT_EQ(5000, rpc1->msgin.granted); EXPECT_EQ(5000, rpc2->msgin.granted); @@ -865,7 +850,7 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) /* Second try: dynamic window size. */ self->homa.window_param = 0; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(25000, self->homa.grant_window); EXPECT_EQ(25000, rpc1->msgin.granted); EXPECT_EQ(25000, rpc2->msgin.granted); @@ -882,7 +867,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted) self->homa.max_incoming = 32000; self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.granted); @@ -903,7 +888,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) mock_trylock_errors = 0xfe0; EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_skips); - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(0, rpc3->msgin.granted); @@ -1062,7 +1047,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); @@ -1084,7 +1069,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__not_in_active_list) rpc2 = test_rpc(self, 102, self->server_ip, 30000); rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa, NULL); + homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); From 7885d57f7b8c117aa3fcac55b2cbadf7bf04b4da Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 7 Feb 2025 11:30:44 -0800 Subject: [PATCH 194/625] Don't hold RPC lock while sending grants * Extract homa_grant_update_offset from homa_grant_send. --- homa_grant.c | 136 ++++++++++++++++++++++++----------------- homa_grant.h | 3 +- test/unit_homa_grant.c | 124 +++++++++++++++---------------------- 3 files changed, 131 insertions(+), 132 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 1ad87fc3..a75179e5 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -223,54 +223,62 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) } /** - * homa_grant_send() - See if it is appropriate to send a grant to an RPC; - * if so, create the grant and send it. - * @rpc: The RPC to check for possible grant. Must be locked by the caller. + * homa_grant_update_offset() - Select a new grant offset for a message, + * assuming that the message is high enough priority to deserve grants. + * @rpc: The RPC to check for possible grant. Need not be locked by + * the caller (if it isn't locked, the worst that will happen + * is the sending of an extraneous grant). * @homa: Overall information about the Homa transport. - * Return: Nonzero if a grant was sent, 0 if not. + * Return: Nonzero means that @rpc->msgin.granted was increased (presumably + * the caller will now send a GRANT packet). Zero means that @rpc->msgin.granted + * can't be increased at this time. */ -int homa_grant_send(struct homa_rpc *rpc, struct homa *homa) +int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa) { - int incoming, increment, available; - struct homa_grant_hdr grant; - - /* Compute how many additional bytes to grant. */ - incoming = rpc->msgin.granted - (rpc->msgin.length - - rpc->msgin.bytes_remaining); - if (incoming < 0) { - rpc->msgin.granted = rpc->msgin.length - - rpc->msgin.bytes_remaining; - incoming = 0; - } - increment = homa->grant_window - incoming; - if (increment > (rpc->msgin.length - rpc->msgin.granted)) - increment = rpc->msgin.length - rpc->msgin.granted; - available = homa->max_incoming - atomic_read(&homa->total_incoming) - + rpc->msgin.rec_incoming - incoming; - if (increment > available) - increment = available; - if (increment <= 0) - return 0; + int received, new_grant_offset, incoming_delta, avl_incoming; - /* Don't increment the grant if the node has been slow to send + /* Don't increase the grant if the node has been slow to send * data already granted: no point in wasting grants on this * node. */ if (rpc->silent_ticks > 1) return 0; - rpc->msgin.granted += increment; + received = rpc->msgin.length - rpc->msgin.bytes_remaining; + new_grant_offset = received + homa->grant_window; + if (new_grant_offset > rpc->msgin.length) + new_grant_offset = rpc->msgin.length; + incoming_delta = (new_grant_offset - received) - rpc->msgin.rec_incoming; + avl_incoming = homa->max_incoming - atomic_read(&homa->total_incoming); + if (avl_incoming < incoming_delta) { + new_grant_offset -= incoming_delta - avl_incoming; + } + if (new_grant_offset <= rpc->msgin.granted) + return 0; + tt_record4("sending grant for id %llu, offset %d, priority %d, increment %d", + rpc->id, new_grant_offset, rpc->msgin.priority, + new_grant_offset - rpc->msgin.granted); + rpc->msgin.granted = new_grant_offset; + return 1; +} + +/** + * homa_grant_send() -Issue a GRANT packet for the current grant offset + * in an RPC. + * @rpc: The RPC on whose behalf to send the packet. Need not be locked + * (and better for it not to be locked, since sending the packet + * takes a while). + */ +void homa_grant_send(struct homa_rpc *rpc) +{ + struct homa_grant_hdr grant; - /* Send the grant. */ grant.offset = htonl(rpc->msgin.granted); grant.priority = rpc->msgin.priority; grant.resend_all = rpc->msgin.resend_all; - rpc->msgin.resend_all = 0; - tt_record4("sending grant for id %llu, offset %d, priority %d, increment %d", - rpc->id, rpc->msgin.granted, rpc->msgin.priority, - increment); + if (grant.resend_all) + rpc->msgin.resend_all = 0; homa_xmit_control(GRANT, &grant, sizeof(grant), rpc); - return 1; } /** @@ -299,12 +307,13 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) */ struct homa *homa = rpc->hsk->homa; int rank, recalc; + int locked = 1; if (rpc->msgin.length < 0 || rpc->state == RPC_DEAD || rpc->msgin.num_bpages <= 0) goto done; - homa_grant_update_incoming(rpc, homa); + recalc = homa_grant_update_incoming(rpc, homa); if (rpc->msgin.granted >= rpc->msgin.length) goto done; @@ -317,7 +326,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) */ if (list_empty(&rpc->grantable_links)) { homa_grant_add_rpc(rpc); - recalc = (homa->num_active_rpcs < homa->max_overcommit || + recalc += (homa->num_active_rpcs < homa->max_overcommit || rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining [homa->max_overcommit - 1])); @@ -345,20 +354,28 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) /* Getting here should be the normal case: see if we can send a new * grant for this message. */ - homa_grant_send(rpc, homa); - recalc = homa_grant_update_incoming(rpc, homa); - - /* Is the message now fully granted? */ - if (rpc->msgin.granted >= rpc->msgin.length) { - homa_grant_remove_rpc(rpc); - recalc = 1; + if (homa_grant_update_offset(rpc, homa)) { + recalc += homa_grant_update_incoming(rpc, homa); + if (rpc->msgin.granted >= rpc->msgin.length) { + homa_grant_remove_rpc(rpc); + recalc = 1; + } + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); + locked = 0; + homa_grant_send(rpc); } done: if (recalc) { - homa_rpc_hold(rpc); - homa_rpc_unlock(rpc); + if (locked) { + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); + locked = 0; + } homa_grant_recalc(homa); + } + if (!locked) { homa_rpc_lock(rpc); homa_rpc_put(rpc); } @@ -376,12 +393,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) */ void homa_grant_recalc(struct homa *homa) { - /* The tricky part of this method is that we need to release - * homa->grantable_lock before actually sending grants, because - * (a) we need to hold the RPC lock while sending grants, and - * (b) sending grants takes a while, and holding grantable_lock - * would significantly increase contention for it. - * This array hold a copy of homa->active_rpcs. + /* A copy of homa->active_rpcs; needed so we can send grants + * without holding grantable_lock. */ struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; int i, active, try_again; @@ -452,21 +465,34 @@ void homa_grant_recalc(struct homa *homa) homa->grant_window = homa->max_incoming / (homa->num_active_rpcs + 1); - /* See comment above, which explains why this is here. */ + /* Release homa->grantable_lock before actually sending grants, + * because sending grants takes a while and holding + * grantable_lock would significantly increase contention for + * it. We don't hold RPC locks while sending grants either, + * for the same reason. + */ homa_grantable_unlock(homa); - for (i = 0; i < active; i++) { struct homa_rpc *rpc = active_rpcs[i]; + if (!homa_grant_update_offset(rpc, homa)) { + homa_rpc_put(rpc); + continue; + } homa_rpc_lock(rpc); - homa_grant_send(rpc, homa); - try_again += homa_grant_update_incoming(rpc, homa); + homa_grant_update_incoming(rpc, homa); if (rpc->msgin.granted >= rpc->msgin.length) { try_again += 1; homa_grant_remove_rpc(rpc); } - homa_rpc_put(rpc); homa_rpc_unlock(rpc); + homa_grant_send(rpc); + + /* Careful not to release reference until after + * grant has been sent; otherwise RPC could be + * reaped. + */ + homa_rpc_put(rpc); } if (try_again == 0) diff --git a/homa_grant.h b/homa_grant.h index fc43d43a..6b001bf0 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -18,9 +18,10 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_grant_recalc(struct homa *homa); void homa_grant_remove_rpc(struct homa_rpc *rpc); -int homa_grant_send(struct homa_rpc *rpc, struct homa *homa); +void homa_grant_send(struct homa_rpc *rpc); int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa); +int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa); /** * homa_grantable_lock() - Acquire the grantable lock. If the lock diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index d606f699..37d71a2c 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -417,93 +417,71 @@ TEST_F(homa_grant, homa_grant_remove_rpc__reposition_peer_in_homa_list) EXPECT_EQ(3, self->homa.num_grantable_rpcs); } -TEST_F(homa_grant, homa_grant_send__basics) +TEST_F(homa_grant, homa_grant_update_offset__basics) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - int granted; - rpc->msgin.priority = 3; - unit_log_clear(); - granted = homa_grant_send(rpc, &self->homa); - EXPECT_EQ(1, granted); + EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(10000, rpc->msgin.granted); - EXPECT_STREQ("xmit GRANT 10000@3", unit_log_get()); } -TEST_F(homa_grant, homa_grant_send__incoming_negative) +TEST_F(homa_grant, homa_grant_update_offset__rpc_idle) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - int granted; - - rpc->msgin.bytes_remaining = 5000; - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); - unit_log_clear(); - granted = homa_grant_send(rpc, &self->homa); - EXPECT_EQ(0, granted); - EXPECT_EQ(15000, rpc->msgin.granted); - EXPECT_STREQ("", unit_log_get()); + rpc->silent_ticks = 2; + EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); + EXPECT_EQ(0, rpc->msgin.granted); } -TEST_F(homa_grant, homa_grant_send__end_of_message) +TEST_F(homa_grant, homa_grant_update_offset__end_of_message) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - int granted; + /* First call grants remaining bytes in message. */ rpc->msgin.bytes_remaining = 5000; - unit_log_clear(); - granted = homa_grant_send(rpc, &self->homa); - EXPECT_EQ(1, granted); + EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); + EXPECT_EQ(20000, rpc->msgin.granted); + + /* Second call cannot grant anymore. */ + EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(20000, rpc->msgin.granted); - EXPECT_STREQ("xmit GRANT 20000@0", unit_log_get()); } -TEST_F(homa_grant, homa_grant_send__not_enough_available_bytes) +TEST_F(homa_grant, homa_grant_update_offset__insufficient_room_in_incoming) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - int granted; - rpc->msgin.granted = 3000; - rpc->msgin.rec_incoming = 4000; - atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 4000); - - unit_log_clear(); - granted = homa_grant_send(rpc, &self->homa); - EXPECT_EQ(1, granted); - EXPECT_EQ(8000, rpc->msgin.granted); - EXPECT_STREQ("xmit GRANT 8000@0", unit_log_get()); + rpc->msgin.bytes_remaining = 5000; + atomic_set(&self->homa.total_incoming, 48000); + EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); + EXPECT_EQ(17000, rpc->msgin.granted); } -TEST_F(homa_grant, homa_grant_send__nothing_available) +TEST_F(homa_grant, homa_grant_update_offset__incoming_overcommitted) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - int granted; - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); - unit_log_clear(); - granted = homa_grant_send(rpc, &self->homa); - EXPECT_EQ(0, granted); + atomic_set(&self->homa.total_incoming, 51000); + EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(0, rpc->msgin.granted); - EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_grant, homa_grant_send__skip_because_of_silent_ticks) + +TEST_F(homa_grant, homa_grant_send__basics) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - int granted; - rpc->silent_ticks = 2; + rpc->msgin.granted = 12300; unit_log_clear(); - granted = homa_grant_send(rpc, &self->homa); - EXPECT_EQ(0, granted); + homa_grant_send(rpc); + EXPECT_STREQ("xmit GRANT 12300@0", unit_log_get()); } TEST_F(homa_grant, homa_grant_send__resend_all) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - int granted; + rpc->msgin.granted = 5000; rpc->msgin.resend_all = 1; unit_log_clear(); - granted = homa_grant_send(rpc, &self->homa); - EXPECT_EQ(1, granted); - EXPECT_EQ(10000, rpc->msgin.granted); + homa_grant_send(rpc); + EXPECT_STREQ("xmit GRANT 5000@0 resend_all", unit_log_get()); EXPECT_EQ(0, rpc->msgin.resend_all); - EXPECT_STREQ("xmit GRANT 10000@0 resend_all", unit_log_get()); } TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) @@ -703,30 +681,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__remove_from_grantable) EXPECT_EQ(0, self->homa.num_active_rpcs); EXPECT_EQ(-1, atomic_read(&rpc->msgin.rank)); } -TEST_F(homa_grant, homa_grant_check_rpc__recalc_because_of_headroom) -{ - struct homa_rpc *rpc1, *rpc2; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - self->homa.max_incoming = 15000; - homa_grant_recalc(&self->homa); - EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(10000, rpc1->msgin.granted); - EXPECT_EQ(5000, rpc2->msgin.granted); - - rpc1->msgin.bytes_remaining = 4000; - rpc1->msgin.granted = 12000; - rpc1->msgin.rec_incoming = 10000; - unit_log_clear(); - homa_grant_check_rpc(rpc1); - EXPECT_EQ(20000, rpc1->msgin.granted); - EXPECT_EQ(4000, rpc1->msgin.rec_incoming); - EXPECT_EQ(10000, rpc2->msgin.granted); - EXPECT_EQ(10000, rpc2->msgin.rec_incoming); - EXPECT_STREQ("xmit GRANT 20000@1; xmit GRANT 10000@0", unit_log_get()); - EXPECT_EQ(14000, atomic_read(&self->homa.total_incoming)); -} TEST_F(homa_grant, homa_grant_recalc__basics) { @@ -856,7 +810,25 @@ TEST_F(homa_grant, homa_grant_recalc__compute_window_size) EXPECT_EQ(25000, rpc2->msgin.granted); EXPECT_EQ(25000, rpc3->msgin.granted); } -TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted) +TEST_F(homa_grant, homa_grant_recalc__rpc_cant_be_granted) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 10000); + rpc3 = test_rpc(self, 104, self->server_ip, 10000); + rpc4 = test_rpc(self, 106, self->server_ip, 10000); + self->homa.window_param = 5000; + self->homa.max_overcommit = 3; + rpc2->silent_ticks = 3; + + homa_grant_recalc(&self->homa); + EXPECT_EQ(5000, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(5000, rpc3->msgin.granted); + EXPECT_EQ(0, rpc4->msgin.granted); +} +TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_so_recalc) { struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; From 303f7a779c9b47651f2c4d5a38357a9516e832c0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 7 Feb 2025 11:42:12 -0800 Subject: [PATCH 195/625] Add grant_check_calls metric --- homa_grant.c | 1 + homa_metrics.c | 6 ++++-- homa_metrics.h | 6 ++++++ test/unit_homa_grant.c | 2 ++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index a75179e5..12b995ae 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -320,6 +320,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, rpc->msgin.length); + INC_METRIC(grant_check_calls, 1); /* This message requires grants; if it is a new message, set up * granting. diff --git a/homa_metrics.c b/homa_metrics.c index 45bf5b47..8f8d8272 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -280,8 +280,10 @@ char *homa_metrics_print(struct homa *homa) m->grantable_lock_miss_ns); M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", m->grantable_rpcs_integral); - M("grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", - m->grant_recalc_calls); + M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", + m->grant_check_calls); + M("grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", + m->grant_recalc_calls); M("grant_recalc_ns %15llu Time spent in homa_grant_recalc\n", m->grant_recalc_ns); M("grant_recalc_skips %15llu Number of times homa_grant_recalc skipped redundant work\n", diff --git a/homa_metrics.h b/homa_metrics.h index 22695aec..dcb87549 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -490,6 +490,12 @@ struct homa_metrics { */ u64 grantable_rpcs_integral; + /** + * @grant_check_calls: cumulative number of times homa_grant_check_rpc + * has been invoked. + */ + u64 grant_check_calls; + /** * @grant_recalc_calls: cumulative number of times homa_grant_recalc * has been invoked. diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 37d71a2c..bd46d75c 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -496,6 +496,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) homa_grant_check_rpc(rpc); EXPECT_EQ(0, rpc->msgin.rec_incoming); EXPECT_EQ(0, atomic_read(&self->homa.total_incoming)); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); } TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) { @@ -550,6 +551,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) EXPECT_EQ(10000, rpc->msgin.rec_incoming); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_calls); } TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) { From e467f6b5b6d77bff0688d27fc4c06c84babf7352 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Feb 2025 09:01:30 -0800 Subject: [PATCH 196/625] Change output formatting in kselftest_harness.h When printing unexpected integer values, print in signed form rather than unsigned. --- test/kselftest_harness.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/kselftest_harness.h b/test/kselftest_harness.h index c2e18a43..d0ae3b9d 100644 --- a/test/kselftest_harness.h +++ b/test/kselftest_harness.h @@ -630,7 +630,7 @@ extern int strcmp(const char *s1, const char *s2); if (!(__exp _t __seen)) { \ unsigned long long __exp_print = (long long)__exp; \ unsigned long long __seen_print = (long long)__seen; \ - __TH_LOG(" Expected %s (%llu) %s %s (%llu)", \ + __TH_LOG(" Expected %s (%lld) %s %s (%lld)", \ #_expected, __exp_print, #_t, \ #_seen, __seen_print); \ __current_test->passed = 0; \ From a24093f2098bd4689e63c292b2614d282a01a166 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Feb 2025 11:18:23 -0800 Subject: [PATCH 197/625] Major refactoring of homa_grant.c * homa_grant_check_rpc no longer expects the rpc to be locked; instead, caller must have a reference on it. * Introduce homa->needy_ranks to keep track of RPCs that are under-granted. * Add homa_grant_check_needy function, so homa_rpc_recalc doesn't need to be called if homa->active_rpcs doesn't need to change. * Factored out homa_grant_try_send function. * Eliminate homa_grant_send (it's now part of homa_grant_try_send). * Introduce new metrics grant_check_calls, grant_check_needy_calls. * Various cleanups (e.g., remove unused homa->grantable_rpcs). --- homa_grant.c | 220 +++++++++++-------- homa_grant.h | 20 +- homa_impl.h | 73 +++---- homa_incoming.c | 16 +- homa_metrics.c | 6 +- homa_metrics.h | 6 + homa_pool.c | 8 +- homa_rpc.c | 10 +- homa_rpc.h | 5 +- homa_timer.c | 7 +- homa_utils.c | 1 - test/unit_homa_grant.c | 466 +++++++++++++++++++++++++++++++++-------- test/unit_homa_pool.c | 3 +- 13 files changed, 600 insertions(+), 241 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 12b995ae..85ead5b0 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -32,32 +32,24 @@ int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) /** * homa_grant_update_incoming() - Figure out how much incoming data there is * for an RPC (i.e., data that has been granted but not yet received) and make - * sure this is properly reflected in homa->total_incoming. - * @rpc: RPC to check; must be locked by the caller and its msgin must be - * properly initialized. + * sure this is properly reflected in rpc->msgin.incoming + * and homa->total_incoming. + * @rpc: RPC to check; need not be locked. * @homa: Overall information about the Homa transport. - * Return: A nonzero return value means that this update caused - * homa->total_incoming to drop below homa->max_incoming, when it - * had previously been at or above that level. This means that it - * may be possible to send out additional grants to some RPCs (doing - * this is left to the caller). */ -int homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) +void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) { - int incoming = rpc->msgin.granted - (rpc->msgin.length - - rpc->msgin.bytes_remaining); + int incoming, delta; + incoming = rpc->msgin.granted - (rpc->msgin.length - + rpc->msgin.bytes_remaining); if (incoming < 0) incoming = 0; - if (incoming != rpc->msgin.rec_incoming) { - int delta = incoming - rpc->msgin.rec_incoming; - int old = atomic_fetch_add(delta, &homa->total_incoming); - - rpc->msgin.rec_incoming = incoming; - return (old >= homa->max_incoming && - (old + delta) < homa->max_incoming); + delta = incoming - atomic_read(&rpc->msgin.rec_incoming); + if (delta != 0) { + atomic_add(delta, &rpc->msgin.rec_incoming); + atomic_add(delta, &homa->total_incoming); } - return 0; } /** @@ -231,7 +223,8 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) * @homa: Overall information about the Homa transport. * Return: Nonzero means that @rpc->msgin.granted was increased (presumably * the caller will now send a GRANT packet). Zero means that @rpc->msgin.granted - * can't be increased at this time. + * can't be increased at this time. This function will set a bit in + * homa->needy_ranks if available incoming was exhausted. */ int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa) { @@ -248,9 +241,15 @@ int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa) new_grant_offset = received + homa->grant_window; if (new_grant_offset > rpc->msgin.length) new_grant_offset = rpc->msgin.length; - incoming_delta = (new_grant_offset - received) - rpc->msgin.rec_incoming; + incoming_delta = (new_grant_offset - received) - + atomic_read(&rpc->msgin.rec_incoming); avl_incoming = homa->max_incoming - atomic_read(&homa->total_incoming); if (avl_incoming < incoming_delta) { + atomic_or(homa_grant_needy_bit(atomic_read(&rpc->msgin.rank)), + &homa->needy_ranks); + tt_record3("insufficient headroom: needed %d, available %d, used %d", + incoming_delta, avl_incoming, + atomic_read(&homa->total_incoming)); new_grant_offset -= incoming_delta - avl_incoming; } if (new_grant_offset <= rpc->msgin.granted) @@ -263,22 +262,36 @@ int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa) } /** - * homa_grant_send() -Issue a GRANT packet for the current grant offset - * in an RPC. - * @rpc: The RPC on whose behalf to send the packet. Need not be locked - * (and better for it not to be locked, since sending the packet - * takes a while). + * homa_grant_try_send() - If an RPC needs granting and there is headroom + * under @homa->max_incoming, send a grant. + * @rpc: RPC to check. Should not be locked, but caller must own a + * reference. + * @homa: Overall info about the Homa transport. + * Return: 1 means that homa_grant_recalc now needs to be called (@rpc + * became completely granted and was removed from the grantable list). */ -void homa_grant_send(struct homa_rpc *rpc) +int homa_grant_try_send(struct homa_rpc *rpc, struct homa *homa) { struct homa_grant_hdr grant; + atomic_andnot(homa_grant_needy_bit(atomic_read(&rpc->msgin.rank)), + &homa->needy_ranks); + if (!homa_grant_update_offset(rpc, homa)) + return 0; + homa_grant_update_incoming(rpc, homa); + grant.offset = htonl(rpc->msgin.granted); grant.priority = rpc->msgin.priority; grant.resend_all = rpc->msgin.resend_all; if (grant.resend_all) rpc->msgin.resend_all = 0; homa_xmit_control(GRANT, &grant, sizeof(grant), rpc); + + if (rpc->msgin.granted >= rpc->msgin.length) { + homa_grant_remove_rpc(rpc); + return 1; + } + return 0; } /** @@ -287,13 +300,12 @@ void homa_grant_send(struct homa_rpc *rpc) * RPC relative to outgoing grants and takes any appropriate actions that * are needed (such as adding the RPC to the grantable list or sending * grants for this or other RPCs). - * @rpc: RPC to check. Must be locked by the caller. This function may - * release and then reacquire that lock, so caller must not hold - * any locks that would disallow that. + * @rpc: RPC to check. Must not be locked by the caller, but caller + * must own a reference. */ void homa_grant_check_rpc(struct homa_rpc *rpc) { - /* Overall design notes: + /* Overall design note: * The grantable lock has proven to be a performance bottleneck, * particularly as network speeds increase. homa_grant_recalc must * acquire that lock in order to recompute the set of messages @@ -306,31 +318,33 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * be called, which create a lot of special cases in this function. */ struct homa *homa = rpc->hsk->homa; - int rank, recalc; - int locked = 1; + int rank; if (rpc->msgin.length < 0 || rpc->state == RPC_DEAD || rpc->msgin.num_bpages <= 0) - goto done; - - recalc = homa_grant_update_incoming(rpc, homa); - if (rpc->msgin.granted >= rpc->msgin.length) - goto done; + return; tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, rpc->msgin.length); INC_METRIC(grant_check_calls, 1); + homa_grant_update_incoming(rpc, homa); + if (rpc->msgin.granted >= rpc->msgin.length) { + if (homa_grant_check_needy(homa)) + goto recalc; + goto done; + } /* This message requires grants; if it is a new message, set up * granting. */ if (list_empty(&rpc->grantable_links)) { homa_grant_add_rpc(rpc); - recalc += (homa->num_active_rpcs < homa->max_overcommit || + if (homa->num_active_rpcs < homa->max_overcommit || rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining - [homa->max_overcommit - 1])); + [homa->max_overcommit - 1])) + goto recalc; goto done; } @@ -338,9 +352,10 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) rank = atomic_read(&rpc->msgin.rank); if (rank < 0) { if (rpc->msgin.bytes_remaining < - atomic_read(&homa->active_remaining[homa->max_overcommit - 1])) { + atomic_read(&homa->active_remaining[homa->max_overcommit - + 1])) { INC_METRIC(grant_priority_bumps, 1); - recalc = 1; + goto recalc; } goto done; } @@ -348,38 +363,29 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) if (rank > 0 && rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining[rank - 1])) { INC_METRIC(grant_priority_bumps, 1); - recalc = 1; - goto done; + goto recalc; } - /* Getting here should be the normal case: see if we can send a new - * grant for this message. - */ - if (homa_grant_update_offset(rpc, homa)) { - recalc += homa_grant_update_incoming(rpc, homa); - if (rpc->msgin.granted >= rpc->msgin.length) { - homa_grant_remove_rpc(rpc); - recalc = 1; - } - homa_rpc_hold(rpc); - homa_rpc_unlock(rpc); - locked = 0; - homa_grant_send(rpc); + if (atomic_read(&homa->needy_ranks) != 0) { + /* There are other RPCs that also need grants; process them + * in priority order (and make sure this RPC ges considered + * as well). + */ + atomic_or(homa_grant_needy_bit(rank), &homa->needy_ranks); + if (!homa_grant_check_needy(homa)) + goto done; + } else { + /* Ideally this should be the common case: no need to consider + * any other RPCs. + */ + if (!homa_grant_try_send(rpc, homa)) + goto done; } +recalc: + homa_grant_recalc(homa); + done: - if (recalc) { - if (locked) { - homa_rpc_hold(rpc); - homa_rpc_unlock(rpc); - locked = 0; - } - homa_grant_recalc(homa); - } - if (!locked) { - homa_rpc_lock(rpc); - homa_rpc_put(rpc); - } tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); } @@ -401,6 +407,7 @@ void homa_grant_recalc(struct homa *homa) int i, active, try_again; u64 start; + UNIT_LOG("; ", "homa_grant_recalc"); tt_record("homa_grant_recalc starting"); INC_METRIC(grant_recalc_calls, 1); start = sched_clock(); @@ -417,6 +424,7 @@ void homa_grant_recalc(struct homa *homa) try_again = 0; atomic_inc(&homa->grant_recalc_count); + atomic_set(&homa->needy_ranks, 0); /* Clear the existing grant calculation. */ for (i = 0; i < homa->num_active_rpcs; i++) @@ -470,29 +478,14 @@ void homa_grant_recalc(struct homa *homa) * because sending grants takes a while and holding * grantable_lock would significantly increase contention for * it. We don't hold RPC locks while sending grants either, - * for the same reason. + * for the same reason (but we do hold a reference, to keep + * the RPC from being reaped). */ homa_grantable_unlock(homa); for (i = 0; i < active; i++) { struct homa_rpc *rpc = active_rpcs[i]; - if (!homa_grant_update_offset(rpc, homa)) { - homa_rpc_put(rpc); - continue; - } - homa_rpc_lock(rpc); - homa_grant_update_incoming(rpc, homa); - if (rpc->msgin.granted >= rpc->msgin.length) { - try_again += 1; - homa_grant_remove_rpc(rpc); - } - homa_rpc_unlock(rpc); - homa_grant_send(rpc); - - /* Careful not to release reference until after - * grant has been sent; otherwise RPC could be - * reaped. - */ + try_again += homa_grant_try_send(rpc, homa); homa_rpc_put(rpc); } @@ -567,6 +560,53 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, return num_rpcs; } +/** + * homa_grant_check_needy() - See if any of the RPCs in @homa->needy_ranks + * can now be granted; if so, issue grants to them. + * @homa: Overall information about the Homa transport. + * Return: Nonzero means that homa_grant_recalc needs to be called (the + * list of grantable RPCs changed). + */ +int homa_grant_check_needy(struct homa *homa) +{ + struct homa_rpc *rpc; + int result = 0; + int rank; + + INC_METRIC(grant_check_needy_calls, 1); + while (atomic_read(&homa->total_incoming) < homa->max_incoming) { + rank = ffs(atomic_read(&homa->needy_ranks)); + if (rank == 0) + break; + rank--; + atomic_andnot(homa_grant_needy_bit(rank), + &homa->needy_ranks); + + homa_grantable_lock(homa, 0); + if (rank >= homa->num_active_rpcs) { + /* active_rpcs changed before lock was acquired; + * no need for us to do anything more (someone else + * has already invoked homa_grant_recalc). + */ + homa_grantable_unlock(homa); + return 0; + } + + /* Must take reference on rpc to keep it alive, which can only + * be done safely while holding grantable lock. But, must + * release grantable lock before actually sending grant, in + * order to reduce contention. + */ + rpc = homa->active_rpcs[rank]; + homa_rpc_hold(rpc); + homa_grantable_unlock(homa); + + result |= homa_grant_try_send(rpc, homa); + homa_rpc_put(rpc); + } + return result; +} + /** * homa_grant_find_oldest() - Recompute the value of homa->oldest_rpc. * @homa: Overall data about the Homa protocol implementation. The @@ -621,6 +661,7 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) __releases(rpc->bucket_lock) { struct homa *homa = rpc->hsk->homa; + int incoming; if (!list_empty(&rpc->grantable_links)) { homa_grant_remove_rpc(rpc); @@ -633,8 +674,9 @@ void homa_grant_free_rpc(struct homa_rpc *rpc) } } - if (rpc->msgin.rec_incoming != 0) - atomic_sub(rpc->msgin.rec_incoming, &homa->total_incoming); + incoming = atomic_read(&rpc->msgin.rec_incoming); + if (incoming != 0) + atomic_sub(incoming, &homa->total_incoming); } /** diff --git a/homa_grant.h b/homa_grant.h index 6b001bf0..3173e4c8 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -5,8 +5,11 @@ #ifndef _HOMA_GRANT_H #define _HOMA_GRANT_H +#include "homa_rpc.h" + int homa_grantable_lock_slow(struct homa *homa, int recalc); void homa_grant_add_rpc(struct homa_rpc *rpc); +int homa_grant_check_needy(struct homa *homa); void homa_grant_check_rpc(struct homa_rpc *rpc); void homa_grant_find_oldest(struct homa *homa); void homa_grant_free_rpc(struct homa_rpc *rpc); @@ -18,9 +21,9 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_grant_recalc(struct homa *homa); void homa_grant_remove_rpc(struct homa_rpc *rpc); -void homa_grant_send(struct homa_rpc *rpc); -int homa_grant_update_incoming(struct homa_rpc *rpc, +void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa); +int homa_grant_try_send(struct homa_rpc *rpc, struct homa *homa); int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa); /** @@ -59,4 +62,17 @@ static inline void homa_grantable_unlock(struct homa *homa) spin_unlock_bh(&homa->grantable_lock); } +/** + * homa_grant_needy_bit() - Return a bit mask with the bit set in the + * position in @homa->grant_needy_ranks for @rank. + * @rank: Rank of an RPC (corresponds to position in @homa->active_rpcs). + * Return: A value with a 1-bit in the position corresponding to @rank, + * or 0 if rank is -1 or >= HOMA_MAX_PRIORITIES. + */ +static inline int homa_grant_needy_bit(int rank) +{ + /* Eliminate any bits that conflict with HOMA_MAX_PRIORITIES. */ + return (1 << rank) & ((1 << HOMA_MAX_PRIORITIES) - 1); +} + #endif /* _HOMA_GRANT_H */ diff --git a/homa_impl.h b/homa_impl.h index 6a05ef17..640a844d 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -243,7 +243,12 @@ struct homa { #ifndef __STRIP__ /* See strip.py */ /** * @grantable_lock: Used to synchronize access to grant-related - * fields below, from @grantable_peers to @last_grantable_change. + * fields below. In order to reduce contention, this lock is held + * only when making structural changes (e.g. modifying grantable_peers + * or active_rpcs). It is not held when computing new grant offsets + * and/or sending grant packets. Under some race conditions, it is + * possible for RPCs to receive grants out of priority order, or to + * receive duplicate grants. */ spinlock_t grantable_lock ____cacheline_aligned_in_smp; @@ -269,14 +274,7 @@ struct homa { */ struct list_head grantable_peers; - /** - * @grantable_rpcs: Contains all RPCs that have not been fully - * granted. The list is sorted in priority order (fewer ungranted - * bytes -> higher priority). - */ - struct list_head grantable_rpcs; - - /** @num_grantable_rpcs: The number of RPCs in grantable_rpcs. */ + /** @num_grantable_rpcs: The number of RPCs in grantable_peers. */ int num_grantable_rpcs; /** @last_grantable_change: The sched_clock() time of the most recent @@ -292,20 +290,6 @@ struct homa { */ int max_grantable_rpcs; - /** - * @oldest_rpc: The RPC with incoming data whose start_ns is - * farthest in the past). NULL means either there are no incoming - * RPCs or the oldest needs to be recomputed. Must hold grantable_lock - * to update. - */ - struct homa_rpc *oldest_rpc; - - /** - * @grant_window: How many bytes of granted but not yet received data - * may exist for an RPC at any given time. - */ - int grant_window; - /** * @num_active_rpcs: number of entries in @active_rpcs and * @active_remaining that are currently used. @@ -327,6 +311,20 @@ struct homa { */ atomic_t active_remaining[HOMA_MAX_GRANTS]; + /** + * @oldest_rpc: The RPC with incoming data whose start_ns is + * farthest in the past). NULL means either there are no incoming + * RPCs or the oldest needs to be recomputed. Must hold grantable_lock + * to update. + */ + struct homa_rpc *oldest_rpc; + + /** + * @grant_window: How many bytes of granted but not yet received data + * may exist for an RPC at any given time. + */ + int grant_window; + /** * @grant_nonfifo: How many bytes should be granted using the * normal priority system between grants to the oldest message. @@ -339,6 +337,23 @@ struct homa { * to the old message. */ int grant_nonfifo_left; + + /** + * @total_incoming: the total number of bytes that we expect to receive + * (across all messages) even if we don't send out any more grants + * (includes granted but unreceived bytes, plus unreceived unscheduled + * bytes that we know about). This can potentially be negative, if + * a peer sends more bytes than granted (see synchronization note in + * homa_send_grants for why we have to allow this possibility). + */ + atomic_t total_incoming ____cacheline_aligned_in_smp; + + /** + * @needy_ranks: A bitmask selecting all of the indices in @active_rpcs + * whose RPCs could not be fully granted because @total_incoming + * hit the @max_incoming limit. + */ + atomic_t needy_ranks; #endif /* See strip.py */ /** @@ -398,18 +413,6 @@ struct homa { */ int throttle_min_bytes; -#ifndef __STRIP__ /* See strip.py */ - /** - * @total_incoming: the total number of bytes that we expect to receive - * (across all messages) even if we don't send out any more grants - * (includes granted but unreceived bytes, plus unreceived unscheduled - * bytes that we know about). This can potentially be negative, if - * a peer sends more bytes than granted (see synchronization note in - * homa_send_grants for why we have to allow this possibility). - */ - atomic_t total_incoming ____cacheline_aligned_in_smp; -#endif /* See strip.py */ - /** * @prev_default_port: The most recent port number assigned from * the range of default ports. diff --git a/homa_incoming.c b/homa_incoming.c index f8fbffd0..c11cc2b0 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -30,7 +30,8 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) #else /* See strip.py */ /** * homa_message_in_init() - Constructor for homa_message_in. - * @rpc: RPC whose msgin structure should be initialized. + * @rpc: RPC whose msgin structure should be initialized. The + * msgin struct is assumed to be zeroes. * @length: Total number of bytes in message. * Return: Zero for successful initialization, or a negative errno * if rpc->msgin could not be initialized. @@ -45,17 +46,12 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) rpc->msgin.length = length; skb_queue_head_init(&rpc->msgin.packets); - rpc->msgin.recv_end = 0; INIT_LIST_HEAD(&rpc->msgin.gaps); rpc->msgin.bytes_remaining = length; #ifndef __STRIP__ /* See strip.py */ rpc->msgin.granted = (unsched > length) ? length : unsched; - rpc->msgin.rec_incoming = 0; atomic_set(&rpc->msgin.rank, -1); - rpc->msgin.priority = 0; #endif /* See strip.py */ - rpc->msgin.resend_all = 0; - rpc->msgin.num_bpages = 0; err = homa_pool_allocate(rpc); if (err != 0) return err; @@ -586,9 +582,13 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) } if (rpc) { #ifndef __STRIP__ /* See strip.py */ + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); homa_grant_check_rpc(rpc); -#endif /* See strip.py */ + homa_rpc_put(rpc); +#else /* See strip.py */ homa_rpc_unlock(rpc); +#endif /* See strip.py */ } while (num_acks > 0) { @@ -1004,6 +1004,7 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, } #ifndef __STRIP__ /* See strip.py */ +#if 0 /** * homa_choose_fifo_grant() - This function is invoked occasionally to give * a high-priority grant to the oldest incoming message. We do this in @@ -1084,6 +1085,7 @@ struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) } return oldest; } +#endif #endif /* See strip.py */ /** diff --git a/homa_metrics.c b/homa_metrics.c index 8f8d8272..71416878 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -286,10 +286,12 @@ char *homa_metrics_print(struct homa *homa) m->grant_recalc_calls); M("grant_recalc_ns %15llu Time spent in homa_grant_recalc\n", m->grant_recalc_ns); - M("grant_recalc_skips %15llu Number of times homa_grant_recalc skipped redundant work\n", - m->grant_recalc_skips); M("grant_recalc_loops %15llu Number of times homa_grant_recalc looped back\n", m->grant_recalc_loops); + M("grant_recalc_skips %15llu Number of times homa_grant_recalc skipped redundant work\n", + m->grant_recalc_skips); + M("grant_check_needy_calls %15llu Number of calls to homa_grant_check_needy\n", + m->grant_recalc_skips); M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", m->grant_priority_bumps); M("fifo_grants %15llu Grants issued using FIFO priority\n", diff --git a/homa_metrics.h b/homa_metrics.h index dcb87549..eebb39bc 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -518,6 +518,12 @@ struct homa_metrics { */ u64 grant_recalc_skips; + /** + * @grant_check_needy_calls: cumulative number of times that + * homa_grant_check_needy has been invoked. + */ + u64 grant_check_needy_calls; + /** * @grant_priority_bumps: cumulative number of times the grant priority * of an RPC has increased above its next-higher-priority neighbor. diff --git a/homa_pool.c b/homa_pool.c index 3472a56d..e8b37853 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -489,9 +489,15 @@ void homa_pool_check_waiting(struct homa_pool *pool) if (rpc->msgin.num_bpages > 0) { /* Allocation succeeded; "wake up" the RPC. */ rpc->msgin.resend_all = 1; + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); homa_grant_check_rpc(rpc); + homa_rpc_put(rpc); + } else { + homa_rpc_unlock(rpc); } -#endif /* See strip.py */ +#else /* See strip.py */ homa_rpc_unlock(rpc); +#endif /* See strip.py */ } } diff --git a/homa_rpc.c b/homa_rpc.c index d968eafd..90fb89a5 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -713,7 +713,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) if (!homa_protect_rpcs(hsk)) continue; list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - int incoming; + int incoming, rec_incoming; if (rpc->state != RPC_INCOMING) continue; @@ -722,13 +722,13 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) - rpc->msgin.bytes_remaining); if (incoming < 0) incoming = 0; - if (rpc->msgin.rec_incoming == 0) + rec_incoming = atomic_read(&rpc->msgin.rec_incoming); + if (rec_incoming == 0) continue; - total_incoming += rpc->msgin.rec_incoming; + total_incoming += rec_incoming; if (verbose) tt_record3("homa_validate_incoming: RPC id %d, incoming %d, rec_incoming %d", - rpc->id, incoming, - rpc->msgin.rec_incoming); + rpc->id, incoming, rec_incoming); if (rpc->msgin.granted >= rpc->msgin.length) continue; if (list_empty(&rpc->grantable_links)) { diff --git a/homa_rpc.h b/homa_rpc.h index 37203219..7c536290 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -157,12 +157,13 @@ struct homa_message_in { * @rec_incoming: Number of bytes in homa->total_incoming currently * contributed ("recorded") from this RPC. */ - int rec_incoming; + atomic_t rec_incoming; /** * @rank: The index of this RPC in homa->active_rpcs and * homa->active_remaining, or -1 if this RPC is not in those arrays. - * Set by homa_grant, read-only to the RPC. + * Lower number means higher priority. Must be atomic because it + * is read without synchronization. */ atomic_t rank; diff --git a/homa_timer.c b/homa_timer.c index 0b050fad..6c2b3a8f 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -152,7 +152,7 @@ void homa_check_rpc(struct homa_rpc *rpc) tt_record4("length %d, granted %d, rem %d, rec_incoming %d", rpc->msgin.length, rpc->msgin.granted, rpc->msgin.bytes_remaining, - rpc->msgin.rec_incoming); + atomic_read(&rpc->msgin.rec_incoming)); #endif /* See strip.py */ } else { #ifndef __STRIP__ /* See strip.py */ @@ -166,7 +166,7 @@ void homa_check_rpc(struct homa_rpc *rpc) tt_record4("length %d, granted %d, rem %d, rec_incoming %d", rpc->msgin.length, rpc->msgin.granted, rpc->msgin.bytes_remaining, - rpc->msgin.rec_incoming); + atomic_read(&rpc->msgin.rec_incoming)); #endif /* See strip.py */ } #ifndef __STRIP__ /* See strip.py */ @@ -274,7 +274,8 @@ void homa_timer(struct homa *homa) #ifndef __STRIP__ /* See strip.py */ } else if (rpc->state == RPC_INCOMING) { total_incoming_rpcs += 1; - sum_incoming_rec += rpc->msgin.rec_incoming; + sum_incoming_rec += + atomic_read(&rpc->msgin.rec_incoming); sum_incoming += rpc->msgin.granted - (rpc->msgin.length - rpc->msgin.bytes_remaining); diff --git a/homa_utils.c b/homa_utils.c index 0d61d44f..50655b04 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -42,7 +42,6 @@ int homa_init(struct homa *homa) #ifndef __STRIP__ /* See strip.py */ spin_lock_init(&homa->grantable_lock); INIT_LIST_HEAD(&homa->grantable_peers); - INIT_LIST_HEAD(&homa->grantable_rpcs); homa->last_grantable_change = sched_clock(); #endif /* See strip.py */ spin_lock_init(&homa->pacer_mutex); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index bd46d75c..0daf7056 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -35,6 +35,14 @@ static void grantable_spinlock_hook(char *id) mock_ns = 1000; } +static void change_active_hook(char *id) +{ + if (strcmp(id, "spin_lock") != 0) + return; + if (hook_homa != NULL) + hook_homa->num_active_rpcs = 0; +} + FIXTURE(homa_grant) { struct in6_addr client_ip[5]; int client_port; @@ -150,39 +158,29 @@ TEST_F(homa_grant, homa_grant_update_incoming) atomic_set(&self->homa.total_incoming, 1000); rpc->msgin.bytes_remaining = 19000; rpc->msgin.granted = 3000; - rpc->msgin.rec_incoming = 500; - EXPECT_EQ(0, homa_grant_update_incoming(rpc, &self->homa)); + atomic_set(&rpc->msgin.rec_incoming, 500); + homa_grant_update_incoming(rpc, &self->homa); EXPECT_EQ(2500, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(2000, rpc->msgin.rec_incoming); + EXPECT_EQ(2000, atomic_read(&rpc->msgin.rec_incoming)); /* Case 2: incoming negative. */ atomic_set(&self->homa.total_incoming, 1000); rpc->msgin.bytes_remaining = 16000; rpc->msgin.granted = 3000; - rpc->msgin.rec_incoming = 500; - EXPECT_EQ(0, homa_grant_update_incoming(rpc, &self->homa)); + atomic_set(&rpc->msgin.rec_incoming, 500); + homa_grant_update_incoming(rpc, &self->homa); EXPECT_EQ(500, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(0, rpc->msgin.rec_incoming); - - /* Case 3: total_incoming decreases below max_incoming. */ - atomic_set(&self->homa.total_incoming, 5000); - self->homa.max_incoming = 5000; - rpc->msgin.bytes_remaining = 17000; - rpc->msgin.granted = 4000; - rpc->msgin.rec_incoming = 2000; - EXPECT_EQ(1, homa_grant_update_incoming(rpc, &self->homa)); - EXPECT_EQ(4000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(1000, rpc->msgin.rec_incoming); - - /* Case 4: no change to rec_incoming. */ + EXPECT_EQ(0, atomic_read(&rpc->msgin.rec_incoming)); + + /* Case 3: no change to rec_incoming. */ atomic_set(&self->homa.total_incoming, 1000); self->homa.max_incoming = 1000; rpc->msgin.bytes_remaining = 16000; rpc->msgin.granted = 4500; - rpc->msgin.rec_incoming = 500; - EXPECT_EQ(0, homa_grant_update_incoming(rpc, &self->homa)); + atomic_set(&rpc->msgin.rec_incoming, 500); + homa_grant_update_incoming(rpc, &self->homa); EXPECT_EQ(1000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(500, rpc->msgin.rec_incoming); + EXPECT_EQ(500, atomic_read(&rpc->msgin.rec_incoming)); } TEST_F(homa_grant, homa_grant_add_rpc__update_metrics) @@ -423,6 +421,7 @@ TEST_F(homa_grant, homa_grant_update_offset__basics) EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(10000, rpc->msgin.granted); + EXPECT_EQ(0, atomic_read(&self->homa.needy_ranks)); } TEST_F(homa_grant, homa_grant_update_offset__rpc_idle) { @@ -440,6 +439,7 @@ TEST_F(homa_grant, homa_grant_update_offset__end_of_message) rpc->msgin.bytes_remaining = 5000; EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(20000, rpc->msgin.granted); + EXPECT_EQ(0, atomic_read(&self->homa.needy_ranks)); /* Second call cannot grant anymore. */ EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); @@ -450,39 +450,78 @@ TEST_F(homa_grant, homa_grant_update_offset__insufficient_room_in_incoming) struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); rpc->msgin.bytes_remaining = 5000; + atomic_set(&rpc->msgin.rank, 5); atomic_set(&self->homa.total_incoming, 48000); EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(17000, rpc->msgin.granted); + EXPECT_EQ(0x20, atomic_read(&self->homa.needy_ranks)); } TEST_F(homa_grant, homa_grant_update_offset__incoming_overcommitted) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + atomic_set(&rpc->msgin.rank, 6); atomic_set(&self->homa.total_incoming, 51000); EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_EQ(0x40, atomic_read(&self->homa.needy_ranks)); } -TEST_F(homa_grant, homa_grant_send__basics) +TEST_F(homa_grant, homa_grant_try_send__basics) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + atomic_set(&rpc->msgin.rank, 1); + atomic_set(&self->homa.needy_ranks, 7); + unit_log_clear(); + EXPECT_EQ(0, homa_grant_try_send(rpc, &self->homa)); + EXPECT_EQ(10000, rpc->msgin.granted); + EXPECT_EQ(5, atomic_read(&self->homa.needy_ranks)); + EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); + EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_try_send__cant_grant) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - rpc->msgin.granted = 12300; + atomic_set(&rpc->msgin.rank, 1); + atomic_set(&self->homa.needy_ranks, 7); + atomic_set(&self->homa.total_incoming, self->homa.max_incoming); unit_log_clear(); - homa_grant_send(rpc); - EXPECT_STREQ("xmit GRANT 12300@0", unit_log_get()); + EXPECT_EQ(0, homa_grant_try_send(rpc, &self->homa)); + EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_EQ(7, atomic_read(&self->homa.needy_ranks)); + EXPECT_EQ(50000, atomic_read(&self->homa.total_incoming)); + EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_grant, homa_grant_send__resend_all) +TEST_F(homa_grant, homa_grant_try_send__resend_all) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - rpc->msgin.granted = 5000; rpc->msgin.resend_all = 1; unit_log_clear(); - homa_grant_send(rpc); - EXPECT_STREQ("xmit GRANT 5000@0 resend_all", unit_log_get()); + EXPECT_EQ(0, homa_grant_try_send(rpc, &self->homa)); + EXPECT_STREQ("xmit GRANT 10000@0 resend_all", unit_log_get()); EXPECT_EQ(0, rpc->msgin.resend_all); } +TEST_F(homa_grant, homa_grant_try_send__end_of_message) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 5000); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("response from 1.2.3.4, id 100, remaining 5000", + unit_log_get()); + + unit_log_clear(); + EXPECT_EQ(1, homa_grant_try_send(rpc, &self->homa)); + EXPECT_EQ(5000, rpc->msgin.granted); + EXPECT_EQ(5000, atomic_read(&self->homa.total_incoming)); + EXPECT_STREQ("xmit GRANT 5000@0", unit_log_get()); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) { @@ -492,9 +531,9 @@ TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) rpc->msgin.bytes_remaining = 500; rpc->msgin.granted = 2000; - rpc->msgin.rec_incoming = 0; + atomic_set(&rpc->msgin.rec_incoming, 0); homa_grant_check_rpc(rpc); - EXPECT_EQ(0, rpc->msgin.rec_incoming); + EXPECT_EQ(0, atomic_read(&rpc->msgin.rec_incoming)); EXPECT_EQ(0, atomic_read(&self->homa.total_incoming)); EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); } @@ -507,7 +546,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) homa_message_in_init(rpc, 2000, 0); homa_grant_check_rpc(rpc); - EXPECT_EQ(2000, rpc->msgin.rec_incoming); + EXPECT_EQ(2000, atomic_read(&rpc->msgin.rec_incoming)); EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); old_state = rpc->state; @@ -515,27 +554,71 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) rpc->msgin.bytes_remaining = 0; homa_grant_check_rpc(rpc); rpc->state = old_state; - EXPECT_EQ(2000, rpc->msgin.rec_incoming); + EXPECT_EQ(2000, atomic_read(&rpc->msgin.rec_incoming)); EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); } TEST_F(homa_grant, homa_grant_check_rpc__message_doesnt_need_grants) { - struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 100, 1000, 2000); + struct homa_rpc *rpc1, *rpc2; - homa_message_in_init(rpc, 2000, 0); - rpc->msgin.granted = 2000; - rpc->msgin.bytes_remaining = 500; + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + atomic_set(&self->homa.total_incoming, self->homa.max_incoming); + homa_grant_recalc(&self->homa); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); + EXPECT_EQ(1, atomic_read(&self->homa.needy_ranks)); + atomic_set(&self->homa.total_incoming, 0); - homa_grant_check_rpc(rpc); - EXPECT_EQ(500, rpc->msgin.rec_incoming); - EXPECT_EQ(500, atomic_read(&self->homa.total_incoming)); + rpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 102, 1000, + 2000); + homa_message_in_init(rpc2, 2000, 0); + rpc2->msgin.granted = 2000; + rpc2->msgin.bytes_remaining = 500; - rpc->msgin.bytes_remaining = 0; - homa_grant_check_rpc(rpc); - EXPECT_EQ(0, rpc->msgin.rec_incoming); - EXPECT_EQ(0, atomic_read(&self->homa.total_incoming)); + unit_log_clear(); + homa_grant_check_rpc(rpc2); + EXPECT_EQ(500, atomic_read(&rpc2->msgin.rec_incoming)); + EXPECT_EQ(10500, atomic_read(&self->homa.total_incoming)); + EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_check_rpc__message_doesnt_need_grants_must_recalc) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + /* First RPC is complete. */ + rpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 100, 1000, + 2000); + homa_message_in_init(rpc1, 2000, 0); + rpc1->msgin.granted = 2000; + rpc1->msgin.bytes_remaining = 0; + + /* Second RPC will be waiting for incoming. */ + rpc2 = test_rpc(self, 100, self->server_ip, 5000); + + /* Third RPC will get granted when homa_grant_check_rpc calls + * homa_grant_recalc. */ + rpc3 = test_rpc(self, 100, self->server_ip, 20000); + + atomic_set(&self->homa.total_incoming, self->homa.max_incoming); + self->homa.max_overcommit = 1; + homa_grant_recalc(&self->homa); + + EXPECT_EQ(0, atomic_read(&rpc2->msgin.rank)); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); + EXPECT_EQ(0, rpc3->msgin.granted); + + atomic_set(&self->homa.total_incoming, 0); + unit_log_clear(); + homa_grant_check_rpc(rpc1); + EXPECT_EQ(5000, rpc2->msgin.granted); + EXPECT_EQ(10000, rpc3->msgin.granted); + EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); + EXPECT_STREQ("xmit GRANT 5000@0; homa_grant_recalc; xmit GRANT 10000@0", + unit_log_get()); } TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) { @@ -548,7 +631,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) homa_grant_check_rpc(rpc); EXPECT_EQ(18000, rpc->msgin.granted); - EXPECT_EQ(10000, rpc->msgin.rec_incoming); + EXPECT_EQ(10000, atomic_read(&rpc->msgin.rec_incoming)); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_calls); @@ -569,7 +652,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) homa_message_in_init(rpc3, 20000, 0); homa_grant_check_rpc(rpc3); EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.rec_incoming); + EXPECT_EQ(10000, atomic_read(&rpc3->msgin.rec_incoming)); EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); @@ -591,7 +674,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) homa_message_in_init(rpc3, 30000, 0); homa_grant_check_rpc(rpc3); EXPECT_EQ(0, rpc3->msgin.granted); - EXPECT_EQ(0, rpc3->msgin.rec_incoming); + EXPECT_EQ(0, atomic_read(&rpc3->msgin.rec_incoming)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); @@ -613,7 +696,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) rpc3->msgin.bytes_remaining = 15000; homa_grant_check_rpc(rpc3); EXPECT_EQ(35000, rpc3->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.rec_incoming); + EXPECT_EQ(10000, atomic_read(&rpc3->msgin.rec_incoming)); EXPECT_EQ(0, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc1->msgin.rank)); @@ -636,52 +719,133 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) unit_log_clear(); homa_grant_check_rpc(rpc3); EXPECT_EQ(25000, rpc3->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.rec_incoming); + EXPECT_EQ(10000, atomic_read(&rpc3->msgin.rec_incoming)); EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_STREQ("xmit GRANT 25000@1", unit_log_get()); + EXPECT_STREQ("homa_grant_recalc; xmit GRANT 25000@1", unit_log_get()); } -TEST_F(homa_grant, homa_grant_check_rpc__send_new_grant) +TEST_F(homa_grant, homa_grant_check_rpc__check_all_needy) { - struct homa_rpc *rpc; + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - rpc = test_rpc(self, 100, self->server_ip, 40000); + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + rpc2 = test_rpc(self, 100, self->server_ip, 30000); + rpc3 = test_rpc(self, 100, self->server_ip, 40000); + rpc4 = test_rpc(self, 100, self->server_ip, 50000); + atomic_set(&self->homa.total_incoming, self->homa.max_incoming); homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); - EXPECT_EQ(10000, rpc->msgin.granted); - EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); + EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); + EXPECT_EQ(3, atomic_read(&rpc4->msgin.rank)); - rpc->msgin.bytes_remaining = 35000; + atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 25000); + atomic_set(&self->homa.needy_ranks, 0x9); unit_log_clear(); - homa_grant_check_rpc(rpc); - EXPECT_EQ(15000, rpc->msgin.granted); - EXPECT_EQ(10000, rpc->msgin.rec_incoming); - EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); - EXPECT_STREQ("xmit GRANT 15000@0", unit_log_get()); + homa_grant_check_rpc(rpc3); + EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(10000, rpc3->msgin.granted); + EXPECT_EQ(5000, rpc4->msgin.granted); + EXPECT_STREQ("xmit GRANT 10000@3; xmit GRANT 10000@1; xmit GRANT 5000@0", + unit_log_get()); } -TEST_F(homa_grant, homa_grant_check_rpc__remove_from_grantable) +TEST_F(homa_grant, homa_grant_check_rpc__recalc_after_cnecking_needy) { - struct homa_rpc *rpc; + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - rpc = test_rpc(self, 100, self->server_ip, 40000); + rpc1 = test_rpc(self, 100, self->server_ip, 5000); + rpc2 = test_rpc(self, 100, self->server_ip, 10000); + rpc3 = test_rpc(self, 100, self->server_ip, 20000); + rpc4 = test_rpc(self, 100, self->server_ip, 30000); + self->homa.max_overcommit = 3; + atomic_set(&self->homa.total_incoming, self->homa.max_incoming); homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); - EXPECT_EQ(10000, rpc->msgin.granted); - EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); + EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); + EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); - rpc->msgin.bytes_remaining = 10000; - rpc->msgin.granted = 30000; - rpc->msgin.rec_incoming = 10000; + atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 10000); + atomic_set(&self->homa.needy_ranks, 0x6); unit_log_clear(); - homa_grant_check_rpc(rpc); - EXPECT_EQ(40000, rpc->msgin.granted); - EXPECT_EQ(10000, rpc->msgin.rec_incoming); - EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); - EXPECT_STREQ("xmit GRANT 40000@0", unit_log_get()); - EXPECT_EQ(0, self->homa.num_grantable_rpcs); - EXPECT_EQ(0, self->homa.num_active_rpcs); - EXPECT_EQ(-1, atomic_read(&rpc->msgin.rank)); + homa_grant_check_rpc(rpc3); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(10000, rpc2->msgin.granted); + EXPECT_EQ(0, rpc3->msgin.granted); + EXPECT_EQ(0, rpc4->msgin.granted); + EXPECT_STREQ("xmit GRANT 10000@1; homa_grant_recalc", unit_log_get()); + EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); + EXPECT_EQ(-1, atomic_read(&rpc2->msgin.rank)); + EXPECT_EQ(2, atomic_read(&rpc4->msgin.rank)); +} +TEST_F(homa_grant, homa_grant_check_rpc__recalc_after_checking_needy) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 100, self->server_ip, 30000); + rpc3 = test_rpc(self, 100, self->server_ip, 40000); + rpc4 = test_rpc(self, 100, self->server_ip, 50000); + self->homa.max_overcommit = 3; + self->homa.max_incoming = 0; + homa_grant_recalc(&self->homa); + EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); + EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); + + self->homa.max_incoming = 15000; + unit_log_clear(); + homa_grant_check_rpc(rpc3); + EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_EQ(5000, rpc2->msgin.granted); + EXPECT_EQ(0, rpc3->msgin.granted); + EXPECT_EQ(0, rpc4->msgin.granted); + EXPECT_EQ(2, atomic_read(&rpc4->msgin.rank)); +} +TEST_F(homa_grant, homa_grant_check_rpc__grant_to_self) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + rpc2 = test_rpc(self, 100, self->server_ip, 30000); + rpc3 = test_rpc(self, 100, self->server_ip, 40000); + rpc4 = test_rpc(self, 100, self->server_ip, 50000); + atomic_set(&self->homa.total_incoming, self->homa.max_incoming); + homa_grant_recalc(&self->homa); + EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); + EXPECT_EQ(3, atomic_read(&rpc4->msgin.rank)); + + atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 20000); + atomic_set(&self->homa.needy_ranks, 0); + unit_log_clear(); + homa_grant_check_rpc(rpc3); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(10000, rpc3->msgin.granted); + EXPECT_EQ(0, rpc4->msgin.granted); + EXPECT_STREQ("xmit GRANT 10000@1", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_check_rpc__grant_to_self_and_recalc) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 5000); + rpc2 = test_rpc(self, 100, self->server_ip, 6000); + rpc3 = test_rpc(self, 100, self->server_ip, 10000); + rpc4 = test_rpc(self, 100, self->server_ip, 20000); + atomic_set(&self->homa.total_incoming, self->homa.max_incoming); + self->homa.max_overcommit = 3; + homa_grant_recalc(&self->homa); + EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); + EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); + + atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 10000); + atomic_set(&self->homa.needy_ranks, 0); + unit_log_clear(); + homa_grant_check_rpc(rpc3); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(10000, rpc3->msgin.granted); + EXPECT_EQ(0, rpc4->msgin.granted); + EXPECT_EQ(2, atomic_read(&rpc4->msgin.rank)); + EXPECT_STREQ("xmit GRANT 10000@0; homa_grant_recalc", unit_log_get()); } TEST_F(homa_grant, homa_grant_recalc__basics) @@ -694,11 +858,12 @@ TEST_F(homa_grant, homa_grant_recalc__basics) rpc4 = test_rpc(self, 106, self->server_ip+1, 35000); self->homa.max_incoming = 100000; self->homa.max_overcommit = 3; + atomic_set(&self->homa.needy_ranks, 1); mock_ns_tick = 10; unit_log_clear(); homa_grant_recalc(&self->homa); - EXPECT_STREQ("xmit GRANT 10000@2; " + EXPECT_STREQ("homa_grant_recalc; xmit GRANT 10000@2; " "xmit GRANT 10000@1; " "xmit GRANT 10000@0", unit_log_get()); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); @@ -706,6 +871,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(20000, atomic_read(&self->homa.active_remaining[0])); EXPECT_EQ(1, atomic_read(&self->homa.grant_recalc_count)); + EXPECT_EQ(0, atomic_read(&self->homa.needy_ranks)); EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(1, rpc3->msgin.priority); @@ -726,7 +892,7 @@ TEST_F(homa_grant, homa_grant_recalc__skip_recalc) unit_log_clear(); homa_grant_recalc(&self->homa); - EXPECT_STREQ("", unit_log_get()); + EXPECT_STREQ("homa_grant_recalc", unit_log_get()); EXPECT_EQ(0, rpc->msgin.granted); EXPECT_EQ(2, atomic_read(&self->homa.grant_recalc_count)); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_recalc_skips); @@ -761,7 +927,8 @@ TEST_F(homa_grant, homa_grant_recalc__use_only_lowest_priorities) unit_log_clear(); homa_grant_recalc(&self->homa); - EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 10000@0", unit_log_get()); + EXPECT_STREQ("homa_grant_recalc; xmit GRANT 10000@1; xmit GRANT 10000@0", + unit_log_get()); EXPECT_EQ(1, rpc1->msgin.priority); EXPECT_EQ(0, rpc2->msgin.priority); } @@ -778,10 +945,11 @@ TEST_F(homa_grant, homa_grant_recalc__share_lowest_priority_level) unit_log_clear(); homa_grant_recalc(&self->homa); - EXPECT_STREQ("xmit GRANT 10000@2; " - "xmit GRANT 10000@1; " - "xmit GRANT 10000@0; " - "xmit GRANT 10000@0", unit_log_get()); + EXPECT_STREQ("homa_grant_recalc; " + "xmit GRANT 10000@2; " + "xmit GRANT 10000@1; " + "xmit GRANT 10000@0; " + "xmit GRANT 10000@0", unit_log_get()); EXPECT_EQ(2, rpc1->msgin.priority); EXPECT_EQ(1, rpc2->msgin.priority); EXPECT_EQ(0, rpc3->msgin.priority); @@ -829,6 +997,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_cant_be_granted) EXPECT_EQ(0, rpc2->msgin.granted); EXPECT_EQ(5000, rpc3->msgin.granted); EXPECT_EQ(0, rpc4->msgin.granted); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_loops); } TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_so_recalc) { @@ -846,6 +1015,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_so_recalc) EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.granted); EXPECT_EQ(2000, rpc4->msgin.granted); + EXPECT_EQ(2, homa_metrics_per_cpu()->grant_recalc_loops); } TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) { @@ -859,7 +1029,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) self->homa.max_overcommit = 2; unit_hook_register(grantable_spinlock_hook); hook_homa = &self->homa; - mock_trylock_errors = 0xfe0; + mock_trylock_errors = 0xf8; EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_skips); homa_grant_recalc(&self->homa); @@ -867,6 +1037,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(0, rpc3->msgin.granted); EXPECT_EQ(0, rpc4->msgin.granted); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_recalc_loops); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_recalc_skips); } @@ -962,6 +1133,102 @@ TEST_F(homa_grant, homa_grant_pick_rpcs__first_rpc_of_peer_doesnt_fit) EXPECT_STREQ("200 300 400", rpc_ids(rpcs, count)); } +TEST_F(homa_grant, homa_grant_check_needy__basics) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + rpc2 = test_rpc(self, 102, self->server_ip, 30000); + rpc3 = test_rpc(self, 104, self->server_ip, 40000); + rpc4 = test_rpc(self, 106, self->server_ip, 50000); + self->homa.max_incoming = 0; + + unit_log_clear(); + homa_grant_recalc(&self->homa); + EXPECT_STREQ("homa_grant_recalc", unit_log_get()); + EXPECT_EQ(0xf, atomic_read(&self->homa.needy_ranks)); + + atomic_set(&self->homa.needy_ranks, 0x5); + self->homa.max_incoming = 50000; + unit_log_clear(); + EXPECT_EQ(0, homa_grant_check_needy(&self->homa)); + EXPECT_STREQ("xmit GRANT 10000@3; xmit GRANT 10000@1", unit_log_get()); + EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(10000, rpc3->msgin.granted); + EXPECT_EQ(0, rpc4->msgin.granted); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_needy_calls); +} +TEST_F(homa_grant, homa_grant_check_needy__incoming_exhausted) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + rpc2 = test_rpc(self, 102, self->server_ip, 30000); + rpc3 = test_rpc(self, 104, self->server_ip, 40000); + rpc4 = test_rpc(self, 106, self->server_ip, 50000); + self->homa.max_incoming = 0; + + unit_log_clear(); + homa_grant_recalc(&self->homa); + EXPECT_STREQ("homa_grant_recalc", unit_log_get()); + EXPECT_EQ(0xf, atomic_read(&self->homa.needy_ranks)); + + self->homa.max_incoming = 15000; + unit_log_clear(); + EXPECT_EQ(0, homa_grant_check_needy(&self->homa)); + EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_EQ(5000, rpc2->msgin.granted); + EXPECT_EQ(0, rpc3->msgin.granted); + EXPECT_EQ(0, rpc4->msgin.granted); +} +TEST_F(homa_grant, homa_grant_check_needy__num_active_rpcs_changed) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + rpc2 = test_rpc(self, 102, self->server_ip, 30000); + rpc3 = test_rpc(self, 104, self->server_ip, 40000); + rpc4 = test_rpc(self, 106, self->server_ip, 50000); + self->homa.max_incoming = 0; + + unit_log_clear(); + homa_grant_recalc(&self->homa); + EXPECT_STREQ("homa_grant_recalc", unit_log_get()); + EXPECT_EQ(0xf, atomic_read(&self->homa.needy_ranks)); + + hook_homa = &self->homa; + unit_hook_register(change_active_hook); + self->homa.max_incoming = 50000; + unit_log_clear(); + EXPECT_EQ(0, homa_grant_check_needy(&self->homa)); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(0, rpc3->msgin.granted); + EXPECT_EQ(0, rpc4->msgin.granted); +} +TEST_F(homa_grant, homa_grant_check_needy__recalc_needed) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + self->homa.max_incoming = 0; + + unit_log_clear(); + homa_grant_recalc(&self->homa); + EXPECT_STREQ("homa_grant_recalc", unit_log_get()); + EXPECT_EQ(0x7, atomic_read(&self->homa.needy_ranks)); + + self->homa.max_incoming = 50000; + unit_log_clear(); + EXPECT_EQ(1, homa_grant_check_needy(&self->homa)); + EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_EQ(10000, rpc2->msgin.granted); + EXPECT_EQ(10000, rpc3->msgin.granted); +} + TEST_F(homa_grant, homa_grant_find_oldest__basics) { mock_ns_tick = 10; @@ -1009,7 +1276,7 @@ TEST_F(homa_grant, homa_grant_rpc_free__rpc_not_grantable) self->client_ip, self->server_ip, self->server_port, 100, 1000, 2000); atomic_set(&self->homa.total_incoming, 10000); - rpc->msgin.rec_incoming = 3000; + atomic_set(&rpc->msgin.rec_incoming, 3000); homa_grant_free_rpc(rpc); EXPECT_EQ(7000, atomic_read(&self->homa.total_incoming)); } @@ -1026,7 +1293,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(10000, rpc1->msgin.rec_incoming); + EXPECT_EQ(10000, atomic_read(&rpc1->msgin.rec_incoming)); unit_log_clear(); homa_grant_free_rpc(rpc1); @@ -1048,10 +1315,10 @@ TEST_F(homa_grant, homa_grant_free_rpc__not_in_active_list) EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(0, rpc3->msgin.rec_incoming); + EXPECT_EQ(0, atomic_read(&rpc3->msgin.rec_incoming)); EXPECT_FALSE(list_empty(&rpc3->grantable_links)); - rpc3->msgin.rec_incoming = 5000; + atomic_set(&rpc3->msgin.rec_incoming, 5000); homa_grant_free_rpc(rpc3); EXPECT_TRUE(list_empty(&rpc3->grantable_links)); EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); @@ -1087,3 +1354,16 @@ TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) EXPECT_EQ(2, homa_metrics_per_cpu()->grantable_lock_misses); homa_grantable_unlock(&self->homa); } + +/* Functions in homa_grant.h: + * -------------------------- + */ + + TEST_F(homa_grant, homa_grant_needy_bit) + { + EXPECT_EQ(0x1, homa_grant_needy_bit(0)); + EXPECT_EQ(0x4, homa_grant_needy_bit(2)); + EXPECT_EQ(0x80, homa_grant_needy_bit(7)); + EXPECT_EQ(0, homa_grant_needy_bit(20)); + EXPECT_EQ(0, homa_grant_needy_bit(-1)); + } \ No newline at end of file diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index ce8e4442..e8658572 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -636,7 +636,8 @@ TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) atomic_set(&pool->free_bpages, 2); homa_pool_check_waiting(pool); EXPECT_EQ(2, crpc->msgin.num_bpages); - EXPECT_STREQ("xmit GRANT 10000@0 resend_all", unit_log_get()); + EXPECT_STREQ("homa_grant_recalc; xmit GRANT 10000@0 resend_all", + unit_log_get()); } #endif /* See strip.py */ TEST_F(homa_pool, homa_pool_check_waiting__reallocation_fails) From 3b7993c4436c71261b2e4396bb1e9cbb7c7d1080 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Feb 2025 13:35:18 -0800 Subject: [PATCH 198/625] Rename homa_grant_free_rpc -> homa_grant_end_rpc --- homa_grant.c | 7 ++++--- homa_grant.h | 2 +- homa_rpc.c | 2 +- test/unit_homa_grant.c | 6 +++--- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 85ead5b0..4d5d7f62 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -653,11 +653,12 @@ void homa_grant_find_oldest(struct homa *homa) } /** - * homa_grant_free_rpc() - This function is invoked when an RPC is freed; - * it cleans up any state related to grants for that RPC's incoming message. + * homa_grant_end_rpc() - This function is invoked when homa_rpc_end is + * invoked; it cleans up any state related to grants for that RPC's + * incoming message. * @rpc: The RPC to clean up. Must be locked by the caller. */ -void homa_grant_free_rpc(struct homa_rpc *rpc) +void homa_grant_end_rpc(struct homa_rpc *rpc) __releases(rpc->bucket_lock) { struct homa *homa = rpc->hsk->homa; diff --git a/homa_grant.h b/homa_grant.h index 3173e4c8..eadb27fc 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -11,8 +11,8 @@ int homa_grantable_lock_slow(struct homa *homa, int recalc); void homa_grant_add_rpc(struct homa_rpc *rpc); int homa_grant_check_needy(struct homa *homa); void homa_grant_check_rpc(struct homa_rpc *rpc); +void homa_grant_end_rpc(struct homa_rpc *rpc); void homa_grant_find_oldest(struct homa *homa); -void homa_grant_free_rpc(struct homa_rpc *rpc); void homa_grant_log_tt(struct homa *homa); int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2); diff --git a/homa_rpc.c b/homa_rpc.c index 90fb89a5..f0d1b14b 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -271,7 +271,7 @@ void homa_rpc_end(struct homa_rpc *rpc) * necessary because homa__rpc releases the RPC lock and * reacquires it. */ - homa_grant_free_rpc(rpc); + homa_grant_end_rpc(rpc); #endif /* See strip.py */ /* Unlink from all lists, so no-one will ever find this RPC again. */ diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 0daf7056..e408624b 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -1277,7 +1277,7 @@ TEST_F(homa_grant, homa_grant_rpc_free__rpc_not_grantable) 100, 1000, 2000); atomic_set(&self->homa.total_incoming, 10000); atomic_set(&rpc->msgin.rec_incoming, 3000); - homa_grant_free_rpc(rpc); + homa_grant_end_rpc(rpc); EXPECT_EQ(7000, atomic_read(&self->homa.total_incoming)); } TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) @@ -1296,7 +1296,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) EXPECT_EQ(10000, atomic_read(&rpc1->msgin.rec_incoming)); unit_log_clear(); - homa_grant_free_rpc(rpc1); + homa_grant_end_rpc(rpc1); EXPECT_EQ(-1, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(0, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); @@ -1319,7 +1319,7 @@ TEST_F(homa_grant, homa_grant_free_rpc__not_in_active_list) EXPECT_FALSE(list_empty(&rpc3->grantable_links)); atomic_set(&rpc3->msgin.rec_incoming, 5000); - homa_grant_free_rpc(rpc3); + homa_grant_end_rpc(rpc3); EXPECT_TRUE(list_empty(&rpc3->grantable_links)); EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); } From 2f6631b414bf70ac8f8bbfe606053810c2566037 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Feb 2025 13:44:43 -0800 Subject: [PATCH 199/625] Use __skb_queue_purge instead of skb_queue_purge (Locking isn't needed; Homa has its own locks) --- homa_rpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_rpc.c b/homa_rpc.c index f0d1b14b..c669e727 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -411,7 +411,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) if (rpc->msgin.length >= 0 && !skb_queue_empty_lockless(&rpc->msgin.packets)) { rx_frees += skb_queue_len(&rpc->msgin.packets); - skb_queue_purge(&rpc->msgin.packets); + __skb_queue_purge(&rpc->msgin.packets); } /* If we get here, it means all packets have been From 5765e0a65b0b4d0b9d4a880adf58af15297104bb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Feb 2025 16:32:29 -0800 Subject: [PATCH 200/625] Rename UNKNOWN packet type to RPC_UNKNOWN --- homa_devel.c | 10 +++++----- homa_impl.h | 2 +- homa_incoming.c | 12 ++++++------ homa_outgoing.c | 12 ++++++------ homa_plumbing.c | 4 ++-- homa_sock.h | 6 ++++++ homa_wire.h | 18 +++++++++--------- test/mock.c | 4 ++-- test/unit_homa_incoming.c | 18 +++++++++--------- test/unit_homa_outgoing.c | 2 +- test/unit_homa_plumbing.c | 2 +- 11 files changed, 48 insertions(+), 42 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index bbfa17ff..bfddc25b 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -212,7 +212,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) #endif /* See strip.py */ break; } - case UNKNOWN: + case RPC_UNKNOWN: /* Nothing to add here. */ break; case BUSY: @@ -341,8 +341,8 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) #endif /* See strip.py */ break; } - case UNKNOWN: - snprintf(buffer, buf_len, "UNKNOWN"); + case RPC_UNKNOWN: + snprintf(buffer, buf_len, "RPC_UNKNOWN"); break; case BUSY: snprintf(buffer, buf_len, "BUSY"); @@ -498,8 +498,8 @@ char *homa_symbol_for_type(uint8_t type) #endif /* See strip.py */ case RESEND: return "RESEND"; - case UNKNOWN: - return "UNKNOWN"; + case RPC_UNKNOWN: + return "RPC_UNKNOWN"; case BUSY: return "BUSY"; #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_impl.h b/homa_impl.h index 640a844d..d205da52 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1171,7 +1171,7 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, void homa_timer(struct homa *homa); int homa_timer_main(void *transport); void homa_unhash(struct sock *sk); -void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_unload(void); #ifndef __STRIP__ /* See strip.py */ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, diff --git a/homa_incoming.c b/homa_incoming.c index c11cc2b0..0c4d374d 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -545,9 +545,9 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) INC_METRIC(packets_received[RESEND - DATA], 1); homa_resend_pkt(skb, rpc, hsk); break; - case UNKNOWN: - INC_METRIC(packets_received[UNKNOWN - DATA], 1); - homa_unknown_pkt(skb, rpc); + case RPC_UNKNOWN: + INC_METRIC(packets_received[RPC_UNKNOWN - DATA], 1); + homa_rpc_unknown_pkt(skb, rpc); break; case BUSY: INC_METRIC(packets_received[BUSY - DATA], 1); @@ -762,7 +762,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_busy_hdr busy; if (!rpc) { - tt_record4("resend request for unknown id %d, peer 0x%x:%d, offset %d; responding with UNKNOWN", + tt_record4("resend request for unknown id %d, peer 0x%x:%d, offset %d; responding with RPC_UNKNOWN", homa_local_id(h->common.sender_id), tt_addr(skb_canonical_ipv6_saddr(skb)), ntohs(h->common.sport), ntohl(h->offset)); @@ -822,12 +822,12 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, } /** - * homa_unknown_pkt() - Handler for incoming UNKNOWN packets. + * homa_rpc_unknown_pkt() - Handler for incoming RPC_UNKNOWN packets. * @skb: Incoming packet; size known to be large enough for the header. * This function now owns the packet. * @rpc: Information about the RPC corresponding to this packet. */ -void homa_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) +void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) { tt_record3("Received unknown for id %llu, peer %x:%d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport); diff --git a/homa_outgoing.c b/homa_outgoing.c index 9e309eb3..11a8b5df 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -538,21 +538,21 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, } /** - * homa_xmit_unknown() - Send an UNKNOWN packet to a peer. + * homa_xmit_unknown() - Send an RPC_UNKNOWN packet to a peer. * @skb: Buffer containing an incoming packet; identifies the peer to - * which the UNKNOWN packet should be sent. - * @hsk: Socket that should be used to send the UNKNOWN packet. + * which the RPC_UNKNOWN packet should be sent. + * @hsk: Socket that should be used to send the RPC_UNKNOWN packet. */ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) { struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data; struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); - struct homa_unknown_hdr unknown; + struct homa_rpc_unknown_hdr unknown; struct homa_peer *peer; #ifndef __STRIP__ /* See strip.py */ if (hsk->homa->verbose) - pr_notice("sending UNKNOWN to peer %s:%d for id %llu", + pr_notice("sending RPC_UNKNOWN to peer %s:%d for id %llu", homa_print_ipv6_addr(&saddr), ntohs(h->sport), homa_local_id(h->sender_id)); #endif /* See strip.py */ @@ -561,7 +561,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) homa_local_id(h->sender_id)); unknown.common.sport = h->dport; unknown.common.dport = h->sport; - unknown.common.type = UNKNOWN; + unknown.common.type = RPC_UNKNOWN; #ifndef __STRIP__ /* See strip.py */ unknown.common.flags = HOMA_TCP_FLAGS; unknown.common.urgent = htons(HOMA_TCP_URGENT); diff --git a/homa_plumbing.c b/homa_plumbing.c index 2b98d0c0..2778c226 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -506,7 +506,7 @@ static __u16 header_lengths[] = { sizeof32(struct homa_data_hdr), sizeof32(struct homa_grant_hdr), sizeof32(struct homa_resend_hdr), - sizeof32(struct homa_unknown_hdr), + sizeof32(struct homa_rpc_unknown_hdr), sizeof32(struct homa_busy_hdr), sizeof32(struct homa_cutoffs_hdr), sizeof32(struct homa_freeze_hdr), @@ -518,7 +518,7 @@ static __u16 header_lengths[] = { sizeof32(struct homa_data_hdr), 0, sizeof32(struct homa_resend_hdr), - sizeof32(struct homa_unknown_hdr), + sizeof32(struct homa_rpc_unknown_hdr), sizeof32(struct homa_busy_hdr), 0, 0, diff --git a/homa_sock.h b/homa_sock.h index d1fcf2d2..246770b2 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -143,6 +143,12 @@ struct homa_sock { */ struct homa *homa; + /** + * @is_server: True means that this socket can act as both client + * and server; false means the socket is client-only. + */ + bool is_server; + /** * @shutdown: True means the socket is no longer usable (either * shutdown has already been invoked, or the socket was never diff --git a/homa_wire.h b/homa_wire.h index c9f77575..56c42b2e 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -17,7 +17,7 @@ enum homa_packet_type { GRANT = 0x11, #endif /* See strip.py */ RESEND = 0x12, - UNKNOWN = 0x13, + RPC_UNKNOWN = 0x13, BUSY = 0x14, #ifndef __STRIP__ /* See strip.py */ CUTOFFS = 0x15, @@ -423,7 +423,7 @@ struct homa_resend_hdr { * @length: Number of bytes of data to retransmit; this could specify * a range longer than the total message size. Zero is a special case * used by servers; in this case, there is no need to actually resend - * anything; the purpose of this packet is to trigger an UNKNOWN + * anything; the purpose of this packet is to trigger an RPC_UNKNOWN * response if the client no longer cares about this RPC. */ __be32 length; @@ -442,20 +442,20 @@ _Static_assert(sizeof(struct homa_resend_hdr) <= HOMA_MAX_HEADER, "homa_resend_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** - * struct homa_unknown_hdr - Wire format for UNKNOWN packets. + * struct homa_rpc_unknown_hdr - Wire format for RPC_UNKNOWN packets. * - * An UNKNOWN packet is sent by either server or client when it receives a + * An RPC_UNKNOWN packet is sent by either server or client when it receives a * packet for an RPC that is unknown to it. When a client receives an - * UNKNOWN packet it will typically restart the RPC from the beginning; - * when a server receives an UNKNOWN packet it will typically discard its + * RPC_UNKNOWN packet it will typically restart the RPC from the beginning; + * when a server receives an RPC_UNKNOWN packet it will typically discard its * state for the RPC. */ -struct homa_unknown_hdr { +struct homa_rpc_unknown_hdr { /** @common: Fields common to all packet types. */ struct homa_common_hdr common; } __packed; -_Static_assert(sizeof(struct homa_unknown_hdr) <= HOMA_MAX_HEADER, - "homa_unknown_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); +_Static_assert(sizeof(struct homa_rpc_unknown_hdr) <= HOMA_MAX_HEADER, + "homa_rpc_unknown_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct homa_busy_hdr - Wire format for BUSY packets. diff --git a/test/mock.c b/test/mock.c index ac1409bd..de443d68 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1588,8 +1588,8 @@ struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct homa_common_hdr *h, case RESEND: header_size = sizeof(struct homa_resend_hdr); break; - case UNKNOWN: - header_size = sizeof(struct homa_unknown_hdr); + case RPC_UNKNOWN: + header_size = sizeof(struct homa_rpc_unknown_hdr); break; case BUSY: header_size = sizeof(struct homa_busy_hdr); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 299c724e..77a62d46 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1038,7 +1038,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) struct homa_grant_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(99991), - .type = UNKNOWN}}; + .type = RPC_UNKNOWN}}; mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), @@ -1093,7 +1093,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); - EXPECT_STREQ("xmit UNKNOWN", unit_log_get()); + EXPECT_STREQ("xmit RPC_UNKNOWN", unit_log_get()); } #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) @@ -1567,7 +1567,7 @@ TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), &self->homa); - EXPECT_STREQ("xmit UNKNOWN", unit_log_get()); + EXPECT_STREQ("xmit RPC_UNKNOWN", unit_log_get()); } TEST_F(homa_incoming, homa_resend_pkt__rpc_in_service_server_sends_busy) { @@ -1719,10 +1719,10 @@ TEST_F(homa_incoming, homa_resend_pkt__server_send_data) TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) { - struct homa_unknown_hdr h = {{.sport = htons(self->server_port), + struct homa_rpc_unknown_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), - .type = UNKNOWN}}; + .type = RPC_UNKNOWN}}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 2000); @@ -1747,10 +1747,10 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) } TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) { - struct homa_unknown_hdr h = {{.sport = htons(self->server_port), + struct homa_rpc_unknown_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), - .type = UNKNOWN}}; + .type = RPC_UNKNOWN}}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 2000, 2000); @@ -1776,10 +1776,10 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) } TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) { - struct homa_unknown_hdr h = {{.sport = htons(self->client_port), + struct homa_rpc_unknown_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->hsk2.port), .sender_id = cpu_to_be64(self->client_id), - .type = UNKNOWN}}; + .type = RPC_UNKNOWN}}; struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 20000); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 9bb36088..d436c167 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -705,7 +705,7 @@ TEST_F(homa_outgoing, homa_xmit_unknown) mock_xmit_log_verbose = 1; skb = mock_skb_new(self->client_ip, &h.common, 0, 0); homa_xmit_unknown(skb, &self->hsk); - EXPECT_STREQ("xmit UNKNOWN from 0.0.0.0:99, dport 40000, id 99991", + EXPECT_STREQ("xmit RPC_UNKNOWN from 0.0.0.0:99, dport 40000, id 99991", unit_log_get()); kfree_skb(skb); } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index dcd01a12..9f290e9b 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -890,7 +890,7 @@ TEST_F(homa_plumbing, homa_softirq__process_control_first) struct homa_common_hdr unknown = { .sport = htons(self->client_port), .dport = htons(self->server_port), - .type = UNKNOWN, + .type = RPC_UNKNOWN, .sender_id = cpu_to_be64(self->client_id) }; struct sk_buff *skb, *skb2; From 9cfed9d6a78d469aae1c6d232c2570a6a87b56d4 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Feb 2025 16:35:34 -0800 Subject: [PATCH 201/625] Add hsk->is_server, setsockopt SO_HOMA_SERVER By default, sockets won't accepting incoming RPCs unless they have been bound. --- README.md | 3 ++ homa.h | 6 +++ homa_incoming.c | 4 +- homa_plumbing.c | 89 +++++++++++++++++++++++++++------------ homa_sock.c | 2 + man/homa.7 | 18 ++++++++ test/mock.c | 1 + test/unit_homa_incoming.c | 17 ++++++++ test/unit_homa_plumbing.c | 85 ++++++++++++++++++++++++++++++------- 9 files changed, 182 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 099de49f..08d56ceb 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,9 @@ This repo contains an implementation of the Homa transport protocol as a Linux k sysctl mechanism. For details, see the man page `homa.7`. ## Significant recent improvements +- February 2025: by default, incoming requests for a socket are rejected + unless the socket has been bound. setsockopt can be used with + SO_HOMA_SERVER to enable or disable incoming requests for any socket. - October 2024: Homa now has an official IANA IP protocol number (146). - August 2024: upgraded to Linux 6.10.6. - July 2024: introduced "TCP hijacking", where Homa packets are sent as diff --git a/homa.h b/homa.h index 4a893d03..a5f692b9 100644 --- a/homa.h +++ b/homa.h @@ -158,6 +158,12 @@ _Static_assert(sizeof(struct homa_abort_args) <= 32, "homa_abort_args grew"); /** define SO_HOMA_RCVBUF: setsockopt option for specifying buffer region. */ #define SO_HOMA_RCVBUF 10 +/** + * define SO_HOMA_SERVER: setsockopt option for specifying whether a + * socket will act as server. + * */ +#define SO_HOMA_SERVER 11 + /** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ struct homa_rcvbuf_args { /** @start: Address of first byte of buffer region in user space. */ diff --git a/homa_incoming.c b/homa_incoming.c index 0c4d374d..7e5b4c9d 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -415,7 +415,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) /* Find the appropriate socket.*/ hsk = homa_sock_find(homa->port_map, dport); - if (!hsk) { + if (!hsk || (!homa_is_client(id) && !hsk->is_server)) { if (skb_is_ipv6(skb)) icmp6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, NULL, IP6CB(skb)); @@ -430,6 +430,8 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) kfree_skb(skb); skb = next; } + if (hsk) + sock_put(&hsk->sock); return; } diff --git a/homa_plumbing.c b/homa_plumbing.c index 2778c226..2732b65f 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -887,32 +887,52 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct homa_sock *hsk = homa_sk(sk); - struct homa_rcvbuf_args args; -#ifndef __STRIP__ /* See strip.py */ - u64 start = sched_clock(); -#endif /* See strip.py */ int ret; - if (level != IPPROTO_HOMA || optname != SO_HOMA_RCVBUF) + if (level != IPPROTO_HOMA) return -ENOPROTOOPT; - if (optlen != sizeof(struct homa_rcvbuf_args)) - return -EINVAL; - if (copy_from_sockptr(&args, optval, optlen)) - return -EFAULT; + if (optname == SO_HOMA_RCVBUF) { + struct homa_rcvbuf_args args; +#ifndef __STRIP__ /* See strip.py */ + u64 start = sched_clock(); +#endif /* See strip.py */ - /* Do a trivial test to make sure we can at least write the first - * page of the region. - */ - if (copy_to_user(u64_to_user_ptr(args.start), &args, - sizeof(args))) - return -EFAULT; + if (optlen != sizeof(struct homa_rcvbuf_args)) + return -EINVAL; + + if (copy_from_sockptr(&args, optval, optlen)) + return -EFAULT; + + /* Do a trivial test to make sure we can at least write the first + * page of the region. + */ + if (copy_to_user(u64_to_user_ptr(args.start), &args, + sizeof(args))) + return -EFAULT; + + homa_sock_lock(hsk); + ret = homa_pool_init(hsk, u64_to_user_ptr(args.start), args.length); + homa_sock_unlock(hsk); + INC_METRIC(so_set_buf_calls, 1); + INC_METRIC(so_set_buf_ns, sched_clock() - start); + } else if (optname == SO_HOMA_SERVER) { + int arg; + + if (optlen != sizeof(arg)) + return -EINVAL; - homa_sock_lock(hsk); - ret = homa_pool_init(hsk, u64_to_user_ptr(args.start), args.length); - homa_sock_unlock(hsk); - INC_METRIC(so_set_buf_calls, 1); - INC_METRIC(so_set_buf_ns, sched_clock() - start); + if (copy_from_sockptr(&arg, optval, optlen)) + return -EFAULT; + + if (arg) + hsk->is_server = true; + else + hsk->is_server = false; + ret = 0; + } else { + ret = -ENOPROTOOPT; + } return ret; } @@ -931,25 +951,40 @@ int homa_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct homa_sock *hsk = homa_sk(sk); - struct homa_rcvbuf_args val; + struct homa_rcvbuf_args rcvbuf_args; + void *result; + int is_server; int len; if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int))) return -EFAULT; - if (level != IPPROTO_HOMA || optname != SO_HOMA_RCVBUF) + if (level != IPPROTO_HOMA) return -ENOPROTOOPT; - if (len < sizeof(val)) - return -EINVAL; + if (optname == SO_HOMA_RCVBUF) { + if (len < sizeof(rcvbuf_args)) + return -EINVAL; - homa_pool_get_rcvbuf(hsk, &val); - len = sizeof(val); + homa_pool_get_rcvbuf(hsk, &rcvbuf_args); + len = sizeof(rcvbuf_args); + result = &rcvbuf_args; + } else if (optname == SO_HOMA_SERVER) { + if (len < sizeof(is_server)) + return -EINVAL; + + is_server = hsk->is_server; + len = sizeof(is_server); + result = &is_server; + } else { + return -ENOPROTOOPT; + } if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int))) return -EFAULT; - if (copy_to_sockptr(USER_SOCKPTR(optval), &val, len)) + if (copy_to_sockptr(USER_SOCKPTR(optval), result, len)) return -EFAULT; + return 0; } diff --git a/homa_sock.c b/homa_sock.c index 7eecdc77..608fd68c 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -141,6 +141,7 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) hsk->homa = homa; hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) ? HOMA_IPV4_HEADER_LENGTH : HOMA_IPV6_HEADER_LENGTH; + hsk->is_server = false; hsk->shutdown = false; starting_port = homa->prev_default_port; while (1) { @@ -340,6 +341,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, hsk->inet.inet_sport = htons(hsk->port); hlist_add_head_rcu(&hsk->socktab_links, &socktab->buckets[homa_port_hash(port)]); + hsk->is_server = true; done: spin_unlock_bh(&socktab->write_lock); homa_sock_unlock(hsk); diff --git a/man/homa.7 b/man/homa.7 index e02a1056..a673ed2b 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -224,6 +224,24 @@ The system call is used to receive messages; see Homa's .BR recvmsg (2) man page for details. +.PP +By default, if +.B bind +has not been invoked for a socket then it can be used only as the client +for outgoing RPCs: incoming requests directed at the socket will be +rejected. Once +.B bind +has been invoked, the socket can act as the server side for incoming +RPCs. In addition, +.B setsockopt +may be invoked with the +.B SO_HOMA_SERVER +option to activate or deactivate any socket for incoming requests. +.B SO_HOMA_SERVER +takes an integer argument, where any nonzero value enables incoming +requests and zero disables them. +The current setting can be retrieved with +.BR getsockopt . .SH ABORTING REQUESTS .PP It is possible to abort RPCs that are in progress. This is done with diff --git a/test/mock.c b/test/mock.c index de443d68..75ba8ead 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1705,6 +1705,7 @@ int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) if (port != 0 && port >= mock_min_default_port) homa->prev_default_port = port - 1; err = homa_sock_init(hsk, homa); + hsk->is_server = true; if (port != 0) homa->prev_default_port = saved_port; if (err != 0) diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 77a62d46..a2783053 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -939,6 +939,23 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv6) EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("icmp6_send type 1, code 4", unit_log_get()); } +TEST_F(homa_incoming, homa_dispatch_pkts__server_not_enabled) +{ + struct sk_buff *skb; + + self->data.common.dport = htons(100); + + // Make sure the test uses IPv4. + mock_ipv6 = false; + homa_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, &self->homa, 0); + self->hsk.is_server = false; + + skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + homa_dispatch_pkts(skb, &self->homa); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_STREQ("icmp_send type 3, code 3", unit_log_get()); +} TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_free_many_packets) { struct sk_buff *skb, *skb2, *skb3; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 9f290e9b..34f7210c 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -166,6 +166,7 @@ TEST_F(homa_plumbing, homa_bind__ipv6_ok) mock_ipv6 = true; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); + self->hsk.is_server = false; addr.in6.sin6_family = AF_INET6; addr.in6.sin6_port = htons(123); @@ -173,6 +174,7 @@ TEST_F(homa_plumbing, homa_bind__ipv6_ok) result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)); EXPECT_EQ(0, -result); EXPECT_EQ(123, self->hsk.port); + EXPECT_EQ(1, self->hsk.is_server); } TEST_F(homa_plumbing, homa_bind__ipv4_address_too_short) { @@ -200,6 +202,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_ok) mock_ipv6 = false; homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); + self->hsk.is_server = false; addr.in4.sin_family = AF_INET; addr.in4.sin_port = htons(345); @@ -207,6 +210,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_ok) result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)); EXPECT_EQ(0, -result); EXPECT_EQ(345, self->hsk.port); + EXPECT_EQ(1, self->hsk.is_server); } #ifndef __STRIP__ /* See strip.py */ @@ -269,30 +273,30 @@ TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) EXPECT_EQ(ENOMEM, -homa_socket(&sock.sock)); } -TEST_F(homa_plumbing, homs_setsockopt__bad_level) +TEST_F(homa_plumbing, homa_setsockopt__bad_level) { EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, 0, 0, self->optval, sizeof(struct homa_rcvbuf_args))); } -TEST_F(homa_plumbing, homs_setsockopt__bad_optname) +TEST_F(homa_plumbing, homa_setsockopt__bad_optname) { EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, 0, self->optval, sizeof(struct homa_rcvbuf_args))); } -TEST_F(homa_plumbing, homs_setsockopt__bad_optlen) +TEST_F(homa_plumbing, homa_setsockopt__recvbuf_bad_optlen) { EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, self->optval, sizeof(struct homa_rcvbuf_args) - 1)); } -TEST_F(homa_plumbing, homs_setsockopt__copy_from_sockptr_fails) +TEST_F(homa_plumbing, homa_setsockopt__recvbuf_copy_from_sockptr_fails) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, self->optval, sizeof(struct homa_rcvbuf_args))); } -TEST_F(homa_plumbing, homa_setsockopt__copy_to_user_fails) +TEST_F(homa_plumbing, homa_setsockopt__recvbuf_copy_to_user_fails) { struct homa_rcvbuf_args args = {0x100000, 5*HOMA_BPAGE_SIZE}; @@ -302,7 +306,7 @@ TEST_F(homa_plumbing, homa_setsockopt__copy_to_user_fails) SO_HOMA_RCVBUF, self->optval, sizeof(struct homa_rcvbuf_args))); } -TEST_F(homa_plumbing, homa_setsockopt__success) +TEST_F(homa_plumbing, homa_setsockopt__recvbuf_success) { struct homa_rcvbuf_args args; char buffer[5000]; @@ -321,9 +325,34 @@ TEST_F(homa_plumbing, homa_setsockopt__success) EXPECT_EQ(1, homa_metrics_per_cpu()->so_set_buf_calls); #endif /* See strip.py */ } +TEST_F(homa_plumbing, homa_setsockopt__server_bad_optlen) +{ + EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, self->optval, sizeof(int) - 1)); +} +TEST_F(homa_plumbing, homa_setsockopt__server_copy_from_sockptr_fails) +{ + mock_copy_data_errors = 1; + EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, self->optval, sizeof(int))); +} +TEST_F(homa_plumbing, homa_setsockopt__server_success) +{ + int arg = 7; + self->optval.user = &arg; + EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, self->optval, sizeof(int))); + EXPECT_EQ(1, self->hsk.is_server); -TEST_F(homa_plumbing, homa_getsockopt__success) + arg = 0; + EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, self->optval, sizeof(int))); + EXPECT_EQ(0, self->hsk.is_server); +} + + +TEST_F(homa_plumbing, homa_getsockopt__recvbuf_success) { struct homa_rcvbuf_args val; int size = sizeof32(val) + 10; @@ -353,21 +382,47 @@ TEST_F(homa_plumbing, homa_getsockopt__bad_level) EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, 0, SO_HOMA_RCVBUF, (char *)&val, &size)); } -TEST_F(homa_plumbing, homa_getsockopt__bad_optname) +TEST_F(homa_plumbing, homa_getsockopt__recvbuf_bad_length) { struct homa_rcvbuf_args val; - int size = sizeof32(val); + int size = sizeof32(val) - 1; - EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_RCVBUF-1, (char *)&val, &size)); + EXPECT_EQ(EINVAL, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF, (char *)&val, &size)); } -TEST_F(homa_plumbing, homa_getsockopt__bad_length) +TEST_F(homa_plumbing, homa_getsockopt__server_bad_length) { - struct homa_rcvbuf_args val; - int size = sizeof32(val) - 1; + int is_server; + int size = sizeof32(is_server) - 1; EXPECT_EQ(EINVAL, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, - SO_HOMA_RCVBUF, (char *)&val, &size)); + SO_HOMA_SERVER, (char *)&is_server, &size)); +} +TEST_F(homa_plumbing, homa_getsockopt__server_success) +{ + int is_server; + int size = sizeof32(is_server); + + self->hsk.is_server = 1; + EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, (char *)&is_server, &size)); + EXPECT_EQ(1, is_server); + EXPECT_EQ(sizeof(int), size); + + self->hsk.is_server = 0; + size = 20; + EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_SERVER, (char *)&is_server, &size)); + EXPECT_EQ(0, is_server); + EXPECT_EQ(sizeof(int), size); +} +TEST_F(homa_plumbing, homa_getsockopt__bad_optname) +{ + struct homa_rcvbuf_args val; + int size = sizeof32(val); + + EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, + SO_HOMA_RCVBUF-1, (char *)&val, &size)); } TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_size) { From 419f45b044203055b0192378387f2761fffe68d8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 10 Mar 2025 16:05:55 -0700 Subject: [PATCH 202/625] New mock function needed when KASAN is enabled --- test/mock.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/mock.c b/test/mock.c index 75ba8ead..b8b4791b 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1046,6 +1046,11 @@ int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) return 1; } +int rcu_read_lock_any_held(void) +{ + return 1; +} + #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) int rcu_read_lock_held(void) { From 1c5f6f56012307c43014d868e37378cb2442173f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 10 Mar 2025 16:06:13 -0700 Subject: [PATCH 203/625] Fix race in homa_rpc.c Not safe to reap RPC unless simultaneously locked and unref-ed. --- homa_rpc.c | 23 +++++++++++++++-------- test/unit_homa_rpc.c | 27 +++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index c669e727..940b033a 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -382,7 +382,21 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) /* Collect buffers and freeable RPCs. */ list_for_each_entry_safe(rpc, tmp, &hsk->dead_rpcs, dead_links) { - if (atomic_read(&rpc->refs) != 0) { + int refs; + + /* Make sure that all outstanding uses of the RPC have + * completed. We can only be sure if the reference + * count is zero when we're holding the lock. Note: + * it isn't safe to block while locking the RPC here, + * since we hold the socket lock. + */ + if (homa_rpc_try_lock(rpc)) { + refs = atomic_read(&rpc->refs); + homa_rpc_unlock(rpc); + } else { + refs = 1; + } + if (refs != 0) { INC_METRIC(disabled_rpc_reaps, 1); continue; } @@ -436,13 +450,6 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) for (i = 0; i < num_rpcs; i++) { rpc = rpcs[i]; UNIT_LOG("; ", "reaped %llu", rpc->id); - /* Lock and unlock the RPC before freeing it. This - * is needed to deal with races where the code - * that invoked homa_rpc_end hasn't unlocked the - * RPC yet. - */ - homa_rpc_lock(rpc); - homa_rpc_unlock(rpc); if (unlikely(rpc->msgin.num_bpages)) homa_pool_release_buffers(rpc->hsk->buffer_pool, diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index fc2187f6..a4a6f33d 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -586,6 +586,30 @@ TEST_F(homa_rpc, homa_rpc_reap__protected_and_reap_all) EXPECT_STREQ("reaped 1234", unit_log_get()); EXPECT_EQ(0, self->hsk.dead_skbs); } +TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_locked) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 2000); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + unit_log_clear(); + self->homa.reap_limit = 3; + mock_trylock_errors = 2; + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1236", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->disabled_rpc_reaps); + unit_log_clear(); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + EXPECT_EQ(1, homa_metrics_per_cpu()->disabled_rpc_reaps); + EXPECT_STREQ("reaped 1234", unit_log_get()); +} TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) { struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, @@ -604,12 +628,15 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) self->homa.reap_limit = 3; EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->disabled_rpc_reaps); unit_log_clear(); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + EXPECT_EQ(2, homa_metrics_per_cpu()->disabled_rpc_reaps); EXPECT_STREQ("", unit_log_get()); homa_rpc_put(crpc1); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1234", unit_log_get()); + EXPECT_EQ(2, homa_metrics_per_cpu()->disabled_rpc_reaps); } TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) { From 52eb2e042398e68491d1905bb595b8f2e1ff4c3c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 3 Mar 2025 15:34:59 -0800 Subject: [PATCH 204/625] Refactor waiting * Replace homa_wait_message with homa_wait_private and homa_wait_shared. * Change API: eliminate HOMA_RECVMSG_REQUEST and HOMA_RECVMSG_RESPONSE flags. * Add flags field to struct homa_sendmsg_args. * Refactor struct homa_interest. * Create homa_interest.c and homa_interest.h --- Makefile | 2 + README.md | 4 + homa.h | 34 +- homa_api.c | 16 +- homa_impl.h | 207 ++-------- homa_incoming.c | 580 +++++++------------------- homa_interest.c | 173 ++++++++ homa_interest.h | 98 +++++ homa_plumbing.c | 50 ++- homa_receiver.cc | 11 +- homa_rpc.c | 10 +- homa_rpc.h | 26 +- homa_sock.c | 22 +- homa_sock.h | 26 +- man/homa_send.3 | 24 +- man/recvmsg.2 | 43 +- man/sendmsg.2 | 23 +- test/Makefile | 2 + test/mock.c | 113 +++++- test/mock.h | 6 +- test/unit_homa_incoming.c | 829 ++++++++------------------------------ test/unit_homa_interest.c | 313 ++++++++++++++ test/unit_homa_plumbing.c | 51 ++- test/unit_homa_rpc.c | 43 +- test/unit_homa_sock.c | 29 +- test/utils.c | 1 - util/cp_node.cc | 11 +- util/homa_test.cc | 80 +++- util/server.cc | 1 - 29 files changed, 1403 insertions(+), 1425 deletions(-) create mode 100644 homa_interest.c create mode 100644 homa_interest.h create mode 100644 test/unit_homa_interest.c diff --git a/Makefile b/Makefile index cd8ad375..dd8258e2 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ HOMA_OBJS := homa_devel.o \ homa_incoming.o \ + homa_interest.o \ homa_outgoing.o \ homa_peer.o \ homa_pool.o \ @@ -52,6 +53,7 @@ check: LINUX_SRC_DIR ?= ../net-next HOMA_TARGET ?= $(LINUX_SRC_DIR)/net/homa CP_HDRS := homa_impl.h \ + homa_interest.h \ homa_peer.h \ homa_pool.h \ homa_rpc.h \ diff --git a/README.md b/README.md index 08d56ceb..ddd6b247 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,10 @@ This repo contains an implementation of the Homa transport protocol as a Linux k sysctl mechanism. For details, see the man page `homa.7`. ## Significant recent improvements +- March 2025: implemented private RPCs, resulting in API changes. + HOMA_RECVMSG_REQUEST and HOMA_RECVMSG_RESPONSE flags no longer exist and + struct homa_sendmsg_args now has a flags field with one defined + flag: HOMA_SENDMSG_PRIVATE. - February 2025: by default, incoming requests for a socket are rejected unless the socket has been bound. setsockopt can be used with SO_HOMA_SERVER to enable or disable incoming requests for any socket. diff --git a/homa.h b/homa.h index a5f692b9..5b170242 100644 --- a/homa.h +++ b/homa.h @@ -63,23 +63,37 @@ struct homa_sendmsg_args { * locate app-specific info about the RPC. */ __u64 completion_cookie; + + /** + * @flags: (in) OR-ed combination of bits that control the operation. + * See below for values. + */ + __u32 flags; + + __u32 reserved; }; #if !defined(__cplusplus) -_Static_assert(sizeof(struct homa_sendmsg_args) >= 16, +_Static_assert(sizeof(struct homa_sendmsg_args) >= 24, "homa_sendmsg_args shrunk"); -_Static_assert(sizeof(struct homa_sendmsg_args) <= 16, +_Static_assert(sizeof(struct homa_sendmsg_args) <= 24, "homa_sendmsg_args grew"); #endif +/* Flag bits for homa_sendmsg_args.flags (see man page for documentation): + */ +#define HOMA_SENDMSG_PRIVATE 0x01 +#define HOMA_SENDMSG_VALID_FLAGS 0x01 + /** * struct homa_recvmsg_args - Provides information needed by Homa's * recvmsg; passed to recvmsg using the msg_control field. */ struct homa_recvmsg_args { /** - * @id: (in/out) Initially specifies the id of the desired RPC, or 0 - * if any RPC is OK; returns the actual id received. + * @id: (in/out) Initial value is 0 to wait for any shared RPC; + * nonzero means wait for that specific (private) RPC. Returns + * the id of the RPC received. */ __u64 id; @@ -125,10 +139,8 @@ _Static_assert(sizeof(struct homa_recvmsg_args) <= 88, /* Flag bits for homa_recvmsg_args.flags (see man page for documentation): */ -#define HOMA_RECVMSG_REQUEST 0x01 -#define HOMA_RECVMSG_RESPONSE 0x02 -#define HOMA_RECVMSG_NONBLOCKING 0x04 -#define HOMA_RECVMSG_VALID_FLAGS 0x07 +#define HOMA_RECVMSG_NONBLOCKING 0x01 +#define HOMA_RECVMSG_VALID_FLAGS 0x01 #ifndef __STRIP__ /* See strip.py */ /** @@ -198,10 +210,12 @@ int homa_abort(int sockfd, __u64 id, int error); #endif /* See strip.py */ int homa_send(int sockfd, const void *message_buf, size_t length, const struct sockaddr *dest_addr, - __u32 addrlen, __u64 *id, __u64 completion_cookie); + __u32 addrlen, __u64 *id, __u64 completion_cookie, + int flags); int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, const struct sockaddr *dest_addr, - __u32 addrlen, __u64 *id, __u64 completion_cookie); + __u32 addrlen, __u64 *id, __u64 completion_cookie, + int flags); ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, const struct sockaddr *dest_addr, __u32 addrlen, __u64 id); diff --git a/homa_api.c b/homa_api.c index 8fc125f9..296cba91 100644 --- a/homa_api.c +++ b/homa_api.c @@ -46,6 +46,8 @@ ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, args.id = id; args.completion_cookie = 0; + args.flags = 0; + args.reserved = 0; vec.iov_base = (void *)message_buf; vec.iov_len = length; @@ -90,6 +92,8 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, args.id = id; args.completion_cookie = 0; + args.flags = 0; + args.reserved = 0; hdr.msg_name = (void *)dest_addr; hdr.msg_namelen = addrlen; @@ -113,13 +117,15 @@ ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, * here; this can be used later to find the response for * this request. * @completion_cookie: Value to be returned by recvmsg when RPC completes. + * @flags: Flag bits to pass to the sendmsg kernel call, such + * as HOMA_SENDMSG_PRIVATE; see man page for complete info. * * Return: 0 means the request has been accepted for delivery. If an * error occurred, -1 is returned and errno is set appropriately. */ int homa_send(int sockfd, const void *message_buf, size_t length, const struct sockaddr *dest_addr, __u32 addrlen, - __u64 *id, __u64 completion_cookie) + __u64 *id, __u64 completion_cookie, int flags) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -128,6 +134,8 @@ int homa_send(int sockfd, const void *message_buf, size_t length, args.id = 0; args.completion_cookie = completion_cookie; + args.flags = flags; + args.reserved = 0; vec.iov_base = (void *)message_buf; vec.iov_len = length; @@ -158,13 +166,15 @@ int homa_send(int sockfd, const void *message_buf, size_t length, * here; this can be used later to find the response for * this request. * @completion_cookie: Value to be returned by recvmsg when RPC completes. + * @flags: Flag bits to pass to the sendmsg kernel call, such + * as HOMA_SENDMSG_PRIVATE; see man page for complete info. * * Return: 0 means the request has been accepted for delivery. If an * error occurred, -1 is returned and errno is set appropriately. */ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, const struct sockaddr *dest_addr, __u32 addrlen, - __u64 *id, __u64 completion_cookie) + __u64 *id, __u64 completion_cookie, int flags) { struct homa_sendmsg_args args; struct msghdr hdr; @@ -172,6 +182,8 @@ int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, args.id = 0; args.completion_cookie = completion_cookie; + args.flags = flags; + args.reserved = 0; hdr.msg_name = (void *)dest_addr; hdr.msg_namelen = addrlen; diff --git a/homa_impl.h b/homa_impl.h index d205da52..5e7e4403 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -106,115 +106,6 @@ union sockaddr_in_union { struct sockaddr_in6 in6; }; -/** - * struct homa_interest - Contains various information used while waiting - * for incoming messages (indicates what kinds of messages a particular - * thread is interested in receiving). - */ -struct homa_interest { - /** - * @thread: Thread that would like to receive a message. Will get - * woken up when a suitable message becomes available. - */ - struct task_struct *thread; - - /** - * @rpc_ready: Non-zero means an appropriate incoming message has - * been assigned to this interest, and @rpc and @locked are valid - * (they must be set before setting this variable). - */ - atomic_t rpc_ready; - - /** - * @rpc: If @rpc_ready is non-zero, points to an RPC with a ready - * incoming message that meets the requirements of this interest. - */ - struct homa_rpc *rpc; - - /** - * @locked: Nonzero means that @rpc is locked; only valid if - * @rpc_ready is non-zero. - */ - int locked; - - /** - * @core: Core on which @thread was executing when it registered - * its interest. This is a hint used for load balancing - * (see balance.txt). - */ - int core; - - /** - * @reg_rpc: RPC whose @interest field points here, or - * NULL if none. - */ - struct homa_rpc *reg_rpc; - - /** - * @request_links: For linking this object into - * &homa_sock.request_interests. The interest must not be linked - * on either this list or @response_links if @id is nonzero. - */ - struct list_head request_links; - - /** - * @response_links: For linking this object into - * &homa_sock.request_interests. - */ - struct list_head response_links; -}; - -/** - * homa_interest_init() - Fill in default values for all of the fields - * of a struct homa_interest. - * @interest: Struct to initialize. - */ -static inline void homa_interest_init(struct homa_interest *interest) -{ - interest->thread = current; - atomic_set(&interest->rpc_ready, 0); - interest->rpc = NULL; - interest->locked = 0; - - /* Safe (and necessary) to use raw_smp_processor_id: this is only - * a hint. - */ - interest->core = raw_smp_processor_id(); - interest->reg_rpc = NULL; - INIT_LIST_HEAD(&interest->request_links); - INIT_LIST_HEAD(&interest->response_links); -} - -/** - * homa_interest_get_rpc() - Return the ready RPC stored in an interest, - * if there is one. - * @interest: Struct to check - * Return: the ready RPC, or NULL if none. If an RPC is returned, a - * reference has been taken on it; caller must call homa_rpc_put(). - */ -static inline struct homa_rpc *homa_interest_get_rpc(struct homa_interest *interest) -{ - if (atomic_read(&interest->rpc_ready)) - return interest->rpc; - return NULL; -} - -/** - * homa_interest_set_rpc() - Hand off a ready RPC to an interest from a - * waiting receiver thread. - * @interest: Belongs to a thread that is waiting for an incoming message. - * @rpc: Ready rpc to assign to @interest. Caller must have taken a - * reference by calling homa_rpc_hold(). - * @locked: 1 means @rpc is locked, 0 means unlocked. - */ -static inline void homa_interest_set_rpc(struct homa_interest *interest, - struct homa_rpc *rpc, int locked) -{ - interest->rpc = rpc; - interest->locked = locked; - atomic_set_release(&interest->rpc_ready, 1); -} - /** * struct homa - Overall information about the Homa protocol implementation. * @@ -1063,30 +954,12 @@ int homa_bind(struct socket *sk, struct sockaddr *addr, int addr_len); int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force); -#ifndef __STRIP__ /* See strip.py */ -struct homa_rpc *homa_choose_fifo_grant(struct homa *homa); -#endif /* See strip.py */ -struct homa_interest *homa_choose_interest(struct homa *homa, - struct list_head *head, - int offset); void homa_close(struct sock *sock, long timeout); int homa_copy_to_user(struct homa_rpc *rpc); -#ifndef __STRIP__ /* See strip.py */ -void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); -#endif /* See strip.py */ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_destroy(struct homa *homa); int homa_disconnect(struct sock *sk, int flags); void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); -#ifndef __STRIP__ /* See strip.py */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -int homa_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#else -int homa_dointvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif -#endif /* See strip.py */ int homa_err_handler_v4(struct sk_buff *skb, u32 info); int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, @@ -1101,19 +974,8 @@ int homa_getsockopt(struct sock *sk, int level, int optname, int homa_hash(struct sock *sk); enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); int homa_init(struct homa *homa); -#ifndef __STRIP__ /* See strip.py */ -void homa_incoming_sysctl_changed(struct homa *homa); -int homa_ioc_abort(struct sock *sk, int *karg); -#endif /* See strip.py */ int homa_ioctl(struct sock *sk, int cmd, int *karg); int homa_load(void); -#ifndef __STRIP__ /* See strip.py */ -void homa_log_throttled(struct homa *homa); -int homa_message_in_init(struct homa_rpc *rpc, int length, - int unsched); -#else /* See strip.py */ -int homa_message_in_init(struct homa_rpc *rpc, int unsched); -#endif /* See strip.py */ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit); void homa_message_out_init(struct homa_rpc *rpc, int length); @@ -1122,28 +984,14 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data); -#ifndef __STRIP__ /* See strip.py */ -void homa_outgoing_sysctl_changed(struct homa *homa); -#endif /* See strip.py */ int homa_pacer_main(void *transport); void homa_pacer_stop(struct homa *homa); bool homa_pacer_xmit(struct homa *homa); __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); -#ifndef __STRIP__ /* See strip.py */ -void homa_prios_changed(struct homa *homa); -#endif /* See strip.py */ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); -int homa_register_interests(struct homa_interest *interest, - struct homa_sock *hsk, int flags, u64 id); void homa_remove_from_throttled(struct homa_rpc *rpc); -#ifndef __STRIP__ /* See strip.py */ -void homa_resend_data(struct homa_rpc *rpc, int start, int end, - int priority); -#else /* See strip.py */ -void homa_resend_data(struct homa_rpc *rpc, int start, int end); -#endif /* See strip.py */ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk); void homa_rpc_abort(struct homa_rpc *crpc, int error); @@ -1157,7 +1005,41 @@ int homa_setsockopt(struct sock *sk, int level, int optname, int homa_shutdown(struct socket *sock, int how); int homa_softirq(struct sk_buff *skb); void homa_spin(int ns); +void homa_timer(struct homa *homa); +int homa_timer_main(void *transport); +void homa_unhash(struct sock *sk); +void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); +void homa_unload(void); +int homa_wait_private(struct homa_rpc *rpc, int nonblocking); +struct homa_rpc + *homa_wait_shared(struct homa_sock *hsk, int nonblocking); +int homa_xmit_control(enum homa_packet_type type, void *contents, + size_t length, struct homa_rpc *rpc); +int __homa_xmit_control(void *contents, size_t length, + struct homa_peer *peer, struct homa_sock *hsk); +void homa_xmit_data(struct homa_rpc *rpc, bool force); +void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); + #ifndef __STRIP__ /* See strip.py */ +struct homa_rpc + *homa_choose_fifo_grant(struct homa *homa); +void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) +int homa_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#else +int homa_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#endif +void homa_incoming_sysctl_changed(struct homa *homa); +int homa_ioc_abort(struct sock *sk, int *karg); +void homa_log_throttled(struct homa *homa); +int homa_message_in_init(struct homa_rpc *rpc, int length, + int unsched); +void homa_outgoing_sysctl_changed(struct homa *homa); +void homa_prios_changed(struct homa *homa); +void homa_resend_data(struct homa_rpc *rpc, int start, int end, + int priority); #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) int homa_sysctl_softirq_cores(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -1166,33 +1048,18 @@ int homa_sysctl_softirq_cores(struct ctl_table *table, int write, int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -#endif -#endif /* See strip.py */ -void homa_timer(struct homa *homa); -int homa_timer_main(void *transport); -void homa_unhash(struct sock *sk); -void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -void homa_unload(void); -#ifndef __STRIP__ /* See strip.py */ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, int length); int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); -#endif /* See strip.py */ -struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, - u64 id); -int homa_xmit_control(enum homa_packet_type type, void *contents, - size_t length, struct homa_rpc *rpc); -int __homa_xmit_control(void *contents, size_t length, - struct homa_peer *peer, struct homa_sock *hsk); -void homa_xmit_data(struct homa_rpc *rpc, bool force); -#ifndef __STRIP__ /* See strip.py */ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority); +#endif #else /* See strip.py */ +int homa_message_in_init(struct homa_rpc *rpc, int unsched); +void homa_resend_data(struct homa_rpc *rpc, int start, int end); void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc); #endif /* See strip.py */ -void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); /** * homa_check_pacer() - This method is invoked at various places in Homa to diff --git a/homa_incoming.c b/homa_incoming.c index 7e5b4c9d..8962f689 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -9,12 +9,14 @@ #endif /* See strip.py */ #include "homa_impl.h" +#include "homa_interest.h" +#include "homa_peer.h" +#include "homa_pool.h" + #ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" #include "homa_offload.h" #endif /* See strip.py */ -#include "homa_peer.h" -#include "homa_pool.h" #ifndef __STRIP__ /* See strip.py */ /** @@ -241,7 +243,8 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) * It is possible for the RPC to be freed while this function * executes (it releases and reacquires the RPC lock). If that * happens, -EINVAL will be returned and the state of @rpc - * will be RPC_DEAD. + * will be RPC_DEAD. Clears the RPC_PKTS_READY bit in @rpc->flags + * if all available packets have been copied out. */ int homa_copy_to_user(struct homa_rpc *rpc) __releases(rpc->bucket_lock) @@ -286,8 +289,10 @@ int homa_copy_to_user(struct homa_rpc *rpc) if (n < MAX_SKBS) continue; } - if (n == 0) + if (n == 0) { + atomic_andnot(RPC_PKTS_READY, &rpc->flags); break; + } /* At this point we've collected a batch of packets (or * run out of packets); copy any available packets out to @@ -680,9 +685,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) if (skb_queue_len(&rpc->msgin.packets) != 0 && !(atomic_read(&rpc->flags) & RPC_PKTS_READY)) { atomic_or(RPC_PKTS_READY, &rpc->flags); - homa_sock_lock(rpc->hsk); homa_rpc_handoff(rpc); - homa_sock_unlock(rpc->hsk); } #ifndef __STRIP__ /* See strip.py */ @@ -1110,10 +1113,7 @@ void homa_rpc_abort(struct homa_rpc *rpc, int error) tt_record3("aborting client RPC: peer 0x%x, id %d, error %d", tt_addr(rpc->peer->addr), rpc->id, error); rpc->error = error; - homa_sock_lock(rpc->hsk); - if (!rpc->hsk->shutdown) - homa_rpc_handoff(rpc); - homa_sock_unlock(rpc->hsk); + homa_rpc_handoff(rpc); } /** @@ -1198,476 +1198,198 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) } /** - * homa_register_interests() - Records information in various places so - * that a thread will be woken up if an RPC that it cares about becomes - * available. - * @interest: Used to record information about the messages this thread is - * waiting on. The initial contents of the structure are - * assumed to be undefined. - * @hsk: Socket on which relevant messages will arrive. Must not be - * locked. - * @flags: Flags field from homa_recvmsg_args; see manual entry for - * details. - * @id: If non-zero, then the caller is interested in receiving - * the response for this RPC (@id must be a client request). - * Return: Either zero or a negative errno value. If a matching RPC - * is already available, information about it will be stored in - * interest. + * homa_wait_private() - Waits until the response has been received for + * a specific RPC or the RPC has failed with an error. + * @rpc: RPC to wait for; an error will be returned if the RPC is + * not a client RPC or not private. Must be locked by caller. + * @nonblocking: Nonzero means return immediately if @rpc not ready. + * Return: 0 if the response has been successfully received, otherwise + * a negative errno. */ -int homa_register_interests(struct homa_interest *interest, - struct homa_sock *hsk, int flags, u64 id) +int homa_wait_private(struct homa_rpc *rpc, int nonblocking) + __must_hold(&rpc->bucket->lock) { - struct homa_rpc *rpc = NULL; - int locked = 1; - - homa_interest_init(interest); - if (id != 0) { - if (!homa_is_client(id)) - return -EINVAL; - rpc = homa_find_client_rpc(hsk, id); /* Locks rpc. */ - if (!rpc) - return -EINVAL; - if (rpc->interest && rpc->interest != interest) { - homa_rpc_unlock(rpc); - return -EINVAL; - } - } + struct homa_interest interest; + int result = 0; + int iteration; - /* Need both the RPC lock (acquired above) and the socket lock to - * avoid races. - */ - homa_sock_lock(hsk); - if (hsk->shutdown) { - homa_sock_unlock(hsk); - if (rpc) - homa_rpc_unlock(rpc); - return -ESHUTDOWN; - } + if (!(atomic_read(&rpc->flags) & RPC_PRIVATE)) + return -EINVAL; - if (id != 0) { - if ((atomic_read(&rpc->flags) & RPC_PKTS_READY) || rpc->error) - goto claim_rpc; - rpc->interest = interest; - interest->reg_rpc = rpc; - homa_rpc_unlock(rpc); - } + homa_rpc_hold(rpc); - locked = 0; - if (flags & HOMA_RECVMSG_RESPONSE) { - if (!list_empty(&hsk->ready_responses)) { - rpc = list_first_entry(&hsk->ready_responses, - struct homa_rpc, - ready_links); - goto claim_rpc; + /* Each iteration through this loop waits until rpc needs attention + * in some way (e.g. packets have arrived), then deals with that need + * (e.g. copy to user space). It may take many iterations until the + * RPC is ready for the application. + */ + for (iteration = 0; ; iteration++) { + if (!rpc->error) + rpc->error = homa_copy_to_user(rpc); + if (rpc->error) { + result = rpc->error; + break; } - /* Insert this thread at the *front* of the list; - * we'll get better cache locality if we reuse - * the same thread over and over, rather than - * round-robining between threads. Same below. - */ - list_add(&interest->response_links, - &hsk->response_interests); - } - if (flags & HOMA_RECVMSG_REQUEST) { - if (!list_empty(&hsk->ready_requests)) { - rpc = list_first_entry(&hsk->ready_requests, - struct homa_rpc, ready_links); - /* Make sure the interest isn't on the response list; - * otherwise it might receive a second RPC. - */ - if (!list_empty(&interest->response_links)) - list_del_init(&interest->response_links); - goto claim_rpc; + if (rpc->msgin.length >= 0 && + rpc->msgin.bytes_remaining == 0 && + skb_queue_len(&rpc->msgin.packets) == 0) { + if (iteration == 0) + INC_METRIC(fast_wakeups, 1); + break; } - list_add(&interest->request_links, &hsk->request_interests); - } - homa_sock_unlock(hsk); - return 0; -claim_rpc: - list_del_init(&rpc->ready_links); - if (!list_empty(&hsk->ready_requests) || - !list_empty(&hsk->ready_responses)) { - // There are still more RPCs available, so let Linux know. - hsk->sock.sk_data_ready(&hsk->sock); - } + result = homa_interest_init_private(&interest, rpc); + if (result != 0) + break; + + homa_rpc_unlock(rpc); + result = homa_interest_wait(&interest, nonblocking ); - /* Must take a reference on the RPC before storing in interest - * (match the behavior of homa_rpc_handoff). This also prevents - * the RPC from being reaped during the gap between when we release - * the socket lock and we acquire the RPC lock. - */ - homa_rpc_hold(rpc); - homa_sock_unlock(hsk); - if (!locked) { atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); - locked = 1; + homa_interest_unlink_private(&interest); + + /* If homa_interest_wait returned an error but the interest + * actually got ready, then ignore the error. + */ + if (result != 0 && atomic_read(&interest.ready) == 0) + break; } - homa_interest_set_rpc(interest, rpc, locked); - return 0; + + homa_rpc_put(rpc); + return result; } /** - * homa_wait_for_message() - Wait for receipt of an incoming message - * that matches the parameters. Various other activities can occur while - * waiting, such as reaping dead RPCs and copying data to user space. - * @hsk: Socket where messages will arrive. - * @flags: Flags field from homa_recvmsg_args; see manual entry for - * details. - * @id: If non-zero, then a response message matching this id may - * be returned (@id must refer to a client request). + * homa_wait_shared() - Wait for the completion of any non-private + * incoming message on a socket. + * @hsk: Socket on which to wait. Must not be locked. + * @nonblocking: Nonzero means return immediately if no RPC is ready. * - * Return: Pointer to an RPC that matches @flags and @id, or a negative - * errno value. The RPC will be locked; the caller must unlock. + * Return: Pointer to an RPC with a complete incoming message or nonzero + * error field, or a negative errno (usually -EINTR). If an RPC + * is returned it will be locked and the caller must unlock. */ -struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, - u64 id) - __acquires(&rpc->bucket_lock) +struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) { -#ifndef __STRIP__ /* See strip.py */ - u64 poll_start, poll_end, now; -#endif /* See strip.py */ - struct homa_rpc *result = NULL; struct homa_interest interest; - struct homa_rpc *rpc = NULL; -#ifndef __STRIP__ /* See strip.py */ - int blocked = 0, polled = 0; -#endif /* See strip.py */ - int error; + struct homa_rpc *rpc; + int iteration; + int result; - /* Each iteration of this loop finds an RPC, but it might not be - * in a state where we can return it (e.g., there might be packets - * ready to transfer to user space, but the incoming message isn't yet - * complete). Thus it could take many iterations of this loop - * before we have an RPC with a complete message. + /* Each iteration through this loop waits until an RPC needs attention + * in some way (e.g. packets have arrived), then deals with that need + * (e.g. copy to user space). It may take many iterations until an + * RPC is ready for the application. */ - while (1) { - error = homa_register_interests(&interest, hsk, flags, id); - rpc = homa_interest_get_rpc(&interest); - if (rpc) - goto found_rpc; - if (error < 0) { - result = ERR_PTR(error); - goto found_rpc; - } - - /* There is no ready RPC so far. Clean up dead RPCs before - * going to sleep (or returning, if in nonblocking mode). - */ - while (1) { - int reaper_result; - - rpc = homa_interest_get_rpc(&interest); - if (rpc) { - tt_record1("received RPC handoff while reaping, id %d", - rpc->id); - goto found_rpc; - } - reaper_result = homa_rpc_reap(hsk, false); - if (reaper_result == 0) - break; - - /* Give NAPI and SoftIRQ tasks a chance to run. */ - schedule(); - } - if (flags & HOMA_RECVMSG_NONBLOCKING) { - result = ERR_PTR(-EAGAIN); - goto found_rpc; + for (iteration = 0; ; iteration++) { + homa_sock_lock(hsk); + if (hsk->shutdown) { + rpc = ERR_PTR(-ESHUTDOWN); + homa_sock_unlock(hsk); + goto done; } - -#ifndef __STRIP__ /* See strip.py */ - // tt_record4("Preparing to poll, socket %d, flags 0x%x, pid %d, poll_usecs %d", - // hsk->port, flags, current->pid, - // hsk->homa->poll_usecs); - - /* Busy-wait for a while before going to sleep; this avoids - * context-switching overhead to wake up. - */ - now = sched_clock(); - poll_start = now; - poll_end = now + (1000 * hsk->homa->poll_usecs); - while (1) { - u64 blocked; - - rpc = homa_interest_get_rpc(&interest); - if (rpc) { - tt_record3("received RPC handoff while polling, id %d, socket %d, pid %d", - rpc->id, hsk->port, - current->pid); - polled = 1; - INC_METRIC(poll_ns, now - poll_start); - goto found_rpc; - } - if (now >= poll_end) { - INC_METRIC(poll_ns, now - poll_start); - break; + if (!list_empty(&hsk->ready_rpcs)) { + rpc = list_first_entry(&hsk->ready_rpcs, struct homa_rpc, + ready_links); + homa_rpc_hold(rpc); + list_del_init(&rpc->ready_links); + if (!list_empty(&hsk->ready_rpcs)) { + /* There are still more RPCs available, so + * let Linux know. + */ + hsk->sock.sk_data_ready(&hsk->sock); } - blocked = sched_clock(); - schedule(); - now = sched_clock(); - blocked = now - blocked; - INC_METRIC(blocked_ns, blocked); - poll_start += blocked; - } - tt_record2("Poll ended unsuccessfully on socket %d, pid %d", - hsk->port, current->pid); - INC_METRIC(poll_ns, now - poll_start); -#endif /* See strip.py */ - - /* Now it's time to sleep. */ -#ifndef __STRIP__ /* See strip.py */ - per_cpu(homa_offload_core, interest.core).last_app_active = now; -#endif /* See strip.py */ - set_current_state(TASK_INTERRUPTIBLE); - rpc = homa_interest_get_rpc(&interest); - if (!rpc && !hsk->shutdown) { -#ifndef __STRIP__ /* See strip.py */ - u64 end; - u64 start = sched_clock(); -#endif /* See strip.py */ - - tt_record1("homa_wait_for_message sleeping, pid %d", - current->pid); - schedule(); -#ifndef __STRIP__ /* See strip.py */ - blocked = 1; - end = sched_clock(); -#endif /* See strip.py */ - INC_METRIC(blocked_ns, end - start); - } - __set_current_state(TASK_RUNNING); - -found_rpc: - /* If we get here, it means either an RPC is ready for our - * attention or an error occurred. - * - * First, clean up all of the interests. Must do this before - * making any other decisions, because until we do, an incoming - * message could still be passed to us. Note: if we went to - * sleep, then this info was already cleaned up by whoever - * woke us up. Also, values in the interest may change between - * when we test them below and when we acquire the socket lock, - * so they have to be checked again after locking the socket. - */ - UNIT_HOOK("found_rpc"); - if (interest.reg_rpc || - !list_empty(&interest.request_links) || - !list_empty(&interest.response_links)) { - homa_sock_lock(hsk); - if (interest.reg_rpc) - interest.reg_rpc->interest = NULL; - if (!list_empty(&interest.request_links)) - list_del_init(&interest.request_links); - if (!list_empty(&interest.response_links)) - list_del_init(&interest.response_links); homa_sock_unlock(hsk); - } - - /* Now check to see if we received an RPC handoff (note that - * this could have happened anytime up until we reset the - * interests above). - */ - rpc = homa_interest_get_rpc(&interest); - if (rpc) { - tt_record2("homa_wait_for_message found rpc id %d, pid %d", - rpc->id, current->pid); - if (!interest.locked) { - atomic_or(APP_NEEDS_LOCK, &rpc->flags); - homa_rpc_lock(rpc); - atomic_andnot(APP_NEEDS_LOCK | RPC_HANDING_OFF, - &rpc->flags); - } else { - atomic_andnot(RPC_HANDING_OFF, &rpc->flags); + if (iteration == 0) + INC_METRIC(fast_wakeups, 1); + } else { + homa_interest_init_shared(&interest, hsk); + homa_sock_unlock(hsk); + result = homa_interest_wait(&interest, nonblocking ); + homa_interest_unlink_shared(&interest); + + if (result != 0) { + /* If homa_interest_wait returned an error + * (e.g. -EAGAIN) but in the meantime the + * interest received a handoff, ignore the + * error. + */ + if (atomic_read(&interest.ready) == 0) { + rpc = ERR_PTR(result); + goto done; + } } - /* Once the RPC has been locked it's safe to drop - * the reference that was set before storing the RPC - * in interest. - * */ - homa_rpc_put(rpc); - if (!rpc->error) - rpc->error = homa_copy_to_user(rpc); - if (rpc->state == RPC_DEAD) { - homa_rpc_unlock(rpc); - continue; - } - if (rpc->error) + rpc = interest.rpc; + if (!rpc) { + rpc = ERR_PTR(-ESHUTDOWN); goto done; - atomic_andnot(RPC_PKTS_READY, &rpc->flags); - if (rpc->msgin.bytes_remaining == 0 && - !skb_queue_len(&rpc->msgin.packets)) - goto done; - homa_rpc_unlock(rpc); + } } - /* A complete message isn't available: check for errors. */ - if (IS_ERR(result)) - return result; - if (signal_pending(current)) - return ERR_PTR(-EINTR); - - /* No message and no error; try again. */ + tt_record3("homa_wait_shared received RPC handoff, id %d, socket %d, pid %d", + rpc->id, rpc->hsk->port, current->pid); + atomic_or(APP_NEEDS_LOCK, &rpc->flags); + homa_rpc_lock(rpc); + atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); + homa_rpc_put(rpc); + if (!rpc->error) + rpc->error = homa_copy_to_user(rpc); + if (rpc->error) { + if (rpc->state != RPC_DEAD) + break; + } else if (rpc->msgin.bytes_remaining == 0 && + skb_queue_len(&rpc->msgin.packets) == 0) + break; + homa_rpc_unlock(rpc); } done: -#ifndef __STRIP__ /* See strip.py */ - if (blocked) - INC_METRIC(slow_wakeups, 1); - else if (polled) - INC_METRIC(fast_wakeups, 1); -#endif /* See strip.py */ return rpc; } -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_choose_interest() - Given a list of interests for an incoming - * message, choose the best one to handle it (if any). - * @homa: Overall information about the Homa transport. - * @head: Head pointers for the list of interest: either - * hsk->request_interests or hsk->response_interests. - * @offset: Offset of "next" pointers in the list elements (either - * offsetof(request_links) or offsetof(response_links). - * Return: An interest to use for the incoming message, or NULL if none - * is available. If possible, this function tries to pick an - * interest whose thread is running on a core that isn't - * currently busy doing Homa transport work. - */ -#else /* See strip.py */ -/** - * homa_choose_interest() - Given a list of interests for an incoming - * message, choose the best one to handle it (if any). - * @homa: Overall information about the Homa transport. - * @head: Head pointers for the list of interest: either - * hsk->request_interests or hsk->response_interests. - * @offset: Offset of "next" pointers in the list elements (either - * offsetof(request_links) or offsetof(response_links). - * Return: An interest to use for the incoming message, or NULL if none - * is available. (future patch sets will fill in additional - * functionality in this function). - */ -#endif /* See strip.py */ -struct homa_interest *homa_choose_interest(struct homa *homa, - struct list_head *head, int offset) -{ -#ifndef __STRIP__ /* See strip.py */ - u64 busy_time = sched_clock() - homa->busy_ns; - struct homa_interest *backup = NULL; - struct homa_interest *interest; - struct list_head *pos; - - list_for_each(pos, head) { - interest = (struct homa_interest *)(((char *)pos) - offset); - if (per_cpu(homa_offload_core, interest->core).last_active < - busy_time) { - if (backup) - INC_METRIC(handoffs_alt_thread, 1); - return interest; - } - if (!backup) - backup = interest; - } - - /* All interested threads are on busy cores; return the first. */ - return backup; -#else /* See strip.py */ - if (list_empty(head)) - return NULL; - else - return (struct homa_interest *)(((char *)head->next) - offset); -#endif /* See strip.py */ -} - /** * homa_rpc_handoff() - This function is called when the input message for - * an RPC is ready for attention from a user thread. It either notifies - * a waiting reader or queues the RPC. - * @rpc: RPC to handoff; must be locked. The caller must - * also have locked the socket for this RPC. + * an RPC is ready for attention from a user thread. It notifies a waiting + * reader and/or queues the RPC, as appropriate. + * @rpc: RPC to handoff; must be locked. */ void homa_rpc_handoff(struct homa_rpc *rpc) { struct homa_sock *hsk = rpc->hsk; struct homa_interest *interest; - if ((atomic_read(&rpc->flags) & RPC_HANDING_OFF) || - !list_empty(&rpc->ready_links)) - return; - - /* First, see if someone is interested in this RPC specifically. - */ - if (rpc->interest) { - interest = rpc->interest; - goto thread_waiting; - } + tt_record1("homa_rpc_handoff called for id %d", rpc->id); - /* Second, check the interest list for this type of RPC. */ - if (homa_is_client(rpc->id)) { - interest = homa_choose_interest(hsk->homa, - &hsk->response_interests, - offsetof(struct homa_interest, - response_links)); - if (interest) - goto thread_waiting; - list_add_tail(&rpc->ready_links, &hsk->ready_responses); - INC_METRIC(responses_queued, 1); - } else { - interest = homa_choose_interest(hsk->homa, - &hsk->request_interests, - offsetof(struct homa_interest, - request_links)); - if (interest) - goto thread_waiting; - list_add_tail(&rpc->ready_links, &hsk->ready_requests); - INC_METRIC(requests_queued, 1); + if (atomic_read(&rpc->flags) & RPC_PRIVATE) { + homa_interest_notify_private(rpc); + return; } - /* If we get here, no-one is waiting for the RPC, so it has been - * queued. + /* Shared RPC; if there is a waiting thread, hand off the RPC; + * otherwise enqueue it. */ - - /* Notify the poll mechanism. */ - hsk->sock.sk_data_ready(&hsk->sock); - tt_record2("homa_rpc_handoff finished queuing id %d for port %d", - rpc->id, hsk->port); - return; - -thread_waiting: - /* We found a waiting thread. Take a reference on it to keep - * it from being freed before homa_wait_for_message picks it up. - */ - homa_rpc_hold(rpc); - atomic_or(RPC_HANDING_OFF, &rpc->flags); - interest->locked = 0; - INC_METRIC(handoffs_thread_waiting, 1); - tt_record3("homa_rpc_handoff handing off id %d to pid %d on core %d", - rpc->id, interest->thread->pid, task_cpu(interest->thread)); - homa_interest_set_rpc(interest, rpc, 0); - + homa_sock_lock(hsk); + if (!list_empty(&hsk->interests)) { #ifndef __STRIP__ /* See strip.py */ - /* Update the last_app_active time for the thread's core, so Homa - * will try to avoid doing any work there. - */ - per_cpu(homa_offload_core, interest->core).last_app_active = - sched_clock(); + interest = homa_choose_interest(hsk); +#else /* See strip.py */ + interest = list_first_entry(&hsk->interests, + struct homa_interest, links); #endif /* See strip.py */ - - /* Clear the interest. This serves two purposes. First, it saves - * the waking thread from acquiring the socket lock again, which - * reduces contention on that lock). Second, it ensures that - * no-one else attempts to give this interest to a different RPC. - */ - if (interest->reg_rpc) { - interest->reg_rpc->interest = NULL; - interest->reg_rpc = NULL; + list_del_init(&interest->links); + interest->rpc = rpc; + homa_rpc_hold(rpc); + atomic_set_release(&interest->ready, 1); + wake_up(&interest->wait_queue); + INC_METRIC(handoffs_thread_waiting, 1); + } else if (list_empty(&rpc->ready_links)) { + list_add_tail(&rpc->ready_links, &hsk->ready_rpcs); + hsk->sock.sk_data_ready(&hsk->sock); } - if (!list_empty(&interest->request_links)) - list_del_init(&interest->request_links); - if (!list_empty(&interest->response_links)) - list_del_init(&interest->response_links); - wake_up_process(interest->thread); + homa_sock_unlock(hsk); } #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_interest.c b/homa_interest.c new file mode 100644 index 00000000..fbf9b4cf --- /dev/null +++ b/homa_interest.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: BSD-2-Clause + +/* This file contains functions for managing homa_interest structs. */ + +#include "homa_impl.h" +#include "homa_interest.h" +#include "homa_rpc.h" +#include "homa_sock.h" + +#ifndef __STRIP__ /* See strip.py */ +#include "homa_offload.h" +#endif /* See strip.py */ + +/** + * homa_interest_init_shared() - Initialize an interest and queue it up on a socket. + * @interest: Interest to initialize + * @hsk: Socket on which the interests should be queued. Must be locked + * by caller. + */ +void homa_interest_init_shared(struct homa_interest *interest, + struct homa_sock *hsk) + __must_hold(&hsk->lock) +{ + atomic_set(&interest->ready, 0); + interest->rpc = NULL; + interest->core = raw_smp_processor_id(); + init_waitqueue_head(&interest->wait_queue); + interest->hsk = hsk; + list_add(&interest->links, &hsk->interests); +} + +/** + * homa_interest_init_private() - Initialize an interest that will wait + * on a particular (private) RPC, and link it to that RPC. + * @interest: Interest to initialize. + * @rpc: RPC to associate with the interest. Must be private, and + * caller must have locked it. + * + * Return: 0 for success, otherwise a negative errno. + */ +int homa_interest_init_private(struct homa_interest *interest, + struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) +{ + if (rpc->private_interest) + return -EINVAL; + + atomic_set(&interest->ready, 0); + interest->rpc = rpc; + interest->core = raw_smp_processor_id(); + init_waitqueue_head(&interest->wait_queue); + interest->hsk = rpc->hsk; + rpc->private_interest = interest; + return 0; +} + +/** + * homa_interest_wait() - Wait for an interest to have an actionable RPC, + * or for an error to occur. + * @interest: Interest to wait for; must previously have been initialized + * and linked to a socket or RPC. On return, the interest + * will have been unlinked if its ready flag is set; otherwise + * it may still be linked. + * @nonblocking: Nonzero means return without blocking if the interest + * doesn't become ready immediately. + * + * Return: 0 for success (there is an actionable RPC in the interest), or + * a negative errno. + */ +int homa_interest_wait(struct homa_interest *interest, int nonblocking) +{ + u64 start, block_start, blocked_time, now; + struct homa_sock *hsk = interest->hsk; + int fast_wakeup = 1; + int result = 0; + int iteration; + int wait_err; + + start = sched_clock(); + blocked_time = 0; + + /* This loop iterates in order to poll and/or reap dead RPCS. */ + for (iteration = 0; ; iteration++) { + if (iteration != 0) + /* Give NAPI/SoftIRQ tasks a chance to run. */ + schedule(); + + if (atomic_read_acquire(&interest->ready) != 0) + goto done; + + /* See if we can cleanup dead RPCs while waiting. */ + if (homa_rpc_reap(hsk, false) != 0) + continue; + + if (nonblocking) { + result = -EAGAIN; + goto done; + } + + now = sched_clock(); + per_cpu(homa_offload_core, + raw_smp_processor_id()).last_app_active = now; + if (now - start >= 1000 * hsk->homa->poll_usecs) + break; + } + + fast_wakeup = 0; + block_start = now; + wait_err = wait_event_interruptible_exclusive(interest->wait_queue, + atomic_read_acquire(&interest->ready) != 0); + blocked_time = sched_clock() - block_start; + if (wait_err == -ERESTARTSYS) + result = -EINTR; + +done: + if (fast_wakeup) + INC_METRIC(fast_wakeups, 1); + else + INC_METRIC(slow_wakeups, 1); + INC_METRIC(blocked_ns, blocked_time); + INC_METRIC(poll_ns, sched_clock() - start - blocked_time); + return result; +} + +/** + * homa_interest_notify_private() - If a thread is waiting on the private + * interest for an RPC, wake it up. + * @rpc: RPC that may (potentially) have a private interest. Must be + * locked by the caller. + */ +void homa_interest_notify_private(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) +{ + if (rpc->private_interest) { + atomic_set_release(&rpc->private_interest->ready, 1); + wake_up(&rpc->private_interest->wait_queue); + } +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_choose_interest() - Given all the interests registered for a socket, + * choose the best one to handle an incoming message. + * @hsk: Socket for which message is intended. Must be locked by caller, + * and must have at least one queued interest. + * Return: The interest to use. This function tries to pick an + * interest whose thread is running on a core that isn't + * currently busy doing Homa transport work. + */ +struct homa_interest *homa_choose_interest(struct homa_sock *hsk) + __must_hold(&hsk->lock) +{ + u64 busy_time = sched_clock() - hsk->homa->busy_ns; + struct homa_interest *interest, *first; + + first = list_first_entry(&hsk->interests, struct homa_interest, + links); + list_for_each_entry(interest, &hsk->interests, links) { + if (per_cpu(homa_offload_core, interest->core).last_active < + busy_time) { + if (interest != first) + INC_METRIC(handoffs_alt_thread, 1); + return interest; + } + } + + /* All interested threads are on busy cores; return the first, + * which is also the most recent one to be registered, hence + * most likely to have warm cache state. + */ + return first; +} +#endif /* See strip.py */ diff --git a/homa_interest.h b/homa_interest.h new file mode 100644 index 00000000..c7eca5c6 --- /dev/null +++ b/homa_interest.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file defines struct homa_interest and related functions. */ + +#ifndef _HOMA_INTEREST_H +#define _HOMA_INTEREST_H + +#include "homa_rpc.h" +#include "homa_sock.h" + +/** + * struct homa_interest - Used by homa_wait_private and homa_wait_shared to + * wait for incoming message data to arrive for an RPC. An interest can + * be either private (if referenced by an rpc->private_interest) or shared + * (if present on hsk->interests). + */ +struct homa_interest { + /** + * @ready: Nonzero means the interest is ready for attention: either + * there is an RPC that needs attention or @hsk has been shutdown. + */ + atomic_t ready; + + /** + * @rpc: If ready is set, then this holds an RPC that needs + * attention, or NULL if this is a shared interest and hsk has + * been shutdown. If ready is not set, this will be NULL if the + * interest is shared; if it's private, it holds the RPC the + * interest is associated with. + */ + struct homa_rpc *rpc; + + /** + * @core: Core on which homa_wait_*was invoked. This is a hint + * used for load balancing (see balance.txt). + */ + int core; + + /** + * @wait_queue: Used to block the thread while waiting (will never + * have more than one queued thread). + */ + struct wait_queue_head wait_queue; + + /** @hsk: Socket that the interest is associated with. */ + struct homa_sock *hsk; + + /** + * @links: If the interest is shared, used to link this object into + * @hsk->interests. + */ + struct list_head links; +}; + +/** + * homa_interest_unlink_shared() - Remove an interest from the list for a + * socket. Note: this can race with homa_rpc_handoff, so on return it's + * possible that the interest is ready. + * @interest: Interest to remove. Must have been initialized with + * homa_interest_init_shared. + */ +static inline void homa_interest_unlink_shared(struct homa_interest *interest) +{ + if (!list_empty(&interest->links)) { + homa_sock_lock(interest->hsk); + list_del_init(&interest->links); + homa_sock_unlock(interest->hsk); + } +} + +/** + * homa_interest_unlink_private() - Detach a private interest from its + * RPC. Note: this can race with homa_rpc_handoff, so on return it's + * possible that the interest is ready. + * @interest: Interest to remove. Must have been initialized with + * homa_interest_init_private. Its RPC must be locked by + * the caller. + */ +static inline void homa_interest_unlink_private(struct homa_interest *interest) + __must_hold(&interest->rpc->bucket->lock) +{ + if (interest == interest->rpc->private_interest) + interest->rpc->private_interest = NULL; +} + +void homa_interest_init_shared(struct homa_interest *interest, + struct homa_sock *hsk); +int homa_interest_init_private(struct homa_interest *interest, + struct homa_rpc *rpc); +void homa_interest_notify_private(struct homa_rpc *rpc); +int homa_interest_wait(struct homa_interest *interest, int nonblocking); + +#ifndef __STRIP__ /* See strip.py */ +struct homa_interest + *homa_choose_interest(struct homa_sock *hsk); +#endif /* See strip.py */ + +#endif /* _HOMA_INTEREST_H */ \ No newline at end of file diff --git a/homa_plumbing.c b/homa_plumbing.c index 2732b65f..8401526c 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1029,6 +1029,12 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) result = -EFAULT; goto error; } + if ((args.flags & ~HOMA_SENDMSG_VALID_FLAGS) || + (args.reserved != 0)) { + result = -EINVAL; + goto error; + } + if (addr->sa.sa_family != sk->sk_family) { result = -EAFNOSUPPORT; goto error; @@ -1049,6 +1055,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) rpc = NULL; goto error; } + if (args.flags & HOMA_SENDMSG_PRIVATE) + atomic_or(RPC_PRIVATE, &rpc->flags); INC_METRIC(send_calls, 1); tt_record4("homa_sendmsg request, target 0x%x:%d, id %u, length %d", (addr->in6.sin6_family == AF_INET) @@ -1149,11 +1157,10 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, { struct homa_sock *hsk = homa_sk(sk); struct homa_recvmsg_args control; -#ifndef __STRIP__ /* See strip.py */ - u64 start = sched_clock(); -#endif /* See strip.py */ struct homa_rpc *rpc; + int nonblocking; #ifndef __STRIP__ /* See strip.py */ + u64 start = sched_clock(); u64 finish; #endif /* See strip.py */ int result; @@ -1188,16 +1195,30 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (result != 0) goto done; - rpc = homa_wait_for_message(hsk, (flags & MSG_DONTWAIT) - ? (control.flags | HOMA_RECVMSG_NONBLOCKING) - : control.flags, control.id); - if (IS_ERR(rpc)) { - /* If we get here, it means there was an error that prevented - * us from finding an RPC to return. If there's an error in - * the RPC itself we won't get here. - */ - result = PTR_ERR(rpc); - goto done; + nonblocking = ((flags & MSG_DONTWAIT) || + (control.flags & HOMA_RECVMSG_NONBLOCKING)); + if (control.id != 0) { + rpc = homa_find_client_rpc(hsk, control.id); /* Locks RPC. */ + if (!rpc) { + result = -EINVAL; + goto done; + } + result = homa_wait_private(rpc, nonblocking); + if (result != 0) { + homa_rpc_unlock(rpc); + control.id = 0; + goto done; + } + } else { + rpc = homa_wait_shared(hsk, nonblocking); + if (IS_ERR(rpc)) { + /* If we get here, it means there was an error that + * prevented us from finding an RPC to return. Errors + * in the RPC itself are handled below. + */ + result = PTR_ERR(rpc); + goto done; + } } result = rpc->error ? rpc->error : rpc->msgin.length; @@ -1591,8 +1612,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, if (homa_sk(sk)->shutdown) mask |= POLLIN; - if (!list_empty(&homa_sk(sk)->ready_requests) || - !list_empty(&homa_sk(sk)->ready_responses)) + if (!list_empty(&homa_sk(sk)->ready_rpcs)) mask |= POLLIN | POLLRDNORM; return (__poll_t)mask; } diff --git a/homa_receiver.cc b/homa_receiver.cc index 0f7fb0ac..9f6fa932 100644 --- a/homa_receiver.cc +++ b/homa_receiver.cc @@ -69,12 +69,11 @@ void homa::receiver::copy_out(void *dest, size_t offset, size_t count) const /** * homa::receiver::receive() - Release resources for the current message, if * any, and receive a new incoming message. - * @flags: Various OR'ed bits such as HOMA_RECVMSG_REQUEST and - * HOMA_RECVMSG_NONBLOCKING. See the Homa documentation - * for the flags field of recvmsg for details. - * @id: Identifier of a particular RPC whose result is desired, - * or 0. See the Homa documentation for the id field of - * recvmsg for details. + * @flags: Various OR'ed bits such as HOMA_RECVMSG_NONBLOCKING. See the + * Homa documentation for the flags field of recvmsg for details. + * @id: Identifier of a private RPC whose result is desired, or 0 + * to wait for a shared RPC. See the Homa documentation for the id + * field of recvmsg for details. * Return: The length of the new active message. If an error occurs, -1 * is returned and additional information is available in * errno. Note: if id() returns a nonzero result after an diff --git a/homa_rpc.c b/homa_rpc.c index 940b033a..9a43fda0 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -3,6 +3,7 @@ /* This file contains functions for managing homa_rpc structs. */ #include "homa_impl.h" +#include "homa_interest.h" #include "homa_peer.h" #include "homa_pool.h" #ifndef __STRIP__ /* See strip.py */ @@ -182,11 +183,11 @@ struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, } hlist_add_head(&srpc->hash_links, &bucket->rpcs); list_add_tail_rcu(&srpc->active_links, &hsk->active_rpcs); + homa_sock_unlock(hsk); if (ntohl(h->seg.offset) == 0 && srpc->msgin.num_bpages > 0) { atomic_or(RPC_PKTS_READY, &srpc->flags); homa_rpc_handoff(srpc); } - homa_sock_unlock(hsk); INC_METRIC(requests_received, 1); *created = 1; return srpc; @@ -265,6 +266,7 @@ void homa_rpc_end(struct homa_rpc *rpc) UNIT_LOG("; ", "homa_rpc_end invoked"); tt_record1("homa_rpc_end invoked for id %d", rpc->id); rpc->state = RPC_DEAD; + rpc->error = -EINVAL; #ifndef __STRIP__ /* See strip.py */ /* The following line must occur before the socket is locked. This is @@ -281,11 +283,7 @@ void homa_rpc_end(struct homa_rpc *rpc) list_add_tail(&rpc->dead_links, &rpc->hsk->dead_rpcs); __list_del_entry(&rpc->ready_links); __list_del_entry(&rpc->buf_links); - if (rpc->interest) { - rpc->interest->reg_rpc = NULL; - wake_up_process(rpc->interest->thread); - rpc->interest = NULL; - } + homa_interest_notify_private(rpc); // tt_record3("Freeing rpc id %d, socket %d, dead_skbs %d", rpc->id, // rpc->hsk->client_port, // rpc->hsk->dead_skbs); diff --git a/homa_rpc.h b/homa_rpc.h index 7c536290..5c684f4f 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -251,8 +251,6 @@ struct homa_rpc { /* Valid bits for @flags: * RPC_PKTS_READY - The RPC has input packets ready to be * copied to user space. - * RPC_HANDING_OFF - The RPC has been handed off to a waiting - * thread but not yet received by that thread. * APP_NEEDS_LOCK - Means that code in the application thread * needs the RPC lock (e.g. so it can start * copying data to user space) so others @@ -262,10 +260,13 @@ struct homa_rpc { * preventing data copies to user space from * starting (and they limit throughput at * high network speeds). + * RPC_PRIVATE - This RPC will be waited on in "private" mode, + * where the app explicitly requests the + * response from this particular RPC. */ #define RPC_PKTS_READY 1 -#define RPC_HANDING_OFF 2 #define APP_NEEDS_LOCK 4 +#define RPC_PRIVATE 8 /** * @refs: Number of unmatched calls to homa_rpc_hold; it's not safe @@ -325,8 +326,7 @@ struct homa_rpc { struct hlist_node hash_links; /** - * @ready_links: Used to link this object into - * @hsk->ready_requests or @hsk->ready_responses. + * @ready_links: Used to link this object into @hsk->ready_rpcs. */ struct list_head ready_links; @@ -348,10 +348,10 @@ struct homa_rpc { struct list_head dead_links; /** - * @interest: Describes a thread that wants to be notified when - * msgin is complete, or NULL if none. + * @private_interest: If there is a thread waiting for this RPC in + * homa_wait_private, then this points to that thread's interest. */ - struct homa_interest *interest; + struct homa_interest *private_interest; #ifndef __STRIP__ /* See strip.py */ /** @@ -539,4 +539,14 @@ static inline bool homa_is_client(u64 id) return (id & 1) == 0; } +/** + * homa_rpc_needs_attention() - Returns true if @rpc has failed or if + * its incoming message is ready for attention by an application thread + * (e.g., packets are ready to copy to user space). + */ +static inline bool homa_rpc_needs_attention(struct homa_rpc *rpc) +{ + return (rpc->error != 0 || atomic_read(&rpc->flags) & RPC_PKTS_READY); +} + #endif /* _HOMA_RPC_H */ diff --git a/homa_sock.c b/homa_sock.c index 608fd68c..6c9d2689 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -3,6 +3,7 @@ /* This file manages homa_sock and homa_socktab objects. */ #include "homa_impl.h" +#include "homa_interest.h" #include "homa_peer.h" #include "homa_pool.h" @@ -167,10 +168,8 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) INIT_LIST_HEAD(&hsk->dead_rpcs); hsk->dead_skbs = 0; INIT_LIST_HEAD(&hsk->waiting_for_bufs); - INIT_LIST_HEAD(&hsk->ready_requests); - INIT_LIST_HEAD(&hsk->ready_responses); - INIT_LIST_HEAD(&hsk->request_interests); - INIT_LIST_HEAD(&hsk->response_interests); + INIT_LIST_HEAD(&hsk->ready_rpcs); + INIT_LIST_HEAD(&hsk->interests); for (i = 0; i < HOMA_CLIENT_RPC_BUCKETS; i++) { struct homa_rpc_bucket *bucket = &hsk->client_rpc_buckets[i]; @@ -220,8 +219,6 @@ void homa_sock_unlink(struct homa_sock *hsk) * @hsk: Socket to shut down. */ void homa_sock_shutdown(struct homa_sock *hsk) - __acquires(&hsk->lock) - __releases(&hsk->lock) { struct homa_interest *interest; struct homa_rpc *rpc; @@ -229,6 +226,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) int i = 0; #endif /* See strip.py */ + tt_record1("Starting shutdown for socket %d", hsk->port); homa_sock_lock(hsk); if (hsk->shutdown) { homa_sock_unlock(hsk); @@ -262,10 +260,13 @@ void homa_sock_shutdown(struct homa_sock *hsk) rcu_read_unlock(); homa_sock_lock(hsk); - list_for_each_entry(interest, &hsk->request_interests, request_links) - wake_up_process(interest->thread); - list_for_each_entry(interest, &hsk->response_interests, response_links) - wake_up_process(interest->thread); + while (!list_empty(&hsk->interests)) { + interest = list_first_entry(&hsk->interests, + struct homa_interest, links); + __list_del_entry(&interest->links); + atomic_set_release(&interest->ready, 1); + wake_up(&interest->wait_queue); + } homa_sock_unlock(hsk); #ifndef __STRIP__ /* See strip.py */ @@ -287,6 +288,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) kfree(hsk->buffer_pool); hsk->buffer_pool = NULL; } + tt_record1("Finished shutdown for socket %d", hsk->port); } /** diff --git a/homa_sock.h b/homa_sock.h index 246770b2..6f3ebc7a 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -199,30 +199,16 @@ struct homa_sock { struct list_head waiting_for_bufs; /** - * @ready_requests: Contains server RPCs whose request message is - * in a state requiring attention from a user process. The head is - * oldest, i.e. next to return. + * @ready_rpcs: List of all RPCs that are ready for attention from + * an application thread. */ - struct list_head ready_requests; + struct list_head ready_rpcs; /** - * @ready_responses: Contains client RPCs whose response message is - * in a state requiring attention from a user process. The head is - * oldest, i.e. next to return. + * @interests: List of threads that are currently waiting for + * incoming messages via homa_wait_shared. */ - struct list_head ready_responses; - - /** - * @request_interests: List of threads that want to receive incoming - * request messages. - */ - struct list_head request_interests; - - /** - * @response_interests: List of threads that want to receive incoming - * response messages. - */ - struct list_head response_interests; + struct list_head interests; /** * @client_rpc_buckets: Hash table for fast lookup of client RPCs. diff --git a/man/homa_send.3 b/man/homa_send.3 index 0cc63409..f85bc3f2 100644 --- a/man/homa_send.3 +++ b/man/homa_send.3 @@ -7,13 +7,13 @@ homa_send, homa_sendv \- send a request message .PP .BI "int homa_send(int " sockfd ", const void *" message_buf ", size_t " length \ ", const struct sockaddr *" dest_addr ", -.BI " size_t " addrlen ", uint64_t *" id ", uint64_t " \ -"completion_cookie" ); +.BI " size_t " addrlen ", __u64 *" id ", __u64" \ +completion_cookie ", __u32 " flags ); .PP .BI "int homa_sendv(int " sockfd ", const struct iovec *" iov ", size_t " \ iovcnt ", const sockaddr *" dest_addr , -.BI " size_t " addrlen ", uint64_t *" id ", uint64_t " \ -"completion_cookie" ); +.BI " size_t " addrlen ", __u64 *" id ", __u64 " \ +completion_cookie ", __u32 " flags ); .fi .SH DESCRIPTION .BR homa_send @@ -61,8 +61,20 @@ The argument provides application-specific identifying information about the RPC, such as the address of a data structure used to manage the RPC; it will be returned by -.BR homa_recv +.BR recvmsg when the RPC completes. +The +.I flags +argument is passed to +.BR sendmsg +as the +.IR flags +field of the +.B +homa_sendmsg_args +struct (see the man page for +.BR sendmsg +for details). .PP This function returns as soon as the message has been queued for transmission. @@ -76,7 +88,7 @@ is stored in is not NULL). The identifier can be used later to match the request with the corresponding response, using -.BR homa_reply (3). +.BR recvmsg (2). On error, \-1 is returned and .I errno is set appropriately. diff --git a/man/recvmsg.2 b/man/recvmsg.2 index b8c63dee..6efe9a1a 100644 --- a/man/recvmsg.2 +++ b/man/recvmsg.2 @@ -15,12 +15,12 @@ on Homa sockets. The .I sockfd argument must refer to a Homa socket. The .I msg -argument describes which incoming messages are of interest, and is +argument describes which incoming messages are of interest and is used to return information about the message that is received. The .I flags -argument is not used for its +argument is not used except for its .B MSG_DONTWAIT -bit, which can be used to request nonblocking behavior +bit, which can be used to request nonblocking behavior. .PP The .B msg @@ -58,7 +58,8 @@ field must refer to a structure of the following type: .EX struct homa_recvmsg_args { uint64_t id; /* If nonzero, specifies id of - * desired RPC. */ + * desired RPC, which must be + * private. */ uint64_t completion_cookie; /* Value from sendmsg for request. */ int flags; /* OR-ed combination of bits. */ uint32_t num_bpages; /* Number of valid entries in @@ -78,25 +79,22 @@ structs are used both for passing parameter information into Homa and for receiving result information: .nr step 1 1 .IP \[bu] 2 -The input values for -.B flags -and +The initial value of .B id -indicate which messages are of interest to the caller. -.B Flags -is a bitmask. If it contains the -.B HOMA_RECVMSG_REQUEST -bit, then the caller is interested in any request message. -If it contains the -.B HOMA_RECVMSG_RESPONSE -bit, then the caller is interested in any response message. -If the -.B HOMA_RECVMSG_RESPONSE -bit is zero but +indicates what message(s) the caller is interested in, in one of two ways. +If the value is nonzero, it gives the RPC identifier for a specific +RPC; it must be an RPC for which the caller is client, and it must +have been specified as +.I private +when +.B sendmsg +was invoked to initate the RPC. In this case, +.B recvmsg +will wait for the response message for that RPC. +If the initial value of .B id -is nonzero, then the caller is interested in receiving a response -for the RPC given by -.B id. +is zero, then the system call will wait for any message that is not a private +response (the message could be either a request or response). .IP \[bu] On a successful return Homa will use the structs to return information about the message received. The @@ -237,7 +235,8 @@ First, it must eventually return the message's bpages to Homa as described above. Second, if the message is a request, the application must eventually send a response (Homa retains state for each request until its response has been sent; if -no responses are sent, kernel state will accumulate without bound). +no responses are sent, kernel state will accumulate and the socket may +eventually block). .SH ERRORS .PP When diff --git a/man/sendmsg.2 b/man/sendmsg.2 index f5864a06..397433eb 100644 --- a/man/sendmsg.2 +++ b/man/sendmsg.2 @@ -56,8 +56,9 @@ field must refer to a structure of the following type: .EX struct homa_sendmsg_args { uint64_t id; /* RPC identifier. */ - uint64_t completion_cookie; /* For requests only; value to return + __u64 completion_cookie; /* For requests only; value to return * along with response. */ + __u32 flags; /* OR'ed combination of bits. */ }; .EE .vs +2 @@ -92,6 +93,26 @@ and .IR msg ->\c .BR msg_name . .PP +The +.B id +field of +.B homa_sendmsg_args +contains an OR'ed collection of bits. At present only a single +flag bit is supported. +.TP +.B HOMA_SENDMSG_PRIVATE +Ignored when sending responses +.RB ( id " is 0)." +For requests, this bit will mark the RPC as +.IR private . +The response for a +private RPC can only be retrieved by specifying the RPC's id explicitly +when invoking +.BR recvmsg . +In addition, system calls such as +.BR select (2) +cannot be used to determine when a private response has arrived. +.PP .B sendmsg returns as soon as the message has been queued for transmission. .SH RETURN VALUE diff --git a/test/Makefile b/test/Makefile index 7b1785ee..c9120c28 100644 --- a/test/Makefile +++ b/test/Makefile @@ -39,6 +39,7 @@ CFLAGS := $(WARNS) -Wstrict-prototypes -MD -g $(CINCLUDES) $(DEFS) CCFLAGS := -std=c++11 $(WARNS) -MD -g $(CCINCLUDES) $(DEFS) -fsanitize=address TEST_SRCS := unit_homa_incoming.c \ + unit_homa_interest.c \ unit_homa_outgoing.c \ unit_homa_peer.c \ unit_homa_pool.c \ @@ -57,6 +58,7 @@ endif TEST_OBJS := $(patsubst %.c,%.o,$(TEST_SRCS)) HOMA_SRCS := homa_devel.c \ + homa_interest.c \ homa_incoming.c \ homa_outgoing.c \ homa_peer.c \ diff --git a/test/mock.c b/test/mock.c index b8b4791b..bc4ec037 100644 --- a/test/mock.c +++ b/test/mock.c @@ -46,11 +46,13 @@ int mock_ip6_xmit_errors; int mock_ip_queue_xmit_errors; int mock_kmalloc_errors; int mock_kthread_create_errors; +int mock_prepare_to_wait_errors; int mock_register_protosw_errors; int mock_route_errors; int mock_spin_lock_held; int mock_trylock_errors; int mock_vmalloc_errors; +int mock_wait_intr_irq_errors; /* The return value from calls to signal_pending(). */ int mock_signal_pending; @@ -68,6 +70,11 @@ int mock_xmit_log_verbose; */ int mock_xmit_log_homa_info; +/* If a test sets this variable to nonzero, calls to wake_up and + * wake_up_all will be logged. + */ +int mock_log_wakeups; + /* If a test sets this variable to nonzero, call_rcu_sched will log * whenever it is invoked. */ @@ -163,6 +170,14 @@ u64 mock_ns; /* Add this value to mock_ns every time sched_clock is invoked. */ u64 mock_ns_tick; +/* If values are present here, use then as the return values from + * sched_clock, without considering mock_ns or mock_ns_ticks. + */ +#define MAX_CLOCK_VALS 10 +u64 mock_clock_vals[MAX_CLOCK_VALS]; +int mock_next_clock_val = 0; +int mock_num_clock_vals = 0; + /* Indicates whether we should be simulation IPv6 or IPv4 in the * current test. Can be overridden by a test. */ @@ -229,13 +244,14 @@ unsigned long vmemmap_base; kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES]; #endif int __preempt_count; -struct pcpu_hot pcpu_hot = {.cpu_number = 1}; +struct pcpu_hot pcpu_hot = {.cpu_number = 1, .current_task = &mock_task}; char sock_flow_table[RPS_SOCK_FLOW_TABLE_SIZE(1024)]; struct net_hotdata net_hotdata = { .rps_cpu_mask = 0x1f, .rps_sock_flow_table = (struct rps_sock_flow_table *) sock_flow_table }; int debug_locks; +struct static_call_key __SCK__might_resched; extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) @@ -277,6 +293,12 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, return skb; } +int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, + int sync, void *key) +{ + return 0; +} + void __check_object_size(const void *ptr, unsigned long n, bool to_user) {} size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) @@ -370,6 +392,14 @@ int debug_lockdep_rcu_enabled(void) } #endif +int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *) +{ + UNIT_HOOK("do_wait_intr_irq"); + if (mock_check_error(&mock_wait_intr_irq_errors)) + return -ERESTARTSYS; + return 0; +} + void dst_release(struct dst_entry *dst) { if (!dst) @@ -572,6 +602,13 @@ void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) {} +void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) +{} + +void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, + struct lock_class_key *) +{} + void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) @@ -797,6 +834,11 @@ void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) return mock_kmalloc(size, gfpflags); } +void __might_sleep(const char *file, int line) +{ + UNIT_HOOK("might_sleep"); +} + void *mock_kmalloc(size_t size, gfp_t flags) { void *block; @@ -923,6 +965,9 @@ int netif_receive_skb(struct sk_buff *skb) long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { + UNIT_HOOK("prepare_to_wait"); + if (mock_check_error(&mock_prepare_to_wait_errors)) + return -ERESTARTSYS; return 0; } @@ -1017,6 +1062,13 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) mock_total_spin_locks++; } +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +{ + UNIT_HOOK("spin_lock"); + mock_active_spin_locks++; + mock_total_spin_locks++; +} + void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner) {} @@ -1037,6 +1089,11 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) mock_active_spin_locks--; } +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +{ + mock_active_spin_locks--; +} + int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { UNIT_HOOK("spin_lock"); @@ -1082,6 +1139,10 @@ void remove_wait_queue(struct wait_queue_head *wq_head, u64 sched_clock(void) { + if (mock_next_clock_val < mock_num_clock_vals) { + mock_next_clock_val++; + return mock_clock_vals[mock_next_clock_val - 1]; + } mock_ns += mock_ns_tick; return mock_ns; } @@ -1091,6 +1152,11 @@ void schedule(void) UNIT_HOOK("schedule"); } +int __SCT__might_resched(void) +{ + return 0; +} + void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic) {} @@ -1298,9 +1364,23 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { + if (!mock_log_wakeups) + return 0; + if (nr_exclusive == 1) + unit_log_printf("; ", "wake_up"); + else + unit_log_printf("; ", "wake_up_all"); return 0; } +void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, + int nr) +{ + if (!mock_log_wakeups) + return; + unit_log_printf("; ", "wake_up_locked"); +} + int wake_up_process(struct task_struct *tsk) { unit_log_printf("; ", "wake_up_process pid %d", tsk ? tsk->pid : -1); @@ -1533,6 +1613,31 @@ void mock_rpc_put(struct homa_rpc *rpc) atomic_dec(&rpc->refs); } +/** + * mock_set_clock_vals() - Specify one or more clock values to be returned + * by the next calls to sched_clock(). The list of arguments must be + * terminated by a zero value (which will not be used as a clock value). + * @t: The first clock reading to return. + */ +void mock_set_clock_vals(u64 t, ...) +{ + va_list args; + + mock_clock_vals[0] = t; + mock_num_clock_vals = 1; + va_start(args, t); + while (mock_num_clock_vals < MAX_CLOCK_VALS) { + u64 time = va_arg(args, u64); + + if (time == 0) + break; + mock_clock_vals[mock_num_clock_vals] = time; + mock_num_clock_vals++; + } + va_end(args); + mock_next_clock_val = 0; +} + /** * mock_set_core() - Set internal state that indicates the "current core". * @num: Integer identifier for a core. @@ -1747,6 +1852,7 @@ void mock_teardown(void) int count; pcpu_hot.cpu_number = 1; + pcpu_hot.current_task = &mock_task; cpu_khz = 1000000; mock_alloc_page_errors = 0; mock_alloc_skb_errors = 0; @@ -1757,6 +1863,8 @@ void mock_teardown(void) mock_cycles = 0; mock_ns = 0; mock_ns_tick = 0; + mock_next_clock_val = 0; + mock_num_clock_vals = 0; mock_ipv6 = mock_ipv6_default; mock_import_ubuf_errors = 0; mock_import_iovec_errors = 0; @@ -1764,7 +1872,9 @@ void mock_teardown(void) mock_ip_queue_xmit_errors = 0; mock_kmalloc_errors = 0; mock_kthread_create_errors = 0; + mock_prepare_to_wait_errors = 0; mock_register_protosw_errors = 0; + mock_wait_intr_irq_errors = 0; mock_copy_to_user_dont_copy = 0; mock_bpage_size = 0x10000; mock_bpage_shift = 16; @@ -1778,6 +1888,7 @@ void mock_teardown(void) mock_signal_pending = 0; mock_xmit_log_verbose = 0; mock_xmit_log_homa_info = 0; + mock_log_wakeups = 0; mock_mtu = 0; mock_max_skb_frags = MAX_SKB_FRAGS; mock_numa_mask = 5; diff --git a/test/mock.h b/test/mock.h index d7196206..edd22bfe 100644 --- a/test/mock.h +++ b/test/mock.h @@ -103,7 +103,7 @@ struct homa_rpc; struct homa_sock; struct homa_socktab; -/* Functions for mocking that are exported to test code. */ +/* Variables and functions for mocking that are exported to test code. */ extern int mock_alloc_page_errors; extern int mock_alloc_skb_errors; extern int mock_bpage_size; @@ -122,8 +122,11 @@ extern bool mock_ipv6; extern bool mock_ipv6_default; extern int mock_kmalloc_errors; extern int mock_kthread_create_errors; +extern int mock_prepare_to_wait_errors; extern int mock_register_protosw_errors; +extern int mock_wait_intr_irq_errors; extern char mock_xmit_prios[]; +extern int mock_log_wakeups; extern int mock_log_rcu_sched; extern int mock_max_grants; extern int mock_max_skb_frags; @@ -177,6 +180,7 @@ struct ctl_table_header * struct ctl_table *table); void mock_rpc_hold(struct homa_rpc *rpc); void mock_rpc_put(struct homa_rpc *rpc); +void mock_set_clock_vals(u64 t, ...); void mock_set_core(int num); void mock_set_ipv6(struct homa_sock *hsk); void mock_spin_lock(spinlock_t *lock); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index a2783053..9da70c63 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1,9 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" -#ifndef __STRIP__ /* See strip.py */ -#include "homa_offload.h" -#endif /* See strip.py */ +#include "homa_interest.h" #include "homa_peer.h" #include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 @@ -12,116 +10,46 @@ #include "mock.h" #include "utils.h" -/* The following variable (and hook function) are used to mark an RPC - * ready with an error (but only if thread is sleeping). - */ -struct homa_rpc *hook_rpc; -struct homa_sock *hook_hsk; -int delete_count; -int lock_delete_count; -int hook_granted; -void handoff_hook(char *id) -{ - if (strcmp(id, "schedule") != 0) - return; - if (task_is_running(current)) - return; - hook_rpc->error = -EFAULT; - homa_rpc_handoff(hook_rpc); - unit_log_printf("; ", - "%d in ready_requests, %d in ready_responses, %d in request_interests, %d in response_interests", - unit_list_length(&hook_rpc->hsk->ready_requests), - unit_list_length(&hook_rpc->hsk->ready_responses), - unit_list_length(&hook_rpc->hsk->request_interests), - unit_list_length(&hook_rpc->hsk->response_interests)); -} +#ifndef __STRIP__ /* See strip.py */ +#include "homa_offload.h" +#endif /* See strip.py */ -/* The following hook function marks an RPC ready after several calls. */ -int poll_count; -void poll_hook(char *id) -{ - if (strcmp(id, "schedule") != 0) - return; - if (poll_count <= 0) - return; - poll_count--; - if (poll_count == 0) { - hook_rpc->error = -EFAULT; - homa_rpc_handoff(hook_rpc); - } -} +static struct homa_rpc *hook_rpc; +static int delete_count; +static int lock_delete_count; +static int hook_count; +static struct homa_sock *hook_shutdown_hsk; -/* The following hook function hands off an RPC (with an error). */ -void handoff_hook2(char *id) +static void wait_hook4(char *id) { - if (strcmp(id, "found_rpc") != 0) + if (strcmp(id, "schedule") != 0 && + strcmp(id, "do_wait_intr_irq") != 0 && + strcmp(id, "prepare_to_wait") != 0) return; - - hook_rpc->error = -ETIMEDOUT; - homa_rpc_handoff(hook_rpc); -} - -/* The following hook function first hands off an RPC, then deletes it. */ -int hook3_count; -void handoff_hook3(char *id) -{ - if (hook3_count || (strcmp(id, "found_rpc") != 0)) + if (hook_count <= 0) return; - hook3_count++; - - homa_rpc_handoff(hook_rpc); - homa_rpc_end(hook_rpc); -} - -/* The following hook function frees an RPC. */ -void delete_hook(char *id) -{ - if (strcmp(id, "schedule") != 0) + hook_count--; + if (hook_count != 0) return; - if (delete_count == 0) - homa_rpc_end(hook_rpc); - delete_count--; + if (hook_shutdown_hsk) + homa_sock_shutdown(hook_shutdown_hsk); + else + homa_rpc_handoff(hook_rpc); } -/* The following hook function frees an RPC when it is locked. */ -void lock_delete_hook(char *id) +static void handoff_hook(char *id) { if (strcmp(id, "spin_lock") != 0) return; - if (lock_delete_count == 0) - homa_rpc_end(hook_rpc); - lock_delete_count--; -} - -/* The following function is used via unit_hook to free an RPC after it - * has been matched in homa_wait_for_message. - */ -void match_free_hook(char *id) -{ - if (strcmp(id, "found_rpc") == 0) - homa_rpc_end(hook_rpc); -} - -/* The following hook function shuts down a socket. */ -void shutdown_hook(char *id) -{ - if (strcmp(id, "schedule") != 0) + if (hook_count <= 0) return; - homa_sock_shutdown(hook_hsk); + hook_count--; + if (hook_count == 0) { + hook_rpc->error = -ENOENT; + homa_rpc_handoff(hook_rpc); + } } -#ifndef __STRIP__ /* See strip.py */ -/* The following hook function updates hook_rpc->msgin.granted. */ -int unlock_count; -void unlock_hook(char *id) -{ - if (strcmp(id, "unlock") != 0) - return; - if (unlock_count == 0) - hook_rpc->msgin.granted = hook_granted; - unlock_count--; -} -#endif /* See strip.py */ #ifdef __STRIP__ /* See strip.py */ int mock_message_in_init(struct homa_rpc *rpc, int length, int unsched) @@ -144,7 +72,6 @@ FIXTURE(homa_incoming) { struct homa_sock hsk; struct homa_sock hsk2; struct homa_data_hdr data; - struct homa_interest interest; }; FIXTURE_SETUP(homa_incoming) { @@ -190,6 +117,7 @@ FIXTURE_SETUP(homa_incoming) unit_log_clear(); delete_count = 0; lock_delete_count = 0; + hook_shutdown_hsk = NULL; } FIXTURE_TEARDOWN(homa_incoming) { @@ -710,6 +638,7 @@ TEST_F(homa_incoming, homa_copy_to_user__basics) self->data.seg.offset = htonl(2800); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1200, 201800), crpc); + EXPECT_NE(0, atomic_read(&crpc->flags) & RPC_PKTS_READY); unit_log_clear(); mock_copy_to_user_dont_copy = -1; @@ -720,6 +649,7 @@ TEST_F(homa_incoming, homa_copy_to_user__basics) "skb_copy_datagram_iter: 1200 bytes to 0x1000af0: 201800-202999", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); + EXPECT_EQ(0, atomic_read(&crpc->flags) & RPC_PKTS_READY); } TEST_F(homa_incoming, homa_copy_to_user__rpc_freed) { @@ -919,6 +849,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv4) mock_sock_init(&self->hsk, &self->homa, 0); skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + unit_log_clear(); homa_dispatch_pkts(skb, &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("icmp_send type 3, code 3", unit_log_get()); @@ -935,6 +866,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv6) mock_sock_init(&self->hsk, &self->homa, 0); skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + unit_log_clear(); homa_dispatch_pkts(skb, &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("icmp6_send type 1, code 4", unit_log_get()); @@ -952,6 +884,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__server_not_enabled) self->hsk.is_server = false; skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + unit_log_clear(); homa_dispatch_pkts(skb, &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("icmp_send type 3, code 3", unit_log_get()); @@ -972,6 +905,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_free_many_packets) skb3 = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); skb->next = skb2; skb2->next = skb3; + unit_log_clear(); homa_dispatch_pkts(skb, &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("icmp6_send type 1, code 4", unit_log_get()); @@ -1301,7 +1235,7 @@ TEST_F(homa_incoming, homa_data_pkt__basics) homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); EXPECT_EQ(RPC_INCOMING, crpc->state); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(200, crpc->msgin.bytes_remaining); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); #ifndef __STRIP__ /* See strip.py */ @@ -1425,7 +1359,7 @@ TEST_F(homa_incoming, homa_data_pkt__handoff) self->data.seg.offset = htonl(1400); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_TRUE(atomic_read(&crpc->flags) & RPC_PKTS_READY); EXPECT_EQ(1600, crpc->msgin.bytes_remaining); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); @@ -2039,7 +1973,7 @@ TEST_F(homa_incoming, homa_rpc_abort__basics) ASSERT_NE(NULL, crpc); unit_log_clear(); homa_rpc_abort(crpc, -EFAULT); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc->ready_links)); EXPECT_EQ(EFAULT, -crpc->error); EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); @@ -2076,7 +2010,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__basics) ASSERT_NE(NULL, crpc3); unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, 0, -EPROTONOSUPPORT); - EXPECT_EQ(2, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(2, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc1->ready_links)); EXPECT_EQ(EPROTONOSUPPORT, -crpc1->error); EXPECT_EQ(0, list_empty(&crpc2->ready_links)); @@ -2101,14 +2035,14 @@ TEST_F(homa_incoming, homa_abort_rpcs__multiple_sockets) ASSERT_NE(NULL, crpc3); unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, 0, -EPROTONOSUPPORT); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc1->ready_links)); EXPECT_EQ(EPROTONOSUPPORT, -crpc1->error); EXPECT_EQ(0, list_empty(&crpc2->ready_links)); EXPECT_EQ(EPROTONOSUPPORT, -crpc2->error); EXPECT_EQ(0, list_empty(&crpc3->ready_links)); EXPECT_EQ(2, unit_list_length(&self->hsk2.active_rpcs)); - EXPECT_EQ(2, unit_list_length(&self->hsk2.ready_responses)); + EXPECT_EQ(2, unit_list_length(&self->hsk2.ready_rpcs)); } TEST_F(homa_incoming, homa_abort_rpcs__select_addr) { @@ -2128,7 +2062,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__select_addr) unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, self->server_port, -ENOTCONN); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc1->ready_links)); EXPECT_EQ(RPC_OUTGOING, crpc2->state); EXPECT_EQ(RPC_OUTGOING, crpc3->state); @@ -2151,7 +2085,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__select_port) unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, self->server_port, -ENOTCONN); - EXPECT_EQ(2, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(2, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_EQ(0, list_empty(&crpc1->ready_links)); EXPECT_EQ(ENOTCONN, -crpc1->error); EXPECT_EQ(RPC_OUTGOING, crpc2->state); @@ -2190,7 +2124,7 @@ TEST_F(homa_incoming, homa_abort_rpcs__ignore_dead_rpcs) EXPECT_EQ(RPC_DEAD, crpc->state); unit_log_clear(); homa_abort_rpcs(&self->homa, self->server_ip, 0, -ENOTCONN); - EXPECT_EQ(0, crpc->error); + EXPECT_EQ(-EINVAL, crpc->error); } TEST_F(homa_incoming, homa_abort_rpcs__free_server_rpc) { @@ -2250,7 +2184,7 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__rpc_already_dead) EXPECT_EQ(RPC_DEAD, crpc->state); unit_log_clear(); homa_abort_sock_rpcs(&self->hsk, -ENOTCONN); - EXPECT_EQ(0, crpc->error); + EXPECT_EQ(-EINVAL, crpc->error); } TEST_F(homa_incoming, homa_abort_sock_rpcs__free_rpcs) { @@ -2270,320 +2204,140 @@ TEST_F(homa_incoming, homa_abort_sock_rpcs__free_rpcs) EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_incoming, homa_register_interests__id_not_for_client_rpc) -{ - int result; - - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_RESPONSE, 45); - EXPECT_EQ(EINVAL, -result); -} -TEST_F(homa_incoming, homa_register_interests__no_rpc_for_id) -{ - int result; - - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_RESPONSE, 44); - EXPECT_EQ(EINVAL, -result); -} -TEST_F(homa_incoming, homa_register_interests__id_already_has_interest) +TEST_F(homa_incoming, homa_wait_private__rpc_not_private) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - struct homa_interest interest; ASSERT_NE(NULL, crpc); - - crpc->interest = &interest; - int result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_RESPONSE, self->client_id); - EXPECT_EQ(EINVAL, -result); - crpc->interest = NULL; + EXPECT_EQ(EINVAL, -homa_wait_private(crpc, 0)); } -TEST_F(homa_incoming, homa_register_interests__return_response_by_id) +TEST_F(homa_incoming, homa_wait_private__basics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - int result; ASSERT_NE(NULL, crpc); - - result = homa_register_interests(&self->interest, &self->hsk, - 0, self->client_id); - EXPECT_EQ(0, result); - EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); - EXPECT_EQ(1, atomic_read(&crpc->refs)); - homa_rpc_put(crpc); - homa_rpc_unlock(crpc); -} -TEST_F(homa_incoming, homa_register_interests__socket_shutdown) -{ - int result; - - self->hsk.shutdown = 1; - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_RESPONSE, 0); - EXPECT_EQ(ESHUTDOWN, -result); - self->hsk.shutdown = 0; + ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); + atomic_or(RPC_PRIVATE, &crpc->flags); + EXPECT_EQ(0, homa_wait_private(crpc, 0)); + ASSERT_EQ(RPC_PRIVATE, atomic_read(&crpc->flags)); + EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups); } -TEST_F(homa_incoming, homa_register_interests__specified_id_has_packets) +TEST_F(homa_incoming, homa_wait_private__rpc_has_error) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - int result; ASSERT_NE(NULL, crpc); - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST, crpc->id); - EXPECT_EQ(0, result); - EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); - EXPECT_EQ(1, atomic_read(&crpc->refs)); - homa_rpc_put(crpc); - homa_rpc_unlock(crpc); + ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); + atomic_or(RPC_PRIVATE, &crpc->flags); + crpc->error = -ENOENT; + EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); + EXPECT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags) & RPC_PKTS_READY); } -TEST_F(homa_incoming, homa_register_interests__specified_id_has_error) +TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - int result; ASSERT_NE(NULL, crpc); - crpc->error = -EFAULT; - - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_NONBLOCKING, crpc->id); - EXPECT_EQ(0, result); - EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); - EXPECT_EQ(1, atomic_read(&crpc->refs)); - homa_rpc_put(crpc); - homa_rpc_unlock(crpc); + ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); + atomic_or(RPC_PRIVATE, &crpc->flags); + mock_copy_data_errors = 1; + EXPECT_EQ(EFAULT, -homa_wait_private(crpc, 0)); } -TEST_F(homa_incoming, homa_register_interests__specified_id_not_ready) +TEST_F(homa_incoming, homa_wait_private__nonblocking) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - int result; ASSERT_NE(NULL, crpc); - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST, crpc->id); - EXPECT_EQ(0, result); - EXPECT_EQ(NULL, homa_interest_get_rpc(&self->interest)); + atomic_or(RPC_PRIVATE, &crpc->flags); + + EXPECT_EQ(EAGAIN, -homa_wait_private(crpc, 1)); } -TEST_F(homa_incoming, homa_register_interests__return_queued_response) +TEST_F(homa_incoming, homa_wait_private__signal_notify_race) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - int result; + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1000); ASSERT_NE(NULL, crpc); - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); - EXPECT_EQ(0, result); - EXPECT_EQ(crpc, homa_interest_get_rpc(&self->interest)); - EXPECT_TRUE(list_empty(&self->interest.request_links)); - EXPECT_TRUE(list_empty(&self->interest.response_links)); - EXPECT_EQ(1, atomic_read(&crpc->refs)); - homa_rpc_put(crpc); - homa_rpc_unlock(crpc); -} -TEST_F(homa_incoming, homa_register_interests__return_queued_request) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, self->server_ip, self->client_port, - 1, 20000, 100); - int result; - - ASSERT_NE(NULL, srpc); - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); - EXPECT_EQ(0, result); - EXPECT_EQ(srpc, homa_interest_get_rpc(&self->interest)); - EXPECT_TRUE(list_empty(&self->interest.request_links)); - EXPECT_TRUE(list_empty(&self->interest.response_links)); - EXPECT_EQ(1, atomic_read(&srpc->refs)); - homa_rpc_put(srpc); - homa_rpc_unlock(srpc); -} -TEST_F(homa_incoming, homa_register_interests__call_sk_data_ready) -{ - struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); - struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_MSG, - self->client_ip, self->server_ip, self->client_port, - self->server_id+2, 20000, 100); - int result; - - // First time should call sk_data_ready (for 2nd RPC). - unit_log_clear(); - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE, 0); - EXPECT_EQ(0, result); - EXPECT_EQ(srpc1, homa_interest_get_rpc(&self->interest)); - EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); - EXPECT_EQ(1, atomic_read(&srpc1->refs)); - homa_rpc_put(srpc1); - homa_rpc_unlock(srpc1); + atomic_or(RPC_PRIVATE, &crpc->flags); + self->homa.poll_usecs = 0; + unit_hook_register(handoff_hook); + hook_rpc = crpc; + hook_count = 1; + mock_prepare_to_wait_errors = 1; - // Second time shouldn't call sk_data_ready (no more RPCs). - unit_log_clear(); - result = homa_register_interests(&self->interest, &self->hsk, - HOMA_RECVMSG_REQUEST|HOMA_RECVMSG_RESPONSE - |HOMA_RECVMSG_NONBLOCKING, 0); - EXPECT_EQ(0, result); - EXPECT_EQ(srpc2, homa_interest_get_rpc(&self->interest)); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(1, atomic_read(&srpc2->refs)); - homa_rpc_put(srpc2); - homa_rpc_unlock(srpc2); + EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); + EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups); + EXPECT_EQ(0, mock_prepare_to_wait_errors); } -TEST_F(homa_incoming, homa_wait_for_message__rpc_from_register_interests) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - struct homa_rpc *rpc; - - ASSERT_NE(NULL, crpc); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, - self->client_id); - EXPECT_EQ(crpc, rpc); - homa_rpc_unlock(crpc); -} -TEST_F(homa_incoming, homa_wait_for_message__error_from_register_interests) +TEST_F(homa_incoming, homa_wait_shared__socket_already_shutdown) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); struct homa_rpc *rpc; - ASSERT_NE(NULL, crpc); self->hsk.shutdown = 1; - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, - self->client_id); + + rpc = homa_wait_shared(&self->hsk, 0); + EXPECT_TRUE(IS_ERR(rpc)); EXPECT_EQ(ESHUTDOWN, -PTR_ERR(rpc)); self->hsk.shutdown = 0; } -TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_polling) +TEST_F(homa_incoming, homa_wait_shared__rpc_already_ready) { - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - struct homa_rpc *rpc; - - ASSERT_NE(NULL, crpc1); - hook_rpc = crpc1; - poll_count = 5; - unit_hook_register(poll_hook); - unit_log_clear(); - rpc = homa_wait_for_message(&self->hsk, 0, self->client_id); - EXPECT_EQ(crpc1, rpc); - EXPECT_EQ(NULL, crpc1->interest); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - EXPECT_EQ(0, self->hsk.dead_skbs); - homa_rpc_unlock(rpc); -} -TEST_F(homa_incoming, homa_wait_for_message__nothing_ready_nonblocking) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - struct homa_rpc *rpc; - - unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 20000, 1600); - ASSERT_NE(NULL, crpc1); - - rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_NONBLOCKING, - self->client_id); - EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); -} -TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_while_sleeping) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); struct homa_rpc *rpc; - ASSERT_NE(NULL, crpc1); - - /* Also, check to see that reaping occurs before sleeping. */ - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 20000, 20000); - self->homa.reap_limit = 5; - homa_rpc_end(crpc2); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(31, self->hsk.dead_skbs); -#else /* See strip.py */ - EXPECT_EQ(30, self->hsk.dead_skbs); -#endif /* See strip.py */ - unit_log_clear(); + ASSERT_NE(NULL, crpc); + ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); - hook_rpc = crpc1; - unit_hook_register(handoff_hook); - rpc = homa_wait_for_message(&self->hsk, 0, self->client_id); - EXPECT_EQ(crpc1, rpc); - EXPECT_EQ(NULL, crpc1->interest); - EXPECT_STREQ("reaped 1236; wake_up_process pid 0; 0 in ready_requests, 0 in ready_responses, 0 in request_interests, 0 in response_interests", - unit_log_get()); - EXPECT_EQ(0, self->hsk.dead_skbs); + rpc = homa_wait_shared(&self->hsk, 0); + ASSERT_FALSE(IS_ERR(rpc)); + EXPECT_EQ(crpc, rpc); + EXPECT_EQ(0, crpc->msgin.packets.qlen); + EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups); homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_wait_for_message__rpc_arrives_after_giving_up) +TEST_F(homa_incoming, homa_wait_shared__multiple_rpcs_already_ready) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 1600); struct homa_rpc *rpc; ASSERT_NE(NULL, crpc); + ASSERT_NE(NULL, crpc2); - hook_rpc = crpc; - unit_hook_register(handoff_hook2); unit_log_clear(); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_NONBLOCKING|HOMA_RECVMSG_RESPONSE, 0); - ASSERT_EQ(crpc, rpc); - EXPECT_EQ(NULL, crpc->interest); - EXPECT_EQ(ETIMEDOUT, -rpc->error); + rpc = homa_wait_shared(&self->hsk, 0); + ASSERT_FALSE(IS_ERR(rpc)); + EXPECT_EQ(crpc, rpc); homa_rpc_unlock(rpc); + EXPECT_SUBSTR("sk->sk_data_ready invoked", unit_log_get()); } -TEST_F(homa_incoming, homa_wait_for_message__handoff_rpc_then_delete_after_giving_up) +TEST_F(homa_incoming, homa_wait_shared__nonblocking) { - // A key thing this test does it to ensure that RPC_HANDING_OFF - // gets cleared even though the RPC has been deleted. - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); struct homa_rpc *rpc; - ASSERT_NE(NULL, crpc); - - // Prevent the RPC from being reaped during the test. - homa_rpc_hold(crpc); - - hook_rpc = crpc; - hook3_count = 0; - unit_hook_register(handoff_hook3); - unit_log_clear(); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_NONBLOCKING|HOMA_RECVMSG_RESPONSE, 0); + rpc = homa_wait_shared(&self->hsk, 1); + EXPECT_TRUE(IS_ERR(rpc)); EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); - EXPECT_EQ(RPC_DEAD, crpc->state); - homa_rpc_put(crpc); } -TEST_F(homa_incoming, homa_wait_for_message__explicit_rpc_deleted_while_sleeping) +TEST_F(homa_incoming, homa_wait_shared__signal_race_with_handoff) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -2591,279 +2345,125 @@ TEST_F(homa_incoming, homa_wait_for_message__explicit_rpc_deleted_while_sleeping struct homa_rpc *rpc; ASSERT_NE(NULL, crpc); - unit_log_clear(); + crpc->error = -ENOENT; + unit_hook_register(handoff_hook); hook_rpc = crpc; - unit_hook_register(delete_hook); - rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_RESPONSE, - self->client_id); - EXPECT_EQ(EINVAL, -PTR_ERR(rpc)); -} -TEST_F(homa_incoming, homa_wait_for_message__socket_shutdown_while_sleeping) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - struct homa_rpc *rpc; + hook_count = 2; + mock_prepare_to_wait_errors = 1; - ASSERT_NE(NULL, crpc); - unit_log_clear(); - hook_hsk = &self->hsk; - unit_hook_register(shutdown_hook); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_REQUEST, 0); - EXPECT_EQ(ESHUTDOWN, -PTR_ERR(rpc)); + rpc = homa_wait_shared(&self->hsk, 0); + EXPECT_EQ(crpc, rpc); + EXPECT_EQ(ENOENT, -rpc->error); + EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups); + homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_wait_for_message__copy_to_user) +TEST_F(homa_incoming, homa_wait_shared__socket_shutdown_while_blocked) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); struct homa_rpc *rpc; - ASSERT_NE(NULL, crpc); - mock_copy_to_user_dont_copy = -1; - unit_log_clear(); - hook_hsk = &self->hsk; - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); - EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); - EXPECT_EQ(0, atomic_read(&crpc->flags) & RPC_PKTS_READY); -} -TEST_F(homa_incoming, homa_wait_for_message__rpc_freed_after_matching) -{ - /* Arrange for 2 RPCs to be ready, but delete the first one after - * it has matched; this should cause the second one to be matched. - */ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 20000, 1600); - struct homa_rpc *rpc; + unit_hook_register(wait_hook4); + hook_shutdown_hsk = &self->hsk; + hook_count = 4; - ASSERT_NE(NULL, crpc1); - ASSERT_NE(NULL, crpc2); - unit_log_clear(); - - hook_rpc = crpc1; - unit_hook_register(match_free_hook); - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); - EXPECT_EQ(RPC_DEAD, crpc1->state); - EXPECT_EQ(crpc2, rpc); - homa_rpc_unlock(rpc); + rpc = homa_wait_shared(&self->hsk, 0); + EXPECT_TRUE(IS_ERR(rpc)); + EXPECT_EQ(ESHUTDOWN, -PTR_ERR(rpc)); + EXPECT_EQ(1, self->hsk.shutdown); + self->hsk.shutdown = 0; } -TEST_F(homa_incoming, homa_wait_for_message__copy_to_user_fails) +TEST_F(homa_incoming, homa_wait_shared__copy_to_user_fails) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); struct homa_rpc *rpc; ASSERT_NE(NULL, crpc); - unit_log_clear(); + ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); mock_copy_data_errors = 1; - hook_hsk = &self->hsk; - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); - ASSERT_FALSE(IS_ERR(rpc)); + rpc = homa_wait_shared(&self->hsk, 0); EXPECT_EQ(crpc, rpc); - EXPECT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags) & RPC_PKTS_READY); EXPECT_EQ(EFAULT, -rpc->error); homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_wait_for_message__message_complete) +TEST_F(homa_incoming, homa_wait_shared__rpc_has_error) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 2000); + self->server_port, self->client_id, 20000, 1600); struct homa_rpc *rpc; ASSERT_NE(NULL, crpc); - mock_copy_to_user_dont_copy = -1; - unit_log_clear(); + EXPECT_EQ(2, crpc->msgin.packets.qlen); + crpc->error = -ENOENT; - hook_hsk = &self->hsk; - rpc = homa_wait_for_message(&self->hsk, - HOMA_RECVMSG_RESPONSE|HOMA_RECVMSG_NONBLOCKING, 0); - ASSERT_FALSE(IS_ERR(rpc)); + rpc = homa_wait_shared(&self->hsk, 0); EXPECT_EQ(crpc, rpc); - EXPECT_EQ(0, atomic_read(&crpc->flags) & RPC_PKTS_READY); + EXPECT_EQ(2, crpc->msgin.packets.qlen); homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_wait_for_message__signal) -{ - struct homa_rpc *rpc; - - mock_signal_pending = 1; - rpc = homa_wait_for_message(&self->hsk, HOMA_RECVMSG_REQUEST, 0); - EXPECT_EQ(EINTR, -PTR_ERR(rpc)); -} - -TEST_F(homa_incoming, homa_choose_interest__empty_list) -{ - struct homa_interest *result = homa_choose_interest(&self->homa, - &self->hsk.request_interests, - offsetof(struct homa_interest, request_links)); - - EXPECT_EQ(NULL, result); -} -#ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_incoming, homa_choose_interest__find_idle_core) -{ - struct homa_interest interest1, interest2, interest3; - - homa_interest_init(&interest1); - interest1.core = 1; - list_add_tail(&interest1.request_links, &self->hsk.request_interests); - homa_interest_init(&interest2); - interest2.core = 2; - list_add_tail(&interest2.request_links, &self->hsk.request_interests); - homa_interest_init(&interest3); - interest3.core = 3; - list_add_tail(&interest3.request_links, &self->hsk.request_interests); - - mock_ns = 5000; - self->homa.busy_ns = 1000; - per_cpu(homa_offload_core, 1).last_active = 4100; - per_cpu(homa_offload_core, 2).last_active = 3500; - per_cpu(homa_offload_core, 3).last_active = 2000; - - struct homa_interest *result = homa_choose_interest(&self->homa, - &self->hsk.request_interests, - offsetof(struct homa_interest, request_links)); - ASSERT_NE(NULL, result); - EXPECT_EQ(2, result->core); - INIT_LIST_HEAD(&self->hsk.request_interests); -} -TEST_F(homa_incoming, homa_choose_interest__all_cores_busy) -{ - struct homa_interest interest1, interest2, interest3; - - homa_interest_init(&interest1); - interest1.core = 1; - list_add_tail(&interest1.request_links, &self->hsk.request_interests); - homa_interest_init(&interest2); - interest2.core = 2; - list_add_tail(&interest2.request_links, &self->hsk.request_interests); - homa_interest_init(&interest3); - interest3.core = 3; - list_add_tail(&interest3.request_links, &self->hsk.request_interests); - - mock_ns = 5000; - self->homa.busy_ns = 1000; - per_cpu(homa_offload_core, 1).last_active = 4100; - per_cpu(homa_offload_core, 2).last_active = 4001; - per_cpu(homa_offload_core, 3).last_active = 4800; - - struct homa_interest *result = homa_choose_interest(&self->homa, - &self->hsk.request_interests, - offsetof(struct homa_interest, request_links)); - INIT_LIST_HEAD(&self->hsk.request_interests); - ASSERT_NE(NULL, result); - EXPECT_EQ(1, result->core); -} -#endif /* See strip.py */ - -TEST_F(homa_incoming, homa_rpc_handoff__handoff_already_in_progress) +TEST_F(homa_incoming, homa_wait_shared__rpc_dead) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - struct homa_interest interest; - - ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); - unit_log_clear(); - - homa_interest_init(&interest); - interest.thread = &mock_task; - interest.reg_rpc = crpc; - crpc->interest = &interest; - atomic_or(RPC_HANDING_OFF, &crpc->flags); - homa_rpc_handoff(crpc); - crpc->interest = NULL; - EXPECT_EQ(NULL, homa_interest_get_rpc(&interest)); - EXPECT_STREQ("", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); -} -TEST_F(homa_incoming, homa_rpc_handoff__rpc_already_enqueued) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - struct homa_interest interest; + struct homa_rpc *rpc; ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); - unit_log_clear(); - - /* First handoff enqueues the RPC. */ - homa_rpc_handoff(crpc); - EXPECT_FALSE(list_empty(&crpc->ready_links)); - unit_log_clear(); - - /* Second handoff does nothing, even though an interest is available. */ + ASSERT_NE(NULL, crpc2); + homa_rpc_end(crpc); - homa_interest_init(&interest); - interest.thread = &mock_task; - interest.reg_rpc = crpc; - crpc->interest = &interest; - atomic_or(RPC_HANDING_OFF, &crpc->flags); - homa_rpc_handoff(crpc); - crpc->interest = NULL; - EXPECT_EQ(NULL, homa_interest_get_rpc(&interest)); - EXPECT_STREQ("", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); + rpc = homa_wait_shared(&self->hsk, 0); + EXPECT_EQ(crpc2, rpc); + homa_rpc_unlock(rpc); } -TEST_F(homa_incoming, homa_rpc_handoff__interest_on_rpc) + +TEST_F(homa_incoming, homa_rpc_handoff__private_rpc) { + struct homa_interest interest; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - struct homa_interest interest; ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); + atomic_or(RPC_PRIVATE, &crpc->flags); + homa_interest_init_private(&interest, crpc); + mock_log_wakeups = 1; unit_log_clear(); - homa_interest_init(&interest); - interest.thread = &mock_task; - interest.reg_rpc = crpc; - crpc->interest = &interest; homa_rpc_handoff(crpc); - crpc->interest = NULL; - EXPECT_EQ(crpc, homa_interest_get_rpc(&interest)); - homa_rpc_put(crpc); - EXPECT_EQ(NULL, interest.reg_rpc); - EXPECT_EQ(NULL, crpc->interest); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); + EXPECT_STREQ("wake_up", unit_log_get()); + EXPECT_EQ(1, atomic_read(&interest.ready)); + EXPECT_TRUE(list_empty(&self->hsk.ready_rpcs)); + homa_interest_unlink_private(&interest); } -TEST_F(homa_incoming, homa_rpc_handoff__response_interests) +TEST_F(homa_incoming, homa_rpc_handoff__handoff_to_shared_interest) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 1600); - struct homa_interest interest; + struct homa_interest interest1, interest2; ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); + homa_interest_init_shared(&interest1, &self->hsk); + homa_interest_init_shared(&interest2, &self->hsk); + EXPECT_EQ(2, unit_list_length(&self->hsk.interests)); unit_log_clear(); - homa_interest_init(&interest); - interest.thread = &mock_task; - list_add_tail(&interest.response_links, &self->hsk.response_interests); homa_rpc_handoff(crpc); - EXPECT_EQ(crpc, homa_interest_get_rpc(&interest)); + EXPECT_EQ(1, unit_list_length(&self->hsk.interests)); + EXPECT_EQ(0, atomic_read(&interest1.ready)); + EXPECT_EQ(1, atomic_read(&interest2.ready)); + EXPECT_EQ(crpc, interest2.rpc); homa_rpc_put(crpc); - EXPECT_EQ(0, unit_list_length(&self->hsk.response_interests)); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); + homa_interest_unlink_shared(&interest1); + EXPECT_EQ(1, homa_metrics_per_cpu()->handoffs_thread_waiting); } -TEST_F(homa_incoming, homa_rpc_handoff__queue_on_ready_responses) +TEST_F(homa_incoming, homa_rpc_handoff__queue_rpc_on_socket) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -2871,98 +2471,21 @@ TEST_F(homa_incoming, homa_rpc_handoff__queue_on_ready_responses) ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_rpc_handoff(crpc); - EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); -} -TEST_F(homa_incoming, homa_rpc_handoff__request_interests) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 20000, 100); - struct homa_interest interest; - - ASSERT_NE(NULL, srpc); - unit_log_clear(); - homa_interest_init(&interest); - interest.thread = &mock_task; - list_add_tail(&interest.request_links, &self->hsk.request_interests); - homa_rpc_handoff(srpc); - EXPECT_EQ(srpc, homa_interest_get_rpc(&interest)); - homa_rpc_put(srpc); - EXPECT_EQ(0, unit_list_length(&self->hsk.request_interests)); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - atomic_andnot(RPC_HANDING_OFF, &srpc->flags); -} -TEST_F(homa_incoming, homa_rpc_handoff__queue_on_ready_requests) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - 1, 20000, 100); - - ASSERT_NE(NULL, srpc); - unit_log_clear(); - - homa_rpc_handoff(srpc); - EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_requests)); -} -TEST_F(homa_incoming, homa_rpc_handoff__detach_interest) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - struct homa_interest interest; - - ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); - unit_log_clear(); - - homa_interest_init(&interest); - interest.thread = &mock_task; - interest.reg_rpc = crpc; - crpc->interest = &interest; - list_add_tail(&interest.response_links, &self->hsk.response_interests); - list_add_tail(&interest.request_links, &self->hsk.request_interests); - EXPECT_EQ(1, unit_list_length(&self->hsk.response_interests)); - EXPECT_EQ(1, unit_list_length(&self->hsk.request_interests)); + mock_log_wakeups = 1; + /* First call should queue RPC. */ homa_rpc_handoff(crpc); - crpc->interest = NULL; - EXPECT_EQ(crpc, homa_interest_get_rpc(&interest)); - homa_rpc_put(crpc); - EXPECT_EQ(NULL, interest.reg_rpc); - EXPECT_EQ(NULL, crpc->interest); - EXPECT_EQ(0, unit_list_length(&self->hsk.response_interests)); - EXPECT_EQ(0, unit_list_length(&self->hsk.request_interests)); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); -} -#ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_incoming, homa_rpc_handoff__update_last_app_active) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 20000, 1600); - struct homa_interest interest; + EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); + EXPECT_FALSE(list_empty(&self->hsk.ready_rpcs)); - ASSERT_NE(NULL, crpc); - EXPECT_EQ(NULL, crpc->interest); + /* Calling again should do nothing (already queued). */ unit_log_clear(); - - homa_interest_init(&interest); - interest.thread = &mock_task; - interest.reg_rpc = crpc; - interest.core = 2; - crpc->interest = &interest; - mock_ns = 10000; - per_cpu(homa_offload_core, 2).last_app_active = 444; homa_rpc_handoff(crpc); - homa_rpc_put(crpc); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - EXPECT_EQ(10000, per_cpu(homa_offload_core, 2).last_app_active); - atomic_andnot(RPC_HANDING_OFF, &crpc->flags); + EXPECT_STREQ("", unit_log_get()); + EXPECT_FALSE(list_empty(&self->hsk.ready_rpcs)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_incoming_sysctl_changed__grant_nonfifo) { self->homa.fifo_grant_increment = 10000; diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c new file mode 100644 index 00000000..3f52f036 --- /dev/null +++ b/test/unit_homa_interest.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#include "homa_impl.h" +#include "homa_interest.h" +#include "homa_sock.h" + +#ifndef __STRIP__ /* See strip.py */ +#include "homa_offload.h" +#endif /* See strip.py */ + +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +static int hook_count; +static struct homa_interest *hook_interest; + +static void log_hook(char *id) +{ + if (strcmp(id, "unlock") == 0 || + strcmp(id, "schedule") == 0) { + unit_log_printf("; ", "%s", id); + } +} + +static void notify_hook(char *id) +{ + if (strcmp(id, "schedule") != 0 && + strcmp(id, "do_wait_intr_irq") != 0 && + strcmp(id, "prepare_to_wait") != 0) + return; + if (hook_count <= 0) + return; + hook_count--; + if (hook_count != 0) + return; + atomic_set(&hook_interest->ready, 1); +} + +FIXTURE(homa_interest) { + struct homa homa; + struct homa_sock hsk; + struct in6_addr client_ip; + int client_port; + struct in6_addr server_ip; + int server_port; + u64 client_id; + u64 server_id; + union sockaddr_in_union server_addr; +}; +FIXTURE_SETUP(homa_interest) +{ + homa_init(&self->homa); + mock_sock_init(&self->hsk, &self->homa, 0); + self->client_ip = unit_get_in_addr("196.168.0.1"); + self->client_port = 40000; + self->server_ip = unit_get_in_addr("1.2.3.4"); + self->server_port = 99; + self->client_id = 1234; + self->server_id = 1235; + self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; + self->server_addr.in6.sin6_addr = self->server_ip; + self->server_addr.in6.sin6_port = htons(self->server_port); + unit_log_clear(); +} +FIXTURE_TEARDOWN(homa_interest) +{ + homa_destroy(&self->homa); + unit_teardown(); +} + +TEST_F(homa_interest, homa_interest_init_shared_and_unlink_shared) +{ + struct homa_interest interests[4]; + int i; + + for (i = 0; i < 4; i++) { + homa_interest_init_shared(&interests[i], &self->hsk); + EXPECT_EQ(i + 1, unit_list_length(&self->hsk.interests)); + } + EXPECT_EQ(3, list_first_entry(&self->hsk.interests, + struct homa_interest, links) + - interests); + homa_interest_unlink_shared(&interests[1]); + EXPECT_EQ(3, unit_list_length(&self->hsk.interests)); + homa_interest_unlink_shared(&interests[0]); + EXPECT_EQ(2, unit_list_length(&self->hsk.interests)); + homa_interest_unlink_shared(&interests[3]); + EXPECT_EQ(1, unit_list_length(&self->hsk.interests)); + homa_interest_unlink_shared(&interests[2]); + EXPECT_EQ(0, unit_list_length(&self->hsk.interests)); +} + +TEST_F(homa_interest, homa_interest_init_private) +{ + struct homa_interest interest; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->server_port, + self->client_id, 20000, 1600); + + /* First call succeeds. */ + EXPECT_EQ(0, homa_interest_init_private(&interest, crpc)); + EXPECT_EQ(&interest, crpc->private_interest); + EXPECT_EQ(crpc, interest.rpc); + + /* Second call fails (rpc already has interest). */ + EXPECT_EQ(EINVAL, -homa_interest_init_private(&interest, crpc)); + + homa_interest_unlink_private(&interest); +} + +TEST_F(homa_interest, homa_interest_unlink_private) +{ + struct homa_interest interest, interest2; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->server_port, + self->client_id, 20000, 1600); + + EXPECT_EQ(0, homa_interest_init_private(&interest, crpc)); + homa_interest_unlink_private(&interest); + EXPECT_EQ(NULL, crpc->private_interest); + + /* Second call does nothing (rpc doesn't refer to interest). */ + crpc->private_interest = &interest2; + homa_interest_unlink_private(&interest); + EXPECT_EQ(&interest2, crpc->private_interest); + + crpc->private_interest = NULL; +} + +TEST_F(homa_interest, homa_interest_wait__already_ready) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + atomic_set(&interest.ready, 1); + EXPECT_EQ(0, homa_interest_wait(&interest, 0)); + + homa_interest_unlink_shared(&interest); +} +TEST_F(homa_interest, homa_interest_wait__call_schedule) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + + self->homa.poll_usecs = 100; + unit_hook_register(log_hook); + unit_hook_register(notify_hook); + hook_interest = &interest; + hook_count = 2; + unit_log_clear(); + + EXPECT_EQ(0, homa_interest_wait(&interest, 0)); + ASSERT_STREQ("schedule; schedule", unit_log_get()); + homa_interest_unlink_shared(&interest); +} +TEST_F(homa_interest, homa_interest_wait__call_homa_rpc_reap) +{ + struct homa_interest interest; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->server_port, + self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); + homa_rpc_end(crpc); + EXPECT_EQ(15, self->hsk.dead_skbs); + homa_interest_init_shared(&interest, &self->hsk); + self->homa.poll_usecs = 0; + + EXPECT_EQ(EAGAIN, -homa_interest_wait(&interest, 1)); + EXPECT_EQ(0, self->hsk.dead_skbs); + homa_interest_unlink_shared(&interest); +} +TEST_F(homa_interest, homa_interest_wait__nonblocking) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + self->homa.poll_usecs = 100; + + EXPECT_EQ(EAGAIN, -homa_interest_wait(&interest, 1)); + homa_interest_unlink_shared(&interest); +} +TEST_F(homa_interest, homa_interest_wait__poll_then_block) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + self->homa.poll_usecs = 3; + mock_set_clock_vals(1000, 2000, 3999, 4000, 0); + mock_ns = 4000; + unit_hook_register(notify_hook); + hook_interest = &interest; + hook_count = 4; + + EXPECT_EQ(0, -homa_interest_wait(&interest, 0)); + EXPECT_EQ(3000, homa_metrics_per_cpu()->poll_ns); + EXPECT_EQ(0, homa_metrics_per_cpu()->blocked_ns); + EXPECT_EQ(0, homa_metrics_per_cpu()->fast_wakeups); + EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups); + homa_interest_unlink_shared(&interest); +} +TEST_F(homa_interest, homa_interest_wait__interrupted_by_signal) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + mock_prepare_to_wait_errors = 1; + self->homa.poll_usecs = 0; + + EXPECT_EQ(EINTR, -homa_interest_wait(&interest, 0)); + homa_interest_unlink_shared(&interest); +} +TEST_F(homa_interest, homa_interest_wait__time_metrics) +{ + struct homa_interest interest; + + homa_interest_init_shared(&interest, &self->hsk); + self->homa.poll_usecs = 0; + mock_set_clock_vals(1000, 1500, 3000, 3200, 0); + mock_ns = 4000; + unit_hook_register(notify_hook); + hook_interest = &interest; + hook_count = 4; + + EXPECT_EQ(0, -homa_interest_wait(&interest, 0)); + EXPECT_EQ(700, homa_metrics_per_cpu()->poll_ns); + EXPECT_EQ(1500, homa_metrics_per_cpu()->blocked_ns); + homa_interest_unlink_shared(&interest); +} + +TEST_F(homa_interest, homa_interest_wait__notify_private) +{ + struct homa_interest interest; + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->server_port, + self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); + + homa_interest_init_private(&interest, crpc); + EXPECT_EQ(0, atomic_read(&interest.ready)); + unit_log_clear(); + mock_log_wakeups = 1; + + /* First call: RPC has an interest. */ + homa_interest_notify_private(crpc); + EXPECT_EQ(1, atomic_read(&interest.ready)); + EXPECT_STREQ("wake_up", unit_log_get()); + homa_interest_unlink_private(&interest); + + /* Second call: No interest on RPC. */ + unit_log_clear(); + homa_interest_notify_private(crpc); + EXPECT_STREQ("", unit_log_get()); +} + +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_interest, homa_choose_interest__find_idle_core) +{ + struct homa_interest interest1, interest2, interest3; + + homa_interest_init_shared(&interest1, &self->hsk); + interest1.core = 1; + homa_interest_init_shared(&interest2, &self->hsk); + interest2.core = 2; + homa_interest_init_shared(&interest3, &self->hsk); + interest3.core = 3; + + mock_ns = 5000; + self->homa.busy_ns = 1000; + per_cpu(homa_offload_core, 1).last_active = 2000; + per_cpu(homa_offload_core, 2).last_active = 3500; + per_cpu(homa_offload_core, 3).last_active = 4100; + + struct homa_interest *result = homa_choose_interest(&self->hsk); + EXPECT_EQ(&interest2, result); + EXPECT_EQ(2, result->core); + EXPECT_EQ(1, homa_metrics_per_cpu()->handoffs_alt_thread); + INIT_LIST_HEAD(&self->hsk.interests); +} +TEST_F(homa_interest, homa_choose_interest__all_cores_busy) +{ + struct homa_interest interest1, interest2, interest3; + + homa_interest_init_shared(&interest1, &self->hsk); + interest1.core = 1; + homa_interest_init_shared(&interest2, &self->hsk); + interest2.core = 2; + homa_interest_init_shared(&interest3, &self->hsk); + interest3.core = 3; + + mock_ns = 5000; + self->homa.busy_ns = 1000; + per_cpu(homa_offload_core, 1).last_active = 4100; + per_cpu(homa_offload_core, 2).last_active = 4001; + per_cpu(homa_offload_core, 3).last_active = 4800; + + struct homa_interest *result = homa_choose_interest(&self->hsk); + EXPECT_EQ(3, result->core); + EXPECT_EQ(0, homa_metrics_per_cpu()->handoffs_alt_thread); + INIT_LIST_HEAD(&self->hsk.interests); +} +#endif /* See strip.py */ diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 34f7210c..9f0ff417 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -85,8 +85,7 @@ FIXTURE_SETUP(homa_plumbing) self->recvmsg_hdr.msg_controllen = sizeof(self->recvmsg_args); self->recvmsg_hdr.msg_flags = 0; memset(&self->recvmsg_args, 0, sizeof(self->recvmsg_args)); - self->recvmsg_args.flags = HOMA_RECVMSG_REQUEST - | HOMA_RECVMSG_RESPONSE | HOMA_RECVMSG_NONBLOCKING; + self->recvmsg_args.flags = HOMA_RECVMSG_NONBLOCKING; self->send_vec[0].iov_base = self->buffer; self->send_vec[0].iov_len = 100; self->send_vec[1].iov_base = self->buffer + 1000; @@ -474,6 +473,20 @@ TEST_F(homa_plumbing, homa_sendmsg__cant_read_args) &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } +TEST_F(homa_plumbing, homa_sendmsg__illegal_flag) +{ + self->sendmsg_args.flags = 2; + EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +} +TEST_F(homa_plumbing, homa_sendmsg__nonzero_reserved_field) +{ + self->sendmsg_args.reserved = 0x1000; + EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +} TEST_F(homa_plumbing, homa_sendmsg__bad_address_family) { self->client_addr.in4.sin_family = 1; @@ -685,13 +698,35 @@ TEST_F(homa_plumbing, homa_recvmsg__error_in_release_buffers) EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); } -TEST_F(homa_plumbing, homa_recvmsg__error_in_homa_wait_for_message) +TEST_F(homa_plumbing, homa_recvmsg__private_rpc_doesnt_exist) { - self->hsk.shutdown = true; - EXPECT_EQ(ESHUTDOWN, -homa_recvmsg(&self->hsk.inet.sk, - &self->recvmsg_hdr, 0, 0, - &self->recvmsg_hdr.msg_namelen)); - self->hsk.shutdown = false; + self->recvmsg_args.id = 99; + + EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, 0, &self->recvmsg_hdr.msg_namelen)); +} +TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_private) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + self->client_id, 100, 2000); + + EXPECT_NE(NULL, crpc); + atomic_or(RPC_PRIVATE, &crpc->flags); + + self->recvmsg_args.id = crpc->id; + self->recvmsg_args.flags = HOMA_RECVMSG_NONBLOCKING; + + EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_EQ(0, self->recvmsg_args.id); +} +TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_shared) +{ + self->recvmsg_args.flags = HOMA_RECVMSG_NONBLOCKING; + + EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, 0, &self->recvmsg_hdr.msg_namelen)); } TEST_F(homa_plumbing, homa_recvmsg__MSG_DONT_WAIT) { diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index a4a6f33d..19f679b2 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -23,6 +23,23 @@ void unprotect_hsk_hook(char *id) } } +#if 0 +static struct homa_rpc *hook_rpc; +static int hook_count; +static void unlink_rpc_hook(char *id) +{ + if (strcmp(id, "spin_lock")!= 0) + return; + if (hook_count == 0) + return; + hook_count--; + if (hook_count == 0) { + list_del_init(&hook_rpc->ready_links); + homa_rpc_put(hook_rpc); + } +} +#endif + FIXTURE(homa_rpc) { struct in6_addr client_ip[1]; int client_port; @@ -247,7 +264,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__handoff_rpc) homa_rpc_unlock(srpc); EXPECT_EQ(RPC_INCOMING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_requests)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); homa_rpc_end(srpc); } TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_no_buffers) @@ -261,7 +278,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_no_buffers) &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); - EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_rpcs)); homa_rpc_end(srpc); } TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_rpc) @@ -277,7 +294,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_rpc) homa_rpc_unlock(srpc); EXPECT_EQ(RPC_INCOMING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(0, unit_list_length(&self->hsk.ready_requests)); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_rpcs)); homa_rpc_end(srpc); } @@ -420,33 +437,27 @@ TEST_F(homa_rpc, homa_rpc_end__already_dead) homa_rpc_end(crpc); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_rpc, homa_rpc_end__state_ready) +TEST_F(homa_rpc, homa_rpc_end__remove_from_ready_rpcs) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 100); ASSERT_NE(NULL, crpc); - EXPECT_EQ(1, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); homa_rpc_end(crpc); - EXPECT_EQ(0, unit_list_length(&self->hsk.ready_responses)); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_rpcs)); } -TEST_F(homa_rpc, homa_rpc_end__wakeup_interest) +TEST_F(homa_rpc, homa_rpc_end__state_ready) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, + UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 1000, 100); - struct homa_interest interest = {}; ASSERT_NE(NULL, crpc); - atomic_set(&interest.rpc_ready, 0); - interest.reg_rpc = crpc; - crpc->interest = &interest; - unit_log_clear(); + EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); homa_rpc_end(crpc); - EXPECT_EQ(NULL, interest.reg_rpc); - EXPECT_STREQ("homa_rpc_end invoked; " - "wake_up_process pid -1", unit_log_get()); + EXPECT_EQ(0, unit_list_length(&self->hsk.ready_rpcs)); } TEST_F(homa_rpc, homa_rpc_end__free_gaps) { diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index db096385..2e2a267b 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_interest.h" #include "homa_sock.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" @@ -246,24 +247,20 @@ TEST_F(homa_sock, homa_sock_shutdown__delete_rpcs) } TEST_F(homa_sock, homa_sock_shutdown__wakeup_interests) { - struct homa_interest interest1, interest2, interest3; - struct task_struct task1, task2, task3; - - interest1.thread = &task1; - task1.pid = 100; - interest2.thread = &task2; - task2.pid = 200; - interest3.thread = &task3; - task3.pid = 300; - EXPECT_FALSE(self->hsk.shutdown); - list_add_tail(&interest1.request_links, &self->hsk.request_interests); - list_add_tail(&interest2.request_links, &self->hsk.request_interests); - list_add_tail(&interest3.response_links, &self->hsk.response_interests); + struct homa_interest interest1, interest2; + + mock_log_wakeups = 1; + homa_interest_init_shared(&interest1, &self->hsk); + homa_interest_init_shared(&interest2, &self->hsk); + unit_log_clear(); + homa_sock_shutdown(&self->hsk); EXPECT_TRUE(self->hsk.shutdown); - EXPECT_STREQ("wake_up_process pid -1; wake_up_process pid 100; " - "wake_up_process pid 200; wake_up_process pid 300", - unit_log_get()); + EXPECT_EQ(1, atomic_read(&interest1.ready)); + EXPECT_EQ(1, atomic_read(&interest2.ready)); + EXPECT_EQ(NULL, interest1.rpc); + EXPECT_EQ(NULL, interest2.rpc); + EXPECT_STREQ("wake_up; wake_up", unit_log_get()); } TEST_F(homa_sock, homa_sock_bind) diff --git a/test/utils.c b/test/utils.c index 7faef956..d1e9e8b5 100644 --- a/test/utils.c +++ b/test/utils.c @@ -390,7 +390,6 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, } if (state == UNIT_RCVD_MSG) return srpc; - list_del_init(&srpc->ready_links); srpc->state = RPC_IN_SERVICE; if (state == UNIT_IN_SERVICE) return srpc; diff --git a/util/cp_node.cc b/util/cp_node.cc index 404ac2eb..0c4c67c6 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -1071,7 +1071,7 @@ void homa_server::server(int thread_id, server_metrics *metrics) while (1) { while (1) { - length = receiver.receive(HOMA_RECVMSG_REQUEST, 0); + length = receiver.receive(0, 0); if (length >= 0) break; if ((errno == EBADF) || (errno == ESHUTDOWN)) { @@ -2052,7 +2052,7 @@ bool homa_client::wait_response(homa::receiver *receiver, uint64_t rpc_id) rpc_id = 0; ssize_t length; do { - length = receiver->receive(HOMA_RECVMSG_RESPONSE, rpc_id); + length = receiver->receive(0, rpc_id); } while ((length < 0) && ((errno == EAGAIN) || (errno == EINTR))); if (length < 0) { if (exit_receivers) @@ -2138,12 +2138,12 @@ void homa_client::sender() status = homa_sendv(fd, vec, 2, &server_addrs[server].sa, sockaddr_size(&server_addrs[server].sa), - &rpc_id, 0); + &rpc_id, 0, 0); } else status = homa_send(fd, sender_buffer, header->length, &server_addrs[server].sa, sockaddr_size(&server_addrs[server].sa), - &rpc_id, 0); + &rpc_id, 0, 0); if (status < 0) { log(NORMAL, "FATAL: error in homa_send: %s (request " "length %d)\n", strerror(errno), @@ -2206,7 +2206,8 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, header->cid.client_port = id; start = rdtsc(); status = homa_send(fd, buffer, header->length, &server_addrs[server].sa, - sockaddr_size(&server_addrs[server].sa), &rpc_id, 0); + sockaddr_size(&server_addrs[server].sa), &rpc_id, 0, + 0); if (status < 0) { log(NORMAL, "FATAL: error in homa_send: %s (request " "length %d)\n", strerror(errno), diff --git a/util/homa_test.cc b/util/homa_test.cc index 6c3c30f5..29bd1c2c 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -84,7 +84,7 @@ void send_fd(int fd, const sockaddr_in_union *addr, char *request) sleep(1); status = homa_send(fd, request, length, &addr->sa, - sockaddr_size(&addr->sa), &id, 0); + sockaddr_size(&addr->sa), &id, 0, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); @@ -142,7 +142,7 @@ void test_close() } std::thread thread(close_fd, fd); recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result > 0) { @@ -172,7 +172,7 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) for (int i = 1; i <= count; i++) { status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0); + sockaddr_size(&dest->sa), &id, 0, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); @@ -186,7 +186,7 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) total = 0; for (int i = 1; i <= count; i++) { recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); received = recvmsg(fd, &recv_hdr, 0); if (received < 0) { @@ -221,7 +221,7 @@ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) ssize_t resp_length; status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0); + sockaddr_size(&dest->sa), &id, 0, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); return; @@ -229,7 +229,7 @@ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) printf("homa_send succeeded, id %llu\n", id); } recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (resp_length < 0) { @@ -322,7 +322,7 @@ void test_poll(int fd, char *request) } recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_REQUEST; + recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result < 0) @@ -332,6 +332,49 @@ void test_poll(int fd, char *request) result, ntohs(source_addr.in4.sin_port)); } +/** + * test_private() - Send several private requests and wait for responses in + * the reverse order. + * @fd: Homa socket. + * @dest: Where to send the request. + * @request: Request message. + */ +void test_private(int fd, const sockaddr_in_union *dest, char *request) +{ + __u64 ids[3]; + int status, i; + ssize_t resp_length; + + for (i = 0; i < 3; i++) { + status = homa_send(fd, request, length, &dest->sa, + sockaddr_size(&dest->sa), &ids[i], 0, + HOMA_SENDMSG_PRIVATE); + if (status < 0) { + printf("Error in homa_send: %s\n", strerror(errno)); + return; + } else { + printf("homa_send succeeded, id %llu\n", ids[i]); + } + } + + for (i = 2; i >= 0; i--) { + recv_args.id = ids[i]; + recv_args.flags = 0; + recv_hdr.msg_controllen = sizeof(recv_args); + resp_length = recvmsg(fd, &recv_hdr, 0); + if (resp_length < 0) { + printf("Error in recvmsg: %s\n", strerror(errno)); + return; + } + int seed = check_message(&recv_args, buf_region, resp_length, + 2*sizeof32(int)); + printf("Received message from %s with %lu bytes, " + "seed %d, id %llu\n", + print_address(&source_addr), resp_length, seed, + recv_args.id); + } +} + /** * test_read() - Measure round-trip time for a read kernel call that * does nothing but return an error. @@ -376,14 +419,14 @@ void test_rtt(int fd, const sockaddr_in_union *dest, char *request) for (int i = -10; i < count; i++) { start = rdtsc(); status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), NULL, 0); + sockaddr_size(&dest->sa), NULL, 0, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); return; } recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (i >= 0) @@ -414,7 +457,7 @@ void test_send(int fd, const sockaddr_in_union *dest, char *request) int status; status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0); + sockaddr_size(&dest->sa), &id, 0, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); @@ -459,7 +502,7 @@ void test_shutdown(int fd) std::thread thread(shutdown_fd, fd); thread.detach(); recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result > 0) { @@ -471,7 +514,7 @@ void test_shutdown(int fd) /* Make sure that future reads also fail. */ recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result < 0) { @@ -517,7 +560,7 @@ void test_stream(int fd, const sockaddr_in_union *dest) } for (i = 0; i < count; i++) { status = homa_send(fd, buffers[i], length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0); + sockaddr_size(&dest->sa), &id, 0, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); return; @@ -528,11 +571,11 @@ void test_stream(int fd, const sockaddr_in_union *dest) * response to an outstanding request, then initiates a new * request. */ - while (1){ + while (1) { int *response; recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_RESPONSE; + recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (resp_length < 0) { @@ -546,7 +589,7 @@ void test_stream(int fd, const sockaddr_in_union *dest) response = (int *) (buf_region + recv_args.bpage_offsets[0]); status = homa_send(fd, buffers[(response[2]/1000) %count], length, &dest->sa, sockaddr_size(&dest->sa), - &id, 0); + &id, 0, 0); if (status < 0) { printf("Error in homa_send: %s\n", strerror(errno)); return; @@ -776,8 +819,7 @@ void test_tmp(int fd, int count) h.msg_controllen = sizeof(control); memset(&control, 0, sizeof(control)); - control.flags = HOMA_RECVMSG_REQUEST | HOMA_RECVMSG_REQUEST - | HOMA_RECVMSG_NONBLOCKING; + control.flags = HOMA_RECVMSG_NONBLOCKING; int result = recvmsg(fd, &h, 0); printf("recvmsg returned %d, addr %p, namelen %d, control %p, " @@ -959,6 +1001,8 @@ int main(int argc, char** argv) test_ioctl(fd, count); } else if (strcmp(argv[next_arg], "poll") == 0) { test_poll(fd, buffer); + } else if (strcmp(argv[next_arg], "private") == 0) { + test_private(fd, &dest, buffer); } else if (strcmp(argv[next_arg], "send") == 0) { test_send(fd, &dest, buffer); } else if (strcmp(argv[next_arg], "read") == 0) { diff --git a/util/server.cc b/util/server.cc index 29dbfc64..ce418c24 100644 --- a/util/server.cc +++ b/util/server.cc @@ -114,7 +114,6 @@ void homa_server(int port) int result; recv_args.id = 0; - recv_args.flags = HOMA_RECVMSG_REQUEST; hdr.msg_controllen = sizeof(recv_args); length = recvmsg(fd, &hdr, 0); if (length < 0) { From 07ac90124581afa2cfe366644eae945f85b1b258 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Mar 2025 15:07:11 -0700 Subject: [PATCH 205/625] Print error message for missing cpu_khz line in ttsyslog.py --- util/ttsyslog.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/util/ttsyslog.py b/util/ttsyslog.py index 3ec5d600..4dc97f5c 100755 --- a/util/ttsyslog.py +++ b/util/ttsyslog.py @@ -57,4 +57,7 @@ print('%9.3f us (+%8.3f us) %s' % ( (this_time - first_time)/(1000.0 *cpu_ghz), (this_time - prev_time)/(1000.0 * cpu_ghz), this_event)) - prev_time = this_time \ No newline at end of file + prev_time = this_time + +if cpu_ghz == None: + print("Couldn't find initial line with clock speed", file=sys.stderr) \ No newline at end of file From 2ac05c2e665d6c2bfefe025d7cc83489c68115f9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Mar 2025 15:07:52 -0700 Subject: [PATCH 206/625] Fix minor spelling errors in cp_node.c --- util/cp_node.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/util/cp_node.cc b/util/cp_node.cc index 0c4c67c6..5506a306 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -141,21 +141,21 @@ uint64_t last_stats_time = 0; /** * @last_client_rpcs: entries correspond to @experiments; total number of - * client RPCS completed by that experiment as of the last time we printed + * client RPCs completed by that experiment as of the last time we printed * statistics. */ std::vector last_client_rpcs; /** * @last_client_bytes_out: entries correspond to @experiments; total amount - * of data sent in request messages by client RPCS in that experiment as + * of data sent in request messages by client RPCs in that experiment as * of the last time we printed statistics. */ std::vector last_client_bytes_out; /** * @last_client_bytes_in: entries correspond to @experiments; total - * amount of data received in response messages for client RPCS in that + * amount of data received in response messages for client RPCs in that * experiment as of the last time we printed statistics. */ std::vector last_client_bytes_in; @@ -2819,7 +2819,7 @@ void client_stats(uint64_t now) } } if (outstanding_rpcs != 0) - log(NORMAL, "Outstanding client RPCS for %s " + log(NORMAL, "Outstanding client RPCs for %s " "experiment: %lu\n", exp.c_str(), outstanding_rpcs); last_client_rpcs[i] = client_rpcs; From 95a2e1b9a6f6f051f0c57cf420d7e2eb91f504f8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Mar 2025 15:11:00 -0700 Subject: [PATCH 207/625] Added address validation functions to homa_devel.h --- homa_devel.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/homa_devel.h b/homa_devel.h index efa19604..d17afa2a 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -7,6 +7,8 @@ #ifndef _HOMA_DEVEL_H #define _HOMA_DEVEL_H +#include "timetrace.h" + struct homa; struct homa_rpc; @@ -36,6 +38,41 @@ static inline u32 tt_addr(const struct in6_addr x) : ntohl(x.in6_u.u6_addr32[1])); } +/** + * addr_valid() - Determine whether a given address is a valid address + * within kernel memory. + * @addr: Address to check + */ +static inline int addr_valid(void *addr) +{ +#ifdef __UNIT_TEST__ + return 1; +#else +#define HIGH_BITS 0xffff800000000000 + u64 int_addr = (u64) addr; + + return (int_addr & HIGH_BITS) == HIGH_BITS; +#endif /* __UNIT_TEST__ */ +} + +static inline void check_addr_valid(void *addr, char *info) +{ +#ifndef __UNIT_TEST__ +#define HIGH_BITS 0xffff800000000000 + u64 int_addr = (u64) addr; + + if ((int_addr & HIGH_BITS) != HIGH_BITS) { + pr_err("Bogus address 0x%px (%s))\n", addr, info); + tt_record("Freezing timetrace because of bogus address"); + tt_record(info); + tt_freeze(); + tt_printk(); + pr_err("Finished dumping timetrace\n"); + BUG_ON(1); + } +#endif /* __UNIT_TEST__ */ +} + #ifndef __STRIP__ /* See strip.py */ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format); From 45cc9c80deabfcd15e879f1b33825ab9d40b7b92 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 11 Mar 2025 15:14:19 -0700 Subject: [PATCH 208/625] Use higher log level when dumping timetrace to syslog --- timetrace.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/timetrace.c b/timetrace.c index 4a0a6f65..aa74eca2 100644 --- a/timetrace.c +++ b/timetrace.c @@ -684,7 +684,7 @@ void tt_printk(void) static atomic_t active; if (atomic_xchg(&active, 1)) { - pr_notice("concurrent call to %s aborting\n", __func__); + pr_err("concurrent call to %s aborting\n", __func__); return; } if (!init) @@ -692,7 +692,7 @@ void tt_printk(void) atomic_inc(&tt_freeze_count); tt_find_oldest(pos); - pr_notice("cpu_khz: %u\n", cpu_khz); + pr_err("cpu_khz: %u\n", cpu_khz); /* Each iteration of this loop printk's one event. */ while (true) { @@ -723,10 +723,11 @@ void tt_printk(void) snprintf(msg, sizeof(msg), event->format, event->arg0, event->arg1, event->arg2, event->arg3); - pr_notice("%lu [C%02d] %s\n", + pr_err("%lu [C%02d] %s\n", (unsigned long)event->timestamp, current_core, msg); } + pr_err("Finished dumping timetrace to syslog\n"); atomic_dec(&tt_freeze_count); atomic_set(&active, 0); @@ -808,6 +809,10 @@ void tt_get_messages(char *buffer, size_t length) */ void tt_dbg1(char *msg, ...) { + pr_err("tt_dbg1 is dumping timetrace\n"); + tt_freeze(); + tt_printk(); + pr_err("Finished dumping timetrace\n"); } /** From 235e1a07656d2911fb5d6b3b15e093fe9189012b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 12 Mar 2025 21:29:12 -0700 Subject: [PATCH 209/625] Restore timetracing in new waiting code --- homa_incoming.c | 18 +++++++++++++----- homa_interest.c | 19 +++++++++++-------- homa_interest.h | 18 ++++++++++++------ test/unit_homa_interest.c | 5 +++++ util/tthoma.py | 29 +++++++++++++++-------------- 5 files changed, 56 insertions(+), 33 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 8962f689..a7ee04f8 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1233,8 +1233,11 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) if (rpc->msgin.length >= 0 && rpc->msgin.bytes_remaining == 0 && skb_queue_len(&rpc->msgin.packets) == 0) { - if (iteration == 0) + if (iteration == 0) { + tt_record2("homa_wait_private found rpc id %d, pid %d via null, blocked 0", + rpc->id, current->pid); INC_METRIC(fast_wakeups, 1); + } break; } @@ -1249,6 +1252,8 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) homa_rpc_lock(rpc); atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); homa_interest_unlink_private(&interest); + tt_record3("homa_wait_private found rpc id %d, pid %d via handoff, blocked %d", + rpc->id, current->pid, interest.blocked); /* If homa_interest_wait returned an error but the interest * actually got ready, then ignore the error. @@ -1293,6 +1298,8 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) if (!list_empty(&hsk->ready_rpcs)) { rpc = list_first_entry(&hsk->ready_rpcs, struct homa_rpc, ready_links); + tt_record2("homa_wait_shared found rpc id %d, pid %d via ready_rpcs, blocked 0", + rpc->id, current->pid); homa_rpc_hold(rpc); list_del_init(&rpc->ready_links); if (!list_empty(&hsk->ready_rpcs)) { @@ -1327,10 +1334,10 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) rpc = ERR_PTR(-ESHUTDOWN); goto done; } + tt_record3("homa_wait_shared found rpc id %d, pid %d via handoff, blocked %d", + rpc->id, current->pid, interest.blocked); } - tt_record3("homa_wait_shared received RPC handoff, id %d, socket %d, pid %d", - rpc->id, rpc->hsk->port, current->pid); atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); @@ -1361,8 +1368,6 @@ void homa_rpc_handoff(struct homa_rpc *rpc) struct homa_sock *hsk = rpc->hsk; struct homa_interest *interest; - tt_record1("homa_rpc_handoff called for id %d", rpc->id); - if (atomic_read(&rpc->flags) & RPC_PRIVATE) { homa_interest_notify_private(rpc); return; @@ -1382,12 +1387,15 @@ void homa_rpc_handoff(struct homa_rpc *rpc) list_del_init(&interest->links); interest->rpc = rpc; homa_rpc_hold(rpc); + tt_record1("homa_rpc_handoff handing off id %d", rpc->id); atomic_set_release(&interest->ready, 1); wake_up(&interest->wait_queue); INC_METRIC(handoffs_thread_waiting, 1); } else if (list_empty(&rpc->ready_links)) { list_add_tail(&rpc->ready_links, &hsk->ready_rpcs); hsk->sock.sk_data_ready(&hsk->sock); + tt_record2("homa_rpc_handoff queued id %d for port %d", + rpc->id, hsk->port); } homa_sock_unlock(hsk); } diff --git a/homa_interest.c b/homa_interest.c index fbf9b4cf..78fea97c 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -21,9 +21,10 @@ void homa_interest_init_shared(struct homa_interest *interest, struct homa_sock *hsk) __must_hold(&hsk->lock) { - atomic_set(&interest->ready, 0); interest->rpc = NULL; + atomic_set(&interest->ready, 0); interest->core = raw_smp_processor_id(); + interest->blocked = 0; init_waitqueue_head(&interest->wait_queue); interest->hsk = hsk; list_add(&interest->links, &hsk->interests); @@ -45,9 +46,10 @@ int homa_interest_init_private(struct homa_interest *interest, if (rpc->private_interest) return -EINVAL; - atomic_set(&interest->ready, 0); interest->rpc = rpc; + atomic_set(&interest->ready, 0); interest->core = raw_smp_processor_id(); + interest->blocked = 0; init_waitqueue_head(&interest->wait_queue); interest->hsk = rpc->hsk; rpc->private_interest = interest; @@ -71,13 +73,13 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) { u64 start, block_start, blocked_time, now; struct homa_sock *hsk = interest->hsk; - int fast_wakeup = 1; int result = 0; int iteration; int wait_err; start = sched_clock(); blocked_time = 0; + interest->blocked = 0; /* This loop iterates in order to poll and/or reap dead RPCS. */ for (iteration = 0; ; iteration++) { @@ -104,7 +106,7 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) break; } - fast_wakeup = 0; + interest->blocked = 1; block_start = now; wait_err = wait_event_interruptible_exclusive(interest->wait_queue, atomic_read_acquire(&interest->ready) != 0); @@ -113,11 +115,12 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) result = -EINTR; done: - if (fast_wakeup) - INC_METRIC(fast_wakeups, 1); - else + if (interest->blocked) { INC_METRIC(slow_wakeups, 1); - INC_METRIC(blocked_ns, blocked_time); + INC_METRIC(blocked_ns, blocked_time); + } else { + INC_METRIC(fast_wakeups, 1); + } INC_METRIC(poll_ns, sched_clock() - start - blocked_time); return result; } diff --git a/homa_interest.h b/homa_interest.h index c7eca5c6..2c307fb3 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -15,12 +15,6 @@ * (if present on hsk->interests). */ struct homa_interest { - /** - * @ready: Nonzero means the interest is ready for attention: either - * there is an RPC that needs attention or @hsk has been shutdown. - */ - atomic_t ready; - /** * @rpc: If ready is set, then this holds an RPC that needs * attention, or NULL if this is a shared interest and hsk has @@ -30,12 +24,24 @@ struct homa_interest { */ struct homa_rpc *rpc; + /** + * @ready: Nonzero means the interest is ready for attention: either + * there is an RPC that needs attention or @hsk has been shutdown. + */ + atomic_t ready; + /** * @core: Core on which homa_wait_*was invoked. This is a hint * used for load balancing (see balance.txt). */ int core; + /** + * @blocked: Zero means a handoff was received without the thread + * needing to block; nonzero means the thread blocked. + */ + int blocked; + /** * @wait_queue: Used to block the thread while waiting (will never * have more than one queued thread). diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index 3f52f036..0e3d6550 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -141,6 +141,8 @@ TEST_F(homa_interest, homa_interest_wait__already_ready) homa_interest_init_shared(&interest, &self->hsk); atomic_set(&interest.ready, 1); EXPECT_EQ(0, homa_interest_wait(&interest, 0)); + EXPECT_EQ(0, interest.blocked); + EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups); homa_interest_unlink_shared(&interest); } @@ -187,6 +189,7 @@ TEST_F(homa_interest, homa_interest_wait__nonblocking) self->homa.poll_usecs = 100; EXPECT_EQ(EAGAIN, -homa_interest_wait(&interest, 1)); + EXPECT_EQ(0, interest.blocked); homa_interest_unlink_shared(&interest); } TEST_F(homa_interest, homa_interest_wait__poll_then_block) @@ -206,6 +209,7 @@ TEST_F(homa_interest, homa_interest_wait__poll_then_block) EXPECT_EQ(0, homa_metrics_per_cpu()->blocked_ns); EXPECT_EQ(0, homa_metrics_per_cpu()->fast_wakeups); EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups); + EXPECT_EQ(1, interest.blocked); homa_interest_unlink_shared(&interest); } TEST_F(homa_interest, homa_interest_wait__interrupted_by_signal) @@ -217,6 +221,7 @@ TEST_F(homa_interest, homa_interest_wait__interrupted_by_signal) self->homa.poll_usecs = 0; EXPECT_EQ(EINTR, -homa_interest_wait(&interest, 0)); + EXPECT_EQ(1, interest.blocked); homa_interest_unlink_shared(&interest); } TEST_F(homa_interest, homa_interest_wait__time_metrics) diff --git a/util/tthoma.py b/util/tthoma.py index 1dc26a6f..8f78cb9c 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -27,7 +27,7 @@ # file; it is created by AnalyzeRpcs. Keys are RPC ids, values are dictionaries # of info about that RPC, with the following elements (some elements may be # missing if the RPC straddled the beginning or end of the timetrace): -# found: Time when homa_wait_for_message found the RPC +# found: Last time when homa_wait_for_message found the RPC # gro_core: Core that handled GRO processing for this RPC # gro_data: List of tuples for all incoming # data packets processed by GRO @@ -37,7 +37,7 @@ # grant packets processed by GRO. Deprecated: use # gro_grant_pkts instead # gro_grant_pkts: List of all incoming grant packets processed by GRO -# handoff: Time when RPC was handed off to waiting thread +# handoff: Last time when RPC was handed off to waiting thread # id: RPC's identifier # in_length: Size of the incoming message, in bytes, or None if unknown # ip_xmits: Dictionary mapping from offset to ip_*xmit time for @@ -47,7 +47,7 @@ # (name of trace file without extension) # out_length: Size of the outgoing message, in bytes # peer: Address of the peer host -# queued: Time when RPC was added to ready queue (no +# queued: Last time when RPC was added to ready queue (no # waiting threads). At most one of 'handoff' and 'queued' # will be present. # resend_rx: List of tuples for all incoming @@ -1308,17 +1308,19 @@ def __rpc_queued(self, trace, time, core, match, interests): patterns.append({ 'name': 'rpc_queued', - 'regexp': 'homa_rpc_handoff finished queuing id ([0-9]+)' + 'regexp': 'homa_rpc_handoff queued id ([0-9]+)' }) def __wait_found_rpc(self, trace, time, core, match, interests): id = int(match.group(1)) + type = match.group(2) + blocked = int(match.group(3)) for interest in interests: - interest.tt_wait_found_rpc(trace, time, core, id) + interest.tt_wait_found_rpc(trace, time, core, id, type, blocked) patterns.append({ 'name': 'wait_found_rpc', - 'regexp': 'homa_wait_for_message found rpc id ([0-9]+)' + 'regexp': 'homa_wait_[^ ]+ found rpc id ([0-9]+).* via ([a-z_]+), blocked ([0-9]+)' }) def __poll_success(self, trace, time, core, match, interests): @@ -2383,16 +2385,15 @@ def tt_poll_success(self, trace, time, core, id): def tt_rpc_queued(self, trace, time, core, id): self.rpc_queued[id] = time - def tt_wait_found_rpc(self, trace, time, core, id): + def tt_wait_found_rpc(self, trace, time, core, id, type, blocked): if id in self.rpc_handoffs: delay = time - self.rpc_handoffs[id] - if id in self.poll_success: - self.app_poll_wakeups.append([delay, time, trace['node']]) - del self.poll_success[id] - else: + if blocked: self.app_sleep_wakeups.append([delay, time, trace['node']]) + else: + self.app_poll_wakeups.append([delay, time, trace['node']]) del self.rpc_handoffs[id] - elif id in self.rpc_queued: + elif id in self.rpc_queued and blocked == 0: self.app_queue_wakeups.append([time - self.rpc_queued[id], time, trace['node']]) del self.rpc_queued[id] @@ -3650,7 +3651,7 @@ def output(self): node_req_handoffs[rpc['node']].append(delay) else: node_resp_handoffs[rpc['node']].append(delay) - elif 'queued' in rpc: + elif 'queued' in rpc:q delay = rpc['found'] - rpc['queued'] if id & 1: node_req_queued[rpc['node']].append(delay) @@ -5733,7 +5734,7 @@ def tt_recvmsg_done(self, trace, t, core, id, length): global rpcs rpcs[id]['recvmsg_done'] = t - def tt_wait_found_rpc(self, trace, t, core, id): + def tt_wait_found_rpc(self, trace, t, core, id, type, blocked): rpcs[id]['found'] = t def tt_copy_out_start(self, trace, t, core, id): From 3f35bae894686ff1904a2c0d4e1c6439b73f6633 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 13 Mar 2025 09:21:39 -0700 Subject: [PATCH 210/625] Make new wait mechanism work in stripped mode --- Makefile.upstream | 1 + homa_devel.h | 6 ++++++ homa_impl.h | 3 ++- homa_incoming.c | 12 ++++++++++++ homa_interest.c | 14 +++++++++++--- util/strip.py | 5 ++++- 6 files changed, 36 insertions(+), 5 deletions(-) diff --git a/Makefile.upstream b/Makefile.upstream index 3eb192a6..c82fbc22 100644 --- a/Makefile.upstream +++ b/Makefile.upstream @@ -4,6 +4,7 @@ obj-$(CONFIG_HOMA) := homa.o homa-y:= homa_incoming.o \ + homa_interest.o \ homa_outgoing.o \ homa_peer.o \ homa_pool.o \ diff --git a/homa_devel.h b/homa_devel.h index d17afa2a..4b74a4eb 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -73,6 +73,12 @@ static inline void check_addr_valid(void *addr, char *info) #endif /* __UNIT_TEST__ */ } +#ifndef __STRIP__ /* See strip.py */ +#define IF_NO_STRIP(code) code +#else /* See strip.py */ +#define IF_NO_STRIP(code) +#endif /* See strip.py */ + #ifndef __STRIP__ /* See strip.py */ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format); diff --git a/homa_impl.h b/homa_impl.h index 5e7e4403..50b36757 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -68,9 +68,10 @@ #endif /* See strip.py */ /* Forward declarations. */ +struct homa; struct homa_peer; +struct homa_rpc; struct homa_sock; -struct homa; #ifndef __STRIP__ /* See strip.py */ #include "timetrace.h" diff --git a/homa_incoming.c b/homa_incoming.c index a7ee04f8..f5e5b3a2 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1233,11 +1233,13 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) if (rpc->msgin.length >= 0 && rpc->msgin.bytes_remaining == 0 && skb_queue_len(&rpc->msgin.packets) == 0) { +#ifndef __STRIP__ /* See strip.py */ if (iteration == 0) { tt_record2("homa_wait_private found rpc id %d, pid %d via null, blocked 0", rpc->id, current->pid); INC_METRIC(fast_wakeups, 1); } +#endif /* See strip.py */ break; } @@ -1309,8 +1311,10 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) hsk->sock.sk_data_ready(&hsk->sock); } homa_sock_unlock(hsk); +#ifndef __STRIP__ /* See strip.py */ if (iteration == 0) INC_METRIC(fast_wakeups, 1); +#endif /* See strip.py */ } else { homa_interest_init_shared(&interest, hsk); homa_sock_unlock(hsk); @@ -1391,6 +1395,14 @@ void homa_rpc_handoff(struct homa_rpc *rpc) atomic_set_release(&interest->ready, 1); wake_up(&interest->wait_queue); INC_METRIC(handoffs_thread_waiting, 1); + +#ifndef __STRIP__ /* See strip.py */ + /* Update the last_app_active time for the thread's core, so Homa + * Homa will try to avoid assigning any work there. + */ + per_cpu(homa_offload_core, interest->core).last_app_active = + sched_clock(); +#endif /* See strip.py */ } else if (list_empty(&rpc->ready_links)) { list_add_tail(&rpc->ready_links, &hsk->ready_rpcs); hsk->sock.sk_data_ready(&hsk->sock); diff --git a/homa_interest.c b/homa_interest.c index 78fea97c..7b88fbfe 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -71,14 +71,16 @@ int homa_interest_init_private(struct homa_interest *interest, */ int homa_interest_wait(struct homa_interest *interest, int nonblocking) { - u64 start, block_start, blocked_time, now; struct homa_sock *hsk = interest->hsk; int result = 0; int iteration; int wait_err; +#ifndef __STRIP__ /* See strip.py */ + u64 start, block_start, blocked_time, now; start = sched_clock(); blocked_time = 0; +#endif /* See strip.py */ interest->blocked = 0; /* This loop iterates in order to poll and/or reap dead RPCS. */ @@ -99,22 +101,27 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) goto done; } +#ifndef __STRIP__ /* See strip.py */ now = sched_clock(); per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = now; if (now - start >= 1000 * hsk->homa->poll_usecs) break; +#else /* See strip.py */ + break; +#endif /* See strip.py */ } interest->blocked = 1; - block_start = now; + IF_NO_STRIP(block_start = now); wait_err = wait_event_interruptible_exclusive(interest->wait_queue, atomic_read_acquire(&interest->ready) != 0); - blocked_time = sched_clock() - block_start; + IF_NO_STRIP(blocked_time = sched_clock() - block_start); if (wait_err == -ERESTARTSYS) result = -EINTR; done: +#ifndef __STRIP__ /* See strip.py */ if (interest->blocked) { INC_METRIC(slow_wakeups, 1); INC_METRIC(blocked_ns, blocked_time); @@ -122,6 +129,7 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) INC_METRIC(fast_wakeups, 1); } INC_METRIC(poll_ns, sched_clock() - start - blocked_time); +#endif /* See strip.py */ return result; } diff --git a/util/strip.py b/util/strip.py index 2af83593..14f4a1bb 100755 --- a/util/strip.py +++ b/util/strip.py @@ -18,6 +18,7 @@ * Blocks conditionalized on '#ifdef __UNIT_TEST__' * UNIT_LOG and UNIT_HOOK statements * INC_METRIC statements + * IF_NO_STRIP statements Additional stripping is controlled by #ifdefs. The #ifdefs allow the code to be used in three ways: @@ -260,10 +261,12 @@ def scan(file): check_braces = True continue - # Strip tt_record and INC_METRIC statements. + # Strip tt_record, INC_METRIC, and IF_NO_STRIP statements. match = re.match('(//[ \t]*)?tt_record[1-4]?[(]', pline) if not match: match = re.match('(//[ \t]*)?INC_METRIC[(]', pline) + if not match: + match = re.match('(//[ \t]*)?IF_NO_STRIP[(]', pline) if match: # If this is the only statement in its block, delete the # outer block statement (if, while, etc.). Don't delete case From dffaa7b1fdf3b2a2099f1b12907c831b0c7d9a9d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 13 Mar 2025 09:31:21 -0700 Subject: [PATCH 211/625] Rename disabled_rpc_reaps metric to deferred_rpc_reaps --- homa_metrics.c | 4 ++-- homa_metrics.h | 6 +++--- homa_rpc.c | 2 +- test/unit_homa_rpc.c | 10 +++++----- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index 71416878..b3ddcf90 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -300,8 +300,8 @@ char *homa_metrics_print(struct homa *homa) m->fifo_grants_no_incoming); M("disabled_reaps %15llu Reaper invocations that were disabled\n", m->disabled_reaps); - M("disabled_rpc_reaps %15llu Disabled RPCs skipped by reaper\n", - m->disabled_rpc_reaps); + M("deferred_rpc_reaps %15llu RPCs skipped by reaper because still in use\n", + m->deferred_rpc_reaps); M("reaper_calls %15llu Reaper invocations that were not disabled\n", m->reaper_calls); M("reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper calls\n", diff --git a/homa_metrics.h b/homa_metrics.h index eebb39bc..ee60103f 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -550,10 +550,10 @@ struct homa_metrics { u64 disabled_reaps; /** - * @disabled_rpc_reaps: total number of times that the reaper skipped - * an RPC because reaping was disabled for that particular RPC + * @deferred_rpc_reaps: total number of times that the reaper skipped + * an RPC because it was still in use elsewhere. */ - u64 disabled_rpc_reaps; + u64 deferred_rpc_reaps; /** * @reaper_calls: total number of times that the reaper was invoked diff --git a/homa_rpc.c b/homa_rpc.c index 9a43fda0..b8620f9d 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -395,7 +395,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) refs = 1; } if (refs != 0) { - INC_METRIC(disabled_rpc_reaps, 1); + INC_METRIC(deferred_rpc_reaps, 1); continue; } rpc->magic = 0; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 19f679b2..2e2463be 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -615,10 +615,10 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_locked) mock_trylock_errors = 2; EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); - EXPECT_EQ(1, homa_metrics_per_cpu()->disabled_rpc_reaps); + EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps); unit_log_clear(); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); - EXPECT_EQ(1, homa_metrics_per_cpu()->disabled_rpc_reaps); + EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps); EXPECT_STREQ("reaped 1234", unit_log_get()); } TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) @@ -639,15 +639,15 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) self->homa.reap_limit = 3; EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); - EXPECT_EQ(1, homa_metrics_per_cpu()->disabled_rpc_reaps); + EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps); unit_log_clear(); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); - EXPECT_EQ(2, homa_metrics_per_cpu()->disabled_rpc_reaps); + EXPECT_EQ(2, homa_metrics_per_cpu()->deferred_rpc_reaps); EXPECT_STREQ("", unit_log_get()); homa_rpc_put(crpc1); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1234", unit_log_get()); - EXPECT_EQ(2, homa_metrics_per_cpu()->disabled_rpc_reaps); + EXPECT_EQ(2, homa_metrics_per_cpu()->deferred_rpc_reaps); } TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) { From f917adb7b6459124c0f90027b5db975cda8b868a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 13 Mar 2025 09:37:56 -0700 Subject: [PATCH 212/625] Check for missing buffer pool in homa_recvmsg --- homa_plumbing.c | 4 ++++ test/unit_homa_plumbing.c | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/homa_plumbing.c b/homa_plumbing.c index 8401526c..17f38695 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1189,6 +1189,10 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, result = -EINVAL; goto done; } + if (!hsk->buffer_pool) { + result = -EINVAL; + goto done; + } result = homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages, control.bpage_offsets); control.num_bpages = 0; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 9f0ff417..9c3021ec 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -674,6 +674,15 @@ TEST_F(homa_plumbing, homa_recvmsg__bogus_flags) EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); } +TEST_F(homa_plumbing, homa_recvmsg__no_buffer_pool) +{ + struct homa_pool *saved_pool = self->hsk.buffer_pool; + + self->hsk.buffer_pool = NULL; + EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, 0, &self->recvmsg_hdr.msg_namelen)); + self->hsk.buffer_pool = saved_pool; +} TEST_F(homa_plumbing, homa_recvmsg__release_buffers) { EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, From c2c53ab160b3578e4ee20d240789e05da24ce15b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 14 Mar 2025 09:43:05 -0700 Subject: [PATCH 213/625] Add 'pairs' analyzer to tthoma.py --- util/tthoma.py | 158 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 133 insertions(+), 25 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 8f78cb9c..8ddb110c 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -135,7 +135,7 @@ def __missing__(self, id): # elapsed_time: Total time interval covered by the trace traces = {} -# Peer address -> node names. Computed by AnalyzeRpcs. +# Peer address -> node name. Computed by AnalyzeRpcs. peer_nodes = {} # This variable holds information about every data packet in the traces. @@ -410,6 +410,18 @@ def get_first_interval_end(node=None): interval_end += options.interval return interval_end +def get_first_end(): + """ + Return the earliest time at which any of the traces ends (i.e. the last + time that is present in all of the trace files). + """ + earliest = 1e20 + for trace in traces.values(): + last = trace['last_time'] + if last < earliest: + earliest = last + return earliest + def get_first_time(): """ Return the earliest event time across all trace files. @@ -3651,7 +3663,7 @@ def output(self): node_req_handoffs[rpc['node']].append(delay) else: node_resp_handoffs[rpc['node']].append(delay) - elif 'queued' in rpc:q + elif 'queued' in rpc: delay = rpc['found'] - rpc['queued'] if id & 1: node_req_queued[rpc['node']].append(delay) @@ -4880,7 +4892,9 @@ def output(self): print('--------------') print('Network delay (including sending NIC, network, receiving NIC, and GRO') print('backup) for packets with GRO processing on a particular core.') - print('Pkts: Total data packets processed by Core on Node') + print('Node: Receiving node for packets') + print('Core: Core identifier on Node') + print('Pkts: Total incoming data packets processed by Core on Node') print('AvgDelay: Average end-to-end delay from ip_*xmit invocation to ' 'GRO (usec)') print('MaxDelay: Maximum end-to-end delay, and the time when the max packet was') @@ -5504,6 +5518,10 @@ def analyze(self): else: pkt['tso_length'] = tso_length + if not 'rx_node' in pkt: + if 'peer' in tx_rpc: + pkt['rx_node'] = peer_nodes[tx_rpc['peer']] + # Make sure that all of the smaller packets deriving from each # TSO packet are represented and properly populated (if one of # these packets is lost it won't be represented yet). @@ -5542,6 +5560,91 @@ def analyze(self): for pid, pkt in new_pkts: packets[pid] = pkt +#------------------------------------------------ +# Analyzer: pairs +#------------------------------------------------ +class AnalyzePairs: + """ + For each pair of nodes, outputs statistics about packet delays and + backlog as of the end of the traces. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + def output(self): + global traces, options, packets + print('\n-------------------') + print('Analyzer: pairs') + print('-------------------') + + # node -> dictionary mapping from node to the statistics about + # the node pair. + pairs = {} + backlog_time = get_first_end() + + for src in get_sorted_nodes(): + dsts = {} + for dst in get_sorted_nodes(): + if dst == src: + continue + dsts[dst] = {'delays': [], 'backlog': 0, 'xmit': 0} + pairs[src] = dsts + + for pkt in packets.values(): + if not 'nic' in pkt: + continue + if not 'tx_node' in pkt: + continue + if not 'rx_node' in pkt: + continue + src = pkt['tx_node'] + dst = pkt['rx_node'] + if pkt['nic'] >= traces[dst]['last_time']: + continue + if pkt['nic'] < traces[dst]['first_time']: + continue + info = pairs[pkt['tx_node']][pkt['rx_node']] + info['xmit'] += 1 + if 'gro' in pkt: + info['delays'].append(pkt['gro'] - pkt['nic']) + if not ('gro' in pkt) or (pkt['gro'] > backlog_time): + info['backlog'] += 1 + + print('Statistics about data packets sent between each distinct pair') + print('of nodes:') + print('Source: Node that transmitted packets') + print('Dest: Node to which packets were sent') + print('Xmits: Total number of packets sent from Source to Dest') + print('Backlog: Number of packets that had been sent but not recevied') + print(' as of the end of the traces (time %.1f)' % + (backlog_time)) + print('DelayP50: 10th percentile delay (usec NIC to GRO) for received pakcets') + print('DelayP50: 50th percentile delay (usec NIC to GRO) for received pakcets') + print('DelayP90: 90th percentile delay (usec NIC to GRO) for received pakcets') + print('DelayP99: 99th percentile delay (usec NIC to GRO) for received pakcets') + print() + print('Source Dest Xmits Backlog DelayP10 DelayP50 DelayP90 DelayP99') + first = True + for src in get_sorted_nodes(): + if not first: + print('') + for dst in get_sorted_nodes(): + if dst == src: + continue + info = pairs[src][dst] + delays = sorted(info['delays']) + if delays: + print('%6s %6s %6d %6d %7.1f %7.1f %7.1f %7.1f' % ( + src, dst, info['xmit'], info['backlog'], + delays[10*len(delays)//100], + delays[len(delays)//2], + delays[90*len(delays)//100], + delays[99*len(delays)//100])) + else: + print('%6s %6s %6d %6d %6d' % (src, dst, info['xmit'], + info['backlog'], len(delays))) + first = False + #------------------------------------------------ # Analyzer: rpcs #------------------------------------------------ @@ -6098,7 +6201,7 @@ class AnalyzeRx: """ Generates one data file for each node showing various statistics related to incoming message reception as a function of time, including - data rate, live messages, info about outstnanding grants, and where + data rate, live messages, info about outstanding grants, and where incoming data packets are curently located (qdisc, net, gro). Requires the --data and --gbps options. """ @@ -6830,40 +6933,45 @@ def __init__(self, dispatcher): dispatcher.interest('AnalyzePackets') def output(self): - global traces, options, packets + global traces, options, packets, rpcs print('\n-------------------') print('Analyzer: temp') print('-------------------') + print('Peer nodes: %s\n' % (peer_nodes)) + delays = [] pkts = [] node3pkts = 0 long = 50 node = options.node for pkt in packets.values(): + if pkt['id'] == 500018274: + print(pkt) if not 'nic' in pkt: continue - if not 'free_tx_skb' in pkt: - continue - if not 'gro' in pkt: - continue - max_gro = get_max_gro(pkt) - if (node != None) and ('tx_node' in pkt) and (node != pkt['tx_node']): + if not ('tx_node' in pkt) or (pkt['tx_node'] != 'node4'): continue - node3pkts += 1 - delta = pkt['free_tx_skb'] - max_gro - if delta < long: + if not ('rx_node' in pkt) or (pkt['rx_node'] != 'node1'): continue - pkts.append([delta, pkt]) - if pkts: - print('%d/%d packets (%.1f%%) had free delays > %d usec'% ( - len(pkts), node3pkts, 100*len(pkts)/node3pkts, long)) - else: - print('No packets had long free delays') - for delay, pkt in sorted(pkts, reverse=True, key=lambda t : t[0]): - print('RPC id %10d (%s), offset %6d, nic %9.3f, max_gro %9.3f, free %9.3f, ' - 'free delay %7.3f' % (pkt['id'], pkt['tx_node'], pkt['offset'], - pkt['nic'], get_max_gro(pkt), pkt['free_tx_skb'], - pkt['free_tx_skb'] - pkt['gro'])) + pkts.append(pkt) + if not pkts: + print('No data packets made it from node1 to node4 in the traces') + return + pkts.sort(key=lambda d : d['nic']) + print('RpcId Offset NIC GRO Delay') + for pkt in pkts: + if 'gro' in pkt: + delay = pkt['gro'] - pkt['nic'] + delays.append(delay) + print('%9d %6d %9.3f %9.3f %7.1f' % (pkt['id'], pkt['offset'], + pkt['nic'], pkt['gro'], pkt['gro'] - pkt['nic'])) + else: + print('%9d %6d %9.3f N/A' % (pkt['id'], pkt['offset'], + pkt['nic'])) + delays.sort() + print('\nDelays: average %.1f us, P50 %.1f us, P90 %.1f us, P99 %.1f us' % + (sum(delays)/len(delays), delays[50*len(delays)//100], + delays[90*len(delays)//100], delays[99*len(delays)//100])) def output_long_qdisc(self): global traces, options, packets From ffb3a885e118686dbbd547d258356797c82f0d8e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 14 Mar 2025 09:56:44 -0700 Subject: [PATCH 214/625] Trivial improvement in comment in homa_skb.c --- homa_skb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/homa_skb.c b/homa_skb.c index 8b129016..12c8ed5f 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: BSD-2-Clause -/* This file contains functions for allocating and freeing sk_buffs. In - * particular, this file implements efficient management of the memory used - * by sk_buffs. +/* This file contains functions for allocating and freeing sk_buffs for + * outbound packets. In particular, this file implements efficient management + * of the memory used by sk_buffs. */ #include "homa_impl.h" From ff39244788bd77bac2b8f03eabdaeb711ac91953 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Mar 2025 11:28:57 -0700 Subject: [PATCH 215/625] Fix errors in comments --- homa_rpc.c | 2 +- homa_sock.c | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index b8620f9d..7402f77b 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -270,7 +270,7 @@ void homa_rpc_end(struct homa_rpc *rpc) #ifndef __STRIP__ /* See strip.py */ /* The following line must occur before the socket is locked. This is - * necessary because homa__rpc releases the RPC lock and + * necessary because homa_grant_end_rpc releases the RPC lock and * reacquires it. */ homa_grant_end_rpc(rpc); diff --git a/homa_sock.c b/homa_sock.c index 6c9d2689..50af19a2 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -204,9 +204,6 @@ void homa_sock_unlink(struct homa_sock *hsk) { struct homa_socktab *socktab = hsk->homa->port_map; - /* If any scans refer to this socket, advance them to refer to - * the next socket instead. - */ spin_lock_bh(&socktab->write_lock); hlist_del_rcu(&hsk->socktab_links); spin_unlock_bh(&socktab->write_lock); From 8fb7e049c9b71df9c041fcfd38011981e014a924 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Mar 2025 14:37:56 -0700 Subject: [PATCH 216/625] Delete homa_grant_check_needy --- homa_grant.c | 81 +++----------- homa_grant.h | 1 - homa_impl.h | 10 +- test/unit_homa_grant.c | 237 ++++++++--------------------------------- 4 files changed, 61 insertions(+), 268 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 4d5d7f62..42fe9e9c 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -245,8 +245,7 @@ int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa) atomic_read(&rpc->msgin.rec_incoming); avl_incoming = homa->max_incoming - atomic_read(&homa->total_incoming); if (avl_incoming < incoming_delta) { - atomic_or(homa_grant_needy_bit(atomic_read(&rpc->msgin.rank)), - &homa->needy_ranks); + atomic_set(&homa->incoming_hit_limit, 1); tt_record3("insufficient headroom: needed %d, available %d, used %d", incoming_delta, avl_incoming, atomic_read(&homa->total_incoming)); @@ -274,8 +273,6 @@ int homa_grant_try_send(struct homa_rpc *rpc, struct homa *homa) { struct homa_grant_hdr grant; - atomic_andnot(homa_grant_needy_bit(atomic_read(&rpc->msgin.rank)), - &homa->needy_ranks); if (!homa_grant_update_offset(rpc, homa)) return 0; homa_grant_update_incoming(rpc, homa); @@ -330,8 +327,10 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) INC_METRIC(grant_check_calls, 1); homa_grant_update_incoming(rpc, homa); if (rpc->msgin.granted >= rpc->msgin.length) { - if (homa_grant_check_needy(homa)) + if (atomic_read(&homa->incoming_hit_limit) != 0 && + atomic_read(&homa->total_incoming) < homa->max_incoming) { goto recalc; + } goto done; } @@ -366,22 +365,17 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) goto recalc; } - if (atomic_read(&homa->needy_ranks) != 0) { - /* There are other RPCs that also need grants; process them - * in priority order (and make sure this RPC ges considered - * as well). - */ - atomic_or(homa_grant_needy_bit(rank), &homa->needy_ranks); - if (!homa_grant_check_needy(homa)) - goto done; - } else { - /* Ideally this should be the common case: no need to consider - * any other RPCs. - */ - if (!homa_grant_try_send(rpc, homa)) - goto done; + if (atomic_read(&homa->incoming_hit_limit) != 0 && + atomic_read(&homa->total_incoming) < homa->max_incoming) { + goto recalc; } + /* Ideally this should be the common case: no need to consider + * any other RPCs. + */ + if (!homa_grant_try_send(rpc, homa)) + goto done; + recalc: homa_grant_recalc(homa); @@ -424,7 +418,7 @@ void homa_grant_recalc(struct homa *homa) try_again = 0; atomic_inc(&homa->grant_recalc_count); - atomic_set(&homa->needy_ranks, 0); + atomic_set(&homa->incoming_hit_limit, 0); /* Clear the existing grant calculation. */ for (i = 0; i < homa->num_active_rpcs; i++) @@ -560,53 +554,6 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, return num_rpcs; } -/** - * homa_grant_check_needy() - See if any of the RPCs in @homa->needy_ranks - * can now be granted; if so, issue grants to them. - * @homa: Overall information about the Homa transport. - * Return: Nonzero means that homa_grant_recalc needs to be called (the - * list of grantable RPCs changed). - */ -int homa_grant_check_needy(struct homa *homa) -{ - struct homa_rpc *rpc; - int result = 0; - int rank; - - INC_METRIC(grant_check_needy_calls, 1); - while (atomic_read(&homa->total_incoming) < homa->max_incoming) { - rank = ffs(atomic_read(&homa->needy_ranks)); - if (rank == 0) - break; - rank--; - atomic_andnot(homa_grant_needy_bit(rank), - &homa->needy_ranks); - - homa_grantable_lock(homa, 0); - if (rank >= homa->num_active_rpcs) { - /* active_rpcs changed before lock was acquired; - * no need for us to do anything more (someone else - * has already invoked homa_grant_recalc). - */ - homa_grantable_unlock(homa); - return 0; - } - - /* Must take reference on rpc to keep it alive, which can only - * be done safely while holding grantable lock. But, must - * release grantable lock before actually sending grant, in - * order to reduce contention. - */ - rpc = homa->active_rpcs[rank]; - homa_rpc_hold(rpc); - homa_grantable_unlock(homa); - - result |= homa_grant_try_send(rpc, homa); - homa_rpc_put(rpc); - } - return result; -} - /** * homa_grant_find_oldest() - Recompute the value of homa->oldest_rpc. * @homa: Overall data about the Homa protocol implementation. The diff --git a/homa_grant.h b/homa_grant.h index eadb27fc..189c78d5 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -9,7 +9,6 @@ int homa_grantable_lock_slow(struct homa *homa, int recalc); void homa_grant_add_rpc(struct homa_rpc *rpc); -int homa_grant_check_needy(struct homa *homa); void homa_grant_check_rpc(struct homa_rpc *rpc); void homa_grant_end_rpc(struct homa_rpc *rpc); void homa_grant_find_oldest(struct homa *homa); diff --git a/homa_impl.h b/homa_impl.h index 50b36757..175574ff 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -190,7 +190,8 @@ struct homa { /** * @active_rpcs: pointers to all of the RPCs that we will grant to - * right now. Slot 0 is highest priority. + * right now. Slot 0 is highest priority. A value may be NULL if the + * RPC has ended. */ struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; @@ -241,11 +242,10 @@ struct homa { atomic_t total_incoming ____cacheline_aligned_in_smp; /** - * @needy_ranks: A bitmask selecting all of the indices in @active_rpcs - * whose RPCs could not be fully granted because @total_incoming - * hit the @max_incoming limit. + * @incoming_hit_limit: Nonzero means that one or more RPCs could + * not be fully granted because @total_incoming exceeded @max_incoming. */ - atomic_t needy_ranks; + atomic_t incoming_hit_limit; #endif /* See strip.py */ /** diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index e408624b..2dffa18e 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -35,14 +35,6 @@ static void grantable_spinlock_hook(char *id) mock_ns = 1000; } -static void change_active_hook(char *id) -{ - if (strcmp(id, "spin_lock") != 0) - return; - if (hook_homa != NULL) - hook_homa->num_active_rpcs = 0; -} - FIXTURE(homa_grant) { struct in6_addr client_ip[5]; int client_port; @@ -421,7 +413,7 @@ TEST_F(homa_grant, homa_grant_update_offset__basics) EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(10000, rpc->msgin.granted); - EXPECT_EQ(0, atomic_read(&self->homa.needy_ranks)); + EXPECT_EQ(0, atomic_read(&self->homa.incoming_hit_limit)); } TEST_F(homa_grant, homa_grant_update_offset__rpc_idle) { @@ -439,7 +431,7 @@ TEST_F(homa_grant, homa_grant_update_offset__end_of_message) rpc->msgin.bytes_remaining = 5000; EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(20000, rpc->msgin.granted); - EXPECT_EQ(0, atomic_read(&self->homa.needy_ranks)); + EXPECT_EQ(0, atomic_read(&self->homa.incoming_hit_limit)); /* Second call cannot grant anymore. */ EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); @@ -454,7 +446,7 @@ TEST_F(homa_grant, homa_grant_update_offset__insufficient_room_in_incoming) atomic_set(&self->homa.total_incoming, 48000); EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(17000, rpc->msgin.granted); - EXPECT_EQ(0x20, atomic_read(&self->homa.needy_ranks)); + EXPECT_EQ(1, atomic_read(&self->homa.incoming_hit_limit)); } TEST_F(homa_grant, homa_grant_update_offset__incoming_overcommitted) { @@ -464,7 +456,7 @@ TEST_F(homa_grant, homa_grant_update_offset__incoming_overcommitted) atomic_set(&self->homa.total_incoming, 51000); EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); EXPECT_EQ(0, rpc->msgin.granted); - EXPECT_EQ(0x40, atomic_read(&self->homa.needy_ranks)); + EXPECT_EQ(1, atomic_read(&self->homa.incoming_hit_limit)); } TEST_F(homa_grant, homa_grant_try_send__basics) @@ -472,11 +464,9 @@ TEST_F(homa_grant, homa_grant_try_send__basics) struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); atomic_set(&rpc->msgin.rank, 1); - atomic_set(&self->homa.needy_ranks, 7); unit_log_clear(); EXPECT_EQ(0, homa_grant_try_send(rpc, &self->homa)); EXPECT_EQ(10000, rpc->msgin.granted); - EXPECT_EQ(5, atomic_read(&self->homa.needy_ranks)); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); } @@ -485,12 +475,11 @@ TEST_F(homa_grant, homa_grant_try_send__cant_grant) struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); atomic_set(&rpc->msgin.rank, 1); - atomic_set(&self->homa.needy_ranks, 7); atomic_set(&self->homa.total_incoming, self->homa.max_incoming); unit_log_clear(); EXPECT_EQ(0, homa_grant_try_send(rpc, &self->homa)); EXPECT_EQ(0, rpc->msgin.granted); - EXPECT_EQ(7, atomic_read(&self->homa.needy_ranks)); + EXPECT_EQ(1, atomic_read(&self->homa.incoming_hit_limit)); EXPECT_EQ(50000, atomic_read(&self->homa.total_incoming)); EXPECT_STREQ("", unit_log_get()); } @@ -557,35 +546,24 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) EXPECT_EQ(2000, atomic_read(&rpc->msgin.rec_incoming)); EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); } -TEST_F(homa_grant, homa_grant_check_rpc__message_doesnt_need_grants) +TEST_F(homa_grant, homa_grant_check_rpc__message_fully_granted_no_recalc) { - struct homa_rpc *rpc1, *rpc2; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, rpc1->msgin.granted); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(1, atomic_read(&self->homa.needy_ranks)); - atomic_set(&self->homa.total_incoming, 0); + struct homa_rpc *rpc; - rpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 102, 1000, 2000); - homa_message_in_init(rpc2, 2000, 0); - rpc2->msgin.granted = 2000; - rpc2->msgin.bytes_remaining = 500; + homa_message_in_init(rpc, 2000, 0); + rpc->msgin.granted = 2000; + rpc->msgin.bytes_remaining = 500; unit_log_clear(); - homa_grant_check_rpc(rpc2); - EXPECT_EQ(500, atomic_read(&rpc2->msgin.rec_incoming)); - EXPECT_EQ(10500, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(10000, rpc1->msgin.granted); - EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); + homa_grant_check_rpc(rpc); + EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_grant, homa_grant_check_rpc__message_doesnt_need_grants_must_recalc) +TEST_F(homa_grant, homa_grant_check_rpc__message_fully_granted_must_recalc) { - struct homa_rpc *rpc1, *rpc2, *rpc3; + struct homa_rpc *rpc1, *rpc2; /* First RPC is complete. */ rpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, @@ -594,30 +572,26 @@ TEST_F(homa_grant, homa_grant_check_rpc__message_doesnt_need_grants_must_recalc) homa_message_in_init(rpc1, 2000, 0); rpc1->msgin.granted = 2000; rpc1->msgin.bytes_remaining = 0; + atomic_set(&rpc1->msgin.rec_incoming, 1500); /* Second RPC will be waiting for incoming. */ rpc2 = test_rpc(self, 100, self->server_ip, 5000); - /* Third RPC will get granted when homa_grant_check_rpc calls - * homa_grant_recalc. */ - rpc3 = test_rpc(self, 100, self->server_ip, 20000); - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); self->homa.max_overcommit = 1; homa_grant_recalc(&self->homa); + EXPECT_EQ(1, atomic_read(&self->homa.incoming_hit_limit)); EXPECT_EQ(0, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(0, rpc2->msgin.granted); - EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); - EXPECT_EQ(0, rpc3->msgin.granted); - atomic_set(&self->homa.total_incoming, 0); + // atomic_set(&self->homa.total_incoming, 0); unit_log_clear(); homa_grant_check_rpc(rpc1); - EXPECT_EQ(5000, rpc2->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); - EXPECT_STREQ("xmit GRANT 5000@0; homa_grant_recalc; xmit GRANT 10000@0", + EXPECT_EQ(1500, rpc2->msgin.granted); + EXPECT_EQ(self->homa.max_incoming, + atomic_read(&self->homa.total_incoming)); + EXPECT_STREQ("homa_grant_recalc; xmit GRANT 1500@0", unit_log_get()); } TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) @@ -725,79 +699,48 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_STREQ("homa_grant_recalc; xmit GRANT 25000@1", unit_log_get()); } -TEST_F(homa_grant, homa_grant_check_rpc__check_all_needy) +TEST_F(homa_grant, homa_grant_check_rpc__check_incoming_no_recalc) { - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + struct homa_rpc *rpc1, *rpc2, *rpc3; rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 100, self->server_ip, 30000); rpc3 = test_rpc(self, 100, self->server_ip, 40000); - rpc4 = test_rpc(self, 100, self->server_ip, 50000); atomic_set(&self->homa.total_incoming, self->homa.max_incoming); homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(3, atomic_read(&rpc4->msgin.rank)); + EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); - atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 25000); - atomic_set(&self->homa.needy_ranks, 0x9); + atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 15000); + atomic_set(&self->homa.incoming_hit_limit, 0); unit_log_clear(); homa_grant_check_rpc(rpc3); - EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_EQ(0, rpc1->msgin.granted); EXPECT_EQ(0, rpc2->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(5000, rpc4->msgin.granted); - EXPECT_STREQ("xmit GRANT 10000@3; xmit GRANT 10000@1; xmit GRANT 5000@0", - unit_log_get()); -} -TEST_F(homa_grant, homa_grant_check_rpc__recalc_after_cnecking_needy) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 5000); - rpc2 = test_rpc(self, 100, self->server_ip, 10000); - rpc3 = test_rpc(self, 100, self->server_ip, 20000); - rpc4 = test_rpc(self, 100, self->server_ip, 30000); - self->homa.max_overcommit = 3; - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); - - atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 10000); - atomic_set(&self->homa.needy_ranks, 0x6); - unit_log_clear(); - homa_grant_check_rpc(rpc3); - EXPECT_EQ(0, rpc1->msgin.granted); - EXPECT_EQ(10000, rpc2->msgin.granted); - EXPECT_EQ(0, rpc3->msgin.granted); - EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_STREQ("xmit GRANT 10000@1; homa_grant_recalc", unit_log_get()); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(2, atomic_read(&rpc4->msgin.rank)); + EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); } -TEST_F(homa_grant, homa_grant_check_rpc__recalc_after_checking_needy) +TEST_F(homa_grant, homa_grant_check_rpc__check_incoming_recalc) { - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + struct homa_rpc *rpc1, *rpc2, *rpc3; - rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 100, self->server_ip, 30000); rpc3 = test_rpc(self, 100, self->server_ip, 40000); - rpc4 = test_rpc(self, 100, self->server_ip, 50000); - self->homa.max_overcommit = 3; - self->homa.max_incoming = 0; + atomic_set(&self->homa.total_incoming, self->homa.max_incoming); homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); + EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); - self->homa.max_incoming = 15000; + atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 15000); + atomic_set(&self->homa.incoming_hit_limit, 1); unit_log_clear(); homa_grant_check_rpc(rpc3); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(5000, rpc2->msgin.granted); EXPECT_EQ(0, rpc3->msgin.granted); - EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_EQ(2, atomic_read(&rpc4->msgin.rank)); + EXPECT_STREQ("homa_grant_recalc; xmit GRANT 10000@2; xmit GRANT 5000@1", + unit_log_get()); } TEST_F(homa_grant, homa_grant_check_rpc__grant_to_self) { @@ -813,7 +756,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__grant_to_self) EXPECT_EQ(3, atomic_read(&rpc4->msgin.rank)); atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 20000); - atomic_set(&self->homa.needy_ranks, 0); + atomic_set(&self->homa.incoming_hit_limit, 0); unit_log_clear(); homa_grant_check_rpc(rpc3); EXPECT_EQ(0, rpc1->msgin.granted); @@ -837,7 +780,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__grant_to_self_and_recalc) EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 10000); - atomic_set(&self->homa.needy_ranks, 0); + atomic_set(&self->homa.incoming_hit_limit, 0); unit_log_clear(); homa_grant_check_rpc(rpc3); EXPECT_EQ(0, rpc1->msgin.granted); @@ -858,7 +801,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) rpc4 = test_rpc(self, 106, self->server_ip+1, 35000); self->homa.max_incoming = 100000; self->homa.max_overcommit = 3; - atomic_set(&self->homa.needy_ranks, 1); + atomic_set(&self->homa.incoming_hit_limit, 1); mock_ns_tick = 10; unit_log_clear(); @@ -871,7 +814,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(20000, atomic_read(&self->homa.active_remaining[0])); EXPECT_EQ(1, atomic_read(&self->homa.grant_recalc_count)); - EXPECT_EQ(0, atomic_read(&self->homa.needy_ranks)); + EXPECT_EQ(0, atomic_read(&self->homa.incoming_hit_limit)); EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(1, rpc3->msgin.priority); @@ -1133,102 +1076,6 @@ TEST_F(homa_grant, homa_grant_pick_rpcs__first_rpc_of_peer_doesnt_fit) EXPECT_STREQ("200 300 400", rpc_ids(rpcs, count)); } -TEST_F(homa_grant, homa_grant_check_needy__basics) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 104, self->server_ip, 40000); - rpc4 = test_rpc(self, 106, self->server_ip, 50000); - self->homa.max_incoming = 0; - - unit_log_clear(); - homa_grant_recalc(&self->homa); - EXPECT_STREQ("homa_grant_recalc", unit_log_get()); - EXPECT_EQ(0xf, atomic_read(&self->homa.needy_ranks)); - - atomic_set(&self->homa.needy_ranks, 0x5); - self->homa.max_incoming = 50000; - unit_log_clear(); - EXPECT_EQ(0, homa_grant_check_needy(&self->homa)); - EXPECT_STREQ("xmit GRANT 10000@3; xmit GRANT 10000@1", unit_log_get()); - EXPECT_EQ(10000, rpc1->msgin.granted); - EXPECT_EQ(0, rpc2->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_needy_calls); -} -TEST_F(homa_grant, homa_grant_check_needy__incoming_exhausted) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 104, self->server_ip, 40000); - rpc4 = test_rpc(self, 106, self->server_ip, 50000); - self->homa.max_incoming = 0; - - unit_log_clear(); - homa_grant_recalc(&self->homa); - EXPECT_STREQ("homa_grant_recalc", unit_log_get()); - EXPECT_EQ(0xf, atomic_read(&self->homa.needy_ranks)); - - self->homa.max_incoming = 15000; - unit_log_clear(); - EXPECT_EQ(0, homa_grant_check_needy(&self->homa)); - EXPECT_EQ(10000, rpc1->msgin.granted); - EXPECT_EQ(5000, rpc2->msgin.granted); - EXPECT_EQ(0, rpc3->msgin.granted); - EXPECT_EQ(0, rpc4->msgin.granted); -} -TEST_F(homa_grant, homa_grant_check_needy__num_active_rpcs_changed) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 104, self->server_ip, 40000); - rpc4 = test_rpc(self, 106, self->server_ip, 50000); - self->homa.max_incoming = 0; - - unit_log_clear(); - homa_grant_recalc(&self->homa); - EXPECT_STREQ("homa_grant_recalc", unit_log_get()); - EXPECT_EQ(0xf, atomic_read(&self->homa.needy_ranks)); - - hook_homa = &self->homa; - unit_hook_register(change_active_hook); - self->homa.max_incoming = 50000; - unit_log_clear(); - EXPECT_EQ(0, homa_grant_check_needy(&self->homa)); - EXPECT_EQ(0, rpc1->msgin.granted); - EXPECT_EQ(0, rpc2->msgin.granted); - EXPECT_EQ(0, rpc3->msgin.granted); - EXPECT_EQ(0, rpc4->msgin.granted); -} -TEST_F(homa_grant, homa_grant_check_needy__recalc_needed) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3; - - rpc1 = test_rpc(self, 100, self->server_ip, 10000); - rpc2 = test_rpc(self, 102, self->server_ip, 20000); - rpc3 = test_rpc(self, 104, self->server_ip, 30000); - self->homa.max_incoming = 0; - - unit_log_clear(); - homa_grant_recalc(&self->homa); - EXPECT_STREQ("homa_grant_recalc", unit_log_get()); - EXPECT_EQ(0x7, atomic_read(&self->homa.needy_ranks)); - - self->homa.max_incoming = 50000; - unit_log_clear(); - EXPECT_EQ(1, homa_grant_check_needy(&self->homa)); - EXPECT_EQ(10000, rpc1->msgin.granted); - EXPECT_EQ(10000, rpc2->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.granted); -} - TEST_F(homa_grant, homa_grant_find_oldest__basics) { mock_ns_tick = 10; From 4eb33ff5eccea96d5f9beb15bc4369e3cb5d6ab9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Mar 2025 15:59:30 -0700 Subject: [PATCH 217/625] Fix potential races related to homa->active_rpcs These values are now only hints, never dereferenced. --- homa_grant.c | 29 ++++++++----------- homa_impl.h | 11 ++++---- homa_incoming.c | 2 +- homa_rpc.c | 8 ++++-- homa_rpc.h | 7 ++--- test/unit_homa_grant.c | 64 ++++++++++++++++++++---------------------- 6 files changed, 58 insertions(+), 63 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 42fe9e9c..38508228 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -349,7 +349,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) /* Not a new message; see if we can upgrade the message's priority. */ rank = atomic_read(&rpc->msgin.rank); - if (rank < 0) { + if (homa->active_rpcs[rank] != rpc) { + /* RPC not currently active. */ if (rpc->msgin.bytes_remaining < atomic_read(&homa->active_remaining[homa->max_overcommit - 1])) { @@ -371,8 +372,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) } /* Ideally this should be the common case: no need to consider - * any other RPCs. - */ + * any other RPCs. + */ if (!homa_grant_try_send(rpc, homa)) goto done; @@ -420,15 +421,16 @@ void homa_grant_recalc(struct homa *homa) atomic_inc(&homa->grant_recalc_count); atomic_set(&homa->incoming_hit_limit, 0); - /* Clear the existing grant calculation. */ - for (i = 0; i < homa->num_active_rpcs; i++) - atomic_set(&homa->active_rpcs[i]->msgin.rank, -1); - /* Recompute which RPCs we'll grant to and initialize info * about them. */ active = homa_grant_pick_rpcs(homa, homa->active_rpcs, homa->max_overcommit); + for (i = active; i < homa->max_overcommit; i++) + /* Effectively invalidates @msgin.rank in RPCs that + * are no longer active. + */ + homa->active_rpcs[i] = NULL; homa->num_active_rpcs = active; for (i = 0; i < active; i++) { struct homa_rpc *rpc = homa->active_rpcs[i]; @@ -613,7 +615,7 @@ void homa_grant_end_rpc(struct homa_rpc *rpc) if (!list_empty(&rpc->grantable_links)) { homa_grant_remove_rpc(rpc); - if (atomic_read(&rpc->msgin.rank) >= 0) { + if (homa->active_rpcs[atomic_read(&rpc->msgin.rank)] == rpc) { homa_rpc_hold(rpc); homa_rpc_unlock(rpc); homa_grant_recalc(homa); @@ -673,16 +675,7 @@ int homa_grantable_lock_slow(struct homa *homa, int recalc) */ void homa_grant_log_tt(struct homa *homa) { - int i; - - homa_grantable_lock(homa, 0); - tt_record1("homa_grant_log_tt found %d active RPCs:", + tt_record1("homa_grant_log_tt found %d active RPCs", homa->num_active_rpcs); - for (i = 0; i < homa->num_active_rpcs; i++) { - tt_record2("active_rpcs[%d]: id %d", i, - homa->active_rpcs[i]->id); - homa_rpc_log_tt(homa->active_rpcs[i]); - } - homa_grantable_unlock(homa); } #endif /* See strip.py */ diff --git a/homa_impl.h b/homa_impl.h index 175574ff..34c8520f 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -189,17 +189,18 @@ struct homa { int num_active_rpcs; /** - * @active_rpcs: pointers to all of the RPCs that we will grant to - * right now. Slot 0 is highest priority. A value may be NULL if the - * RPC has ended. + * @active_rpcs: Hints about RPCs that we are currently granting to + * (lower index in the array means higher priority). Entries may be + * NULL or may refer to RPCs that no longer exist, so can't dereference + * these pointers. */ struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; /** - * @bytes_remaining: entry i in this array contains a copy of + * @active_remaining: entry i in this array contains a copy of * active_rpcs[i]->msgin.bytes_remaining. These values can be * updated by the corresponding RPCs without holding the grantable - * lock. Perfect consistency isn't required; this is used only to + * lock. Perfect consistency isn't required; this are hints used to * detect when the priority ordering of messages changes. */ atomic_t active_remaining[HOMA_MAX_GRANTS]; diff --git a/homa_incoming.c b/homa_incoming.c index f5e5b3a2..7d178943 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -52,7 +52,7 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) rpc->msgin.bytes_remaining = length; #ifndef __STRIP__ /* See strip.py */ rpc->msgin.granted = (unsched > length) ? length : unsched; - atomic_set(&rpc->msgin.rank, -1); + atomic_set(&rpc->msgin.rank, 0); #endif /* See strip.py */ err = homa_pool_allocate(rpc); if (err != 0) diff --git a/homa_rpc.c b/homa_rpc.c index 7402f77b..4f9cbe7c 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -608,6 +608,8 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) if (rpc->state == RPC_INCOMING) { int received = rpc->msgin.length - rpc->msgin.bytes_remaining; + int rank; + tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes received", rpc->id, tt_addr(rpc->peer->addr), received, rpc->msgin.length); @@ -615,10 +617,12 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) tt_record4("RPC id %d has incoming %d, granted %d, prio %d", rpc->id, rpc->msgin.granted - received, rpc->msgin.granted, rpc->msgin.priority); + rank = atomic_read(&rpc->msgin.rank); + if (rpc->hsk->homa->active_rpcs[rank] != rpc) + rank = -1; tt_record4("RPC id %d: length %d, remaining %d, rank %d", rpc->id, rpc->msgin.length, - rpc->msgin.bytes_remaining, - atomic_read(&rpc->msgin.rank)); + rpc->msgin.bytes_remaining, rank); if (rpc->msgin.num_bpages == 0) tt_record1("RPC id %d is blocked waiting for buffers", rpc->id); diff --git a/homa_rpc.h b/homa_rpc.h index 5c684f4f..68e52761 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -160,10 +160,9 @@ struct homa_message_in { atomic_t rec_incoming; /** - * @rank: The index of this RPC in homa->active_rpcs and - * homa->active_remaining, or -1 if this RPC is not in those arrays. - * Lower number means higher priority. Must be atomic because it - * is read without synchronization. + * @rank: A hint: if homa->active_rpcs[@rank] refers to this RPC then + * the RPC is active and this value indicates the RPC's priority (lower + * is better). Read without synchronization, so must be atomic. */ atomic_t rank; diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 2dffa18e..d77e60df 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -616,7 +616,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); - self->homa.max_overcommit = 2; + self->homa.max_overcommit = 3; homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); @@ -628,7 +628,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) EXPECT_EQ(10000, rpc3->msgin.granted); EXPECT_EQ(10000, atomic_read(&rpc3->msgin.rec_incoming)); EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc2->msgin.rank)); + EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); } TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) @@ -649,7 +649,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) homa_grant_check_rpc(rpc3); EXPECT_EQ(0, rpc3->msgin.granted); EXPECT_EQ(0, atomic_read(&rpc3->msgin.rec_incoming)); - EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); } @@ -663,17 +662,17 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) self->homa.max_overcommit = 2; homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); + EXPECT_EQ(rpc1, self->homa.active_rpcs[0]); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); + EXPECT_EQ(rpc2, self->homa.active_rpcs[1]); EXPECT_EQ(0, rpc3->msgin.granted); rpc3->msgin.bytes_remaining = 15000; homa_grant_check_rpc(rpc3); EXPECT_EQ(35000, rpc3->msgin.granted); EXPECT_EQ(10000, atomic_read(&rpc3->msgin.rec_incoming)); - EXPECT_EQ(0, atomic_read(&rpc3->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(1, atomic_read(&rpc1->msgin.rank)); + EXPECT_EQ(rpc1, self->homa.active_rpcs[1]); + EXPECT_EQ(rpc3, self->homa.active_rpcs[0]); } TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) { @@ -777,7 +776,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__grant_to_self_and_recalc) self->homa.max_overcommit = 3; homa_grant_recalc(&self->homa); EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 10000); atomic_set(&self->homa.incoming_hit_limit, 0); @@ -822,7 +820,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(30000, atomic_read(&self->homa.active_remaining[2])); EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc4->msgin.rank)); + EXPECT_EQ(0, rpc4->msgin.granted); EXPECT_NE(0, homa_metrics_per_cpu()->grant_recalc_ns); } TEST_F(homa_grant, homa_grant_recalc__skip_recalc) @@ -840,24 +838,27 @@ TEST_F(homa_grant, homa_grant_recalc__skip_recalc) EXPECT_EQ(2, atomic_read(&self->homa.grant_recalc_count)); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_recalc_skips); } -TEST_F(homa_grant, homa_grant_recalc__clear_existing_active_rpcs) +TEST_F(homa_grant, homa_grant_recalc__clear_unused_entries_in_active_rpcs) { - struct homa_rpc *rpc1; - - rpc1 = test_rpc(self, 100, self->server_ip, 40000); - test_rpc(self, 102, self->server_ip, 30000); - test_rpc(self, 104, self->server_ip, 25000); - test_rpc(self, 106, self->server_ip, 35000); - self->homa.active_rpcs[0] = rpc1; - atomic_set(&rpc1->msgin.rank, 10); - self->homa.num_active_rpcs = 1; + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + rpc2 = test_rpc(self, 102, self->server_ip, 30000); + rpc3 = test_rpc(self, 104, self->server_ip, 25000); self->homa.max_incoming = 100000; - self->homa.max_rpcs_per_peer = 10; - self->homa.max_overcommit = 2; + self->homa.max_overcommit = 3; + + homa_grant_recalc(&self->homa); + EXPECT_EQ(rpc1, self->homa.active_rpcs[0]); + EXPECT_EQ(rpc3, self->homa.active_rpcs[1]); + EXPECT_EQ(rpc2, self->homa.active_rpcs[2]); + homa_rpc_end(rpc1); + homa_rpc_end(rpc2); homa_grant_recalc(&self->homa); - EXPECT_EQ(-1, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(2, self->homa.num_active_rpcs); + EXPECT_EQ(rpc3, self->homa.active_rpcs[0]); + EXPECT_EQ(NULL, self->homa.active_rpcs[1]); + EXPECT_EQ(NULL, self->homa.active_rpcs[2]); } TEST_F(homa_grant, homa_grant_recalc__use_only_lowest_priorities) { @@ -1117,7 +1118,7 @@ TEST_F(homa_grant, homa_grant_find_oldest__no_good_candidates) EXPECT_EQ(NULL, self->homa.oldest_rpc); } -TEST_F(homa_grant, homa_grant_rpc_free__rpc_not_grantable) +TEST_F(homa_grant, homa_grant_end_rpc__rpc_not_grantable) { struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, @@ -1127,7 +1128,7 @@ TEST_F(homa_grant, homa_grant_rpc_free__rpc_not_grantable) homa_grant_end_rpc(rpc); EXPECT_EQ(7000, atomic_read(&self->homa.total_incoming)); } -TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) +TEST_F(homa_grant, homa_grant_end_rpc__in_active_list) { struct homa_rpc *rpc1, *rpc2, *rpc3; @@ -1136,20 +1137,18 @@ TEST_F(homa_grant, homa_grant_free_rpc__in_active_list) rpc3 = test_rpc(self, 104, self->server_ip, 40000); self->homa.max_overcommit = 2; homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); + EXPECT_EQ(rpc1, self->homa.active_rpcs[0]); + EXPECT_EQ(rpc2, self->homa.active_rpcs[1]); EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); EXPECT_EQ(10000, atomic_read(&rpc1->msgin.rec_incoming)); unit_log_clear(); homa_grant_end_rpc(rpc1); - EXPECT_EQ(-1, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(0, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); + EXPECT_EQ(rpc2, self->homa.active_rpcs[0]); + EXPECT_EQ(rpc3, self->homa.active_rpcs[1]); EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); } -TEST_F(homa_grant, homa_grant_free_rpc__not_in_active_list) +TEST_F(homa_grant, homa_grant_end_rpc__not_in_active_list) { struct homa_rpc *rpc1, *rpc2, *rpc3; @@ -1160,7 +1159,6 @@ TEST_F(homa_grant, homa_grant_free_rpc__not_in_active_list) homa_grant_recalc(&self->homa); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(-1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); EXPECT_EQ(0, atomic_read(&rpc3->msgin.rec_incoming)); EXPECT_FALSE(list_empty(&rpc3->grantable_links)); From 06826a373bae107a4f4a5b3a6057cea53076a9bd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Mar 2025 15:59:30 -0700 Subject: [PATCH 218/625] Fix potential races related to homa->active_rpcs These values are now only hints, never dereferenced. --- homa_grant.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_grant.c b/homa_grant.c index 38508228..98b011ab 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -438,9 +438,9 @@ void homa_grant_recalc(struct homa *homa) active_rpcs[i] = rpc; homa_rpc_hold(rpc); - atomic_set(&rpc->msgin.rank, i); atomic_set(&homa->active_remaining[i], rpc->msgin.bytes_remaining); + atomic_set(&rpc->msgin.rank, i); /* Compute the priority to use for this RPC's grants: * if there aren't enough RPCs to consume all of the From c794405b41219c3f82e2d0fe636e49f52e635372 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 16 Mar 2025 17:22:39 -0700 Subject: [PATCH 219/625] Implement accounting for bytes in tx skbs --- homa_devel.h | 17 -------------- homa_outgoing.c | 17 +++++++------- homa_rpc.c | 2 ++ homa_rpc.h | 6 +++++ homa_sock.c | 16 +++++++++++++ test/mock.c | 2 +- test/unit_homa_outgoing.c | 49 +++++++++++++++++++++++++++++++++++++++ test/unit_homa_plumbing.c | 1 + test/unit_homa_rpc.c | 28 ++++++++++++++++++++++ 9 files changed, 111 insertions(+), 27 deletions(-) diff --git a/homa_devel.h b/homa_devel.h index 4b74a4eb..3256b3dd 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -38,23 +38,6 @@ static inline u32 tt_addr(const struct in6_addr x) : ntohl(x.in6_u.u6_addr32[1])); } -/** - * addr_valid() - Determine whether a given address is a valid address - * within kernel memory. - * @addr: Address to check - */ -static inline int addr_valid(void *addr) -{ -#ifdef __UNIT_TEST__ - return 1; -#else -#define HIGH_BITS 0xffff800000000000 - u64 int_addr = (u64) addr; - - return (int_addr & HIGH_BITS) == HIGH_BITS; -#endif /* __UNIT_TEST__ */ -} - static inline void check_addr_valid(void *addr, char *info) { #ifndef __UNIT_TEST__ diff --git a/homa_outgoing.c b/homa_outgoing.c index 11a8b5df..f8f285be 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -23,17 +23,13 @@ */ void homa_message_out_init(struct homa_rpc *rpc, int length) { + memset(&rpc->msgout, 0, sizeof(rpc->msgout)); rpc->msgout.length = length; - rpc->msgout.num_skbs = 0; - rpc->msgout.copied_from_user = 0; - rpc->msgout.packets = NULL; rpc->msgout.next_xmit = &rpc->msgout.packets; - rpc->msgout.next_xmit_offset = 0; #ifndef __STRIP__ /* See strip.py */ rpc->msgout.unscheduled = rpc->hsk->homa->unsched_bytes; if (rpc->msgout.unscheduled > length) rpc->msgout.unscheduled = length; - rpc->msgout.sched_priority = 0; #endif /* See strip.py */ rpc->msgout.init_ns = sched_clock(); } @@ -269,14 +265,14 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) int err; homa_rpc_hold(rpc); - homa_message_out_init(rpc, iter->count); - if (unlikely(rpc->msgout.length > HOMA_MAX_MESSAGE_LENGTH || - rpc->msgout.length == 0)) { + if (unlikely(iter->count > HOMA_MAX_MESSAGE_LENGTH || + iter->count == 0)) { tt_record2("homa_message_out_fill found bad length %d for id %d", - rpc->msgout.length, rpc->id); + iter->count, rpc->id); err = -EINVAL; goto error; } + homa_message_out_init(rpc, iter->count); /* Compute the geometry of packets. */ dst = homa_get_dst(rpc->peer, rpc->hsk); @@ -372,6 +368,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) last_link = &(homa_get_skb_info(skb)->next_skb); *last_link = NULL; rpc->msgout.num_skbs++; + rpc->msgout.skb_memory += skb->truesize; rpc->msgout.copied_from_user = rpc->msgout.length - bytes_left; if (overlap_xmit && list_empty(&rpc->throttled_links) && #ifndef __STRIP__ /* See strip.py */ @@ -386,12 +383,14 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) tt_record2("finished copy from user space for id %d, length %d", rpc->id, rpc->msgout.length); INC_METRIC(sent_msg_bytes, rpc->msgout.length); + refcount_add(rpc->msgout.skb_memory, &rpc->hsk->sock.sk_wmem_alloc); homa_rpc_put(rpc); if (!overlap_xmit && xmit) homa_xmit_data(rpc, false); return 0; error: + refcount_add(rpc->msgout.skb_memory, &rpc->hsk->sock.sk_wmem_alloc); homa_rpc_put(rpc); return err; } diff --git a/homa_rpc.c b/homa_rpc.c index 4f9cbe7c..fb895ed9 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -432,6 +432,8 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) rpcs[num_rpcs] = rpc; num_rpcs++; list_del(&rpc->dead_links); + WARN_ON(refcount_sub_and_test(rpc->msgout.skb_memory, + &hsk->sock.sk_wmem_alloc)); if (num_rpcs >= batch_size) goto release; } diff --git a/homa_rpc.h b/homa_rpc.h index 68e52761..ef3f2679 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -30,6 +30,12 @@ struct homa_message_out { /** @num_skbs: Total number of buffers currently in @packets. */ int num_skbs; + /** + * @skb_memory: Total number of bytes of memory occupied by + * the sk_buffs for this message. + */ + int skb_memory; + /** * @copied_from_user: Number of bytes of the message that have * been copied from user space into skbs in @packets. diff --git a/homa_sock.c b/homa_sock.c index 50af19a2..55f0ba71 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -7,6 +7,11 @@ #include "homa_peer.h" #include "homa_pool.h" +#ifdef __UNIT_TEST__ +#define KSELFTEST_NOT_MAIN 1 +#include "test/kselftest_harness.h" +#endif + /** * homa_socktab_init() - Constructor for homa_socktabs. * @socktab: The object to initialize; previous contents are discarded. @@ -219,6 +224,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) { struct homa_interest *interest; struct homa_rpc *rpc; + u64 tx_memory; #ifndef __STRIP__ /* See strip.py */ int i = 0; #endif /* See strip.py */ @@ -280,6 +286,16 @@ void homa_sock_shutdown(struct homa_sock *hsk) homa_rpc_reap(hsk, 1000); #endif /* See strip.py */ + tx_memory = refcount_read(&hsk->sock.sk_wmem_alloc); + if (tx_memory != 0) { + pr_err("homa_sock_shutdown found sk_wmem_alloc %llu bytes, port %d\n", + tx_memory, hsk->port); +#ifdef __UNIT_TEST__ + FAIL(" sk_wmem_alloc %llu after shutdown for port %d", tx_memory, + hsk->port); +#endif + } + if (hsk->buffer_pool) { homa_pool_destroy(hsk->buffer_pool); kfree(hsk->buffer_pool); diff --git a/test/mock.c b/test/mock.c index bc4ec037..ff51a125 100644 --- a/test/mock.c +++ b/test/mock.c @@ -288,7 +288,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, skb->users.refs.counter = 1; skb->_skb_refdst = 0; ip_hdr(skb)->saddr = 0; - skb->truesize = size; + skb->truesize = SKB_TRUESIZE(size); skb->dev = &mock_net_device; return skb; } diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index d436c167..8132c98b 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -47,6 +47,15 @@ void mock_resend_data(struct homa_rpc *rpc, int start, int end, mock_resend_data(rpc, start, end, priority); #endif /* See strip.py */ +/* Compute the expected "truesize" value for a Homa packet, given + * the number of bytes of message data in the packet. + */ +static int true_size(int msg_bytes) +{ + return msg_bytes + SKB_TRUESIZE(SKB_DATA_ALIGN(HOMA_SKB_EXTRA + + HOMA_IPV6_HEADER_LENGTH + sizeof(struct homa_data_hdr))); +} + FIXTURE(homa_outgoing) { struct in6_addr client_ip[1]; int client_port; @@ -366,6 +375,8 @@ TEST_F(homa_outgoing, homa_message_out_fill__message_too_long) unit_iov_iter((void *) 1000, HOMA_MAX_MESSAGE_LENGTH+1), 0)); homa_rpc_unlock(crpc); + EXPECT_EQ(0, crpc->msgout.skb_memory); + EXPECT_EQ(0, refcount_read(&self->hsk.sock.sk_wmem_alloc)); } TEST_F(homa_outgoing, homa_message_out_fill__zero_length_message) { @@ -523,6 +534,24 @@ TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) unit_log_get()); EXPECT_EQ(4200, homa_get_skb_info(crpc->msgout.packets)->data_bytes); } +TEST_F(homa_outgoing, homa_message_out_fill__error_in_homa_new_data_packet) +{ + struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + &self->server_addr); + + ASSERT_FALSE(crpc == NULL); + mock_set_ipv6(&self->hsk); + mock_copy_data_errors = 2; + + EXPECT_EQ(EFAULT, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 3000), 0)); + homa_rpc_unlock(crpc); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(1, crpc->msgout.num_skbs); + EXPECT_EQ(true_size(1400), crpc->msgout.skb_memory); + EXPECT_EQ(true_size(1400), + refcount_read(&self->hsk.sock.sk_wmem_alloc)); +} TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) { struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, @@ -535,6 +564,8 @@ TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) unit_iov_iter((void *) 1000, 3000), 0)); EXPECT_EQ(0, crpc->msgout.num_skbs); EXPECT_EQ(RPC_DEAD, crpc->state); + EXPECT_EQ(0, crpc->msgout.skb_memory); + EXPECT_EQ(0, refcount_read(&self->hsk.sock.sk_wmem_alloc)); homa_rpc_unlock(crpc); } TEST_F(homa_outgoing, homa_message_out_fill__add_to_throttled) @@ -570,6 +601,24 @@ TEST_F(homa_outgoing, homa_message_out_fill__too_short_for_pipelining) unit_log_throttled(&self->homa); EXPECT_STREQ("", unit_log_get()); } +TEST_F(homa_outgoing, homa_message_out_fill__packet_memory_accounting) +{ + struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + &self->server_addr); + + mock_set_ipv6(&self->hsk); + + ASSERT_FALSE(crpc == NULL); + ASSERT_EQ(0, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 3000), 0)); + homa_rpc_unlock(crpc); + unit_log_clear(); + EXPECT_EQ(3, crpc->msgout.num_skbs); + EXPECT_EQ(2 * true_size(1400) + true_size(200), + crpc->msgout.skb_memory); + EXPECT_EQ(2 * true_size(1400) + true_size(200), + refcount_read(&self->hsk.sock.sk_wmem_alloc)); +} TEST_F(homa_outgoing, homa_xmit_control__server_request) { diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 9c3021ec..a1403876 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -261,6 +261,7 @@ TEST_F(homa_plumbing, homa_socket__success) { struct homa_sock sock; + memset(&sock, 0, sizeof(sock)); EXPECT_EQ(0, homa_socket(&sock.sock)); homa_sock_destroy(&sock); } diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 2e2463be..6eda4715 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -664,6 +664,34 @@ TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) EXPECT_STREQ("1234", dead_rpcs(&self->hsk)); EXPECT_EQ(4, self->hsk.dead_skbs); } +TEST_F(homa_rpc, homa_rpc_reap__skb_memory_accounting) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 5000, 100); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + crpc1->msgout.skb_memory = 2000; + crpc2->msgout.skb_memory = 3000; + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + unit_log_clear(); + EXPECT_STREQ("1234 1236", dead_rpcs(&self->hsk)); + refcount_set(&self->hsk.sock.sk_wmem_alloc, 5000); + EXPECT_EQ(9, self->hsk.dead_skbs); + unit_log_clear(); + self->homa.reap_limit = 7; + EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1234", unit_log_get()); + unit_log_clear(); + EXPECT_STREQ("1236", dead_rpcs(&self->hsk)); + EXPECT_EQ(1, self->hsk.dead_skbs); + EXPECT_EQ(3000, refcount_read(&self->hsk.sock.sk_wmem_alloc)); +} TEST_F(homa_rpc, homa_rpc_reap__release_buffers) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, From 4cd1bb91b24cc9c712b48eed892d818c50ed6714 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 18 Mar 2025 09:52:47 -0700 Subject: [PATCH 220/625] Fix race in homa_grant_remove_rpc --- homa_grant.c | 8 ++++++++ test/unit_homa_grant.c | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/homa_grant.c b/homa_grant.c index 98b011ab..cde1037f 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -168,6 +168,14 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) homa_grantable_lock(homa, 0); + /* Must check list again: might have been removed by someone + * else before we got the lock. + */ + if (list_empty(&rpc->grantable_links)) { + homa_grantable_unlock(homa); + return; + } + if (homa->oldest_rpc == rpc) homa->oldest_rpc = NULL; diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index d77e60df..6c571557 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -35,6 +35,17 @@ static void grantable_spinlock_hook(char *id) mock_ns = 1000; } +static struct homa_rpc *hook_rpc; +static void remove_rpc_hook(char *id) +{ + if (strcmp(id, "spin_lock") != 0) + return; + if (hook_rpc != NULL) { + homa_grant_remove_rpc(hook_rpc); + hook_rpc = NULL; + } +} + FIXTURE(homa_grant) { struct in6_addr client_ip[5]; int client_port; @@ -307,6 +318,17 @@ TEST_F(homa_grant, homa_grant_remove_rpc__skip_if_not_linked) homa_grant_remove_rpc(rpc); EXPECT_EQ(0, self->homa.num_grantable_rpcs); } +TEST_F(homa_grant, homa_grant_remove_rpc__race_in_checking_not_linked) +{ + struct homa_rpc *rpc = test_rpc(self, 200, self->server_ip, 20000); + + EXPECT_EQ(1, self->homa.num_grantable_rpcs); + + unit_hook_register(remove_rpc_hook); + hook_rpc = rpc; + homa_grant_remove_rpc(rpc); + EXPECT_EQ(0, self->homa.num_grantable_rpcs); +} TEST_F(homa_grant, homa_grant_remove_rpc__clear_oldest_rpc) { struct homa_rpc *rpc1 = test_rpc(self, 200, self->server_ip, 20000); From dc6f301231e2212de6d7981d721ad2a43aa1416c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 18 Mar 2025 13:54:43 -0700 Subject: [PATCH 221/625] Change tt_printk to print entries backwards (most recent first) --- timetrace.c | 60 ++++++++++++++++++++++++++++++++---------------- timetrace.h | 2 +- util/ttsyslog.py | 5 ++++ 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/timetrace.c b/timetrace.c index aa74eca2..8df7b501 100644 --- a/timetrace.c +++ b/timetrace.c @@ -286,8 +286,9 @@ void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, * complete, since there may have been events that were discarded). * @pos: Array with NPOS elements; will be filled in with the oldest * index in the trace for each core. + * Return: Time of oldest log entry that should be printed. */ -void tt_find_oldest(int *pos) +u64 tt_find_oldest(int *pos) { struct tt_buffer *buffer; u64 start_time = 0; @@ -318,6 +319,7 @@ void tt_find_oldest(int *pos) pos[i] = (pos[i] + 1) & (tt_buffer_size - 1); } } + return start_time; } /** @@ -669,20 +671,28 @@ void tt_print_file(char *path) /** * tt_printk() - Print the contents of the timetrace to the system log. * Useful in situations where the system is too unstable to extract a - * timetrace by reading /proc/timetrace. + * timetrace by reading /proc/timetrace. Note: the timetrace is printed + * most recent entry first (in the hopes that if buffer overflows + * disrupt the output, at least the most recent entries will be complete). */ void tt_printk(void) { - /* Index of the next entry to return from each tt_buffer. - * This array is too large to allocate on the stack, and we don't - * want to allocate space dynamically (this function could be - * called at a point where the world is going to hell). So, - * allocate the array statically, and only allow one concurrent - * call to this function. + /* Index of the oldest entry to return from each tt_buffer. This + * array is too large to allocate on the stack, and we don't want to + * allocate space dynamically (this function could be called at a + * point where the world is going to hell). So, allocate the array + * statically and only allow one concurrent call to this function. */ - static int pos[NR_CPUS]; + static int oldest[NR_CPUS]; static atomic_t active; + /* Index of the next entry to consider from each tt_buffer, or -1 if + * the last entry has been processed. + */ + static int pos[NR_CPUS]; + u64 start_time; + int i; + if (atomic_xchg(&active, 1)) { pr_err("concurrent call to %s aborting\n", __func__); return; @@ -690,27 +700,34 @@ void tt_printk(void) if (!init) return; atomic_inc(&tt_freeze_count); - tt_find_oldest(pos); + start_time = tt_find_oldest(oldest); + for (i = 0; i < nr_cpu_ids; i++) { + if (oldest[i] == tt_buffers[i]->next_index) + pos[i] = -1; + else + pos[i] = (tt_buffers[i]->next_index - 1) & + (tt_buffer_size - 1); + } - pr_err("cpu_khz: %u\n", cpu_khz); + pr_err("cpu_khz: %u, start: %llu\n", cpu_khz, start_time); /* Each iteration of this loop printk's one event. */ while (true) { - u64 earliest_time = ~0; + u64 latest_time = 0; struct tt_event *event; int current_core = -1; char msg[200]; - int i; - /* Check all the traces to find the earliest available event. */ + /* Check all the traces to find the latest available event. */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; + if (pos[i] == -1) + continue; event = &buffer->events[pos[i]]; - if (pos[i] != buffer->next_index && - event->timestamp < earliest_time) { + if (event->timestamp >= latest_time) { current_core = i; - earliest_time = event->timestamp; + latest_time = event->timestamp; } } if (current_core < 0) { @@ -718,12 +735,15 @@ void tt_printk(void) break; } event = &(tt_buffers[current_core]->events[pos[current_core]]); - pos[current_core] = (pos[current_core] + 1) - & (tt_buffer_size - 1); + if (pos[current_core] == oldest[current_core]) + pos[current_core] = -1; + else + pos[current_core] = (pos[current_core] - 1) + & (tt_buffer_size - 1); snprintf(msg, sizeof(msg), event->format, event->arg0, event->arg1, event->arg2, event->arg3); - pr_err("%lu [C%02d] %s\n", + pr_notice("%lu [C%02d] %s\n", (unsigned long)event->timestamp, current_core, msg); } diff --git a/timetrace.h b/timetrace.h index d6af807c..8bcb0c58 100644 --- a/timetrace.h +++ b/timetrace.h @@ -107,7 +107,7 @@ void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, void tt_dbg1(char *msg, ...); void tt_dbg2(char *msg, ...); void tt_dbg3(char *msg, ...); -void tt_find_oldest(int *pos); +u64 tt_find_oldest(int *pos); void tt_get_messages(char *buffer, size_t length); void tt_print_file(char *path); void tt_printk(void); diff --git a/util/ttsyslog.py b/util/ttsyslog.py index 4dc97f5c..305ea233 100755 --- a/util/ttsyslog.py +++ b/util/ttsyslog.py @@ -36,6 +36,8 @@ if len(sys.argv) > 1: f = open(sys.argv[1]) +lines = [] + for line in f: # Ignore everything up until the initial line containing the clock speed. if cpu_ghz == None: @@ -44,6 +46,9 @@ cpu_ghz = float(match.group(1))*1e-06 continue + lines.append(line) + +for line in reversed(lines): match = re.match('.* ([0-9.]+) (\[C..\] .+)', line) if not match: continue From 65c2decd1a5f8a6bf18dd148c18b18279945f488 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 19 Mar 2025 09:57:04 -0700 Subject: [PATCH 222/625] Revert "Change tt_printk to print entries backwards (most recent first)" This reverts commit dc6f301231e2212de6d7981d721ad2a43aa1416c. --- timetrace.c | 60 ++++++++++++++++-------------------------------- timetrace.h | 2 +- util/ttsyslog.py | 5 ---- 3 files changed, 21 insertions(+), 46 deletions(-) diff --git a/timetrace.c b/timetrace.c index 8df7b501..aa74eca2 100644 --- a/timetrace.c +++ b/timetrace.c @@ -286,9 +286,8 @@ void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, * complete, since there may have been events that were discarded). * @pos: Array with NPOS elements; will be filled in with the oldest * index in the trace for each core. - * Return: Time of oldest log entry that should be printed. */ -u64 tt_find_oldest(int *pos) +void tt_find_oldest(int *pos) { struct tt_buffer *buffer; u64 start_time = 0; @@ -319,7 +318,6 @@ u64 tt_find_oldest(int *pos) pos[i] = (pos[i] + 1) & (tt_buffer_size - 1); } } - return start_time; } /** @@ -671,27 +669,19 @@ void tt_print_file(char *path) /** * tt_printk() - Print the contents of the timetrace to the system log. * Useful in situations where the system is too unstable to extract a - * timetrace by reading /proc/timetrace. Note: the timetrace is printed - * most recent entry first (in the hopes that if buffer overflows - * disrupt the output, at least the most recent entries will be complete). + * timetrace by reading /proc/timetrace. */ void tt_printk(void) { - /* Index of the oldest entry to return from each tt_buffer. This - * array is too large to allocate on the stack, and we don't want to - * allocate space dynamically (this function could be called at a - * point where the world is going to hell). So, allocate the array - * statically and only allow one concurrent call to this function. - */ - static int oldest[NR_CPUS]; - static atomic_t active; - - /* Index of the next entry to consider from each tt_buffer, or -1 if - * the last entry has been processed. + /* Index of the next entry to return from each tt_buffer. + * This array is too large to allocate on the stack, and we don't + * want to allocate space dynamically (this function could be + * called at a point where the world is going to hell). So, + * allocate the array statically, and only allow one concurrent + * call to this function. */ static int pos[NR_CPUS]; - u64 start_time; - int i; + static atomic_t active; if (atomic_xchg(&active, 1)) { pr_err("concurrent call to %s aborting\n", __func__); @@ -700,34 +690,27 @@ void tt_printk(void) if (!init) return; atomic_inc(&tt_freeze_count); - start_time = tt_find_oldest(oldest); - for (i = 0; i < nr_cpu_ids; i++) { - if (oldest[i] == tt_buffers[i]->next_index) - pos[i] = -1; - else - pos[i] = (tt_buffers[i]->next_index - 1) & - (tt_buffer_size - 1); - } + tt_find_oldest(pos); - pr_err("cpu_khz: %u, start: %llu\n", cpu_khz, start_time); + pr_err("cpu_khz: %u\n", cpu_khz); /* Each iteration of this loop printk's one event. */ while (true) { - u64 latest_time = 0; + u64 earliest_time = ~0; struct tt_event *event; int current_core = -1; char msg[200]; + int i; - /* Check all the traces to find the latest available event. */ + /* Check all the traces to find the earliest available event. */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; - if (pos[i] == -1) - continue; event = &buffer->events[pos[i]]; - if (event->timestamp >= latest_time) { + if (pos[i] != buffer->next_index && + event->timestamp < earliest_time) { current_core = i; - latest_time = event->timestamp; + earliest_time = event->timestamp; } } if (current_core < 0) { @@ -735,15 +718,12 @@ void tt_printk(void) break; } event = &(tt_buffers[current_core]->events[pos[current_core]]); - if (pos[current_core] == oldest[current_core]) - pos[current_core] = -1; - else - pos[current_core] = (pos[current_core] - 1) - & (tt_buffer_size - 1); + pos[current_core] = (pos[current_core] + 1) + & (tt_buffer_size - 1); snprintf(msg, sizeof(msg), event->format, event->arg0, event->arg1, event->arg2, event->arg3); - pr_notice("%lu [C%02d] %s\n", + pr_err("%lu [C%02d] %s\n", (unsigned long)event->timestamp, current_core, msg); } diff --git a/timetrace.h b/timetrace.h index 8bcb0c58..d6af807c 100644 --- a/timetrace.h +++ b/timetrace.h @@ -107,7 +107,7 @@ void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, void tt_dbg1(char *msg, ...); void tt_dbg2(char *msg, ...); void tt_dbg3(char *msg, ...); -u64 tt_find_oldest(int *pos); +void tt_find_oldest(int *pos); void tt_get_messages(char *buffer, size_t length); void tt_print_file(char *path); void tt_printk(void); diff --git a/util/ttsyslog.py b/util/ttsyslog.py index 305ea233..4dc97f5c 100755 --- a/util/ttsyslog.py +++ b/util/ttsyslog.py @@ -36,8 +36,6 @@ if len(sys.argv) > 1: f = open(sys.argv[1]) -lines = [] - for line in f: # Ignore everything up until the initial line containing the clock speed. if cpu_ghz == None: @@ -46,9 +44,6 @@ cpu_ghz = float(match.group(1))*1e-06 continue - lines.append(line) - -for line in reversed(lines): match = re.match('.* ([0-9.]+) (\[C..\] .+)', line) if not match: continue From 29d0f539fae553cb990d0ade0fbef93e88adb084 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 26 Mar 2025 15:02:19 -0700 Subject: [PATCH 223/625] Block Homa senders if insufficient tx buffer memory * Check wmem availability in homa_sendmsg and homa_poll. * New HOMA_SENDMSG_NONBLOCKING flag for sendmsg. * New wmem_max sysctl value --- homa.h | 5 ++- homa_impl.h | 6 +++ homa_plumbing.c | 52 +++++++++++++++++----- homa_rpc.c | 1 + homa_sock.c | 37 +++++++++++++++- homa_sock.h | 29 ++++++++++++ homa_utils.c | 1 + man/homa.7 | 5 +++ test/mock.c | 30 ++++++++++++- test/unit_homa_outgoing.c | 8 ++-- test/unit_homa_plumbing.c | 24 +++++++--- test/unit_homa_rpc.c | 16 ++++++- test/unit_homa_sock.c | 75 +++++++++++++++++++++++++++++++ util/homa_test.cc | 92 +++++++++++++++++++++++++++++++++++++++ 14 files changed, 352 insertions(+), 29 deletions(-) diff --git a/homa.h b/homa.h index 5b170242..40a084cc 100644 --- a/homa.h +++ b/homa.h @@ -82,8 +82,9 @@ _Static_assert(sizeof(struct homa_sendmsg_args) <= 24, /* Flag bits for homa_sendmsg_args.flags (see man page for documentation): */ -#define HOMA_SENDMSG_PRIVATE 0x01 -#define HOMA_SENDMSG_VALID_FLAGS 0x01 +#define HOMA_SENDMSG_PRIVATE 0x01 +#define HOMA_SENDMSG_NONBLOCKING 0x02 +#define HOMA_SENDMSG_VALID_FLAGS 0x03 /** * struct homa_recvmsg_args - Provides information needed by Homa's diff --git a/homa_impl.h b/homa_impl.h index 34c8520f..dfad041d 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -595,6 +595,12 @@ struct homa { */ int gso_force_software; + /** + * @wmem_max: Limit on the value of sk_sndbuf for any socket. Set + * externally via sysctl. + */ + int wmem_max; + #ifndef __STRIP__ /* See strip.py */ /** * @hijack_tcp: Non-zero means encapsulate outgoing Homa packets diff --git a/homa_plumbing.c b/homa_plumbing.c index 17f38695..caa7fb55 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -494,6 +494,13 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, + { + .procname = "wmem_max", + .data = &homa_data.wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_dointvec + }, #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) {} #endif @@ -831,22 +838,18 @@ int homa_ioctl(struct sock *sk, int cmd, int *karg) int result; u64 start = sched_clock(); - switch (cmd) { - case HOMAIOCABORT: + if (cmd == HOMAIOCABORT) { result = homa_ioc_abort(sk, karg); INC_METRIC(abort_calls, 1); INC_METRIC(abort_ns, sched_clock() - start); - break; - case HOMAIOCFREEZE: + } else if (cmd == HOMAIOCFREEZE) { tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, pid %d", current->pid); tt_freeze(); result = 0; - break; - default: + } else { pr_notice("Unknown Homa ioctl: %d\n", cmd); result = -EINVAL; - break; } return result; #else /* See strip.py */ @@ -1035,6 +1038,15 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) goto error; } + if (!homa_sock_wmem_avl(hsk)) { + result = homa_sock_wait_wmem(hsk, + msg->msg_flags & MSG_DONTWAIT || + args.flags & + HOMA_SENDMSG_NONBLOCKING); + if (result != 0) + goto error; + } + if (addr->sa.sa_family != sk->sk_family) { result = -EAFNOSUPPORT; goto error; @@ -1290,11 +1302,19 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } homa_rpc_unlock(rpc); /* Locked by homa_wait_for_message. */ + if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags)) { + /* There are tasks waiting for tx memory, so reap + * immediately. + */ + homa_rpc_reap(hsk, true); + } + done: if (unlikely(copy_to_user((__force void __user *)msg->msg_control, &control, sizeof(control)))) { /* Note: in this case the message's buffers will be leaked. */ - pr_notice("%s couldn't copy back args\n", __func__); + pr_notice("%s couldn't copy back args to 0x%px\n", + __func__, msg->msg_control); result = -EFAULT; } @@ -1607,17 +1627,25 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - struct sock *sk = sock->sk; + struct homa_sock *hsk = homa_sk(sock->sk); u32 mask; + mask = 0; sock_poll_wait(file, sock, wait); - mask = POLLOUT | POLLWRNORM; + tt_record2("homa_poll found sk_wmem_alloc %d, sk_sndbuf %d", + refcount_read(&hsk->sock.sk_wmem_alloc), + hsk->sock.sk_sndbuf); + if (homa_sock_wmem_avl(hsk)) + mask |= POLLOUT | POLLWRNORM; + else + set_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags); - if (homa_sk(sk)->shutdown) + if (hsk->shutdown) mask |= POLLIN; - if (!list_empty(&homa_sk(sk)->ready_rpcs)) + if (!list_empty(&hsk->ready_rpcs)) mask |= POLLIN | POLLRDNORM; + tt_record1("homa_poll returning mask 0x%x", mask); return (__poll_t)mask; } diff --git a/homa_rpc.c b/homa_rpc.c index fb895ed9..b6f68dd3 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -474,6 +474,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) rpc->state = 0; kfree(rpc); } + homa_sock_wakeup_wmem(hsk); tt_record4("reaped %d skbs, %d rpcs; %d skbs remain for port %d", num_skbs + rx_frees, num_rpcs, hsk->dead_skbs, hsk->port); diff --git a/homa_sock.c b/homa_sock.c index 55f0ba71..aad33200 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -140,6 +140,10 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) int result = 0; int i; + /* Initialize fields outside the Homa part. */ + hsk->sock.sk_sndbuf = homa->wmem_max; + + /* Initialize Homa-specific fields. */ spin_lock_bh(&socktab->write_lock); atomic_set(&hsk->protect_count, 0); spin_lock_init(&hsk->lock); @@ -287,7 +291,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) #endif /* See strip.py */ tx_memory = refcount_read(&hsk->sock.sk_wmem_alloc); - if (tx_memory != 0) { + if (tx_memory != 1) { pr_err("homa_sock_shutdown found sk_wmem_alloc %llu bytes, port %d\n", tx_memory, hsk->port); #ifdef __UNIT_TEST__ @@ -437,3 +441,34 @@ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) } } #endif /* See strip.py */ + +/** + * homa_sock_wait_wmem() - Block the thread until @hsk's usage of tx + * packet memory drops below the socket's limit. + * @hsk: Socket of interest. + * @nonblocking: If there's not enough memory, return -EWOLDBLOCK instead + * of blocking. + * Return: 0 for success, otherwise a negative errno. + */ +int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking) +{ + long timeo = hsk->sock.sk_sndtimeo; + int result; + + if (nonblocking) + timeo = 0; + set_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags); + tt_record2("homa_sock_wait_wmem waiting on port %d, wmem %d", + hsk->port, refcount_read(&hsk->sock.sk_wmem_alloc)); + result = wait_event_interruptible_timeout(*sk_sleep(&hsk->sock), + homa_sock_wmem_avl(hsk) || hsk->shutdown, + timeo); + tt_record4("homa_sock_wait_wmem woke up on port %d with result %d, wmem %d, signal pending %d", + hsk->port, result, refcount_read(&hsk->sock.sk_wmem_alloc), + signal_pending(current)); + if (signal_pending(current)) + return -EINTR; + if (result == 0) + return -EWOULDBLOCK; + return 0; +} diff --git a/homa_sock.h b/homa_sock.h index 6f3ebc7a..0702bfdc 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -254,6 +254,7 @@ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port); int homa_sock_init(struct homa_sock *hsk, struct homa *homa); void homa_sock_shutdown(struct homa_sock *hsk); void homa_sock_unlink(struct homa_sock *hsk); +int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking); int homa_socket(struct sock *sk); void homa_socktab_destroy(struct homa_socktab *socktab); void homa_socktab_end_scan(struct homa_socktab_scan *scan); @@ -390,4 +391,32 @@ static inline struct homa_sock *homa_sk(const struct sock *sk) return (struct homa_sock *)sk; } +/** + * homa_sock_wmem_avl()) - Returns true if the socket is within its limit + * for output memory usage. False means that no new messages should be sent + * until memory is freed. + * @hsk: Socket of interest. + */ +static inline bool homa_sock_wmem_avl(struct homa_sock *hsk) +{ + return refcount_read(&hsk->sock.sk_wmem_alloc) < hsk->sock.sk_sndbuf; +} + +/** + * homa_sock_wakeup_wmem() - Invoked when tx packet memory has been freed; + * if memory usage is below the limit and there are tasks waiting for memory, + * wake them up. + * @hsk: Socket of interest. + */ +static inline void homa_sock_wakeup_wmem(struct homa_sock *hsk) +{ + if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags) && + homa_sock_wmem_avl(hsk)) { + tt_record2("homa_sock_wakeup_wmem waking up port %d, wmem %d", + hsk->port, refcount_read(&hsk->sock.sk_wmem_alloc)); + clear_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags); + wake_up_interruptible_poll(sk_sleep(&hsk->sock), EPOLLOUT); + } +} + #endif /* _HOMA_SOCK_H */ diff --git a/homa_utils.c b/homa_utils.c index 50655b04..14bdee65 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -131,6 +131,7 @@ int homa_init(struct homa *homa) homa->max_gso_size = 10000; #ifndef __STRIP__ /* See strip.py */ homa->max_gro_skbs = 20; + homa->wmem_max = 100000000; homa->gro_policy = HOMA_GRO_NORMAL; homa->busy_usecs = 100; homa->gro_busy_usecs = 5; diff --git a/man/homa.7 b/man/homa.7 index a673ed2b..acc4fe96 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -616,6 +616,11 @@ of these messages will be This approach was inspired by the paper "Dynamic Queue Length Thresholds for Shared-Memory Packet Switches"; the idea is to maintain unused granting capacity equal to the window for each of the current messages. +.TP +.IR wmem_max +Maximum amount of memory that may be used for outgoing packet buffers +by a single socket at a given time. Output message transmissions will +block when this limit is reached. .SH /PROC FILES .PP In addition to files for the configuration parameters described above, diff --git a/test/mock.c b/test/mock.c index ff51a125..2a0506cf 100644 --- a/test/mock.c +++ b/test/mock.c @@ -219,6 +219,9 @@ char mock_printk_output [5000]; /* Used instead of HOMA_MIN_DEFAULT_PORT by homa_skb.c. */ __u16 mock_min_default_port = 0x8000; +/* Used as sk_socket for all sockets created by mock_sock_init. */ +struct socket mock_socket; + struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; struct netdev_queue mock_net_queue = {.state = 0}; struct net_device mock_net_device = { @@ -758,6 +761,17 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, return 0; } +void device_set_wakeup_capable(struct device *dev, bool capable) +{} + +void device_wakeup_disable(struct device *dev) +{} + +int device_wakeup_enable(struct device *dev) +{ + return 0; +} + int filp_close(struct file *, fl_owner_t id) { return 0; @@ -926,7 +940,7 @@ void lock_sock_nested(struct sock *sk, int subclass) sk->sk_lock.owned = 1; } -ssize_t __modver_version_show(struct module_attribute *a, +ssize_t __modver_version_show(const struct module_attribute *a, struct module_kobject *b, char *c) { return 0; @@ -1152,6 +1166,14 @@ void schedule(void) UNIT_HOOK("schedule"); } +signed long schedule_timeout(signed long timeout) +{ + UNIT_HOOK("schedule_timeout"); + + /* Result is time remaining in timeout. */ + return timeout - 1; +} + int __SCT__might_resched(void) { return 0; @@ -1812,6 +1834,12 @@ int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) memset(hsk, 0, sizeof(*hsk)); sk->sk_data_ready = mock_data_ready; sk->sk_family = mock_ipv6 ? AF_INET6 : AF_INET; + sk->sk_socket = &mock_socket; + memset(&mock_socket, 0, sizeof(mock_socket)); + refcount_set(&sk->sk_wmem_alloc, 1); + init_waitqueue_head(&mock_socket.wq.wait); + rcu_assign_pointer(sk->sk_wq, &mock_socket.wq); + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; if (port != 0 && port >= mock_min_default_port) homa->prev_default_port = port - 1; err = homa_sock_init(hsk, homa); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 8132c98b..ea5a91be 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -376,7 +376,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__message_too_long) 0)); homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->msgout.skb_memory); - EXPECT_EQ(0, refcount_read(&self->hsk.sock.sk_wmem_alloc)); + EXPECT_EQ(1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); } TEST_F(homa_outgoing, homa_message_out_fill__zero_length_message) { @@ -549,7 +549,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__error_in_homa_new_data_packet) EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, crpc->msgout.num_skbs); EXPECT_EQ(true_size(1400), crpc->msgout.skb_memory); - EXPECT_EQ(true_size(1400), + EXPECT_EQ(true_size(1400) + 1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); } TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) @@ -565,7 +565,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) EXPECT_EQ(0, crpc->msgout.num_skbs); EXPECT_EQ(RPC_DEAD, crpc->state); EXPECT_EQ(0, crpc->msgout.skb_memory); - EXPECT_EQ(0, refcount_read(&self->hsk.sock.sk_wmem_alloc)); + EXPECT_EQ(1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); homa_rpc_unlock(crpc); } TEST_F(homa_outgoing, homa_message_out_fill__add_to_throttled) @@ -616,7 +616,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__packet_memory_accounting) EXPECT_EQ(3, crpc->msgout.num_skbs); EXPECT_EQ(2 * true_size(1400) + true_size(200), crpc->msgout.skb_memory); - EXPECT_EQ(2 * true_size(1400) + true_size(200), + EXPECT_EQ(2 * true_size(1400) + true_size(200) + 1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index a1403876..4fe70118 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -259,18 +259,20 @@ TEST_F(homa_plumbing, homa_ioc_abort__nonexistent_rpc) TEST_F(homa_plumbing, homa_socket__success) { - struct homa_sock sock; + struct homa_sock hsk; - memset(&sock, 0, sizeof(sock)); - EXPECT_EQ(0, homa_socket(&sock.sock)); - homa_sock_destroy(&sock); + memset(&hsk, 0, sizeof(hsk)); + refcount_set(&hsk.sock.sk_wmem_alloc, 1); + EXPECT_EQ(0, homa_socket(&hsk.sock)); + homa_sock_destroy(&hsk); } TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) { - struct homa_sock sock; + struct homa_sock hsk; + refcount_set(&hsk.sock.sk_wmem_alloc, 1); mock_kmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_socket(&sock.sock)); + EXPECT_EQ(ENOMEM, -homa_socket(&hsk.sock)); } TEST_F(homa_plumbing, homa_setsockopt__bad_level) @@ -476,7 +478,7 @@ TEST_F(homa_plumbing, homa_sendmsg__cant_read_args) } TEST_F(homa_plumbing, homa_sendmsg__illegal_flag) { - self->sendmsg_args.flags = 2; + self->sendmsg_args.flags = 4; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); @@ -1190,6 +1192,14 @@ TEST_F(homa_plumbing, homa_err_handler_v6__protocol_not_supported) kfree_skb(failed); } +TEST_F(homa_plumbing, homa_poll__no_tx_buffer_space) +{ + struct socket sock = {.sk = &self->hsk.sock}; + + self->hsk.sock.sk_sndbuf = 0; + EXPECT_EQ(0, homa_poll(NULL, &sock, NULL)); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); +} TEST_F(homa_plumbing, homa_poll__not_readable) { struct socket sock = {.sk = &self->hsk.sock}; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 6eda4715..cc80561f 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -681,7 +681,7 @@ TEST_F(homa_rpc, homa_rpc_reap__skb_memory_accounting) homa_rpc_end(crpc2); unit_log_clear(); EXPECT_STREQ("1234 1236", dead_rpcs(&self->hsk)); - refcount_set(&self->hsk.sock.sk_wmem_alloc, 5000); + refcount_set(&self->hsk.sock.sk_wmem_alloc, 5001); EXPECT_EQ(9, self->hsk.dead_skbs); unit_log_clear(); self->homa.reap_limit = 7; @@ -690,7 +690,7 @@ TEST_F(homa_rpc, homa_rpc_reap__skb_memory_accounting) unit_log_clear(); EXPECT_STREQ("1236", dead_rpcs(&self->hsk)); EXPECT_EQ(1, self->hsk.dead_skbs); - EXPECT_EQ(3000, refcount_read(&self->hsk.sock.sk_wmem_alloc)); + EXPECT_EQ(3001, refcount_read(&self->hsk.sock.sk_wmem_alloc)); } TEST_F(homa_rpc, homa_rpc_reap__release_buffers) { @@ -727,6 +727,18 @@ TEST_F(homa_rpc, homa_rpc_reap__free_gaps) homa_rpc_reap(&self->hsk, false); // Test framework will complain if memory not freed. } +TEST_F(homa_rpc, homa_rpc_reap__call_homa_sock_wakeup_wmem) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + 4000, 98, 1000, 150000); + + ASSERT_NE(NULL, crpc); + homa_rpc_end(crpc); + set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(0, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); +} TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) { EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 2e2a267b..5f10c4cb 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -12,6 +12,21 @@ #define n(x) htons(x) #define N(x) htonl(x) +struct homa_sock *hook_hsk; +static int hook_count; +static void schedule_hook(char *id) +{ + if (strcmp(id, "schedule_timeout") != 0) + return; + if (hook_count <= 0) + return; + hook_count--; + if (hook_count != 0) + return; + hook_hsk->sock.sk_sndbuf = refcount_read(&hook_hsk->sock.sk_wmem_alloc) + + 100; +} + FIXTURE(homa_sock) { struct homa homa; struct homa_sock hsk; @@ -30,6 +45,7 @@ FIXTURE_SETUP(homa_sock) self->server_ip[0] = unit_get_in_addr("1.2.3.4"); self->server_port = 99; self->client_id = 1234; + unit_log_clear(); } FIXTURE_TEARDOWN(homa_sock) { @@ -366,3 +382,62 @@ TEST_F(homa_sock, homa_sock_lock_slow) homa_sock_unlock(&self->hsk); } #endif /* See strip.py */ + +TEST_F(homa_sock, homa_sock_wait_wmem__no_memory_shortage) +{ + EXPECT_EQ(0, -homa_sock_wait_wmem(&self->hsk, 1)); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); +} +TEST_F(homa_sock, homa_sock_wait_wmem__nonblocking) +{ + self->hsk.sock.sk_sndbuf = 0; + EXPECT_EQ(EWOULDBLOCK, -homa_sock_wait_wmem(&self->hsk, 1)); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); +} +TEST_F(homa_sock, homa_sock_wait_wmem__thread_blocks_then_wakes) +{ + self->hsk.sock.sk_sndbuf = 0; + self->hsk.sock.sk_sndtimeo = 6; + hook_hsk = &self->hsk; + hook_count = 5; + unit_hook_register(schedule_hook); + + EXPECT_EQ(0, -homa_sock_wait_wmem(&self->hsk, 0)); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); +} +TEST_F(homa_sock, homa_sock_wait_wmem__thread_blocks_but_times_out) +{ + self->hsk.sock.sk_sndbuf = 0; + self->hsk.sock.sk_sndtimeo = 4; + hook_hsk = &self->hsk; + hook_count = 5; + unit_hook_register(schedule_hook); + + EXPECT_EQ(EWOULDBLOCK, -homa_sock_wait_wmem(&self->hsk, 0)); +} +TEST_F(homa_sock, homa_sock_wait_wmem__interrupted_by_signal) +{ + self->hsk.sock.sk_sndbuf = 0; + mock_prepare_to_wait_errors = 1; + mock_signal_pending = 1; + + EXPECT_EQ(EINTR, -homa_sock_wait_wmem(&self->hsk, 0)); +} + +TEST_F(homa_sock, homa_sock_wakeup_wmem) +{ + self->hsk.sock.sk_sndbuf = 0; + set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); + + /* First call: no memory available. */ + homa_sock_wakeup_wmem(&self->hsk); + EXPECT_EQ(1, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); + + /* Second call: memory now available. */ + self->hsk.sock.sk_sndbuf = 1000000; + mock_log_wakeups = 1; + unit_log_clear(); + homa_sock_wakeup_wmem(&self->hsk); + EXPECT_EQ(0, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); + EXPECT_STREQ("wake_up", unit_log_get()); +} diff --git a/util/homa_test.cc b/util/homa_test.cc index 29bd1c2c..429e6332 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -863,6 +863,94 @@ void test_udpclose() } } +/* Receive one message every second. */ +void recv_slow(int fd) +{ + int status; + + while (1) { + sleep(1); + recv_args.id = 0; + recv_args.flags = 0; + recv_hdr.msg_controllen = sizeof(recv_args); + status = recvmsg(fd, &recv_hdr, 0); + if (status < 0) { + printf("Receiver exiting: %s\n", strerror(errno)); + return; + } + printf("Received response with %d bytes\n", status); + } +} + +/** + * test_wmem() - Use two threads, a sender and a receiver, and make the + * receiver go so slowly that the sender uses up all available tx packet + * memory and blocks. + * @fd: Homa socket. + * @dest: Where to send the request + * @request: Request message. + */ +void test_wmem(int fd, const sockaddr_in_union *dest, char *request) +{ + __u64 id; + int status; + + std::thread thread(recv_slow, fd); + + for ( ; count > 0; count--) { + status = homa_send(fd, request, length, &dest->sa, + sockaddr_size(&dest->sa), &id, 0, 0); + if (status < 0) { + printf("Error in homa_send: %s\n", strerror(errno)); + break; + } + printf("Sent request with %d bytes\n", length); + } + shutdown(fd, 0); + thread.join(); +} + +/** + * test_wmem() - Use two threads, a sender and a receiver, and make the + * receiver go so slowly that the sender uses up all available tx packet + * memory and blocks. On the sender, use poll to wait for tx packet memory. + * @fd: Homa socket. + * @dest: Where to send the request + * @request: Request message. + */ +void test_wmem_poll(int fd, const sockaddr_in_union *dest, char *request) +{ + __u64 id; + int status; + struct pollfd poll_info = { + .fd = fd, + .events = POLLOUT, + .revents = 0 + }; + + std::thread thread(recv_slow, fd); + + for ( ; count > 0; count--) { + status = poll(&poll_info, 1, -1); + if (status > 0) { + printf("Poll succeeded with mask 0x%x\n", poll_info.revents); + } else { + printf("Poll failed: %s\n", strerror(errno)); + break; + } + status = homa_send(fd, request, length, &dest->sa, + sockaddr_size(&dest->sa), &id, 0, + HOMA_SENDMSG_NONBLOCKING); + if (status < 0) { + printf("Error in homa_send: %s\n", strerror(errno)); + break; + } + printf("Sent request with %d bytes\n", length); + } + shutdown(fd, 0); + thread.join(); +} + int main(int argc, char** argv) { int fd, status, port, next_arg; @@ -1023,6 +1111,10 @@ int main(int argc, char** argv) test_tmp(fd, count); } else if (strcmp(argv[next_arg], "udpclose") == 0) { test_udpclose(); + } else if (strcmp(argv[next_arg], "wmem") == 0) { + test_wmem(fd, &dest, buffer); + } else if (strcmp(argv[next_arg], "wmem_poll") == 0) { + test_wmem_poll(fd, &dest, buffer); } else { printf("Unknown operation '%s'\n", argv[next_arg]); exit(1); From f0733410f9d2544695c8991b7442568def61fe92 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 27 Mar 2025 21:01:39 -0700 Subject: [PATCH 224/625] Make Homa a pernet subsystem Also moved metrics-related functionality from homa_plumbing.c to homa_metrics.c for better modularity. --- homa_impl.h | 104 ++++++---- homa_metrics.c | 126 ++++++++----- homa_metrics.h | 46 ++++- homa_offload.c | 4 +- homa_plumbing.c | 387 ++++++++++++++++++++------------------ homa_utils.c | 4 - test/mock.c | 67 ++++++- test/mock.h | 5 + test/unit_homa_grant.c | 1 + test/unit_homa_incoming.c | 3 +- test/unit_homa_interest.c | 1 + test/unit_homa_metrics.c | 58 +++--- test/unit_homa_offload.c | 3 +- test/unit_homa_outgoing.c | 1 + test/unit_homa_peer.c | 1 + test/unit_homa_plumbing.c | 6 +- test/unit_homa_pool.c | 1 + test/unit_homa_rpc.c | 1 + test/unit_homa_skb.c | 1 + test/unit_homa_sock.c | 4 + test/unit_homa_timer.c | 1 + test/unit_homa_utils.c | 1 + test/unit_timetrace.c | 2 +- timetrace.c | 20 +- timetrace.h | 3 +- 25 files changed, 535 insertions(+), 316 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index dfad041d..4fe19ffe 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -108,10 +109,8 @@ union sockaddr_in_union { }; /** - * struct homa - Overall information about the Homa protocol implementation. - * - * There will typically only exist one of these at a time, except during - * unit tests. + * struct homa - Stores overall information about an implementation of + * the Homa transport. One of these objects exists for each network namespace. */ struct homa { /** @@ -685,37 +684,6 @@ struct homa { */ u32 timer_ticks; -#ifndef __STRIP__ /* See strip.py */ - /** - * @metrics_mutex: Used to synchronize accesses to @metrics_active_opens - * and updates to @metrics. - */ - struct mutex metrics_mutex; - - /* - * @metrics: a human-readable string containing recent values - * for all the Homa performance metrics, as generated by - * homa_append_metric. This string is kmalloc-ed; NULL means - * homa_append_metric has never been called. - */ - char *metrics; - - /** @metrics_capacity: number of bytes available at metrics. */ - size_t metrics_capacity; - - /** - * @metrics_length: current length of the string in metrics, - * not including terminating NULL character. - */ - size_t metrics_length; - - /** - * @metrics_active_opens: number of open struct files that - * currently exist for the metrics file in /proc. - */ - int metrics_active_opens; -#endif /* See strip.py */ - /** * @flags: a collection of bits that can be set using sysctl * to trigger various behaviors. @@ -743,7 +711,36 @@ struct homa { */ int next_id; +#ifndef __STRIP__ /* See strip.py */ + /** + * @sysctl_header: Used to remove sysctl values when this structure + * is destroyed. + */ + struct ctl_table_header *sysctl_header; +#endif /* See strip.py */ + #ifndef __UPSTREAM__ /* See strip.py */ + /** + * @sysctl_action: This value is set by sysctl to invoke one of + * several actions for testing. It is normally zero. + */ + int sysctl_action; + + /** + * @timer_kthread: Thread that runs timer code to detect lost + * packets and crashed peers. + */ + struct task_struct *timer_kthread; + + /** @hrtimer: Used to wakeup @timer_kthread at regular intervals. */ + struct hrtimer hrtimer; + + /** + * @destroyed: True means that this structure is being destroyed + * so everyone should clean up. + */ + bool destroyed; + /** * @temp: the values in this array can be read and written with sysctl. * They have no officially defined purpose, and are available for @@ -948,7 +945,7 @@ void unit_hook(char *id); #define UNIT_HOOK(...) #endif /* __UNIT_TEST__ */ -extern struct homa *global_homa; +extern unsigned int homa_net_id; void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, int port, int error); @@ -992,6 +989,8 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data); +int homa_net_init(struct net *net); +void homa_net_exit(struct net *net); int homa_pacer_main(void *transport); void homa_pacer_stop(struct homa *homa); bool homa_pacer_xmit(struct homa *homa); @@ -1096,5 +1095,38 @@ static inline void homa_check_pacer(struct homa *homa, int softirq) INC_METRIC(pacer_needed_help, 1); } +/** + * homa_from_net() - Return the struct homa associated with a particular + * struct net. + * @net: Get the struct homa for this net namespace. + * Return: see above + */ +static inline struct homa *homa_from_net(struct net *net) +{ + return (struct homa *) net_generic(net, homa_net_id); +} + +/** + * homa_from_sock() - Return the struct homa associated with a particular + * struct sock. + * @sock: Get the struct homa for this socket. + * Return: see above + */ +static inline struct homa *homa_from_sock(struct sock *sock) +{ + return (struct homa *) net_generic(sock_net(sock), homa_net_id); +} + +/** + * homa_from_skb() - Return the struct homa associated with a particular + * sk_buff. + * @skb: Get the struct homa for this packet buffer. + * Return: see above + */ +static inline struct homa *homa_from_skb(struct sk_buff *skb) +{ + return (struct homa *) net_generic(dev_net(skb->dev), homa_net_id); +} + extern struct completion homa_pacer_kthread_done; #endif /* _HOMA_IMPL_H */ diff --git a/homa_metrics.c b/homa_metrics.c index b3ddcf90..cd3eceb0 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -8,75 +8,116 @@ DEFINE_PER_CPU(struct homa_metrics, homa_metrics); +/* Describes file operations implemented for /proc/net/homa_metrics. */ +static const struct proc_ops homa_metrics_ops = { + .proc_open = homa_metrics_open, + .proc_read = homa_metrics_read, + .proc_lseek = homa_metrics_lseek, + .proc_release = homa_metrics_release, +}; + +/* Global information used to export metrics information through a file in + * /proc. + */ +struct homa_metrics_output homa_mout; + /** - * homa_metric_append() - Formats a new metric and appends it to homa->metrics. - * @homa: The new data will appended to the @metrics field of - * this structure. + * homa_metrics_init() - Initialize global information related to metrics. + * Return: 0 for success, otherwise a negative errno. + */ +int homa_metrics_init() +{ + mutex_init(&homa_mout.mutex); + homa_mout.output = NULL; + homa_mout.dir_entry = proc_create("homa_metrics", 0444, + init_net.proc_net, + &homa_metrics_ops); + if (!homa_mout.dir_entry) { + pr_err("couldn't create /proc/net/homa_metrics\n"); + return -ENOMEM; + } + return 0; +} + +/** + * homa_metrics_end() - Called to clean up metrics information when the + * Homa module unloads. + */ +void homa_metrics_end() +{ + if (homa_mout.dir_entry) + proc_remove(homa_mout.dir_entry); + homa_mout.dir_entry = NULL; + kfree(homa_mout.output); + homa_mout.output = NULL; +} + +/** + * homa_metric_append() - Formats a new metric and appends it to + * homa_mout.output. * @format: Standard printf-style format string describing the * new metric. Arguments after this provide the usual * values expected for printf-like functions. */ -void homa_metric_append(struct homa *homa, const char *format, ...) +void homa_metric_append(const char *format, ...) { char *new_buffer; size_t new_chars; va_list ap; - if (!homa->metrics) { + if (!homa_mout.output) { #ifdef __UNIT_TEST__ - homa->metrics_capacity = 30; + homa_mout.capacity = 30; #else - homa->metrics_capacity = 4096; + homa_mout.capacity = 4096; #endif - homa->metrics = kmalloc(homa->metrics_capacity, GFP_KERNEL); - if (!homa->metrics) { + homa_mout.output = kmalloc(homa_mout.capacity, GFP_KERNEL); + if (!homa_mout.output) { pr_warn("%s couldn't allocate memory\n", __func__); return; } - homa->metrics_length = 0; + homa_mout.length = 0; } /* May have to execute this loop multiple times if we run out - * of space in homa->metrics; each iteration expands the storage, + * of space in homa_mout.output; each iteration expands the storage, * until eventually it is large enough. */ while (true) { va_start(ap, format); - new_chars = vsnprintf(homa->metrics + homa->metrics_length, - homa->metrics_capacity - - homa->metrics_length, format, ap); + new_chars = vsnprintf(homa_mout.output + homa_mout.length, + homa_mout.capacity - + homa_mout.length, format, ap); va_end(ap); - if ((homa->metrics_length + new_chars) < homa->metrics_capacity) + if ((homa_mout.length + new_chars) < homa_mout.capacity) break; /* Not enough room; expand buffer capacity. */ - homa->metrics_capacity *= 2; - new_buffer = kmalloc(homa->metrics_capacity, GFP_KERNEL); + homa_mout.capacity *= 2; + new_buffer = kmalloc(homa_mout.capacity, GFP_KERNEL); if (!new_buffer) { pr_warn("%s couldn't allocate memory\n", __func__); return; } - memcpy(new_buffer, homa->metrics, homa->metrics_length); - kfree(homa->metrics); - homa->metrics = new_buffer; + memcpy(new_buffer, homa_mout.output, homa_mout.length); + kfree(homa_mout.output); + homa_mout.output = new_buffer; } - homa->metrics_length += new_chars; + homa_mout.length += new_chars; } /** * homa_metrics_print() - Sample all of the Homa performance metrics and * generate a human-readable string describing all of them. - * @homa: Overall data about the Homa protocol implementation; - * the formatted string will be stored in homa->metrics. * * Return: The formatted string. */ -char *homa_metrics_print(struct homa *homa) +char *homa_metrics_print(void) { int core, i, lower = 0; - homa->metrics_length = 0; -#define M(...) homa_metric_append(homa, __VA_ARGS__) + homa_mout.length = 0; +#define M(...) homa_metric_append(__VA_ARGS__) M("time_ns %20llu sched_clock() time when metrics were gathered\n", sched_clock()); for (core = 0; core < nr_cpu_ids; core++) { @@ -337,7 +378,7 @@ char *homa_metrics_print(struct homa *homa) i, m->temp[i]); } - return homa->metrics; + return homa_mout.output; } /** @@ -350,8 +391,6 @@ char *homa_metrics_print(struct homa *homa) */ int homa_metrics_open(struct inode *inode, struct file *file) { - struct homa *homa = global_homa; - /* Collect all of the metrics when the file is opened, and save * these for use by subsequent reads (don't want the metrics to * change between reads). If there are concurrent opens on the @@ -359,17 +398,17 @@ int homa_metrics_open(struct inode *inode, struct file *file) * use this copy for subsequent opens, until the file has been * completely closed. */ - mutex_lock(&homa->metrics_mutex); - if (homa->metrics_active_opens == 0) - homa_metrics_print(homa); - homa->metrics_active_opens++; - mutex_unlock(&homa->metrics_mutex); + mutex_lock(&homa_mout.mutex); + if (homa_mout.active_opens == 0) + homa_metrics_print(); + homa_mout.active_opens++; + mutex_unlock(&homa_mout.mutex); return 0; } /** * homa_metrics_read() - This function is invoked to handle read kernel calls on - * /proc/net/homa_metrics. + * /proc/net/homa_homa_mout. * @file: Information about the file being read. * @buffer: Address in user space of the buffer in which data from the file * should be returned. @@ -382,15 +421,14 @@ int homa_metrics_open(struct inode *inode, struct file *file) ssize_t homa_metrics_read(struct file *file, char __user *buffer, size_t length, loff_t *offset) { - struct homa *homa = global_homa; size_t copied; - if (*offset >= homa->metrics_length) + if (*offset >= homa_mout.length) return 0; - copied = homa->metrics_length - *offset; + copied = homa_mout.length - *offset; if (copied > length) copied = length; - if (copy_to_user(buffer, homa->metrics + *offset, copied)) + if (copy_to_user(buffer, homa_mout.output + *offset, copied)) return -EFAULT; *offset += copied; return copied; @@ -398,7 +436,7 @@ ssize_t homa_metrics_read(struct file *file, char __user *buffer, /** * homa_metrics_lseek() - This function is invoked to handle seeks on - * /proc/net/homa_metrics. Right now seeks are ignored: the file must be + * /proc/net/homa_homa_mout. Right now seeks are ignored: the file must be * read sequentially. * @file: Information about the file being read. * @offset: Distance to seek, in bytes @@ -420,10 +458,8 @@ loff_t homa_metrics_lseek(struct file *file, loff_t offset, int whence) */ int homa_metrics_release(struct inode *inode, struct file *file) { - struct homa *homa = global_homa; - - mutex_lock(&homa->metrics_mutex); - homa->metrics_active_opens--; - mutex_unlock(&homa->metrics_mutex); + mutex_lock(&homa_mout.mutex); + homa_mout.active_opens--; + mutex_unlock(&homa_mout.mutex); return 0; } diff --git a/homa_metrics.h b/homa_metrics.h index ee60103f..95c92ca2 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -657,6 +657,44 @@ struct homa_metrics { DECLARE_PER_CPU(struct homa_metrics, homa_metrics); +/** + * struct homa_metrics_output - Holds global information used to export metrics + * information through a file in /proc. + */ +struct homa_metrics_output { + /** + * @metrics_mutex: Used to synchronize accesses to @active_opens + * and updates to @output. + */ + struct mutex mutex; + + /* + * @output: a human-readable string containing recent values + * for all the Homa performance metrics, as generated by + * homa_append_metric. This string is kmalloc-ed; NULL means + * homa_append_metric has never been called. + */ + char *output; + + /** @capacity: number of bytes available at @output. */ + size_t capacity; + + /** + * @length: current length of the string in @output, not including + * terminating NULL character. + */ + size_t length; + + /** + * @active_opens: number of open struct files that currently exist + * for the metrics file in /proc. + */ + int active_opens; + + /* Used to remove /proc/net/homa_metrics when the module is unloaded. */ + struct proc_dir_entry *dir_entry; +}; + /** * homa_metrics_per_cpu() - Return the metrics structure for the current core. * This is unsynchronized and doesn't guarantee non-preemption. @@ -675,11 +713,15 @@ static inline struct homa_metrics *homa_metrics_per_cpu(void) #define INC_METRIC(metric, count) per_cpu(homa_metrics, \ raw_smp_processor_id()).metric += (count) -void homa_metric_append(struct homa *homa, const char *format, ...); +extern struct homa_metrics_output homa_mout; + +void homa_metric_append(const char *format, ...); +void homa_metrics_end(void); +int homa_metrics_init(void); loff_t homa_metrics_lseek(struct file *file, loff_t offset, int whence); int homa_metrics_open(struct inode *inode, struct file *file); -char *homa_metrics_print(struct homa *homa); +char *homa_metrics_print(void); ssize_t homa_metrics_read(struct file *file, char __user *buffer, size_t length, loff_t *offset); int homa_metrics_release(struct inode *inode, struct file *file); diff --git a/homa_offload.c b/homa_offload.c index 56947685..5355512a 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -290,7 +290,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, */ u64 saved_softirq_metric, softirq_ns; struct homa_offload_core *offload_core; - struct homa *homa = global_homa; + struct homa *homa = homa_from_skb(skb); struct sk_buff *result = NULL; struct homa_data_hdr *h_new; u64 *softirq_ns_metric; @@ -592,7 +592,7 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) { struct homa_data_hdr *h = (struct homa_data_hdr *)skb_transport_header(skb); - struct homa *homa = global_homa; + struct homa *homa = homa_from_skb(skb); // tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", // h->common.type, homa_local_id(h->common.sender_id), diff --git a/homa_plumbing.c b/homa_plumbing.c index caa7fb55..1c36d01a 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -16,33 +16,18 @@ static long sysctl_homa_mem[3] __read_mostly; static int sysctl_homa_rmem_min __read_mostly; static int sysctl_homa_wmem_min __read_mostly; -/* Global data for Homa. Never reference homa_data directly. Always use - * the global_homa variable instead; this allows overriding during unit tests. - */ -static struct homa homa_data; - -/* This variable contains the address of the statically-allocated struct homa - * used throughout Homa. This variable should almost never be used directly: - * it should be passed as a parameter to functions that need it. This - * variable is used only by functions called from Linux (so they can't pass - * in a pointer). - */ -struct homa *global_homa = &homa_data; - -/* True means that the Homa module is in the process of unloading itself, - * so everyone should clean up. - */ -static bool exiting; - -/* Thread that runs timer code to detect lost packets and crashed peers. */ -static struct task_struct *timer_kthread; +/* Identifier for retrieving Homa-specific data for a struct net. */ +unsigned int homa_net_id; -#ifndef __STRIP__ /* See strip.py */ -/* Set via sysctl to request that a particular action be taken. The value - * written determines the action. +/* This structure defines functions that allow Homa to be used as a + * pernet subsystem. */ -static int action; -#endif /* See strip.py */ +static struct pernet_operations homa_net_ops = { + .init = homa_net_init, + .exit = homa_net_exit, + .id = &homa_net_id, + .size = sizeof(struct homa) +}; /* This structure defines functions that handle various operations on * Homa sockets. These functions are relatively generic: they are called @@ -180,74 +165,67 @@ static struct inet6_protocol homav6_protocol = { }; #ifndef __STRIP__ /* See strip.py */ -/* Describes file operations implemented for /proc/net/homa_metrics. */ -static const struct proc_ops homa_metrics_pops = { - .proc_open = homa_metrics_open, - .proc_read = homa_metrics_read, - .proc_lseek = homa_metrics_lseek, - .proc_release = homa_metrics_release, -}; - -/* Used to remove /proc/net/homa_metrics when the module is unloaded. */ -static struct proc_dir_entry *metrics_dir_entry; - -/* Used to configure sysctl access to Homa configuration parameters.*/ +/* Used to configure sysctl access to Homa configuration parameters. The + * @data fields are actually offsets within a struct homa; these are converted + * to pointers into a net-specific struct homa later. + */ +#define OFFSET(field) (void *) offsetof(struct homa, field) static struct ctl_table homa_ctl_table[] = { { .procname = "action", - .data = &action, + .data = OFFSET(sysctl_action), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "bpage_lease_usecs", - .data = &homa_data.bpage_lease_usecs, + .data = OFFSET(bpage_lease_usecs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "busy_usecs", - .data = &homa_data.busy_usecs, + .data = OFFSET(busy_usecs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "cutoff_version", - .data = &homa_data.cutoff_version, + .data = OFFSET(cutoff_version), .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "dead_buffs_limit", - .data = &homa_data.dead_buffs_limit, + .data = OFFSET(dead_buffs_limit), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "fifo_grant_increment", - .data = &homa_data.fifo_grant_increment, + .data = OFFSET(fifo_grant_increment), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "flags", - .data = &homa_data.flags, + .data = OFFSET(flags), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "freeze_type", - .data = &homa_data.freeze_type, + .data = OFFSET(freeze_type), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "gen3_softirq_cores", @@ -258,245 +236,245 @@ static struct ctl_table homa_ctl_table[] = { }, { .procname = "grant_fifo_fraction", - .data = &homa_data.grant_fifo_fraction, + .data = OFFSET(grant_fifo_fraction), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "gro_busy_usecs", - .data = &homa_data.gro_busy_usecs, + .data = OFFSET(gro_busy_usecs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "gro_policy", - .data = &homa_data.gro_policy, + .data = OFFSET(gro_policy), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "gso_force_software", - .data = &homa_data.gso_force_software, + .data = OFFSET(gso_force_software), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "hijack_tcp", - .data = &homa_data.hijack_tcp, + .data = OFFSET(hijack_tcp), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "link_mbps", - .data = &homa_data.link_mbps, + .data = OFFSET(link_mbps), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "max_dead_buffs", - .data = &homa_data.max_dead_buffs, + .data = OFFSET(max_dead_buffs), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "max_grantable_rpcs", - .data = &homa_data.max_grantable_rpcs, + .data = OFFSET(max_grantable_rpcs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "max_gro_skbs", - .data = &homa_data.max_gro_skbs, + .data = OFFSET(max_gro_skbs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "max_gso_size", - .data = &homa_data.max_gso_size, + .data = OFFSET(max_gso_size), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "max_nic_queue_ns", - .data = &homa_data.max_nic_queue_ns, + .data = OFFSET(max_nic_queue_ns), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "max_incoming", - .data = &homa_data.max_incoming, + .data = OFFSET(max_incoming), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "max_overcommit", - .data = &homa_data.max_overcommit, + .data = OFFSET(max_overcommit), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "max_rpcs_per_peer", - .data = &homa_data.max_rpcs_per_peer, + .data = OFFSET(max_rpcs_per_peer), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "max_sched_prio", - .data = &homa_data.max_sched_prio, + .data = OFFSET(max_sched_prio), .maxlen = sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "next_id", - .data = &homa_data.next_id, + .data = OFFSET(next_id), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "num_priorities", - .data = &homa_data.num_priorities, + .data = OFFSET(num_priorities), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "pacer_fifo_fraction", - .data = &homa_data.pacer_fifo_fraction, + .data = OFFSET(pacer_fifo_fraction), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "poll_usecs", - .data = &homa_data.poll_usecs, + .data = OFFSET(poll_usecs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "priority_map", - .data = &homa_data.priority_map, + .data = OFFSET(priority_map), .maxlen = HOMA_MAX_PRIORITIES * sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "reap_limit", - .data = &homa_data.reap_limit, + .data = OFFSET(reap_limit), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "request_ack_ticks", - .data = &homa_data.request_ack_ticks, + .data = OFFSET(request_ack_ticks), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "resend_interval", - .data = &homa_data.resend_interval, + .data = OFFSET(resend_interval), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "resend_ticks", - .data = &homa_data.resend_ticks, + .data = OFFSET(resend_ticks), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "skb_page_frees_per_sec", - .data = &homa_data.skb_page_frees_per_sec, + .data = OFFSET(skb_page_frees_per_sec), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "skb_page_pool_min_kb", - .data = &homa_data.skb_page_pool_min_kb, + .data = OFFSET(skb_page_pool_min_kb), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "temp", - .data = homa_data.temp, - .maxlen = sizeof(homa_data.temp), + .data = OFFSET(temp[0]), + .maxlen = sizeof(((struct homa *) 0)->temp), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "throttle_min_bytes", - .data = &homa_data.throttle_min_bytes, + .data = OFFSET(throttle_min_bytes), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "timeout_resends", - .data = &homa_data.timeout_resends, + .data = OFFSET(timeout_resends), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "timeout_ticks", - .data = &homa_data.timeout_ticks, + .data = OFFSET(timeout_ticks), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = homa_dointvec }, { .procname = "unsched_bytes", - .data = &homa_data.unsched_bytes, + .data = OFFSET(unsched_bytes), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "unsched_cutoffs", - .data = &homa_data.unsched_cutoffs, + .data = OFFSET(unsched_cutoffs), .maxlen = HOMA_MAX_PRIORITIES * sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "verbose", - .data = &homa_data.verbose, + .data = OFFSET(verbose), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "window", - .data = &homa_data.window_param, + .data = OFFSET(window_param), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec }, { .procname = "wmem_max", - .data = &homa_data.wmem_max, + .data = OFFSET(wmem_max), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec @@ -534,11 +512,6 @@ static __u16 header_lengths[] = { }; #endif /* See strip.py */ -#ifndef __STRIP__ /* See strip.py */ -/* Used to remove sysctl values when the module is unloaded. */ -static struct ctl_table_header *homa_ctl_header; -#endif /* See strip.py */ - static DECLARE_COMPLETION(timer_thread_done); /** @@ -547,7 +520,6 @@ static DECLARE_COMPLETION(timer_thread_done); */ int __init homa_load(void) { - struct homa *homa = global_homa; int status; pr_notice("Homa module loading\n"); @@ -601,25 +573,10 @@ int __init homa_load(void) goto add_protocol_v6_err; } - status = homa_init(homa); - if (status) - goto homa_init_err; #ifndef __STRIP__ /* See strip.py */ - metrics_dir_entry = proc_create("homa_metrics", 0444, - init_net.proc_net, &homa_metrics_pops); - if (!metrics_dir_entry) { - pr_err("couldn't create /proc/net/homa_metrics\n"); - status = -ENOMEM; + status = homa_metrics_init(); + if (status != 0) goto metrics_err; - } - - homa_ctl_header = register_net_sysctl(&init_net, "net/homa", - homa_ctl_table); - if (!homa_ctl_header) { - pr_err("couldn't register Homa sysctl parameters\n"); - status = -ENOMEM; - goto sysctl_err; - } status = homa_offload_init(); if (status != 0) { @@ -628,35 +585,29 @@ int __init homa_load(void) } #endif /* See strip.py */ - timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); - if (IS_ERR(timer_kthread)) { - status = PTR_ERR(timer_kthread); - pr_err("couldn't create homa pacer thread: error %d\n", - status); - timer_kthread = NULL; - goto timer_err; + status = register_pernet_subsys(&homa_net_ops); + if (status != 0) { + pr_err("Homa got error from register_pernet_subsys: %d\n", + status); + goto net_err; } #ifndef __STRIP__ /* See strip.py */ homa_gro_hook_tcp(); #endif /* See strip.py */ #ifndef __UPSTREAM__ /* See strip.py */ - tt_init("timetrace", homa->temp); + tt_init("timetrace"); #endif /* See strip.py */ return 0; -timer_err: #ifndef __STRIP__ /* See strip.py */ +net_err: homa_offload_end(); offload_err: - unregister_net_sysctl_table(homa_ctl_header); -sysctl_err: - proc_remove(metrics_dir_entry); + homa_metrics_end(); metrics_err: #endif /* See strip.py */ - homa_destroy(homa); -homa_init_err: inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); add_protocol_v6_err: inet_del_protocol(&homa_protocol, IPPROTO_HOMA); @@ -676,27 +627,19 @@ int __init homa_load(void) */ void __exit homa_unload(void) { - struct homa *homa = global_homa; - pr_notice("Homa module unloading\n"); - exiting = true; + + unregister_pernet_subsys(&homa_net_ops); #ifndef __UPSTREAM__ /* See strip.py */ tt_destroy(); #endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ homa_gro_unhook_tcp(); -#endif /* See strip.py */ - if (timer_kthread) - wake_up_process(timer_kthread); - wait_for_completion(&timer_thread_done); -#ifndef __STRIP__ /* See strip.py */ if (homa_offload_end() != 0) pr_err("Homa couldn't stop offloads\n"); - unregister_net_sysctl_table(homa_ctl_header); - proc_remove(metrics_dir_entry); + homa_metrics_end(); #endif /* See strip.py */ - homa_destroy(homa); inet_del_protocol(&homa_protocol, IPPROTO_HOMA); inet_unregister_protosw(&homa_protosw); inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); @@ -708,6 +651,78 @@ void __exit homa_unload(void) module_init(homa_load); module_exit(homa_unload); +/** + * homa_net_init() - Initialize a new struct homa as a per-net subsystem. + * @net: The net that Homa will be associated with. + * Return: 0 on success, otherwise a negative errno. + */ +int homa_net_init(struct net *net) +{ + struct homa *homa = homa_from_net(net); + int status; + + pr_notice("Homa attaching to net namespace\n"); + + status = homa_init(homa); + if (status) + goto homa_init_err; +#ifndef __STRIP__ /* See strip.py */ + + homa->sysctl_header = register_net_sysctl(net, "net/homa", + homa_ctl_table); + if (!homa->sysctl_header) { + pr_err("couldn't register Homa sysctl parameters\n"); + status = -ENOMEM; + goto sysctl_err; + } +#endif /* See strip.py */ + + homa->timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); + if (IS_ERR(homa->timer_kthread)) { + status = PTR_ERR(homa->timer_kthread); + pr_err("couldn't create homa timer thread: error %d\n", + status); + homa->timer_kthread = NULL; + goto timer_err; + } + +#ifndef __UPSTREAM__ /* See strip.py */ + tt_set_temp(homa->temp); +#endif /* See strip.py */ + return 0; + +timer_err: +#ifndef __STRIP__ /* See strip.py */ + unregister_net_sysctl_table(homa->sysctl_header); +sysctl_err: +#endif /* See strip.py */ + homa_destroy(homa); +homa_init_err: + return status; +} + +/** + * homa_net_exit() - Remove Homa from a net. + * @net: The net from which Homa should be removed. + */ +void homa_net_exit(struct net *net) +{ + struct homa *homa = homa_from_net(net); + + pr_notice("Homa detaching from net namespace\n"); + + homa->destroyed = true; + if (homa->timer_kthread) + wake_up_process(homa->timer_kthread); + wait_for_completion(&timer_thread_done); + +#ifndef __STRIP__ /* See strip.py */ + if (homa->sysctl_header) + unregister_net_sysctl_table(homa->sysctl_header); +#endif /* See strip.py */ + homa_destroy(homa); +} + /** * homa_bind() - Implements the bind system call for Homa sockets: associates * a well-known service port with a socket. Unlike other AF_INET6 protocols, @@ -867,7 +882,7 @@ int homa_ioctl(struct sock *sk, int cmd, int *karg) int homa_socket(struct sock *sk) { struct homa_sock *hsk = homa_sk(sk); - struct homa *homa = global_homa; + struct homa *homa = homa_from_sock(sk); int result; result = homa_sock_init(hsk, homa); @@ -1371,7 +1386,7 @@ int homa_softirq(struct sk_buff *skb) { struct sk_buff *packets, *other_pkts, *next; struct sk_buff **prev_link, **other_link; - struct homa *homa = global_homa; + struct homa *homa = homa_from_skb(skb); struct homa_common_hdr *h; int header_offset; #ifndef __STRIP__ /* See strip.py */ @@ -1547,7 +1562,7 @@ int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb) int homa_err_handler_v4(struct sk_buff *skb, u32 info) { const struct icmphdr *icmp = icmp_hdr(skb); - struct homa *homa = global_homa; + struct homa *homa = homa_from_skb(skb); struct in6_addr daddr; int type = icmp->type; int code = icmp->code; @@ -1593,7 +1608,7 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; - struct homa *homa = global_homa; + struct homa *homa = homa_from_skb(skb); int error = 0; int port = 0; @@ -1670,13 +1685,21 @@ int homa_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) #endif { - struct homa *homa = global_homa; + struct homa *homa = homa_from_net(current->nsproxy->net_ns); + struct ctl_table table_copy; int result; - result = proc_dointvec(table, write, buffer, lenp, ppos); + /* Generate a new ctl_table that refers to a field in the + * net-specific struct homa. + */ + table_copy = *table; + table_copy.data = ((char *) homa) + (uintptr_t) table_copy.data; + + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); if (write) { - /* Don't worry which particular value changed; update - * all info that is dependent on any sysctl value. + /* Update any information that is dependent on sysctl values + * (don't worry about which value changed, just refresh all + * dependent information). */ homa_incoming_sysctl_changed(homa); homa_outgoing_sysctl_changed(homa); @@ -1685,8 +1708,8 @@ int homa_dointvec(const struct ctl_table *table, int write, * particular value was written (don't want to increment * cutoff_version otherwise). */ - if (table->data == &homa_data.unsched_cutoffs || - table->data == &homa_data.num_priorities) { + if (table_copy.data == &homa->unsched_cutoffs || + table_copy.data == &homa->num_priorities) { homa_prios_changed(homa); } @@ -1695,39 +1718,39 @@ int homa_dointvec(const struct ctl_table *table, int write, homa->next_id = 0; } - /* Handle the special value log_topic by invoking a function + /* Handle the special value "action" by invoking a function * to print information to the log. */ - if (table->data == &action) { - if (action == 2) { + if (table_copy.data == &homa->sysctl_action) { + if (homa->sysctl_action == 2) { homa_rpc_log_active(homa, 0); - } else if (action == 3) { + } else if (homa->sysctl_action == 3) { tt_record("Freezing because of sysctl"); tt_freeze(); - } else if (action == 4) { + } else if (homa->sysctl_action == 4) { homa_log_throttled(homa); - } else if (action == 5) { + } else if (homa->sysctl_action == 5) { tt_printk(); - } else if (action == 6) { + } else if (homa->sysctl_action == 6) { tt_record("Calling homa_rpc_log_active because of action 6"); homa_rpc_log_active_tt(homa, 0); tt_record("Freezing because of action 6"); tt_freeze(); - } else if (action == 7) { + } else if (homa->sysctl_action == 7) { homa_rpc_log_active_tt(homa, 0); tt_record("Freezing cluster because of action 7"); homa_freeze_peers(homa); tt_record("Finished freezing cluster"); tt_freeze(); - } else if (action == 8) { + } else if (homa->sysctl_action == 8) { pr_notice("homa_total_incoming is %d\n", atomic_read(&homa->total_incoming)); - } else if (action == 9) { + } else if (homa->sysctl_action == 9) { tt_print_file("/users/ouster/node.tt"); } else { - homa_rpc_log_active(homa, action); + homa_rpc_log_active(homa, homa->sysctl_action); } - action = 0; + homa->sysctl_action = 0; } } return result; @@ -1822,7 +1845,10 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, */ enum hrtimer_restart homa_hrtimer(struct hrtimer *timer) { - wake_up_process(timer_kthread); + struct homa *homa; + + homa = container_of(timer, struct homa, hrtimer); + wake_up_process(homa->timer_kthread); return HRTIMER_NORESTART; } @@ -1835,31 +1861,26 @@ enum hrtimer_restart homa_hrtimer(struct hrtimer *timer) int homa_timer_main(void *transport) { struct homa *homa = (struct homa *)transport; - - /* The following variable is static because hrtimer_init will - * complain about a stack-allocated hrtimer if in debug mode. - */ - static struct hrtimer hrtimer; ktime_t tick_interval; u64 nsec; - hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer.function = &homa_hrtimer; + hrtimer_init(&homa->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + homa->hrtimer.function = &homa_hrtimer; nsec = 1000000; /* 1 ms */ tick_interval = ns_to_ktime(nsec); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!exiting) { - hrtimer_start(&hrtimer, tick_interval, + if (!homa->destroyed) { + hrtimer_start(&homa->hrtimer, tick_interval, HRTIMER_MODE_REL); schedule(); } __set_current_state(TASK_RUNNING); - if (exiting) + if (homa->destroyed) break; homa_timer(homa); } - hrtimer_cancel(&hrtimer); + hrtimer_cancel(&homa->hrtimer); kthread_complete_and_exit(&timer_thread_done, 0); return 0; } diff --git a/homa_utils.c b/homa_utils.c index 14bdee65..cfd54b2b 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -135,8 +135,6 @@ int homa_init(struct homa *homa) homa->gro_policy = HOMA_GRO_NORMAL; homa->busy_usecs = 100; homa->gro_busy_usecs = 5; - mutex_init(&homa->metrics_mutex); - homa->metrics = NULL; #endif /* See strip.py */ homa->bpage_lease_usecs = 10000; #ifndef __STRIP__ /* See strip.py */ @@ -174,8 +172,6 @@ void homa_destroy(struct homa *homa) } #ifndef __STRIP__ /* See strip.py */ homa_skb_cleanup(homa); - kfree(homa->metrics); - homa->metrics = NULL; #endif /* See strip.py */ } diff --git a/test/mock.c b/test/mock.c index 2a0506cf..6a3d65ce 100644 --- a/test/mock.c +++ b/test/mock.c @@ -220,14 +220,22 @@ char mock_printk_output [5000]; __u16 mock_min_default_port = 0x8000; /* Used as sk_socket for all sockets created by mock_sock_init. */ -struct socket mock_socket; +static struct socket mock_socket; + +/* Will be used as the struct homa for functions such as homa_from_net + * and homa_from_sock. + */ +static struct homa *mock_homa; +struct net mock_net; struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; struct netdev_queue mock_net_queue = {.state = 0}; struct net_device mock_net_device = { .gso_max_segs = 1000, .gso_max_size = 0, - ._tx = &mock_net_queue}; + ._tx = &mock_net_queue, + .nd_net = {.net = &mock_net} + }; const struct net_offload *inet_offloads[MAX_INET_PROTOS]; const struct net_offload *inet6_offloads[MAX_INET_PROTOS]; struct net_offload tcp_offload; @@ -256,6 +264,10 @@ struct net_hotdata net_hotdata = { int debug_locks; struct static_call_key __SCK__might_resched; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +struct lockdep_map rcu_lock_map; +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} @@ -924,6 +936,13 @@ bool __list_del_entry_valid_or_report(struct list_head *entry) void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) {} +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *nest_lock, unsigned long ip) +{} +#endif + #if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) void lockdep_rcu_suspicious(const char *file, const int line, const char *s) {} @@ -934,6 +953,11 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) return 0; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void lock_release(struct lockdep_map *lock, unsigned long ip) +{} +#endif + void lock_sock_nested(struct sock *sk, int subclass) { mock_active_locks++; @@ -1117,6 +1141,11 @@ int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) return 1; } +bool rcu_is_watching(void) +{ + return true; +} + int rcu_read_lock_any_held(void) { return 1; @@ -1134,6 +1163,12 @@ int rcu_read_lock_bh_held(void) } #endif +void __rcu_read_lock(void) +{} + +void __rcu_read_unlock(void) +{} + bool rcuref_get_slowpath(rcuref_t *ref) { return true; @@ -1141,6 +1176,11 @@ bool rcuref_get_slowpath(rcuref_t *ref) void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) {} +int register_pernet_subsys(struct pernet_operations *) +{ + return 0; +} + void release_sock(struct sock *sk) { mock_active_locks--; @@ -1360,6 +1400,9 @@ void tasklet_kill(struct tasklet_struct *t) void unregister_net_sysctl_table(struct ctl_table_header *header) {} +void unregister_pernet_subsys(struct pernet_operations *) +{} + void vfree(const void *block) { if (!vmallocs_in_use || unit_hash_get(vmallocs_in_use, block) == NULL) { @@ -1533,6 +1576,13 @@ void mock_get_page(struct page *page) unit_hash_set(pages_in_use, page, (void *) (ref_count+1)); } +void *mock_net_generic(const struct net *net, unsigned int id) +{ + if (id == homa_net_id) + return mock_homa; + return NULL; +} + /** * mock_page_refs() - Returns current reference count for page (0 if no * such page exists). @@ -1635,6 +1685,16 @@ void mock_rpc_put(struct homa_rpc *rpc) atomic_dec(&rpc->refs); } +/** + * mock_set_homa() - Arrange for a particular struct homa to be used in + * tests (e.g., it will be discovered by homa_from_net etc.). + */ +void mock_set_homa(struct homa *homa) +{ + mock_homa = homa; + homa_net_id = 167; +} + /** * mock_set_clock_vals() - Specify one or more clock values to be returned * by the next calls to sched_clock(). The list of arguments must be @@ -1835,6 +1895,7 @@ int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) sk->sk_data_ready = mock_data_ready; sk->sk_family = mock_ipv6 ? AF_INET6 : AF_INET; sk->sk_socket = &mock_socket; + sk->sk_net.net = &mock_net; memset(&mock_socket, 0, sizeof(mock_socket)); refcount_set(&sk->sk_wmem_alloc, 1); init_waitqueue_head(&mock_socket.wq.wait); @@ -1924,6 +1985,8 @@ void mock_teardown(void) mock_page_nid_mask = 0; mock_printk_output[0] = 0; mock_min_default_port = 0x8000; + mock_homa = NULL; + homa_net_id = 0; mock_net_device.gso_max_size = 0; mock_net_device.gso_max_segs = 1000; memset(inet_offloads, 0, sizeof(inet_offloads)); diff --git a/test/mock.h b/test/mock.h index edd22bfe..05999fe0 100644 --- a/test/mock.h +++ b/test/mock.h @@ -50,6 +50,8 @@ #define kthread_complete_and_exit(...) +#define net_generic(net, id) mock_net_generic(net, id) + #ifdef page_address #undef page_address #endif @@ -132,6 +134,7 @@ extern int mock_max_grants; extern int mock_max_skb_frags; extern __u16 mock_min_default_port; extern int mock_mtu; +extern struct net mock_net; extern struct net_device mock_net_device; extern u64 mock_ns; @@ -165,6 +168,7 @@ unsigned int mock_get_mtu(const struct dst_entry *dst); void mock_get_page(struct page *page); void *mock_kmalloc(size_t size, gfp_t flags); +void *mock_net_generic(const struct net *net, unsigned int id); int mock_page_refs(struct page *page); int mock_page_refs(struct page *page); int mock_page_to_nid(struct page *page); @@ -182,6 +186,7 @@ void mock_rpc_hold(struct homa_rpc *rpc); void mock_rpc_put(struct homa_rpc *rpc); void mock_set_clock_vals(u64 t, ...); void mock_set_core(int num); +void mock_set_homa(struct homa *homa); void mock_set_ipv6(struct homa_sock *hsk); void mock_spin_lock(spinlock_t *lock); void mock_spin_unlock(spinlock_t *lock); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 6c571557..3ed838f2 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -76,6 +76,7 @@ FIXTURE_SETUP(homa_grant) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); + mock_set_homa(&self->homa); self->homa.num_priorities = 1; self->homa.poll_usecs = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 9da70c63..870e2ced 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -87,6 +87,7 @@ FIXTURE_SETUP(homa_incoming) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); + mock_set_homa(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 1; self->homa.poll_usecs = 0; @@ -819,7 +820,7 @@ TEST_F(homa_incoming, homa_copy_to_user__timetrace_info) unit_log_clear(); mock_copy_to_user_dont_copy = -1; - tt_init(NULL, NULL); + tt_init(NULL); EXPECT_EQ(0, -homa_copy_to_user(crpc)); tt_get_messages(traces, sizeof(traces)); EXPECT_STREQ("starting copy to user space for id 1234; " diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index 0e3d6550..fc15c52f 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -53,6 +53,7 @@ FIXTURE(homa_interest) { FIXTURE_SETUP(homa_interest) { homa_init(&self->homa); + mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); self->client_port = 40000; diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c index 0c02f42a..d09c364b 100644 --- a/test/unit_homa_metrics.c +++ b/test/unit_homa_metrics.c @@ -13,52 +13,52 @@ FIXTURE(homa_metrics) { FIXTURE_SETUP(homa_metrics) { homa_init(&self->homa); - global_homa = &self->homa; + mock_set_homa(&self->homa); } FIXTURE_TEARDOWN(homa_metrics) { - global_homa = NULL; homa_destroy(&self->homa); + homa_metrics_end(); unit_teardown(); } TEST_F(homa_metrics, homa_metric_append) { - self->homa.metrics_length = 0; - homa_metric_append(&self->homa, "x: %d, y: %d", 10, 20); - EXPECT_EQ(12, self->homa.metrics_length); - EXPECT_STREQ("x: 10, y: 20", self->homa.metrics); + homa_mout.length = 0; + homa_metric_append("x: %d, y: %d", 10, 20); + EXPECT_EQ(12, homa_mout.length); + EXPECT_STREQ("x: 10, y: 20", homa_mout.output); - homa_metric_append(&self->homa, ", z: %d", 12345); - EXPECT_EQ(22, self->homa.metrics_length); - EXPECT_STREQ("x: 10, y: 20, z: 12345", self->homa.metrics); - EXPECT_EQ(30, self->homa.metrics_capacity); + homa_metric_append(", z: %d", 12345); + EXPECT_EQ(22, homa_mout.length); + EXPECT_STREQ("x: 10, y: 20, z: 12345", homa_mout.output); + EXPECT_EQ(30, homa_mout.capacity); - homa_metric_append(&self->homa, ", q: %050d", 88); - EXPECT_EQ(77, self->homa.metrics_length); + homa_metric_append(", q: %050d", 88); + EXPECT_EQ(77, homa_mout.length); EXPECT_STREQ("x: 10, y: 20, z: 12345, q: 00000000000000000000000000000000000000000000000088", - self->homa.metrics); - EXPECT_EQ(120, self->homa.metrics_capacity); + homa_mout.output); + EXPECT_EQ(120, homa_mout.capacity); } TEST_F(homa_metrics, homa_metrics_open) { EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); - EXPECT_NE(NULL, self->homa.metrics); + EXPECT_NE(NULL, homa_mout.output); - strcpy(self->homa.metrics, "12345"); + strcpy(homa_mout.output, "12345"); EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); - EXPECT_EQ(5, strlen(self->homa.metrics)); - EXPECT_EQ(2, self->homa.metrics_active_opens); + EXPECT_EQ(5, strlen(homa_mout.output)); + EXPECT_EQ(2, homa_mout.active_opens); } TEST_F(homa_metrics, homa_metrics_read__basics) { loff_t offset = 10; char buffer[1000]; - self->homa.metrics = kmalloc(100, GFP_KERNEL); - self->homa.metrics_capacity = 100; - strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); - self->homa.metrics_length = 26; + homa_mout.output = kmalloc(100, GFP_KERNEL); + homa_mout.capacity = 100; + strcpy(homa_mout.output, "0123456789abcdefghijklmnop"); + homa_mout.length = 26; EXPECT_EQ(5, homa_metrics_read(NULL, buffer, 5, &offset)); EXPECT_SUBSTR("_copy_to_user copied 5 bytes", unit_log_get()); EXPECT_EQ(15, offset); @@ -78,20 +78,20 @@ TEST_F(homa_metrics, homa_metrics_read__error_copying_to_user) loff_t offset = 10; char buffer[1000]; - self->homa.metrics = kmalloc(100, GFP_KERNEL); - self->homa.metrics_capacity = 100; - strcpy(self->homa.metrics, "0123456789abcdefghijklmnop"); - self->homa.metrics_length = 26; + homa_mout.output = kmalloc(100, GFP_KERNEL); + homa_mout.capacity = 100; + strcpy(homa_mout.output, "0123456789abcdefghijklmnop"); + homa_mout.length = 26; mock_copy_to_user_errors = 1; EXPECT_EQ(EFAULT, -homa_metrics_read(NULL, buffer, 5, &offset)); } TEST_F(homa_metrics, homa_metrics_release) { - self->homa.metrics_active_opens = 2; + homa_mout.active_opens = 2; EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); - EXPECT_EQ(1, self->homa.metrics_active_opens); + EXPECT_EQ(1, homa_mout.active_opens); EXPECT_EQ(0, homa_metrics_release(NULL, NULL)); - EXPECT_EQ(0, self->homa.metrics_active_opens); + EXPECT_EQ(0, homa_mout.active_opens); } diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index b780132e..c661ffd5 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -41,10 +41,10 @@ FIXTURE_SETUP(homa_offload) int i; homa_init(&self->homa); + mock_set_homa(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; - global_homa = &self->homa; mock_sock_init(&self->hsk, &self->homa, 99); self->ip = unit_get_in_addr("196.168.0.1"); memset(&self->header, 0, sizeof(self->header)); @@ -101,7 +101,6 @@ FIXTURE_TEARDOWN(homa_offload) list_for_each_entry_safe(skb, tmp, &self->napi.gro_hash[2].list, list) kfree_skb(skb); homa_destroy(&self->homa); - global_homa = NULL; unit_teardown(); } diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index ea5a91be..2f1dcdf8 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -77,6 +77,7 @@ FIXTURE_SETUP(homa_outgoing) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); + mock_set_homa(&self->homa); mock_ns = 10000; atomic64_set(&self->homa.link_idle_time, 10000); self->homa.ns_per_mbyte = 1000000; diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 3810052b..c8caa257 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -46,6 +46,7 @@ FIXTURE(homa_peer) { FIXTURE_SETUP(homa_peer) { homa_init(&self->homa); + mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); homa_peertab_init(&self->peertab); self->client_ip[0] = unit_get_in_addr("196.168.0.1"); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 4fe70118..6cfb8f16 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -55,8 +55,8 @@ FIXTURE_SETUP(homa_plumbing) self->client_addr.in6.sin6_port = htons(self->client_port); self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - global_homa = &self->homa; homa_init(&self->homa); + mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); self->client_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; @@ -107,7 +107,6 @@ FIXTURE_TEARDOWN(homa_plumbing) { homa_destroy(&self->homa); unit_teardown(); - global_homa = NULL; } TEST_F(homa_plumbing, homa_load__error_in_inet6_register_protosw) @@ -262,6 +261,7 @@ TEST_F(homa_plumbing, homa_socket__success) struct homa_sock hsk; memset(&hsk, 0, sizeof(hsk)); + hsk.sock.sk_net.net = &mock_net; refcount_set(&hsk.sock.sk_wmem_alloc, 1); EXPECT_EQ(0, homa_socket(&hsk.sock)); homa_sock_destroy(&hsk); @@ -270,6 +270,8 @@ TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) { struct homa_sock hsk; + memset(&hsk, 0, sizeof(hsk)); + hsk.sock.sk_net.net = &mock_net; refcount_set(&hsk.sock.sk_wmem_alloc, 1); mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_socket(&hsk.sock)); diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index e8658572..954642de 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -19,6 +19,7 @@ FIXTURE(homa_pool) { FIXTURE_SETUP(homa_pool) { homa_init(&self->homa); + mock_set_homa(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index cc80561f..e1b9e046 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -67,6 +67,7 @@ FIXTURE_SETUP(homa_rpc) self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); + mock_set_homa(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 932c4882..620f53c7 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -81,6 +81,7 @@ FIXTURE(homa_skb) { FIXTURE_SETUP(homa_skb) { homa_init(&self->homa); + mock_set_homa(&self->homa); self->skb = alloc_skb_fclone(200, GFP_KERNEL); if (!self->skb) FAIL("unit_homa_skb setup couldn't allocate skb"); diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 5f10c4cb..26db62c1 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -39,6 +39,7 @@ FIXTURE(homa_sock) { FIXTURE_SETUP(homa_sock) { homa_init(&self->homa); + mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); self->client_ip[0] = unit_get_in_addr("196.168.0.1"); self->client_port = 40000; @@ -66,6 +67,7 @@ TEST_F(homa_sock, homa_socktab_start_scan) homa_destroy(&self->homa); homa_init(&self->homa); + mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); EXPECT_EQ(&self->hsk, homa_socktab_start_scan(self->homa.port_map, &scan)); @@ -82,6 +84,7 @@ TEST_F(homa_sock, homa_socktab_next) homa_destroy(&self->homa); homa_init(&self->homa); + mock_set_homa(&self->homa); mock_sock_init(&hsk1, &self->homa, first_port); mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); mock_sock_init(&hsk3, &self->homa, first_port+2*HOMA_SOCKTAB_BUCKETS); @@ -114,6 +117,7 @@ TEST_F(homa_sock, homa_socktab_end_scan) homa_destroy(&self->homa); homa_init(&self->homa); + mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); homa_socktab_start_scan(self->homa.port_map, &scan1); homa_socktab_start_scan(self->homa.port_map, &scan2); diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index c8c3409c..8a1d1868 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -32,6 +32,7 @@ FIXTURE_SETUP(homa_timer) self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); + mock_set_homa(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.resend_ticks = 2; self->homa.timer_ticks = 100; diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 92ad1a2a..aa748f24 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -16,6 +16,7 @@ FIXTURE(homa_utils) { FIXTURE_SETUP(homa_utils) { homa_init(&self->homa); + mock_set_homa(&self->homa); unit_log_clear(); } FIXTURE_TEARDOWN(homa_utils) diff --git a/test/unit_timetrace.c b/test/unit_timetrace.c index 142d749f..ee7e64d2 100644 --- a/test/unit_timetrace.c +++ b/test/unit_timetrace.c @@ -15,7 +15,7 @@ FIXTURE_SETUP(timetrace) self->file.private_data = 0; tt_buffer_size = 64; tt_test_no_khz = true; - tt_init("tt", NULL); + tt_init("tt"); mock_cycles = 1000; } FIXTURE_TEARDOWN(timetrace) diff --git a/timetrace.c b/timetrace.c index aa74eca2..11b8c504 100644 --- a/timetrace.c +++ b/timetrace.c @@ -97,14 +97,11 @@ bool tt_test_no_khz; * @proc_file: Name of a file in /proc; this file can be read to extract * the current timetrace. NULL means don't create a /proc file * (such as when running unit tests). - * @temp: Pointer to homa's "temp" configuration parameters, which - * we should make available to the kernel. NULL means no - * such variables available. * * Return : 0 means success, anything else means an error occurred (a * log message will be printed to describe the error). */ -int tt_init(char *proc_file, int *temp) +int tt_init(char *proc_file) { int i; @@ -153,8 +150,6 @@ int tt_init(char *proc_file, int *temp) tt_linux_dbg2 = tt_dbg2; tt_linux_dbg3 = tt_dbg3; memset(tt_debug_int64, 0, sizeof(tt_debug_int64)); - if (temp) - tt_linux_homa_temp = temp; #endif return 0; @@ -167,6 +162,19 @@ int tt_init(char *proc_file, int *temp) return -1; } +/** + * tt_set_temp() - Make the "temp" variables from a struct homa available + * to the rest of the Linux kernel. + * @temp: Pointer to homa's "temp" configuration parameters, which + * we should make available to the kernel. + */ +void tt_set_temp(int *temp) +{ +#ifdef TT_KERNEL + tt_linux_homa_temp = temp; +#endif +} + /** * tt_destroy(): Disable time tracing and disable the /proc file for * reading traces. diff --git a/timetrace.h b/timetrace.h index d6af807c..9e95dc9a 100644 --- a/timetrace.h +++ b/timetrace.h @@ -96,10 +96,11 @@ struct tt_proc_file { void tt_destroy(void); void tt_freeze(void); -int tt_init(char *proc_file, int *temp); +int tt_init(char *proc_file); void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, const char *format, u32 arg0, u32 arg1, u32 arg2, u32 arg3); +void tt_set_temp(int *temp); /* Private methods and variables: exposed so they can be accessed * by unit tests. From 60e8fa34c860516b2e8ebf8a573bc9d470bd2a34 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 28 Mar 2025 09:48:48 -0700 Subject: [PATCH 225/625] Remove extraneous code in homa_plumbing.c --- homa_plumbing.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 1c36d01a..e6006566 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -11,11 +11,6 @@ #include "homa_peer.h" #include "homa_pool.h" -/* Not yet sure what these variables are for */ -static long sysctl_homa_mem[3] __read_mostly; -static int sysctl_homa_rmem_min __read_mostly; -static int sysctl_homa_wmem_min __read_mostly; - /* Identifier for retrieving Homa-specific data for a struct net. */ unsigned int homa_net_id; @@ -100,9 +95,6 @@ static struct proto homa_prot = { .hash = homa_hash, .unhash = homa_unhash, .get_port = homa_get_port, - .sysctl_mem = sysctl_homa_mem, - .sysctl_wmem = &sysctl_homa_wmem_min, - .sysctl_rmem = &sysctl_homa_rmem_min, .obj_size = sizeof(struct homa_sock), .no_autobind = 1, }; @@ -124,10 +116,6 @@ static struct proto homav6_prot = { .hash = homa_hash, .unhash = homa_unhash, .get_port = homa_get_port, - .sysctl_mem = sysctl_homa_mem, - .sysctl_wmem = &sysctl_homa_wmem_min, - .sysctl_rmem = &sysctl_homa_rmem_min, - .obj_size = sizeof(struct homa_v6_sock), .ipv6_pinfo_offset = offsetof(struct homa_v6_sock, inet6), From 14a4e0a372ccb7ffc427eb712f209c1e6ab1d0b7 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sat, 29 Mar 2025 21:05:14 -0700 Subject: [PATCH 226/625] Make unit tests pass with __STRIP__=1 --- homa_outgoing.c | 2 +- homa_plumbing.c | 2 +- homa_utils.c | 2 +- test/mock.c | 1 - test/unit_homa_incoming.c | 12 ++++++------ test/unit_homa_interest.c | 28 +++++++++++++++++----------- test/unit_homa_outgoing.c | 5 +++-- test/unit_homa_plumbing.c | 4 ++++ test/unit_homa_rpc.c | 14 +++++++++----- 9 files changed, 42 insertions(+), 28 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index f8f285be..a1c9da49 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -136,7 +136,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, skb = homa_skb_new_tx(sizeof32(struct homa_data_hdr)); #else /* See strip.py */ skb = homa_skb_new_tx(sizeof32(struct homa_data_hdr) + length + - segs * sizeof32(struct homa_seg_hdr)); + (segs - 1) * sizeof32(struct homa_seg_hdr)); #endif /* See strip.py */ if (!skb) return ERR_PTR(-ENOMEM); diff --git a/homa_plumbing.c b/homa_plumbing.c index e6006566..b0135076 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -589,8 +589,8 @@ int __init homa_load(void) return 0; -#ifndef __STRIP__ /* See strip.py */ net_err: +#ifndef __STRIP__ /* See strip.py */ homa_offload_end(); offload_err: homa_metrics_end(); diff --git a/homa_utils.c b/homa_utils.c index cfd54b2b..c1ea05e6 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -125,13 +125,13 @@ int homa_init(struct homa *homa) } homa->pacer_exit = false; homa->max_nic_queue_ns = 5000; + homa->wmem_max = 100000000; #ifndef __STRIP__ /* See strip.py */ homa->verbose = 0; #endif /* See strip.py */ homa->max_gso_size = 10000; #ifndef __STRIP__ /* See strip.py */ homa->max_gro_skbs = 20; - homa->wmem_max = 100000000; homa->gro_policy = HOMA_GRO_NORMAL; homa->busy_usecs = 100; homa->gro_busy_usecs = 5; diff --git a/test/mock.c b/test/mock.c index 6a3d65ce..db2b5c95 100644 --- a/test/mock.c +++ b/test/mock.c @@ -287,7 +287,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, if (!skbs_in_use) skbs_in_use = unit_hash_new(); unit_hash_set(skbs_in_use, skb, "used"); - size = SKB_DATA_ALIGN(size); shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); skb->head = malloc(size + shinfo_size); memset(skb->head, 0, size + shinfo_size); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 870e2ced..f60af88b 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2225,7 +2225,7 @@ TEST_F(homa_incoming, homa_wait_private__basics) atomic_or(RPC_PRIVATE, &crpc->flags); EXPECT_EQ(0, homa_wait_private(crpc, 0)); ASSERT_EQ(RPC_PRIVATE, atomic_read(&crpc->flags)); - EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups)); } TEST_F(homa_incoming, homa_wait_private__rpc_has_error) { @@ -2271,14 +2271,14 @@ TEST_F(homa_incoming, homa_wait_private__signal_notify_race) ASSERT_NE(NULL, crpc); atomic_or(RPC_PRIVATE, &crpc->flags); - self->homa.poll_usecs = 0; + IF_NO_STRIP(self->homa.poll_usecs = 0); unit_hook_register(handoff_hook); hook_rpc = crpc; hook_count = 1; mock_prepare_to_wait_errors = 1; EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); - EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups)); EXPECT_EQ(0, mock_prepare_to_wait_errors); } @@ -2307,7 +2307,7 @@ TEST_F(homa_incoming, homa_wait_shared__rpc_already_ready) ASSERT_FALSE(IS_ERR(rpc)); EXPECT_EQ(crpc, rpc); EXPECT_EQ(0, crpc->msgin.packets.qlen); - EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups)); homa_rpc_unlock(rpc); } TEST_F(homa_incoming, homa_wait_shared__multiple_rpcs_already_ready) @@ -2355,7 +2355,7 @@ TEST_F(homa_incoming, homa_wait_shared__signal_race_with_handoff) rpc = homa_wait_shared(&self->hsk, 0); EXPECT_EQ(crpc, rpc); EXPECT_EQ(ENOENT, -rpc->error); - EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups)); homa_rpc_unlock(rpc); } TEST_F(homa_incoming, homa_wait_shared__socket_shutdown_while_blocked) @@ -2462,7 +2462,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__handoff_to_shared_interest) EXPECT_EQ(crpc, interest2.rpc); homa_rpc_put(crpc); homa_interest_unlink_shared(&interest1); - EXPECT_EQ(1, homa_metrics_per_cpu()->handoffs_thread_waiting); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->handoffs_thread_waiting)); } TEST_F(homa_incoming, homa_rpc_handoff__queue_rpc_on_socket) { diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index fc15c52f..7b74276a 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -17,6 +17,7 @@ static int hook_count; static struct homa_interest *hook_interest; +#ifndef __STRIP__ /* See strip.py */ static void log_hook(char *id) { if (strcmp(id, "unlock") == 0 || @@ -24,6 +25,7 @@ static void log_hook(char *id) unit_log_printf("; ", "%s", id); } } +#endif /* See strip.py */ static void notify_hook(char *id) { @@ -143,10 +145,11 @@ TEST_F(homa_interest, homa_interest_wait__already_ready) atomic_set(&interest.ready, 1); EXPECT_EQ(0, homa_interest_wait(&interest, 0)); EXPECT_EQ(0, interest.blocked); - EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups)); homa_interest_unlink_shared(&interest); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_interest, homa_interest_wait__call_schedule) { struct homa_interest interest; @@ -161,9 +164,10 @@ TEST_F(homa_interest, homa_interest_wait__call_schedule) unit_log_clear(); EXPECT_EQ(0, homa_interest_wait(&interest, 0)); - ASSERT_STREQ("schedule; schedule", unit_log_get()); + EXPECT_STREQ("schedule; schedule", unit_log_get()); homa_interest_unlink_shared(&interest); } +#endif /* See strip.py */ TEST_F(homa_interest, homa_interest_wait__call_homa_rpc_reap) { struct homa_interest interest; @@ -176,7 +180,7 @@ TEST_F(homa_interest, homa_interest_wait__call_homa_rpc_reap) homa_rpc_end(crpc); EXPECT_EQ(15, self->hsk.dead_skbs); homa_interest_init_shared(&interest, &self->hsk); - self->homa.poll_usecs = 0; + IF_NO_STRIP(self->homa.poll_usecs = 0); EXPECT_EQ(EAGAIN, -homa_interest_wait(&interest, 1)); EXPECT_EQ(0, self->hsk.dead_skbs); @@ -187,7 +191,7 @@ TEST_F(homa_interest, homa_interest_wait__nonblocking) struct homa_interest interest; homa_interest_init_shared(&interest, &self->hsk); - self->homa.poll_usecs = 100; + IF_NO_STRIP(self->homa.poll_usecs = 100); EXPECT_EQ(EAGAIN, -homa_interest_wait(&interest, 1)); EXPECT_EQ(0, interest.blocked); @@ -198,7 +202,7 @@ TEST_F(homa_interest, homa_interest_wait__poll_then_block) struct homa_interest interest; homa_interest_init_shared(&interest, &self->hsk); - self->homa.poll_usecs = 3; + IF_NO_STRIP(self->homa.poll_usecs = 3); mock_set_clock_vals(1000, 2000, 3999, 4000, 0); mock_ns = 4000; unit_hook_register(notify_hook); @@ -206,11 +210,13 @@ TEST_F(homa_interest, homa_interest_wait__poll_then_block) hook_count = 4; EXPECT_EQ(0, -homa_interest_wait(&interest, 0)); +#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(3000, homa_metrics_per_cpu()->poll_ns); EXPECT_EQ(0, homa_metrics_per_cpu()->blocked_ns); EXPECT_EQ(0, homa_metrics_per_cpu()->fast_wakeups); EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups); EXPECT_EQ(1, interest.blocked); +#endif /* See strip.py */ homa_interest_unlink_shared(&interest); } TEST_F(homa_interest, homa_interest_wait__interrupted_by_signal) @@ -219,7 +225,7 @@ TEST_F(homa_interest, homa_interest_wait__interrupted_by_signal) homa_interest_init_shared(&interest, &self->hsk); mock_prepare_to_wait_errors = 1; - self->homa.poll_usecs = 0; + IF_NO_STRIP(self->homa.poll_usecs = 0); EXPECT_EQ(EINTR, -homa_interest_wait(&interest, 0)); EXPECT_EQ(1, interest.blocked); @@ -230,7 +236,7 @@ TEST_F(homa_interest, homa_interest_wait__time_metrics) struct homa_interest interest; homa_interest_init_shared(&interest, &self->hsk); - self->homa.poll_usecs = 0; + IF_NO_STRIP(self->homa.poll_usecs = 0); mock_set_clock_vals(1000, 1500, 3000, 3200, 0); mock_ns = 4000; unit_hook_register(notify_hook); @@ -238,8 +244,8 @@ TEST_F(homa_interest, homa_interest_wait__time_metrics) hook_count = 4; EXPECT_EQ(0, -homa_interest_wait(&interest, 0)); - EXPECT_EQ(700, homa_metrics_per_cpu()->poll_ns); - EXPECT_EQ(1500, homa_metrics_per_cpu()->blocked_ns); + IF_NO_STRIP(EXPECT_EQ(700, homa_metrics_per_cpu()->poll_ns)); + IF_NO_STRIP(EXPECT_EQ(1500, homa_metrics_per_cpu()->blocked_ns)); homa_interest_unlink_shared(&interest); } @@ -291,7 +297,7 @@ TEST_F(homa_interest, homa_choose_interest__find_idle_core) struct homa_interest *result = homa_choose_interest(&self->hsk); EXPECT_EQ(&interest2, result); EXPECT_EQ(2, result->core); - EXPECT_EQ(1, homa_metrics_per_cpu()->handoffs_alt_thread); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->handoffs_alt_thread)); INIT_LIST_HEAD(&self->hsk.interests); } TEST_F(homa_interest, homa_choose_interest__all_cores_busy) @@ -313,7 +319,7 @@ TEST_F(homa_interest, homa_choose_interest__all_cores_busy) struct homa_interest *result = homa_choose_interest(&self->hsk); EXPECT_EQ(3, result->core); - EXPECT_EQ(0, homa_metrics_per_cpu()->handoffs_alt_thread); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->handoffs_alt_thread)); INIT_LIST_HEAD(&self->hsk.interests); } #endif /* See strip.py */ diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 2f1dcdf8..9834b450 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -52,8 +52,9 @@ void mock_resend_data(struct homa_rpc *rpc, int start, int end, */ static int true_size(int msg_bytes) { - return msg_bytes + SKB_TRUESIZE(SKB_DATA_ALIGN(HOMA_SKB_EXTRA + - HOMA_IPV6_HEADER_LENGTH + sizeof(struct homa_data_hdr))); + return SKB_TRUESIZE(msg_bytes + HOMA_SKB_EXTRA + + HOMA_IPV6_HEADER_LENGTH + sizeof(struct homa_skb_info) + + sizeof(struct homa_data_hdr)); } FIXTURE(homa_outgoing) { diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 6cfb8f16..b4498a89 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -56,6 +56,8 @@ FIXTURE_SETUP(homa_plumbing) self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); + if (self->homa.wmem_max == 0) + printf("homa_plumbing fixture found wmem_max 0\n"); mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); self->client_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; @@ -102,6 +104,8 @@ FIXTURE_SETUP(homa_plumbing) self->optval.user = (void *) 0x100000; self->optval.is_kernel = 0; unit_log_clear(); + if (self->homa.wmem_max == 0) + printf("homa_plumbing fixture set wmem_max 0\n"); } FIXTURE_TEARDOWN(homa_plumbing) { diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index e1b9e046..95ffe91e 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -613,13 +613,17 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_locked) homa_rpc_end(crpc2); unit_log_clear(); self->homa.reap_limit = 3; +#ifndef __STRIP__ /* See strip.py */ mock_trylock_errors = 2; +#else /* See strip.py */ + mock_trylock_errors = 1; +#endif /* See strip.py */ EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); - EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps)); unit_log_clear(); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); - EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps)); EXPECT_STREQ("reaped 1234", unit_log_get()); } TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) @@ -640,15 +644,15 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) self->homa.reap_limit = 3; EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); - EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps)); unit_log_clear(); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); - EXPECT_EQ(2, homa_metrics_per_cpu()->deferred_rpc_reaps); + IF_NO_STRIP(EXPECT_EQ(2, homa_metrics_per_cpu()->deferred_rpc_reaps)); EXPECT_STREQ("", unit_log_get()); homa_rpc_put(crpc1); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1234", unit_log_get()); - EXPECT_EQ(2, homa_metrics_per_cpu()->deferred_rpc_reaps); + IF_NO_STRIP(EXPECT_EQ(2, homa_metrics_per_cpu()->deferred_rpc_reaps)); } TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) { From dd7de20085ada80263ff829db228a6528fe0370c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 30 Mar 2025 16:16:19 -0700 Subject: [PATCH 227/625] Enable homa_freeze and homa_freeze_peers with __STRIP__=1 --- homa_devel.c | 6 ++---- homa_devel.h | 12 ++++++++++-- homa_impl.h | 1 - homa_peer.c | 2 +- homa_peer.h | 2 +- homa_plumbing.c | 2 +- homa_rpc.c | 29 ++++++++++++++++++++++++----- homa_rpc.h | 4 ++-- homa_wire.h | 4 ++++ 9 files changed, 45 insertions(+), 17 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index bfddc25b..26b16873 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -369,7 +369,6 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) return buffer; } -#ifndef __STRIP__ /* See strip.py */ /** * homa_freeze_peers() - Send FREEZE packets to all known peers. * @homa: Provides info about peers. @@ -399,8 +398,8 @@ void homa_freeze_peers(struct homa *homa) freeze.common.type = FREEZE; freeze.common.sport = htons(hsk->port); freeze.common.dport = 0; - freeze.common.flags = HOMA_TCP_FLAGS; - freeze.common.urgent = htons(HOMA_TCP_URGENT); + IF_NO_STRIP(freeze.common.flags = HOMA_TCP_FLAGS); + IF_NO_STRIP(freeze.common.urgent = htons(HOMA_TCP_URGENT)); freeze.common.sender_id = 0; for (i = 0; i < num_peers; i++) { tt_record1("Sending freeze to 0x%x", tt_addr(peers[i]->addr)); @@ -415,7 +414,6 @@ void homa_freeze_peers(struct homa *homa) rcu_read_unlock(); return; } -#endif /* See strip.py */ /** * homa_snprintf() - This function makes it easy to use a series of calls diff --git a/homa_devel.h b/homa_devel.h index 3256b3dd..853e9d99 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -9,6 +9,16 @@ #include "timetrace.h" +#ifdef __STRIP__ +#define INC_METRIC(...) + +#undef LINUX_VERSION_CODE +#define LINUX_VERSION_CODE 100 + +#undef KERNEL_VERSION +#define KERNEL_VERSION(...) 100 +#endif /* __STRIP__ */ + struct homa; struct homa_rpc; @@ -62,11 +72,9 @@ static inline void check_addr_valid(void *addr, char *info) #define IF_NO_STRIP(code) #endif /* See strip.py */ -#ifndef __STRIP__ /* See strip.py */ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format); void homa_freeze_peers(struct homa *homa); -#endif /* See strip.py */ char *homa_print_ipv4_addr(__be32 addr); char *homa_print_ipv6_addr(const struct in6_addr *addr); char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); diff --git a/homa_impl.h b/homa_impl.h index 4fe19ffe..51150a78 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -48,7 +48,6 @@ #include "homa.h" #include #include "homa_devel.h" -#include "homa_strip.h" #else /* See strip.py */ #include #endif /* See strip.py */ diff --git a/homa_peer.c b/homa_peer.c index d23103af..7bb41683 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -62,7 +62,7 @@ void homa_peertab_destroy(struct homa_peertab *peertab) spin_unlock_bh(&peertab->write_lock); } -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ /** * homa_peertab_get_peers() - Return information about all of the peers * currently known diff --git a/homa_peer.h b/homa_peer.h index f85dff11..eb397de0 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -200,7 +200,7 @@ struct homa_peer { void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, struct homa_sock *hsk); void homa_peertab_destroy(struct homa_peertab *peertab); -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ struct homa_peer ** homa_peertab_get_peers(struct homa_peertab *peertab, int *num_peers); diff --git a/homa_plumbing.c b/homa_plumbing.c index b0135076..d46fb2f9 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1434,7 +1434,7 @@ int homa_softirq(struct sk_buff *skb) goto discard; } -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ /* Check for FREEZE here, rather than in homa_incoming.c, so * it will work even if the RPC and/or socket are unknown. */ diff --git a/homa_rpc.c b/homa_rpc.c index b6f68dd3..a960d429 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -537,7 +537,7 @@ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, return NULL; } -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ /** * homa_rpc_log() - Log info about a particular RPC; this is functionality * pulled out of homa_rpc_log_active because its indentation got too deep. @@ -552,12 +552,20 @@ void homa_rpc_log(struct homa_rpc *rpc) pr_notice("%s RPC INCOMING, id %llu, peer %s:%d, %d/%d bytes received, incoming %d\n", type, rpc->id, peer, rpc->dport, rpc->msgin.length - rpc->msgin.bytes_remaining, +#ifndef __STRIP__ rpc->msgin.length, rpc->msgin.granted); +#else + rpc->msgin.length, 0); +#endif /* __STRIP__ */ else if (rpc->state == RPC_OUTGOING) { pr_notice("%s RPC OUTGOING, id %llu, peer %s:%d, out length %d, left %d, granted %d, in left %d, resend_ticks %u, silent_ticks %d\n", type, rpc->id, peer, rpc->dport, rpc->msgout.length, rpc->msgout.length - rpc->msgout.next_xmit_offset, +#ifndef __STRIP__ rpc->msgout.granted, rpc->msgin.bytes_remaining, +#else + 0, rpc->msgin.bytes_remaining, +#endif /* __STRIP__ */ rpc->resend_timer_ticks, rpc->silent_ticks); } else { pr_notice("%s RPC %s, id %llu, peer %s:%d, incoming length %d, outgoing length %d\n", @@ -616,13 +624,16 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes received", rpc->id, tt_addr(rpc->peer->addr), received, rpc->msgin.length); - if (1) - tt_record4("RPC id %d has incoming %d, granted %d, prio %d", rpc->id, - rpc->msgin.granted - received, - rpc->msgin.granted, rpc->msgin.priority); +#ifndef __STRIP__ + tt_record4("RPC id %d has incoming %d, granted %d, prio %d", rpc->id, + rpc->msgin.granted - received, + rpc->msgin.granted, rpc->msgin.priority); rank = atomic_read(&rpc->msgin.rank); if (rpc->hsk->homa->active_rpcs[rank] != rpc) rank = -1; +#else /* __STRIP__ */ + rank = -1; +#endif /* __STRIP__ */ tt_record4("RPC id %d: length %d, remaining %d, rank %d", rpc->id, rpc->msgin.length, rpc->msgin.bytes_remaining, rank); @@ -637,11 +648,13 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) rpc->id, tt_addr(rpc->peer->addr), rpc->msgout.next_xmit_offset, rpc->msgout.length); +#ifndef __STRIP__ if (rpc->msgout.granted > rpc->msgout.next_xmit_offset) tt_record3("RPC id %d has %d unsent grants (granted %d)", rpc->id, rpc->msgout.granted - rpc->msgout.next_xmit_offset, rpc->msgout.granted); +#endif /* __STRIP__ */ } else { tt_record2("RPC id %d is in state %d", rpc->id, rpc->state); } @@ -661,7 +674,9 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) struct homa_rpc *rpc; int count = 0; +#ifndef __STRIP__ homa_grant_log_tt(homa); +#endif /* __STRIP__ */ tt_record("Logging active Homa RPCs:"); rcu_read_lock(); for (hsk = homa_socktab_start_scan(homa->port_map, &scan); @@ -680,9 +695,11 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) continue; if (rpc->state != RPC_INCOMING) continue; +#ifndef __STRIP__ if (rpc->msgin.granted <= (rpc->msgin.length - rpc->msgin.bytes_remaining)) continue; +#endif /* __STRIP__ */ freeze_count--; pr_notice("Emitting FREEZE in %s\n", __func__); homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); @@ -693,7 +710,9 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) rcu_read_unlock(); tt_record1("Finished logging (%d active Homa RPCs)", count); } +#endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ /** * homa_validate_incoming() - Scan all of the active RPCs to compute what * homa_total_incoming should be, and see if it actually matches. diff --git a/homa_rpc.h b/homa_rpc.h index ef3f2679..d8e0efa7 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -420,7 +420,7 @@ struct homa_rpc void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); void homa_rpc_end(struct homa_rpc *rpc); -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ void homa_rpc_log(struct homa_rpc *rpc); void homa_rpc_log_active(struct homa *homa, uint64_t id); void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); @@ -434,7 +434,7 @@ struct homa_rpc const struct in6_addr *source, struct homa_data_hdr *h, int *created); int homa_rpc_reap(struct homa_sock *hsk, bool reap_all); -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); #endif /* See strip.py */ diff --git a/homa_wire.h b/homa_wire.h index 56c42b2e..37303962 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -21,6 +21,8 @@ enum homa_packet_type { BUSY = 0x14, #ifndef __STRIP__ /* See strip.py */ CUTOFFS = 0x15, +#endif /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ FREEZE = 0x16, #endif /* See strip.py */ NEED_ACK = 0x17, @@ -497,7 +499,9 @@ struct homa_cutoffs_hdr { } __packed; _Static_assert(sizeof(struct homa_cutoffs_hdr) <= HOMA_MAX_HEADER, "homa_cutoffs_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); + #endif /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ /** * struct homa_freeze_hdr - Wire format for FREEZE packets. * From 4975c3ef844b1a44525e3681bb03b750b71b6b31 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 30 Mar 2025 16:35:19 -0700 Subject: [PATCH 228/625] Allow compilation against versions of Linux other than current --- Makefile | 3 ++- test/Makefile | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index dd8258e2..0d691169 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,8 @@ ifneq ($(KERNEL_SRC),) KDIR ?= $(KERNEL_SRC) endif -KDIR ?= /lib/modules/$(shell uname -r)/build +LINUX_VERSION ?= $(shell uname -r) +KDIR ?= /lib/modules/$(LINUX_VERSION)/build all: $(MAKE) -C $(KDIR) M=$(shell pwd) modules diff --git a/test/Makefile b/test/Makefile index c9120c28..c1ec9fb1 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,6 +1,7 @@ # Makefile to run unit tests for Homa -KDIR ?= /lib/modules/$(shell uname -r)/build +LINUX_VERSION ?= $(shell uname -r) +KDIR ?= /lib/modules/$(LINUX_VERSION)/build CC ?= gcc CXX ?= g++ PERL ?= perl From 69bbf4b5e572577a0d61149ac0e083be9bf31933 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 30 Mar 2025 16:54:22 -0700 Subject: [PATCH 229/625] Allow cperf-based apps to run on nodes without Homa loaded --- util/cperf.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index fa2fb8fa..8280a3bc 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -68,6 +68,9 @@ # Value of the "--stripped" option. stripped = False +# Speed of host uplinks. +link_mbps = 0 + # Defaults for command-line options; assumes that servers and clients # share nodes. default_defaults = { @@ -333,7 +336,7 @@ def init(options): """ Initialize various global state, such as the log file. """ - global old_slowdown, log_dir, log_file, verbose, delete_rtts + global old_slowdown, log_dir, log_file, verbose, delete_rtts, link_mbps log_dir = options.log_dir old_slowdown = options.old_slowdown if not options.plot_only: @@ -377,7 +380,7 @@ def init(options): s += ", " s += ("--%s: %s" % (name, str(opts[name]))) vlog("Options: %s" % (s)) - vlog("Homa configuration:") + vlog("Homa configuration (node%d):" % (options.nodes[0])) for param in ['dead_buffs_limit', 'grant_fifo_fraction', 'gro_policy', 'link_mbps', 'max_dead_buffs', 'max_grantable_rpcs', 'max_gro_skbs', 'max_gso_size', @@ -385,8 +388,11 @@ def init(options): 'max_rpcs_per_peer', 'num_priorities', 'pacer_fifo_fraction', 'poll_usecs', 'reap_limit', 'resend_interval', 'resend_ticks', 'throttle_min_bytes', 'timeout_resends', 'unsched_bytes', 'window']: - result = do_subprocess(['sysctl', '-n', '.net.homa.' + param]) + result = do_subprocess(['ssh', 'node%d' % (options.nodes[0]), + 'sysctl', '-n', '.net.homa.' + param]) vlog(" %-20s %s" % (param, result)) + if param == 'link_mbps': + link_mbps = float(result) if options.mtu != 0: log("Setting MTU to %d" % (options.mtu)) @@ -1293,6 +1299,7 @@ def get_digest(experiment): experiment: Name of the desired experiment """ global old_slowdown, digests, log_dir, min_rtt, unloaded_p50, delete_rtts + global link_mbps if experiment in digests: return digests[experiment] @@ -1311,8 +1318,6 @@ def get_digest(experiment): avg_slowdowns = [] - link_mbps = float(get_sysctl_parameter(".net.homa.link_mbps")) - # Read in the RTT files for this experiment. files = sorted(glob.glob(log_dir + ("/%s-*.rtts" % (experiment)))) if len(files) == 0: From 37fdfb3c29336a1d9573fac2f86c790ad59ee67f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 30 Mar 2025 17:38:40 -0700 Subject: [PATCH 230/625] Fix style issues from checkpatch.pl --- homa.h | 2 +- homa_devel.c | 1 - homa_devel.h | 4 +- homa_grant.c | 2 - homa_impl.h | 7 +- homa_incoming.c | 6 +- homa_interest.c | 369 ++++++++++++++++++++++++------------------------ homa_interest.h | 2 +- homa_metrics.c | 4 +- homa_metrics.h | 4 +- homa_outgoing.c | 6 +- homa_peer.c | 2 +- homa_plumbing.c | 10 +- homa_skb.c | 8 +- homa_sock.c | 10 +- 15 files changed, 217 insertions(+), 220 deletions(-) diff --git a/homa.h b/homa.h index 40a084cc..6c3c729b 100644 --- a/homa.h +++ b/homa.h @@ -174,7 +174,7 @@ _Static_assert(sizeof(struct homa_abort_args) <= 32, "homa_abort_args grew"); /** * define SO_HOMA_SERVER: setsockopt option for specifying whether a * socket will act as server. - * */ + */ #define SO_HOMA_SERVER 11 /** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ diff --git a/homa_devel.c b/homa_devel.c index 26b16873..1ebf54df 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -412,7 +412,6 @@ void homa_freeze_peers(struct homa *homa) done: rcu_read_unlock(); - return; } /** diff --git a/homa_devel.h b/homa_devel.h index 853e9d99..a09bcdcc 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -69,7 +69,7 @@ static inline void check_addr_valid(void *addr, char *info) #ifndef __STRIP__ /* See strip.py */ #define IF_NO_STRIP(code) code #else /* See strip.py */ -#define IF_NO_STRIP(code) +#define IF_NO_STRIP(...) #endif /* See strip.py */ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, @@ -81,7 +81,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len); int homa_snprintf(char *buffer, int size, int used, - const char *format, ...) __printf(4, 5); + const char *format, ...) __printf(4, 5); char *homa_symbol_for_type(uint8_t type); char *homa_symbol_for_state(struct homa_rpc *rpc); diff --git a/homa_grant.c b/homa_grant.c index cde1037f..299c1eee 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -146,7 +146,6 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) } done: homa_grantable_unlock(homa); - return; } /** @@ -219,7 +218,6 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) done: homa_grantable_unlock(homa); - return; } /** diff --git a/homa_impl.h b/homa_impl.h index 51150a78..754bbb78 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -876,6 +876,7 @@ static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union *addr) { struct in6_addr mapped; + if (addr) { if (addr->sa.sa_family == AF_INET6) return addr->in6.sin6_addr; @@ -1018,7 +1019,7 @@ void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_unload(void); int homa_wait_private(struct homa_rpc *rpc, int nonblocking); struct homa_rpc - *homa_wait_shared(struct homa_sock *hsk, int nonblocking); + *homa_wait_shared(struct homa_sock *hsk, int nonblocking); int homa_xmit_control(enum homa_packet_type type, void *contents, size_t length, struct homa_rpc *rpc); int __homa_xmit_control(void *contents, size_t length, @@ -1031,7 +1032,7 @@ struct homa_rpc *homa_choose_fifo_grant(struct homa *homa); void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -int homa_dointvec(struct ctl_table *table, int write, +int homa_dointvec(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #else int homa_dointvec(const struct ctl_table *table, int write, @@ -1047,7 +1048,7 @@ void homa_prios_changed(struct homa *homa); void homa_resend_data(struct homa_rpc *rpc, int start, int end, int priority); #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -int homa_sysctl_softirq_cores(struct ctl_table *table, int write, +int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #else diff --git a/homa_incoming.c b/homa_incoming.c index 7d178943..564864fc 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1248,7 +1248,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) break; homa_rpc_unlock(rpc); - result = homa_interest_wait(&interest, nonblocking ); + result = homa_interest_wait(&interest, nonblocking); atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); @@ -1318,7 +1318,7 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) } else { homa_interest_init_shared(&interest, hsk); homa_sock_unlock(hsk); - result = homa_interest_wait(&interest, nonblocking ); + result = homa_interest_wait(&interest, nonblocking); homa_interest_unlink_shared(&interest); if (result != 0) { @@ -1397,7 +1397,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) INC_METRIC(handoffs_thread_waiting, 1); #ifndef __STRIP__ /* See strip.py */ - /* Update the last_app_active time for the thread's core, so Homa + /* Update the last_app_active time for the thread's core, so * Homa will try to avoid assigning any work there. */ per_cpu(homa_offload_core, interest->core).last_app_active = diff --git a/homa_interest.c b/homa_interest.c index 7b88fbfe..3b87f3b5 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -1,184 +1,185 @@ -// SPDX-License-Identifier: BSD-2-Clause - -/* This file contains functions for managing homa_interest structs. */ - -#include "homa_impl.h" -#include "homa_interest.h" -#include "homa_rpc.h" -#include "homa_sock.h" - -#ifndef __STRIP__ /* See strip.py */ -#include "homa_offload.h" -#endif /* See strip.py */ - -/** - * homa_interest_init_shared() - Initialize an interest and queue it up on a socket. - * @interest: Interest to initialize - * @hsk: Socket on which the interests should be queued. Must be locked - * by caller. - */ -void homa_interest_init_shared(struct homa_interest *interest, - struct homa_sock *hsk) - __must_hold(&hsk->lock) -{ - interest->rpc = NULL; - atomic_set(&interest->ready, 0); - interest->core = raw_smp_processor_id(); - interest->blocked = 0; - init_waitqueue_head(&interest->wait_queue); - interest->hsk = hsk; - list_add(&interest->links, &hsk->interests); -} - -/** - * homa_interest_init_private() - Initialize an interest that will wait - * on a particular (private) RPC, and link it to that RPC. - * @interest: Interest to initialize. - * @rpc: RPC to associate with the interest. Must be private, and - * caller must have locked it. - * - * Return: 0 for success, otherwise a negative errno. - */ -int homa_interest_init_private(struct homa_interest *interest, - struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) -{ - if (rpc->private_interest) - return -EINVAL; - - interest->rpc = rpc; - atomic_set(&interest->ready, 0); - interest->core = raw_smp_processor_id(); - interest->blocked = 0; - init_waitqueue_head(&interest->wait_queue); - interest->hsk = rpc->hsk; - rpc->private_interest = interest; - return 0; -} - -/** - * homa_interest_wait() - Wait for an interest to have an actionable RPC, - * or for an error to occur. - * @interest: Interest to wait for; must previously have been initialized - * and linked to a socket or RPC. On return, the interest - * will have been unlinked if its ready flag is set; otherwise - * it may still be linked. - * @nonblocking: Nonzero means return without blocking if the interest - * doesn't become ready immediately. - * - * Return: 0 for success (there is an actionable RPC in the interest), or - * a negative errno. - */ -int homa_interest_wait(struct homa_interest *interest, int nonblocking) -{ - struct homa_sock *hsk = interest->hsk; - int result = 0; - int iteration; - int wait_err; - -#ifndef __STRIP__ /* See strip.py */ - u64 start, block_start, blocked_time, now; - start = sched_clock(); - blocked_time = 0; -#endif /* See strip.py */ - interest->blocked = 0; - - /* This loop iterates in order to poll and/or reap dead RPCS. */ - for (iteration = 0; ; iteration++) { - if (iteration != 0) - /* Give NAPI/SoftIRQ tasks a chance to run. */ - schedule(); - - if (atomic_read_acquire(&interest->ready) != 0) - goto done; - - /* See if we can cleanup dead RPCs while waiting. */ - if (homa_rpc_reap(hsk, false) != 0) - continue; - - if (nonblocking) { - result = -EAGAIN; - goto done; - } - -#ifndef __STRIP__ /* See strip.py */ - now = sched_clock(); - per_cpu(homa_offload_core, - raw_smp_processor_id()).last_app_active = now; - if (now - start >= 1000 * hsk->homa->poll_usecs) - break; -#else /* See strip.py */ - break; -#endif /* See strip.py */ - } - - interest->blocked = 1; - IF_NO_STRIP(block_start = now); - wait_err = wait_event_interruptible_exclusive(interest->wait_queue, - atomic_read_acquire(&interest->ready) != 0); - IF_NO_STRIP(blocked_time = sched_clock() - block_start); - if (wait_err == -ERESTARTSYS) - result = -EINTR; - -done: -#ifndef __STRIP__ /* See strip.py */ - if (interest->blocked) { - INC_METRIC(slow_wakeups, 1); - INC_METRIC(blocked_ns, blocked_time); - } else { - INC_METRIC(fast_wakeups, 1); - } - INC_METRIC(poll_ns, sched_clock() - start - blocked_time); -#endif /* See strip.py */ - return result; -} - -/** - * homa_interest_notify_private() - If a thread is waiting on the private - * interest for an RPC, wake it up. - * @rpc: RPC that may (potentially) have a private interest. Must be - * locked by the caller. - */ -void homa_interest_notify_private(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) -{ - if (rpc->private_interest) { - atomic_set_release(&rpc->private_interest->ready, 1); - wake_up(&rpc->private_interest->wait_queue); - } -} - -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_choose_interest() - Given all the interests registered for a socket, - * choose the best one to handle an incoming message. - * @hsk: Socket for which message is intended. Must be locked by caller, - * and must have at least one queued interest. - * Return: The interest to use. This function tries to pick an - * interest whose thread is running on a core that isn't - * currently busy doing Homa transport work. - */ -struct homa_interest *homa_choose_interest(struct homa_sock *hsk) - __must_hold(&hsk->lock) -{ - u64 busy_time = sched_clock() - hsk->homa->busy_ns; - struct homa_interest *interest, *first; - - first = list_first_entry(&hsk->interests, struct homa_interest, - links); - list_for_each_entry(interest, &hsk->interests, links) { - if (per_cpu(homa_offload_core, interest->core).last_active < - busy_time) { - if (interest != first) - INC_METRIC(handoffs_alt_thread, 1); - return interest; - } - } - - /* All interested threads are on busy cores; return the first, - * which is also the most recent one to be registered, hence - * most likely to have warm cache state. - */ - return first; -} -#endif /* See strip.py */ +// SPDX-License-Identifier: BSD-2-Clause + +/* This file contains functions for managing homa_interest structs. */ + +#include "homa_impl.h" +#include "homa_interest.h" +#include "homa_rpc.h" +#include "homa_sock.h" + +#ifndef __STRIP__ /* See strip.py */ +#include "homa_offload.h" +#endif /* See strip.py */ + +/** + * homa_interest_init_shared() - Initialize an interest and queue it up on a socket. + * @interest: Interest to initialize + * @hsk: Socket on which the interests should be queued. Must be locked + * by caller. + */ +void homa_interest_init_shared(struct homa_interest *interest, + struct homa_sock *hsk) + __must_hold(&hsk->lock) +{ + interest->rpc = NULL; + atomic_set(&interest->ready, 0); + interest->core = raw_smp_processor_id(); + interest->blocked = 0; + init_waitqueue_head(&interest->wait_queue); + interest->hsk = hsk; + list_add(&interest->links, &hsk->interests); +} + +/** + * homa_interest_init_private() - Initialize an interest that will wait + * on a particular (private) RPC, and link it to that RPC. + * @interest: Interest to initialize. + * @rpc: RPC to associate with the interest. Must be private, and + * caller must have locked it. + * + * Return: 0 for success, otherwise a negative errno. + */ +int homa_interest_init_private(struct homa_interest *interest, + struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) +{ + if (rpc->private_interest) + return -EINVAL; + + interest->rpc = rpc; + atomic_set(&interest->ready, 0); + interest->core = raw_smp_processor_id(); + interest->blocked = 0; + init_waitqueue_head(&interest->wait_queue); + interest->hsk = rpc->hsk; + rpc->private_interest = interest; + return 0; +} + +/** + * homa_interest_wait() - Wait for an interest to have an actionable RPC, + * or for an error to occur. + * @interest: Interest to wait for; must previously have been initialized + * and linked to a socket or RPC. On return, the interest + * will have been unlinked if its ready flag is set; otherwise + * it may still be linked. + * @nonblocking: Nonzero means return without blocking if the interest + * doesn't become ready immediately. + * + * Return: 0 for success (there is an actionable RPC in the interest), or + * a negative errno. + */ +int homa_interest_wait(struct homa_interest *interest, int nonblocking) +{ + struct homa_sock *hsk = interest->hsk; + int result = 0; + int iteration; + int wait_err; + +#ifndef __STRIP__ /* See strip.py */ + u64 start, block_start, blocked_time, now; + + start = sched_clock(); + blocked_time = 0; +#endif /* See strip.py */ + interest->blocked = 0; + + /* This loop iterates in order to poll and/or reap dead RPCS. */ + for (iteration = 0; ; iteration++) { + if (iteration != 0) + /* Give NAPI/SoftIRQ tasks a chance to run. */ + schedule(); + + if (atomic_read_acquire(&interest->ready) != 0) + goto done; + + /* See if we can cleanup dead RPCs while waiting. */ + if (homa_rpc_reap(hsk, false) != 0) + continue; + + if (nonblocking) { + result = -EAGAIN; + goto done; + } + +#ifndef __STRIP__ /* See strip.py */ + now = sched_clock(); + per_cpu(homa_offload_core, + raw_smp_processor_id()).last_app_active = now; + if (now - start >= 1000 * hsk->homa->poll_usecs) + break; +#else /* See strip.py */ + break; +#endif /* See strip.py */ + } + + interest->blocked = 1; + IF_NO_STRIP(block_start = now); + wait_err = wait_event_interruptible_exclusive(interest->wait_queue, + atomic_read_acquire(&interest->ready) != 0); + IF_NO_STRIP(blocked_time = sched_clock() - block_start); + if (wait_err == -ERESTARTSYS) + result = -EINTR; + +done: +#ifndef __STRIP__ /* See strip.py */ + if (interest->blocked) { + INC_METRIC(slow_wakeups, 1); + INC_METRIC(blocked_ns, blocked_time); + } else { + INC_METRIC(fast_wakeups, 1); + } + INC_METRIC(poll_ns, sched_clock() - start - blocked_time); +#endif /* See strip.py */ + return result; +} + +/** + * homa_interest_notify_private() - If a thread is waiting on the private + * interest for an RPC, wake it up. + * @rpc: RPC that may (potentially) have a private interest. Must be + * locked by the caller. + */ +void homa_interest_notify_private(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) +{ + if (rpc->private_interest) { + atomic_set_release(&rpc->private_interest->ready, 1); + wake_up(&rpc->private_interest->wait_queue); + } +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_choose_interest() - Given all the interests registered for a socket, + * choose the best one to handle an incoming message. + * @hsk: Socket for which message is intended. Must be locked by caller, + * and must have at least one queued interest. + * Return: The interest to use. This function tries to pick an + * interest whose thread is running on a core that isn't + * currently busy doing Homa transport work. + */ +struct homa_interest *homa_choose_interest(struct homa_sock *hsk) + __must_hold(&hsk->lock) +{ + u64 busy_time = sched_clock() - hsk->homa->busy_ns; + struct homa_interest *interest, *first; + + first = list_first_entry(&hsk->interests, struct homa_interest, + links); + list_for_each_entry(interest, &hsk->interests, links) { + if (per_cpu(homa_offload_core, interest->core).last_active < + busy_time) { + if (interest != first) + INC_METRIC(handoffs_alt_thread, 1); + return interest; + } + } + + /* All interested threads are on busy cores; return the first, + * which is also the most recent one to be registered, hence + * most likely to have warm cache state. + */ + return first; +} +#endif /* See strip.py */ diff --git a/homa_interest.h b/homa_interest.h index 2c307fb3..161e86a2 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -101,4 +101,4 @@ struct homa_interest *homa_choose_interest(struct homa_sock *hsk); #endif /* See strip.py */ -#endif /* _HOMA_INTEREST_H */ \ No newline at end of file +#endif /* _HOMA_INTEREST_H */ diff --git a/homa_metrics.c b/homa_metrics.c index cd3eceb0..2e41139c 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -25,7 +25,7 @@ struct homa_metrics_output homa_mout; * homa_metrics_init() - Initialize global information related to metrics. * Return: 0 for success, otherwise a negative errno. */ -int homa_metrics_init() +int homa_metrics_init(void) { mutex_init(&homa_mout.mutex); homa_mout.output = NULL; @@ -43,7 +43,7 @@ int homa_metrics_init() * homa_metrics_end() - Called to clean up metrics information when the * Homa module unloads. */ -void homa_metrics_end() +void homa_metrics_end(void) { if (homa_mout.dir_entry) proc_remove(homa_mout.dir_entry); diff --git a/homa_metrics.h b/homa_metrics.h index 95c92ca2..f10a0555 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -710,8 +710,8 @@ static inline struct homa_metrics *homa_metrics_per_cpu(void) * different core and races with an INC_METRIC there, the worst that * happens is that one of the INC_METRICs is lost, which isn't a big deal. */ -#define INC_METRIC(metric, count) per_cpu(homa_metrics, \ - raw_smp_processor_id()).metric += (count) +#define INC_METRIC(metric, count) (per_cpu(homa_metrics, \ + raw_smp_processor_id()).metric += (count)) extern struct homa_metrics_output homa_mout; diff --git a/homa_outgoing.c b/homa_outgoing.c index a1c9da49..7c527631 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -266,7 +266,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) homa_rpc_hold(rpc); if (unlikely(iter->count > HOMA_MAX_MESSAGE_LENGTH || - iter->count == 0)) { + iter->count == 0)) { tt_record2("homa_message_out_fill found bad length %d for id %d", iter->count, rpc->id); err = -EINVAL; @@ -350,7 +350,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) skb_data_bytes = bytes_left; skb = homa_new_data_packet(rpc, iter, offset, skb_data_bytes, max_seg_data); - if (unlikely(IS_ERR(skb))) { + if (IS_ERR(skb)) { err = PTR_ERR(skb); homa_rpc_lock(rpc); goto error; @@ -979,7 +979,7 @@ int homa_pacer_main(void *transport) else { tt_record("pacer sleeping"); INC_METRIC(throttled_ns, sched_clock() - - homa->throttle_add); + homa->throttle_add); } #endif /* See strip.py */ INC_METRIC(pacer_ns, sched_clock() - homa->pacer_wake_time); diff --git a/homa_peer.c b/homa_peer.c index 7bb41683..e825f010 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -226,7 +226,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, } peer->dst = dst; #ifndef __STRIP__ /* See strip.py */ - peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; + peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 2] = INT_MAX; INIT_LIST_HEAD(&peer->grantable_rpcs); INIT_LIST_HEAD(&peer->grantable_links); diff --git a/homa_plumbing.c b/homa_plumbing.c index d46fb2f9..f89f37b1 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -157,7 +157,7 @@ static struct inet6_protocol homav6_protocol = { * @data fields are actually offsets within a struct homa; these are converted * to pointers into a net-specific struct homa later. */ -#define OFFSET(field) (void *) offsetof(struct homa, field) +#define OFFSET(field) ((void *) offsetof(struct homa, field)) static struct ctl_table homa_ctl_table[] = { { .procname = "action", @@ -911,8 +911,8 @@ int homa_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; /* Do a trivial test to make sure we can at least write the first - * page of the region. - */ + * page of the region. + */ if (copy_to_user(u64_to_user_ptr(args.start), &args, sizeof(args))) return -EFAULT; @@ -1666,7 +1666,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, * Return: 0 for success, nonzero for error. */ #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -int homa_dointvec(struct ctl_table *table, int write, +int homa_dointvec(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) #else int homa_dointvec(const struct ctl_table *table, int write, @@ -1757,7 +1757,7 @@ int homa_dointvec(const struct ctl_table *table, int write, * Return: 0 for success, nonzero for error. */ #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -int homa_sysctl_softirq_cores(struct ctl_table *table, int write, +int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) #else int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, diff --git a/homa_skb.c b/homa_skb.c index 12c8ed5f..d0c7a5a9 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -99,11 +99,9 @@ void homa_skb_cleanup(struct homa *homa) homa->page_pools[i] = NULL; } - if (homa->skb_pages_to_free) { - kfree(homa->skb_pages_to_free); - homa->skb_pages_to_free = NULL; - homa->pages_to_free_slots = 0; - } + kfree(homa->skb_pages_to_free); + homa->skb_pages_to_free = NULL; + homa->pages_to_free_slots = 0; } /** diff --git a/homa_sock.c b/homa_sock.c index aad33200..03a693cc 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -292,8 +292,8 @@ void homa_sock_shutdown(struct homa_sock *hsk) tx_memory = refcount_read(&hsk->sock.sk_wmem_alloc); if (tx_memory != 1) { - pr_err("homa_sock_shutdown found sk_wmem_alloc %llu bytes, port %d\n", - tx_memory, hsk->port); + pr_err("%s found sk_wmem_alloc %llu bytes, port %d\n", + __func__, tx_memory, hsk->port); #ifdef __UNIT_TEST__ FAIL(" sk_wmem_alloc %llu after shutdown for port %d", tx_memory, hsk->port); @@ -373,7 +373,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, * @port: The port of interest. * Return: The socket that owns @port, or NULL if none. If non-NULL * then this method has taken a reference on the socket and - * the caller must call sock_put to release it. + * the caller must call sock_put to release it. */ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) { @@ -461,8 +461,8 @@ int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking) tt_record2("homa_sock_wait_wmem waiting on port %d, wmem %d", hsk->port, refcount_read(&hsk->sock.sk_wmem_alloc)); result = wait_event_interruptible_timeout(*sk_sleep(&hsk->sock), - homa_sock_wmem_avl(hsk) || hsk->shutdown, - timeo); + homa_sock_wmem_avl(hsk) || hsk->shutdown, + timeo); tt_record4("homa_sock_wait_wmem woke up on port %d with result %d, wmem %d, signal pending %d", hsk->port, result, refcount_read(&hsk->sock.sk_wmem_alloc), signal_pending(current)); From 5220dc1f36adef6bc07cbbe3743e0f34d407a98f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 31 Mar 2025 09:56:22 -0700 Subject: [PATCH 231/625] Various improvements to strip.py --- util/strip.py | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/util/strip.py b/util/strip.py index 14f4a1bb..7cde6b54 100755 --- a/util/strip.py +++ b/util/strip.py @@ -127,6 +127,19 @@ def last_non_blank(s): return s2[-1] return None +def blank_next_ok(line): + """ + Given a line, return True if it is OK for this line to be followed by + a blank line. False means that if the next line to be output is blank, + it should be dropped. + """ + s = line.strip() + if s == '': + return False + if s.endswith('{') or s.endswith('*/'): + return False + return True + def scan(file): """ Read a file, remove information that shouldn't appear in the Linux kernel @@ -227,7 +240,6 @@ def scan(file): if in_labeled_skip != None: if line.startswith('#endif /* See strip.py */'): in_labeled_skip = None - check_braces = False continue elif line.startswith('#else /* See strip.py */'): in_labeled_skip = 0 @@ -236,22 +248,22 @@ def scan(file): continue if line.startswith('#ifndef __STRIP__ /* See strip.py */') or ( line.startswith('#ifndef __UPSTREAM__ /* See strip.py */')): - if slines[-1].strip() == '': + if not blank_next_ok(slines[-1]): delete_empty_line = True in_labeled_skip = 1 - check_braces = False + check_braces = True continue if line.startswith('#ifdef __STRIP__ /* See strip.py */') : - if slines[-1].strip() == '': + if not blank_next_ok(slines[-1]): slines.pop() in_labeled_skip = 0 - check_braces = False + check_braces = True continue # Strip tt_freeze() statements. if pline == 'tt_freeze();': check_braces = True - if slines[-1].strip() == '': + if not blank_next_ok(slines[-1]): delete_empty_line = True continue @@ -288,7 +300,7 @@ def scan(file): if pline[-1] != ';': skip_statement = True - if slines[-1].strip() == '': + if not blank_next_ok(slines[-1]): delete_empty_line = True check_braces = True continue @@ -297,14 +309,14 @@ def scan(file): if (pline.startswith('UNIT_LOG(') or pline.startswith('UNIT_HOOK(')): if pline[-1] != ';': skip_statement = True - if slines[-1].strip() == '': + if not blank_next_ok(slines[-1]): delete_empty_line = True check_braces = True continue # Strip #include "homa_strip.h" statements. if pline.startswith('#include "homa_strip.h"'): - if slines[-1].strip() == '': + if not blank_next_ok(slines[-1]): delete_empty_line = True continue @@ -320,12 +332,12 @@ def scan(file): continue elif line.startswith('#ifdef __UNIT_TEST__'): in_unit = 'if' - if slines[-1].strip() == '': + if not blank_next_ok(slines[-1]): delete_empty_line = True continue elif line.startswith('#ifndef __UNIT_TEST__'): in_unit = 'else' - if slines[-1].strip() == '': + if not blank_next_ok(slines[-1]): delete_empty_line = True continue @@ -341,7 +353,7 @@ def scan(file): continue elif line.startswith('#if LINUX_VERSION_CODE'): in_version = 'if' - if slines[-1].strip() == '': + if not blank_next_ok(slines[-1]): delete_empty_line = True continue @@ -353,9 +365,9 @@ def scan(file): delete_empty_line = False # Remove braces for blocks that now have only a single statement - if pline[0] == '}': + if pline == '}' or pline.startswith('} else'): if check_braces: - check_braces = False; + check_braces = False if open_index != None: if statements_in_block == 0: print('%s:%d: stripping creates empty block' % @@ -371,6 +383,7 @@ def scan(file): if pline[-1] == '{' and line[0] != '{': statements_in_block = 0 open_index = len(slines) + check_braces = False # Count statements if non_comment and non_comment[-1] == ';': From 22f0c4b30954614a77f8f7f87ccf7d15c7115ee1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 31 Mar 2025 10:41:45 -0700 Subject: [PATCH 232/625] Cleanups related to stripping --- homa_impl.h | 14 +++++++------- homa_plumbing.c | 2 ++ homa_sock.c | 4 ++-- homa_timer.c | 38 +++++++++----------------------------- util/cperf.py | 28 ++++++++++++++++------------ 5 files changed, 36 insertions(+), 50 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 754bbb78..0a083258 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -718,13 +718,6 @@ struct homa { struct ctl_table_header *sysctl_header; #endif /* See strip.py */ -#ifndef __UPSTREAM__ /* See strip.py */ - /** - * @sysctl_action: This value is set by sysctl to invoke one of - * several actions for testing. It is normally zero. - */ - int sysctl_action; - /** * @timer_kthread: Thread that runs timer code to detect lost * packets and crashed peers. @@ -740,6 +733,13 @@ struct homa { */ bool destroyed; +#ifndef __UPSTREAM__ /* See strip.py */ + /** + * @sysctl_action: This value is set by sysctl to invoke one of + * several actions for testing. It is normally zero. + */ + int sysctl_action; + /** * @temp: the values in this array can be read and written with sysctl. * They have no officially defined purpose, and are available for diff --git a/homa_plumbing.c b/homa_plumbing.c index f89f37b1..0f103db6 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1315,9 +1315,11 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, done: if (unlikely(copy_to_user((__force void __user *)msg->msg_control, &control, sizeof(control)))) { +#ifndef __UPSTREAM__ /* See strip.py */ /* Note: in this case the message's buffers will be leaked. */ pr_notice("%s couldn't copy back args to 0x%px\n", __func__, msg->msg_control); +#endif /* See strip.py */ result = -EFAULT; } diff --git a/homa_sock.c b/homa_sock.c index 03a693cc..8bb43037 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -10,7 +10,7 @@ #ifdef __UNIT_TEST__ #define KSELFTEST_NOT_MAIN 1 #include "test/kselftest_harness.h" -#endif +#endif /* __UNIT_TEST__ */ /** * homa_socktab_init() - Constructor for homa_socktabs. @@ -297,7 +297,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) #ifdef __UNIT_TEST__ FAIL(" sk_wmem_alloc %llu after shutdown for port %d", tx_memory, hsk->port); -#endif +#endif /* __UNIT_TEST__ */ } if (hsk->buffer_pool) { diff --git a/homa_timer.c b/homa_timer.c index 6c2b3a8f..938ea18b 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -27,9 +27,6 @@ void homa_check_rpc(struct homa_rpc *rpc) { struct homa *homa = rpc->hsk->homa; struct homa_resend_hdr resend; -#ifndef __STRIP__ /* See strip.py */ - const char *us, *them; -#endif /* See strip.py */ /* See if we need to request an ack for this RPC. */ if (!homa_is_client(rpc->id) && rpc->state == RPC_OUTGOING && @@ -140,41 +137,24 @@ void homa_check_rpc(struct homa_rpc *rpc) resend.priority = homa->num_priorities - 1; #endif /* See strip.py */ homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); +#ifndef __UPSTREAM__ /* See strip.py */ if (homa_is_client(rpc->id)) { -#ifndef __STRIP__ /* See strip.py */ - us = "client"; - them = "server"; -#endif /* See strip.py */ tt_record4("Sent RESEND for client RPC id %llu, server 0x%x:%d, offset %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); -#ifndef __STRIP__ /* See strip.py */ - tt_record4("length %d, granted %d, rem %d, rec_incoming %d", - rpc->msgin.length, rpc->msgin.granted, - rpc->msgin.bytes_remaining, - atomic_read(&rpc->msgin.rec_incoming)); -#endif /* See strip.py */ + /* Should be if (homa->verbose) */ + pr_notice("Homa client RESEND to %s:%d for id %llu, offset %d\n", + homa_print_ipv6_addr(&rpc->peer->addr), + rpc->dport, rpc->id, rpc->msgin.recv_end); } else { -#ifndef __STRIP__ /* See strip.py */ - us = "server"; - them = "client"; -#endif /* See strip.py */ tt_record4("Sent RESEND for server RPC id %llu, client 0x%x:%d offset %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); -#ifndef __STRIP__ /* See strip.py */ - tt_record4("length %d, granted %d, rem %d, rec_incoming %d", - rpc->msgin.length, rpc->msgin.granted, - rpc->msgin.bytes_remaining, - atomic_read(&rpc->msgin.rec_incoming)); -#endif /* See strip.py */ + /* Should be if (homa->verbose) */ + pr_notice("Homa server RESEND to %s:%d for id %llu, offset %d\n", + homa_print_ipv6_addr(&rpc->peer->addr), + rpc->dport, rpc->id, rpc->msgin.recv_end); } -#ifndef __STRIP__ /* See strip.py */ - if (homa->verbose) - pr_notice("Homa %s RESEND to %s %s:%d for id %llu, offset %d, length %d\n", us, them, - homa_print_ipv6_addr(&rpc->peer->addr), - rpc->dport, rpc->id, rpc->msgin.recv_end, - rpc->msgin.granted - rpc->msgin.recv_end); #endif /* See strip.py */ } diff --git a/util/cperf.py b/util/cperf.py index 8280a3bc..c27ed7db 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -337,6 +337,7 @@ def init(options): Initialize various global state, such as the log file. """ global old_slowdown, log_dir, log_file, verbose, delete_rtts, link_mbps + global stripped log_dir = options.log_dir old_slowdown = options.old_slowdown if not options.plot_only: @@ -381,18 +382,21 @@ def init(options): s += ("--%s: %s" % (name, str(opts[name]))) vlog("Options: %s" % (s)) vlog("Homa configuration (node%d):" % (options.nodes[0])) - for param in ['dead_buffs_limit', 'grant_fifo_fraction', - 'gro_policy', 'link_mbps', 'max_dead_buffs', - 'max_grantable_rpcs', 'max_gro_skbs', 'max_gso_size', - 'max_nic_queue_ns', 'max_incoming', 'max_overcommit', - 'max_rpcs_per_peer', 'num_priorities', 'pacer_fifo_fraction', - 'poll_usecs', 'reap_limit', 'resend_interval', 'resend_ticks', - 'throttle_min_bytes', 'timeout_resends', 'unsched_bytes', 'window']: - result = do_subprocess(['ssh', 'node%d' % (options.nodes[0]), - 'sysctl', '-n', '.net.homa.' + param]) - vlog(" %-20s %s" % (param, result)) - if param == 'link_mbps': - link_mbps = float(result) + if not options.stripped: + for param in ['dead_buffs_limit', 'grant_fifo_fraction', + 'gro_policy', 'link_mbps', 'max_dead_buffs', + 'max_grantable_rpcs', 'max_gro_skbs', 'max_gso_size', + 'max_nic_queue_ns', 'max_incoming', 'max_overcommit', + 'max_rpcs_per_peer', 'num_priorities', 'pacer_fifo_fraction', + 'poll_usecs', 'reap_limit', 'resend_interval', 'resend_ticks', + 'throttle_min_bytes', 'timeout_resends', 'unsched_bytes', 'window']: + result = do_subprocess(['ssh', 'node%d' % (options.nodes[0]), + 'sysctl', '-n', '.net.homa.' + param]) + vlog(" %-20s %s" % (param, result)) + if param == 'link_mbps': + link_mbps = float(result) + else: + link_mbps = 25000 if options.mtu != 0: log("Setting MTU to %d" % (options.mtu)) From 7cedd9292a9145ebfaae39702078fe00c1a0f712 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 31 Mar 2025 14:34:02 -0700 Subject: [PATCH 233/625] Trivial order change in Makefile.upstream --- Makefile.upstream | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.upstream b/Makefile.upstream index c82fbc22..2b3c3aff 100644 --- a/Makefile.upstream +++ b/Makefile.upstream @@ -7,8 +7,8 @@ homa-y:= homa_incoming.o \ homa_interest.o \ homa_outgoing.o \ homa_peer.o \ - homa_pool.o \ homa_plumbing.o \ + homa_pool.o \ homa_rpc.o \ homa_sock.o \ homa_timer.o \ From e197a9bfc7908b1a8e93314d560adf0fa3b0c160 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 31 Mar 2025 16:54:53 -0700 Subject: [PATCH 234/625] Fix issues from kernel-doc --- homa_pool.c | 2 +- homa_rpc.h | 2 ++ homa_sock.h | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/homa_pool.c b/homa_pool.c index e8b37853..a260d4dc 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -134,7 +134,7 @@ void homa_pool_get_rcvbuf(struct homa_sock *hsk, } /** - * homa_pool_bpage_available() - Check whether a bpage is available for use. + * homa_bpage_available() - Check whether a bpage is available for use. * @bpage: Bpage to check * @now: Current time (sched_clock() units) * Return: True if the bpage is free or if it can be stolen, otherwise diff --git a/homa_rpc.h b/homa_rpc.h index d8e0efa7..58ced478 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -548,6 +548,8 @@ static inline bool homa_is_client(u64 id) * homa_rpc_needs_attention() - Returns true if @rpc has failed or if * its incoming message is ready for attention by an application thread * (e.g., packets are ready to copy to user space). + * @rpc: RPC to check. + * Return: See above */ static inline bool homa_rpc_needs_attention(struct homa_rpc *rpc) { diff --git a/homa_sock.h b/homa_sock.h index 0702bfdc..aba3a565 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -392,10 +392,11 @@ static inline struct homa_sock *homa_sk(const struct sock *sk) } /** - * homa_sock_wmem_avl()) - Returns true if the socket is within its limit + * homa_sock_wmem_avl() - Returns true if the socket is within its limit * for output memory usage. False means that no new messages should be sent * until memory is freed. * @hsk: Socket of interest. + * Return: See above. */ static inline bool homa_sock_wmem_avl(struct homa_sock *hsk) { From 9f55c4385aacc44c62079a735e98444f40de9150 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 31 Mar 2025 16:55:15 -0700 Subject: [PATCH 235/625] Minor bug in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0d691169..e25d9010 100644 --- a/Makefile +++ b/Makefile @@ -61,7 +61,7 @@ CP_HDRS := homa_impl.h \ homa_sock.h \ homa_stub.h \ homa_wire.h -CP_SRCS := $(patsubst %.o,%.c,$(filter-out timetrace.o, $(HOMA_OBJS))) +CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o timetrace.o, $(HOMA_OBJS))) CP_EXTRAS := reap.txt \ sync.txt \ Makefile From 43ecff2ec327330dff92d65a06ed1af606ed159f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 1 Apr 2025 10:58:59 -0700 Subject: [PATCH 236/625] Fix syntax errors in regex patterns in config script --- cloudlab/bin/config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cloudlab/bin/config b/cloudlab/bin/config index 1995eff6..4f50270e 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -134,7 +134,7 @@ def get_interfaces(): or (current == 'eno1')): interface = current continue - if re.match('^[ ]+ inet 10\.0\.1\.', line): + if re.match(r'^[ ]+ inet 10\.0\.1\.', line): vlan = current if not vlan or not interface: print("Found the following interfaces: %s" % (available)) @@ -198,7 +198,7 @@ def get_node_num(): return node_num hostname = subprocess.run(["hostname"], stdout=subprocess.PIPE, encoding="utf-8", check=True).stdout - match = re.match('node([0-9]+)\.', hostname) + match = re.match(r'node([0-9]+)\.', hostname) if not match: raise Exception("Couldn't figure out node number for this node") node_num = int(match.group(1)) @@ -313,7 +313,7 @@ def read_cpu_info(): sockets = {} f = open("/proc/cpuinfo", "r") for line in f: - match = re.match('([^\t]*)[\t ]+: (.*)', line) + match = re.match(r'([^\t]*)[\t ]+: (.*)', line) if match: name = match.group(1) value = match.group(2) From 083008ae3e6b0ecea4762132ea19fd2c1d921f17 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 1 Apr 2025 17:06:51 -0700 Subject: [PATCH 237/625] Fix #ifdefs in mock.c for running without lock checking etc. --- test/mock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/mock.c b/test/mock.c index db2b5c95..f1c0ce35 100644 --- a/test/mock.c +++ b/test/mock.c @@ -399,7 +399,7 @@ void __copy_overflow(int size, unsigned long count) abort(); } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) +#ifdef CONFIG_DEBUG_LOCK_ALLOC int debug_lockdep_rcu_enabled(void) { return 0; @@ -859,10 +859,12 @@ void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) return mock_kmalloc(size, gfpflags); } +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP void __might_sleep(const char *file, int line) { UNIT_HOOK("might_sleep"); } +#endif void *mock_kmalloc(size_t size, gfp_t flags) { @@ -940,9 +942,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip) {} -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) void lockdep_rcu_suspicious(const char *file, const int line, const char *s) {} #endif @@ -963,7 +963,7 @@ void lock_sock_nested(struct sock *sk, int subclass) sk->sk_lock.owned = 1; } -ssize_t __modver_version_show(const struct module_attribute *a, +ssize_t __modver_version_show(struct module_attribute *a, struct module_kobject *b, char *c) { return 0; @@ -1145,12 +1145,12 @@ bool rcu_is_watching(void) return true; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC int rcu_read_lock_any_held(void) { return 1; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0) int rcu_read_lock_held(void) { return 0; From 9a4002075371ffb02a4a47b30a61e6d64adda2e5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 1 Apr 2025 17:07:45 -0700 Subject: [PATCH 238/625] Fix race in pacer (wakeups could get lost) --- homa_impl.h | 22 ++++--- homa_outgoing.c | 128 ++++++++++++++------------------------ homa_utils.c | 5 +- test/unit_homa_outgoing.c | 126 ++++++++++++++++++++++++------------- 4 files changed, 145 insertions(+), 136 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 0a083258..449cbf34 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -543,19 +543,18 @@ struct homa { */ int max_dead_buffs; - /** - * @pacer_kthread: Kernel thread that transmits packets from - * throttled_rpcs in a way that limits queue buildup in the - * NIC. - */ - struct task_struct *pacer_kthread; - /** * @pacer_exit: true means that the pacer thread should exit as * soon as possible. */ bool pacer_exit; + /** + * @pacer_wait_queue: Used to block the pacer thread when there + * are no throttled RPCs. + */ + struct wait_queue_head pacer_wait_queue; + /** * @max_nic_queue_ns: Limits the NIC queue length: we won't queue * up a packet for transmission if link_idle_time is this many @@ -563,6 +562,13 @@ struct homa { */ int max_nic_queue_ns; + /** + * @pacer_kthread: Kernel thread that transmits packets from + * throttled_rpcs in a way that limits queue buildup in the + * NIC. + */ + struct task_struct *pacer_kthread; + /** * @ns_per_mbyte: the number of ns that it takes to transmit * 10**6 bytes on our uplink. This is actually a slight overestimate @@ -993,7 +999,7 @@ int homa_net_init(struct net *net); void homa_net_exit(struct net *net); int homa_pacer_main(void *transport); void homa_pacer_stop(struct homa *homa); -bool homa_pacer_xmit(struct homa *homa); +void homa_pacer_xmit(struct homa *homa); __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, diff --git a/homa_outgoing.c b/homa_outgoing.c index 7c527631..23602a9b 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -956,95 +956,61 @@ int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) int homa_pacer_main(void *transport) { struct homa *homa = (struct homa *)transport; - bool work_left; - homa->pacer_wake_time = sched_clock(); while (1) { - if (homa->pacer_exit) { - homa->pacer_wake_time = 0; + if (homa->pacer_exit) break; + homa_pacer_xmit(homa); + if (!list_empty(&homa->throttled_rpcs)) { + /* NIC queue is full; before calling pacer again, + * give other threads a chance to run (otherwise + * low-level packet processing such as softirq could + * get locked out). + */ + schedule(); + continue; } - work_left = homa_pacer_xmit(homa); - /* Sleep this thread if the throttled list is empty. Even - * if the throttled list isn't empty, call the scheduler - * to give other processes a chance to run (if we don't, - * softirq handlers can get locked out, which prevents - * incoming packets from being handled). - */ - set_current_state(TASK_INTERRUPTIBLE); - if (work_left) - __set_current_state(TASK_RUNNING); -#ifndef __STRIP__ /* See strip.py */ - else { - tt_record("pacer sleeping"); - INC_METRIC(throttled_ns, sched_clock() - - homa->throttle_add); - } -#endif /* See strip.py */ - INC_METRIC(pacer_ns, sched_clock() - homa->pacer_wake_time); - homa->pacer_wake_time = 0; - schedule(); - homa->pacer_wake_time = sched_clock(); - __set_current_state(TASK_RUNNING); + tt_record("pacer sleeping"); + wait_event(homa->pacer_wait_queue, homa->pacer_exit || + !list_empty(&homa->throttled_rpcs)); + tt_record("pacer woke up"); } kthread_complete_and_exit(&homa_pacer_kthread_done, 0); return 0; } /** - * homa_pacer_xmit() - Transmit packets from the throttled list. Note: - * this function may be invoked from either process context or softirq (BH) - * level. This function is invoked from multiple places, not just in the - * pacer thread. The reason for this is that (as of 10/2019) Linux's scheduling - * of the pacer thread is unpredictable: the thread may block for long periods - * of time (e.g., because it is assigned to the same CPU as a busy interrupt - * handler). This can result in poor utilization of the network link. So, - * this method gets invoked from other places as well, to increase the - * likelihood that we keep the link busy. Those other invocations are not - * guaranteed to happen, so the pacer thread provides a backstop. + * homa_pacer_xmit() - Transmit packets from the throttled list until + * either (a) the throttled list is empty or (b) the NIC queue has + * reached maximum allowable length. Note: this function may be invoked + * from either process context or softirq (BH) level. This function is + * invoked from multiple places, not just in the pacer thread. The reason + * for this is that (as of 10/2019) Linux's scheduling of the pacer thread + * is unpredictable: the thread may block for long periods of time (e.g., + * because it is assigned to the same CPU as a busy interrupt handler). + * This can result in poor utilization of the network link. So, this method + * gets invoked from other places as well, to increase the likelihood that we + * keep the link busy. Those other invocations are not guaranteed to happen, + * so the pacer thread provides a backstop. * @homa: Overall data about the Homa protocol implementation. - * Return: False if there are no throttled RPCs at the time this - * function returns, true if there are throttled RPCs or - * if the answer is unknown at the time of return. */ -bool homa_pacer_xmit(struct homa *homa) +void homa_pacer_xmit(struct homa *homa) { struct homa_rpc *rpc; - bool result = true; - int i; + s64 queue_ns; - /* Make sure only one instance of this function executes at a - * time. - */ + /* Make sure only one instance of this function executes at a time. */ if (!spin_trylock_bh(&homa->pacer_mutex)) - return true; + return; - /* Each iteration through the following loop sends one packet. We - * limit the number of passes through this loop in order to cap the - * time spent in one call to this function (see note in - * homa_pacer_main about interfering with softirq handlers). - */ - for (i = 0; i < 5; i++) { - u64 idle_time, now; - - /* If the NIC queue is too long, wait until it gets shorter. */ - now = sched_clock(); - idle_time = atomic64_read(&homa->link_idle_time); - while ((now + homa->max_nic_queue_ns) < idle_time) { - /* If we've xmitted at least one packet then - * return (this helps with testing and also - * allows homa_pacer_main to yield the core). - */ - if (i != 0) - goto done; - now = sched_clock(); - } - /* Note: when we get here, it's possible that the NIC queue is - * still too long because other threads have queued packets, - * but we transmit anyway so we don't starve (see perf.text - * for more info). - */ + homa->pacer_wake_time = sched_clock(); + while (1) { + queue_ns = atomic64_read(&homa->link_idle_time) - sched_clock(); + if (queue_ns >= homa->max_nic_queue_ns) + break; + if (list_empty(&homa->throttled_rpcs)) + break; /* Lock the first throttled RPC. This may not be possible * because we have to hold throttle_lock while locking @@ -1071,11 +1037,10 @@ bool homa_pacer_xmit(struct homa *homa) } } else { rpc = list_first_entry_or_null(&homa->throttled_rpcs, - struct homa_rpc, - throttled_links); + struct homa_rpc, + throttled_links); } if (!rpc) { - result = false; homa_throttle_unlock(homa); break; } @@ -1097,12 +1062,12 @@ bool homa_pacer_xmit(struct homa *homa) */ #ifndef __STRIP__ /* See strip.py */ if (!*rpc->msgout.next_xmit || rpc->msgout.next_xmit_offset >= - rpc->msgout.granted) { + rpc->msgout.granted) { #else /* See strip.py */ if (!*rpc->msgout.next_xmit) { #endif /* See strip.py */ - /* Nothing more to transmit from this message (right - * now), so remove it from the throttled list. + /* No more data can be transmitted from this message + * (right now), so remove it from the throttled list. */ homa_throttle_lock(homa); if (!list_empty(&rpc->throttled_links)) { @@ -1110,14 +1075,13 @@ bool homa_pacer_xmit(struct homa *homa) rpc->id, rpc->msgout.next_xmit_offset); list_del_init(&rpc->throttled_links); } - result = !list_empty(&homa->throttled_rpcs); homa_throttle_unlock(homa); } homa_rpc_unlock(rpc); } -done: + INC_METRIC(pacer_ns, sched_clock() - homa->pacer_wake_time); + homa->pacer_wake_time = 0; spin_unlock_bh(&homa->pacer_mutex); - return result; } /** @@ -1128,7 +1092,7 @@ bool homa_pacer_xmit(struct homa *homa) void homa_pacer_stop(struct homa *homa) { homa->pacer_exit = true; - wake_up_process(homa->pacer_kthread); + wake_up(&homa->pacer_wait_queue); kthread_stop(homa->pacer_kthread); homa->pacer_kthread = NULL; } @@ -1178,7 +1142,7 @@ void homa_add_to_throttled(struct homa_rpc *rpc) list_add_tail(&rpc->throttled_links, &homa->throttled_rpcs); done: homa_throttle_unlock(homa); - wake_up_process(homa->pacer_kthread); + wake_up(&homa->pacer_wait_queue); INC_METRIC(throttle_list_adds, 1); INC_METRIC(throttle_list_checks, checks); // tt_record("woke up pacer thread"); diff --git a/homa_utils.c b/homa_utils.c index c1ea05e6..a82d0d0f 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -115,6 +115,9 @@ int homa_init(struct homa *homa) homa->request_ack_ticks = 2; homa->reap_limit = 10; homa->dead_buffs_limit = 5000; + homa->pacer_exit = false; + init_waitqueue_head(&homa->pacer_wait_queue); + homa->max_nic_queue_ns = 5000; homa->pacer_kthread = kthread_run(homa_pacer_main, homa, "homa_pacer"); if (IS_ERR(homa->pacer_kthread)) { @@ -123,8 +126,6 @@ int homa_init(struct homa *homa) pr_err("couldn't create homa pacer thread: error %d\n", err); return err; } - homa->pacer_exit = false; - homa->max_nic_queue_ns = 5000; homa->wmem_max = 100000000; #ifndef __STRIP__ /* See strip.py */ homa->verbose = 0; diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 9834b450..dfd95bee 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -47,6 +47,17 @@ void mock_resend_data(struct homa_rpc *rpc, int start, int end, mock_resend_data(rpc, start, end, priority); #endif /* See strip.py */ +static int hook_count; +static void remove_throttled_hook(char *id) { + if (strcmp(id, "spin_lock") != 0) + return; + if (hook_count <= 0) + return; + hook_count--; + if (hook_count == 0) + homa_remove_from_throttled(hook_rpc); +} + /* Compute the expected "truesize" value for a Homa packet, given * the number of bytes of message data in the packet. */ @@ -86,6 +97,7 @@ FIXTURE_SETUP(homa_outgoing) #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; + self->homa.pacer_fifo_fraction = 0; #endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, self->client_port); self->server_addr.in6.sin6_family = AF_INET; @@ -851,8 +863,7 @@ TEST_F(homa_outgoing, homa_xmit_data__force) /* Now force transmission. */ unit_log_clear(); homa_xmit_data(crpc2, true); - EXPECT_STREQ("xmit DATA 1400@0; wake_up_process pid -1", - unit_log_get()); + EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("request id 1234, next_offset 2800; " @@ -871,8 +882,7 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) homa_xmit_data(crpc, false); EXPECT_STREQ("xmit DATA 1400@0; " - "xmit DATA 1400@1400; " - "wake_up_process pid -1", unit_log_get()); + "xmit DATA 1400@1400", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("request id 1234, next_offset 2800", unit_log_get()); @@ -1242,7 +1252,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__basics) self->homa.max_nic_queue_ns = 2000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); - EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); + homa_pacer_xmit(&self->homa); EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", unit_log_get()); unit_log_clear(); @@ -1251,6 +1261,54 @@ TEST_F(homa_outgoing, homa_pacer_xmit__basics) "request id 1236, next_offset 0; " "request id 1238, next_offset 0", unit_log_get()); } +TEST_F(homa_outgoing, homa_pacer_xmit__pacer_already_active) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, + 10000, 1000); + + homa_add_to_throttled(crpc); + self->homa.max_nic_queue_ns = 2000; + self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + mock_trylock_errors = 1; + unit_log_clear(); + homa_pacer_xmit(&self->homa); + EXPECT_STREQ("", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 1234, next_offset 0", unit_log_get()); +} +TEST_F(homa_outgoing, homa_pacer_xmit__nic_queue_fills) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, + 10000, 1000); + + homa_add_to_throttled(crpc); + self->homa.max_nic_queue_ns = 2001; + mock_ns = 10000; + atomic64_set(&self->homa.link_idle_time, 12000); + self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + unit_log_clear(); + homa_pacer_xmit(&self->homa); + + /* Just room for one packet before NIC queue fills. */ + EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 1234, next_offset 1400", unit_log_get()); +} +TEST_F(homa_outgoing, homa_pacer_xmit__queue_empty) +{ + self->homa.max_nic_queue_ns = 2000; + self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + unit_log_clear(); + homa_pacer_xmit(&self->homa); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) { struct homa_rpc *crpc1, *crpc2, *crpc3; @@ -1277,7 +1335,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); mock_xmit_log_verbose = 1; - EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); + homa_pacer_xmit(&self->homa); EXPECT_SUBSTR("id 4, message_length 10000, offset 0, data_length 1400", unit_log_get()); unit_log_clear(); @@ -1290,7 +1348,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) /* Second attempt: pacer_fifo_count reaches zero. */ atomic64_set(&self->homa.link_idle_time, 10000); unit_log_clear(); - EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); + homa_pacer_xmit(&self->homa); EXPECT_SUBSTR("id 2, message_length 20000, offset 0, data_length 1400", unit_log_get()); unit_log_clear(); @@ -1300,7 +1358,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) "request id 6, next_offset 0", unit_log_get()); EXPECT_EQ(900, self->homa.pacer_fifo_count); } -TEST_F(homa_outgoing, homa_pacer_xmit__pacer_busy) +TEST_F(homa_outgoing, homa_pacer_xmit__rpc_removed_from_queue_before_locked) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -1308,43 +1366,18 @@ TEST_F(homa_outgoing, homa_pacer_xmit__pacer_busy) 10000, 1000); homa_add_to_throttled(crpc); - self->homa.max_nic_queue_ns = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - mock_trylock_errors = 1; - unit_log_clear(); - EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); - EXPECT_STREQ("", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 1234, next_offset 0", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__queue_empty) -{ - self->homa.max_nic_queue_ns = 2000; + self->homa.max_nic_queue_ns = 10000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); - EXPECT_EQ(0, homa_pacer_xmit(&self->homa)); - unit_log_throttled(&self->homa); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__nic_queue_fills) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 10000, 1000); + unit_hook_register(remove_throttled_hook); + hook_rpc = crpc; + hook_count = 2; + homa_pacer_xmit(&self->homa); - homa_add_to_throttled(crpc); - self->homa.max_nic_queue_ns = 2001; - mock_ns = 10000; - atomic64_set(&self->homa.link_idle_time, 12000); - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); - EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); + EXPECT_STREQ("removing id 1234 from throttled list", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 1234, next_offset 1400", unit_log_get()); + EXPECT_STREQ("", unit_log_get()); } TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) { @@ -1358,14 +1391,14 @@ TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; unit_log_clear(); mock_trylock_errors = ~1; - EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); + homa_pacer_xmit(&self->homa); EXPECT_STREQ("", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_skipped_rpcs); #endif /* See strip.py */ unit_log_clear(); mock_trylock_errors = 0; - EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); + homa_pacer_xmit(&self->homa); EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", unit_log_get()); } @@ -1387,7 +1420,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__remove_from_queue) unit_log_clear(); /* First call completes id 2, but id 4 is still in the queue. */ - EXPECT_EQ(1, homa_pacer_xmit(&self->homa)); + homa_pacer_xmit(&self->homa); EXPECT_STREQ("xmit DATA 1000@0; xmit DATA 1400@0", unit_log_get()); unit_log_clear(); @@ -1398,7 +1431,7 @@ TEST_F(homa_outgoing, homa_pacer_xmit__remove_from_queue) /* Second call completes id 4, queue now empty. */ unit_log_clear(); self->homa.max_nic_queue_ns = 10000; - EXPECT_EQ(0, homa_pacer_xmit(&self->homa)); + homa_pacer_xmit(&self->homa); EXPECT_STREQ("xmit DATA 600@1400", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); @@ -1427,7 +1460,10 @@ TEST_F(homa_outgoing, homa_add_to_throttled__basics) self->server_port, 10, 10000, 1000); /* Basics: add one RPC. */ + mock_log_wakeups = 1; + unit_log_clear(); homa_add_to_throttled(crpc1); + EXPECT_STREQ("wake_up", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("request id 2, next_offset 0", unit_log_get()); @@ -1446,7 +1482,9 @@ TEST_F(homa_outgoing, homa_add_to_throttled__basics) "request id 6, next_offset 0", unit_log_get()); /* Don't reinsert if already present. */ + unit_log_clear(); homa_add_to_throttled(crpc1); + EXPECT_STREQ("", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("request id 4, next_offset 0; " From 38b52d047f98875e5bfda2badcd1fc6656847f90 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 1 Apr 2025 17:08:36 -0700 Subject: [PATCH 239/625] Fix regex strings in cperf.py that had backslashes --- util/cperf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index c27ed7db..7181893a 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -1062,7 +1062,7 @@ def scan_logs(): experiments = defaultdict(lambda : defaultdict(dict)) for file in sorted(glob.glob(log_dir + "/node*.log")): - node = re.match('.*/(node[0-9]+)\.log', file).group(1) + node = re.match(r'.*/(node[0-9]+)\.log', file).group(1) scan_log(file, node, experiments) for name, exp in experiments.items(): @@ -1188,7 +1188,7 @@ def scan_metrics(experiment): if match: metrics['cores'][file] = float(match.group(1)) continue - match = re.match('([^ ]+) +([0-9]+) +\( *([0-9.]+ *[MKG]?)/s', line) + match = re.match(r'([^ ]+) +([0-9]+) +\( *([0-9.]+ *[MKG]?)/s', line) if not match: continue name = match.group(1) From ac2b7c5c6479a58d4550ac60fc5cfdb998072230 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 1 Apr 2025 17:26:59 -0700 Subject: [PATCH 240/625] Change the way cperf.py logs Homa's sysctl values New approach (a) ensures that all values are printed, and (b) does it with a single ssh command. --- util/cperf.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index 7181893a..325e3729 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -69,7 +69,7 @@ stripped = False # Speed of host uplinks. -link_mbps = 0 +link_mbps = None # Defaults for command-line options; assumes that servers and clients # share nodes. @@ -382,20 +382,21 @@ def init(options): s += ("--%s: %s" % (name, str(opts[name]))) vlog("Options: %s" % (s)) vlog("Homa configuration (node%d):" % (options.nodes[0])) - if not options.stripped: - for param in ['dead_buffs_limit', 'grant_fifo_fraction', - 'gro_policy', 'link_mbps', 'max_dead_buffs', - 'max_grantable_rpcs', 'max_gro_skbs', 'max_gso_size', - 'max_nic_queue_ns', 'max_incoming', 'max_overcommit', - 'max_rpcs_per_peer', 'num_priorities', 'pacer_fifo_fraction', - 'poll_usecs', 'reap_limit', 'resend_interval', 'resend_ticks', - 'throttle_min_bytes', 'timeout_resends', 'unsched_bytes', 'window']: - result = do_subprocess(['ssh', 'node%d' % (options.nodes[0]), - 'sysctl', '-n', '.net.homa.' + param]) - vlog(" %-20s %s" % (param, result)) - if param == 'link_mbps': - link_mbps = float(result) - else: + result = subprocess.run(['ssh', 'node%d' % (options.nodes[0]), + 'sysctl', '-a'], capture_output=True, encoding="utf-8") + if (result.returncode != 0): + log("sysctl -a on node%d exited with status %d:" % + (options.nodes[0], result.returncode)) + log(result.stderr.rstrip()) + for line in result.stdout.splitlines(): + match = re.match('.*net.homa.([^ ]+) = (.*)', line) + if match: + name = match.group(1) + value = match.group(2) + vlog(" %-20s %s" % (name, value)) + if name == 'link_mbps': + link_mbps = float(value) + if link_mbps == None: link_mbps = 25000 if options.mtu != 0: From 47e1f27b3dc042014a587cc2578ea2e23f01baf5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Apr 2025 09:44:05 -0700 Subject: [PATCH 241/625] Fix issues from kdoc and checkpatch.pl --- homa.h | 4 ++-- homa_impl.h | 6 +++--- homa_incoming.c | 19 ++++++++++++------- homa_outgoing.c | 4 ++-- homa_plumbing.c | 11 ++++++----- homa_pool.c | 6 ++++++ homa_rpc.c | 3 ++- homa_sock.h | 8 ++++---- homa_wire.h | 2 ++ 9 files changed, 39 insertions(+), 24 deletions(-) diff --git a/homa.h b/homa.h index 6c3c729b..2db52801 100644 --- a/homa.h +++ b/homa.h @@ -70,6 +70,7 @@ struct homa_sendmsg_args { */ __u32 flags; + /** @reserved: Not currently used. */ __u32 reserved; }; @@ -196,8 +197,7 @@ struct homa_rcvbuf_args { */ #define HOMA_FLAG_DONT_THROTTLE 2 -/** - * I/O control calls on Homa sockets. These are mapped into the +/* I/O control calls on Homa sockets. These are mapped into the * SIOCPROTOPRIVATE range of 0x89e0 through 0x89ef. */ diff --git a/homa_impl.h b/homa_impl.h index 449cbf34..37691c09 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1109,7 +1109,7 @@ static inline void homa_check_pacer(struct homa *homa, int softirq) */ static inline struct homa *homa_from_net(struct net *net) { - return (struct homa *) net_generic(net, homa_net_id); + return (struct homa *)net_generic(net, homa_net_id); } /** @@ -1120,7 +1120,7 @@ static inline struct homa *homa_from_net(struct net *net) */ static inline struct homa *homa_from_sock(struct sock *sock) { - return (struct homa *) net_generic(sock_net(sock), homa_net_id); + return (struct homa *)net_generic(sock_net(sock), homa_net_id); } /** @@ -1131,7 +1131,7 @@ static inline struct homa *homa_from_sock(struct sock *sock) */ static inline struct homa *homa_from_skb(struct sk_buff *skb) { - return (struct homa *) net_generic(dev_net(skb->dev), homa_net_id); + return (struct homa *)net_generic(dev_net(skb->dev), homa_net_id); } extern struct completion homa_pacer_kthread_done; diff --git a/homa_incoming.c b/homa_incoming.c index 564864fc..149e3e9d 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -856,7 +856,6 @@ void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) #endif /* See strip.py */ goto done; } - #ifndef __STRIP__ /* See strip.py */ pr_err("Received unknown for RPC id %llu, peer %s:%d in bogus state %d; discarding unknown\n", rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), @@ -865,16 +864,19 @@ void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) tt_record4("Discarding unknown for RPC id %d, peer 0x%x:%d: bad state %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->state); - } else { #ifndef __STRIP__ /* See strip.py */ + } else { if (rpc->hsk->homa->verbose) pr_notice("Ending rpc id %llu from client %s:%d: unknown to client", rpc->id, homa_print_ipv6_addr(&rpc->peer->addr), rpc->dport); -#endif /* See strip.py */ homa_rpc_end(rpc); INC_METRIC(server_rpcs_unknown, 1); +#else /* See strip.py */ + } else { + homa_rpc_end(rpc); +#endif /* See strip.py */ } done: kfree_skb(skb); @@ -929,12 +931,14 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, * the entire response), or if we can't find peer info. */ if (rpc && (rpc->state != RPC_INCOMING || - rpc->msgin.bytes_remaining)) { #ifndef __STRIP__ /* See strip.py */ + rpc->msgin.bytes_remaining)) { tt_record3("NEED_ACK arrived for id %d before message received, state %d, remaining %d", rpc->id, rpc->state, rpc->msgin.bytes_remaining); homa_freeze(rpc, NEED_ACK_MISSING_DATA, "Freezing because NEED_ACK received before message complete, id %d, peer 0x%x"); +#else /* See strip.py */ + rpc->msgin.bytes_remaining)) { #endif /* See strip.py */ goto done; } else { @@ -1298,10 +1302,11 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) goto done; } if (!list_empty(&hsk->ready_rpcs)) { - rpc = list_first_entry(&hsk->ready_rpcs, struct homa_rpc, - ready_links); + rpc = list_first_entry(&hsk->ready_rpcs, + struct homa_rpc, + ready_links); tt_record2("homa_wait_shared found rpc id %d, pid %d via ready_rpcs, blocked 0", - rpc->id, current->pid); + rpc->id, current->pid); homa_rpc_hold(rpc); list_del_init(&rpc->ready_links); if (!list_empty(&hsk->ready_rpcs)) { diff --git a/homa_outgoing.c b/homa_outgoing.c index 23602a9b..8348675d 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -1029,7 +1029,7 @@ void homa_pacer_xmit(struct homa *homa) homa->pacer_fifo_count += 1000; rpc = NULL; list_for_each_entry(cur, &homa->throttled_rpcs, - throttled_links) { + throttled_links) { if (cur->msgout.init_ns < oldest) { rpc = cur; oldest = cur->msgout.init_ns; @@ -1135,7 +1135,7 @@ void homa_add_to_throttled(struct homa_rpc *rpc) candidate->msgout.next_xmit_offset; if (bytes_left_cand > bytes_left) { list_add_tail(&rpc->throttled_links, - &candidate->throttled_links); + &candidate->throttled_links); goto done; } } diff --git a/homa_plumbing.c b/homa_plumbing.c index 0f103db6..318c68e8 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -576,7 +576,7 @@ int __init homa_load(void) status = register_pernet_subsys(&homa_net_ops); if (status != 0) { pr_err("Homa got error from register_pernet_subsys: %d\n", - status); + status); goto net_err; } @@ -910,15 +910,16 @@ int homa_setsockopt(struct sock *sk, int level, int optname, if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; - /* Do a trivial test to make sure we can at least write the first - * page of the region. + /* Do a trivial test to make sure we can at least write the + * first page of the region. */ if (copy_to_user(u64_to_user_ptr(args.start), &args, sizeof(args))) return -EFAULT; homa_sock_lock(hsk); - ret = homa_pool_init(hsk, u64_to_user_ptr(args.start), args.length); + ret = homa_pool_init(hsk, u64_to_user_ptr(args.start), + args.length); homa_sock_unlock(hsk); INC_METRIC(so_set_buf_calls, 1); INC_METRIC(so_set_buf_ns, sched_clock() - start); @@ -1035,7 +1036,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) result = -EFAULT; goto error; } - if ((args.flags & ~HOMA_SENDMSG_VALID_FLAGS) || + if (args.flags & ~HOMA_SENDMSG_VALID_FLAGS || (args.reserved != 0)) { result = -EINVAL; goto error; diff --git a/homa_pool.c b/homa_pool.c index a260d4dc..d69eb69c 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -301,10 +301,16 @@ int homa_pool_allocate(struct homa_rpc *rpc) goto new_page; } if ((core->allocated + partial) > HOMA_BPAGE_SIZE) { +#ifndef __STRIP__ /* See strip.py */ if (atomic_read(&bpage->refs) == 1) { /* Bpage is totally free, so we can reuse it. */ core->allocated = 0; INC_METRIC(bpage_reuses, 1); +#else /* See strip.py */ + if (atomic_read(&bpage->refs) == 1) { + /* Bpage is totally free, so we can reuse it. */ + core->allocated = 0; +#endif /* See strip.py */ } else { bpage->owner = -1; diff --git a/homa_rpc.c b/homa_rpc.c index a960d429..d7bc5c11 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -379,7 +379,8 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) } /* Collect buffers and freeable RPCs. */ - list_for_each_entry_safe(rpc, tmp, &hsk->dead_rpcs, dead_links) { + list_for_each_entry_safe(rpc, tmp, &hsk->dead_rpcs, + dead_links) { int refs; /* Make sure that all outstanding uses of the RPC have diff --git a/homa_sock.h b/homa_sock.h index aba3a565..ff7d1f75 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -321,8 +321,8 @@ static inline int homa_port_hash(__u16 port) * * Return: The bucket in which this RPC will appear, if the RPC exists. */ -static inline struct homa_rpc_bucket *homa_client_rpc_bucket(struct homa_sock *hsk, - u64 id) +static inline struct homa_rpc_bucket + *homa_client_rpc_bucket(struct homa_sock *hsk, u64 id) { /* We can use a really simple hash function here because RPC ids * are allocated sequentially. @@ -339,8 +339,8 @@ static inline struct homa_rpc_bucket *homa_client_rpc_bucket(struct homa_sock *h * * Return: The bucket in which this RPC will appear, if the RPC exists. */ -static inline struct homa_rpc_bucket *homa_server_rpc_bucket(struct homa_sock *hsk, - u64 id) +static inline struct homa_rpc_bucket + *homa_server_rpc_bucket(struct homa_sock *hsk, u64 id) { /* Each client allocates RPC ids sequentially, so they will * naturally distribute themselves across the hash space. diff --git a/homa_wire.h b/homa_wire.h index 37303962..dc324aa5 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -135,6 +135,7 @@ struct homa_common_hdr { __u8 flags; #define HOMA_TCP_FLAGS 6 #else /* See strip.py */ + /** @reserved1: Not used (corresponds to TCP flags). */ __u8 reserved1; #endif /* See strip.py */ @@ -160,6 +161,7 @@ struct homa_common_hdr { __be16 urgent; #define HOMA_TCP_URGENT 0xb97d #else /* See strip.py */ + /** @reserved2: Not used (corresponds to TCP urgent field). */ __be16 reserved2; #endif /* See strip.py */ From 0320003d8723081f70c124d7dbe221efaeb49289 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Apr 2025 09:50:52 -0700 Subject: [PATCH 242/625] Fix unused variable found by kernel test robot --- homa_incoming.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 149e3e9d..35630f90 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1215,7 +1215,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) { struct homa_interest interest; int result = 0; - int iteration; + IF_NO_STRIP(int iteration); if (!(atomic_read(&rpc->flags) & RPC_PRIVATE)) return -EINVAL; @@ -1227,7 +1227,11 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) * (e.g. copy to user space). It may take many iterations until the * RPC is ready for the application. */ +#ifndef __STRIP__ /* See strip.py */ for (iteration = 0; ; iteration++) { +#else /* See strip.py */ + while (1) { +#endif /* See strip.py */ if (!rpc->error) rpc->error = homa_copy_to_user(rpc); if (rpc->error) { @@ -1286,7 +1290,7 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) { struct homa_interest interest; struct homa_rpc *rpc; - int iteration; + IF_NO_STRIP(int iteration); int result; /* Each iteration through this loop waits until an RPC needs attention @@ -1294,7 +1298,11 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) * (e.g. copy to user space). It may take many iterations until an * RPC is ready for the application. */ +#ifndef __STRIP__ /* See strip.py */ for (iteration = 0; ; iteration++) { +#else /* See strip.py */ + while (1) { +#endif /* See strip.py */ homa_sock_lock(hsk); if (hsk->shutdown) { rpc = ERR_PTR(-ESHUTDOWN); From fcc8595a1a27b351451c4fb5c3228975c2287b84 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Apr 2025 12:46:41 -0700 Subject: [PATCH 243/625] Clean up metrics related to waiting for messages --- homa_incoming.c | 55 ++++++++++++++++++++++++--------------- homa_interest.c | 6 +---- homa_metrics.c | 14 +++++----- homa_metrics.h | 25 +++++++----------- test/unit_homa_incoming.c | 13 +++++---- test/unit_homa_interest.c | 3 --- util/metrics.py | 16 +++--------- 7 files changed, 63 insertions(+), 69 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 35630f90..eda30052 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1214,8 +1214,11 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) __must_hold(&rpc->bucket->lock) { struct homa_interest interest; +#ifndef __STRIP__ /* See strip.py */ + int avail_immediately = 1; + int blocked = 0; +#endif /* See strip.py */ int result = 0; - IF_NO_STRIP(int iteration); if (!(atomic_read(&rpc->flags) & RPC_PRIVATE)) return -EINVAL; @@ -1227,11 +1230,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) * (e.g. copy to user space). It may take many iterations until the * RPC is ready for the application. */ -#ifndef __STRIP__ /* See strip.py */ - for (iteration = 0; ; iteration++) { -#else /* See strip.py */ while (1) { -#endif /* See strip.py */ if (!rpc->error) rpc->error = homa_copy_to_user(rpc); if (rpc->error) { @@ -1241,13 +1240,8 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) if (rpc->msgin.length >= 0 && rpc->msgin.bytes_remaining == 0 && skb_queue_len(&rpc->msgin.packets) == 0) { -#ifndef __STRIP__ /* See strip.py */ - if (iteration == 0) { - tt_record2("homa_wait_private found rpc id %d, pid %d via null, blocked 0", - rpc->id, current->pid); - INC_METRIC(fast_wakeups, 1); - } -#endif /* See strip.py */ + tt_record2("homa_wait_private found rpc id %d, pid %d via null, blocked 0", + rpc->id, current->pid); break; } @@ -1257,6 +1251,10 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) homa_rpc_unlock(rpc); result = homa_interest_wait(&interest, nonblocking); +#ifndef __STRIP__ /* See strip.py */ + avail_immediately = 0; + blocked |= interest.blocked; +#endif /* See strip.py */ atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); @@ -1272,6 +1270,14 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) break; } +#ifndef __STRIP__ /* See strip.py */ + if (avail_immediately) + INC_METRIC(wait_none, 1); + else if (blocked) + INC_METRIC(wait_block, 1); + else + INC_METRIC(wait_fast, 1); +#endif /* See strip.py */ homa_rpc_put(rpc); return result; } @@ -1290,7 +1296,10 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) { struct homa_interest interest; struct homa_rpc *rpc; - IF_NO_STRIP(int iteration); +#ifndef __STRIP__ /* See strip.py */ + int avail_immediately = 1; + int blocked = 0; +#endif /* See strip.py */ int result; /* Each iteration through this loop waits until an RPC needs attention @@ -1298,11 +1307,7 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) * (e.g. copy to user space). It may take many iterations until an * RPC is ready for the application. */ -#ifndef __STRIP__ /* See strip.py */ - for (iteration = 0; ; iteration++) { -#else /* See strip.py */ while (1) { -#endif /* See strip.py */ homa_sock_lock(hsk); if (hsk->shutdown) { rpc = ERR_PTR(-ESHUTDOWN); @@ -1324,14 +1329,14 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) hsk->sock.sk_data_ready(&hsk->sock); } homa_sock_unlock(hsk); -#ifndef __STRIP__ /* See strip.py */ - if (iteration == 0) - INC_METRIC(fast_wakeups, 1); -#endif /* See strip.py */ } else { homa_interest_init_shared(&interest, hsk); homa_sock_unlock(hsk); result = homa_interest_wait(&interest, nonblocking); +#ifndef __STRIP__ /* See strip.py */ + avail_immediately = 0; + blocked |= interest.blocked; +#endif /* See strip.py */ homa_interest_unlink_shared(&interest); if (result != 0) { @@ -1371,6 +1376,14 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) } done: +#ifndef __STRIP__ /* See strip.py */ + if (avail_immediately) + INC_METRIC(wait_none, 1); + else if (blocked) + INC_METRIC(wait_block, 1); + else + INC_METRIC(wait_fast, 1); +#endif /* See strip.py */ return rpc; } diff --git a/homa_interest.c b/homa_interest.c index 3b87f3b5..17d26fb9 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -123,12 +123,8 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) done: #ifndef __STRIP__ /* See strip.py */ - if (interest->blocked) { - INC_METRIC(slow_wakeups, 1); + if (interest->blocked) INC_METRIC(blocked_ns, blocked_time); - } else { - INC_METRIC(fast_wakeups, 1); - } INC_METRIC(poll_ns, sched_clock() - start - blocked_time); #endif /* See strip.py */ return result; diff --git a/homa_metrics.c b/homa_metrics.c index 2e41139c..89121f56 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -179,16 +179,14 @@ char *homa_metrics_print(void) m->skb_page_alloc_ns); M("requests_received %15llu Incoming request messages\n", m->requests_received); - M("requests_queued %15llu Requests for which no thread was waiting\n", - m->requests_queued); M("responses_received %15llu Incoming response messages\n", m->responses_received); - M("responses_queued %15llu Responses for which no thread was waiting\n", - m->responses_queued); - M("fast_wakeups %15llu Messages received while polling\n", - m->fast_wakeups); - M("slow_wakeups %15llu Messages received after thread went to sleep\n", - m->slow_wakeups); + M("wait_none %15llu Messages received without blocking or polling\n", + m->wait_none); + M("wait_fast %15llu Messages received while polling\n", + m->wait_fast); + M("wait_block %15llu Messages received after thread went to sleep\n", + m->wait_block); M("handoffs_thread_waiting %15llu RPC handoffs to waiting threads (vs. queue)\n", m->handoffs_thread_waiting); M("handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", diff --git a/homa_metrics.h b/homa_metrics.h index f10a0555..be4fc62f 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -103,34 +103,29 @@ struct homa_metrics { */ u64 requests_received; - /** - * @requests_queued: total number of requests that were added to - * @homa->ready_requests (no thread was waiting). - */ - u64 requests_queued; - /** * @responses_received: total number of response messages received. */ u64 responses_received; /** - * @responses_queued: total number of responses that were added to - * @homa->ready_responses (no thread was waiting). + * @wait_none: total number of times that an incoming message was + * already waiting when recvmsg was invoked. */ - u64 responses_queued; + u64 wait_none; /** - * @fast_wakeups: total number of times that a message arrived for - * a receiving thread that was polling in homa_wait_for_message. + * @wait_fast: total number of times that a message arrived for + * a receiving thread while it was polling (i.e. the message + * wasn't immediatly available, but the thread never blocked). */ - u64 fast_wakeups; + u64 wait_fast; /** - * @slow_wakeups: total number of times that a receiving thread - * had to be put to sleep (no message arrived while it was polling). + * @wait_block: total number of times that a thread blocked at + * least once while waiting for an incoming message. */ - u64 slow_wakeups; + u64 wait_block; /** * @handoffs_thread_waiting: total number of times that an RPC diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index f60af88b..31995db5 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2214,7 +2214,7 @@ TEST_F(homa_incoming, homa_wait_private__rpc_not_private) ASSERT_NE(NULL, crpc); EXPECT_EQ(EINVAL, -homa_wait_private(crpc, 0)); } -TEST_F(homa_incoming, homa_wait_private__basics) +TEST_F(homa_incoming, homa_wait_private__available_immediately) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, @@ -2225,7 +2225,7 @@ TEST_F(homa_incoming, homa_wait_private__basics) atomic_or(RPC_PRIVATE, &crpc->flags); EXPECT_EQ(0, homa_wait_private(crpc, 0)); ASSERT_EQ(RPC_PRIVATE, atomic_read(&crpc->flags)); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_none)); } TEST_F(homa_incoming, homa_wait_private__rpc_has_error) { @@ -2262,6 +2262,7 @@ TEST_F(homa_incoming, homa_wait_private__nonblocking) atomic_or(RPC_PRIVATE, &crpc->flags); EXPECT_EQ(EAGAIN, -homa_wait_private(crpc, 1)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_fast)); } TEST_F(homa_incoming, homa_wait_private__signal_notify_race) { @@ -2278,7 +2279,7 @@ TEST_F(homa_incoming, homa_wait_private__signal_notify_race) mock_prepare_to_wait_errors = 1; EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); EXPECT_EQ(0, mock_prepare_to_wait_errors); } @@ -2307,7 +2308,7 @@ TEST_F(homa_incoming, homa_wait_shared__rpc_already_ready) ASSERT_FALSE(IS_ERR(rpc)); EXPECT_EQ(crpc, rpc); EXPECT_EQ(0, crpc->msgin.packets.qlen); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_none)); homa_rpc_unlock(rpc); } TEST_F(homa_incoming, homa_wait_shared__multiple_rpcs_already_ready) @@ -2337,6 +2338,7 @@ TEST_F(homa_incoming, homa_wait_shared__nonblocking) rpc = homa_wait_shared(&self->hsk, 1); EXPECT_TRUE(IS_ERR(rpc)); EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_fast)); } TEST_F(homa_incoming, homa_wait_shared__signal_race_with_handoff) { @@ -2355,7 +2357,7 @@ TEST_F(homa_incoming, homa_wait_shared__signal_race_with_handoff) rpc = homa_wait_shared(&self->hsk, 0); EXPECT_EQ(crpc, rpc); EXPECT_EQ(ENOENT, -rpc->error); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); homa_rpc_unlock(rpc); } TEST_F(homa_incoming, homa_wait_shared__socket_shutdown_while_blocked) @@ -2371,6 +2373,7 @@ TEST_F(homa_incoming, homa_wait_shared__socket_shutdown_while_blocked) EXPECT_EQ(ESHUTDOWN, -PTR_ERR(rpc)); EXPECT_EQ(1, self->hsk.shutdown); self->hsk.shutdown = 0; + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); } TEST_F(homa_incoming, homa_wait_shared__copy_to_user_fails) { diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index 7b74276a..9c28e86a 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -145,7 +145,6 @@ TEST_F(homa_interest, homa_interest_wait__already_ready) atomic_set(&interest.ready, 1); EXPECT_EQ(0, homa_interest_wait(&interest, 0)); EXPECT_EQ(0, interest.blocked); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->fast_wakeups)); homa_interest_unlink_shared(&interest); } @@ -213,8 +212,6 @@ TEST_F(homa_interest, homa_interest_wait__poll_then_block) #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(3000, homa_metrics_per_cpu()->poll_ns); EXPECT_EQ(0, homa_metrics_per_cpu()->blocked_ns); - EXPECT_EQ(0, homa_metrics_per_cpu()->fast_wakeups); - EXPECT_EQ(1, homa_metrics_per_cpu()->slow_wakeups); EXPECT_EQ(1, interest.blocked); #endif /* See strip.py */ homa_interest_unlink_shared(&interest); diff --git a/util/metrics.py b/util/metrics.py index 61121a74..ee4e0925 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -370,8 +370,9 @@ def scale_number(number): if total_messages > 0.0: print("\nReceiving Messages:") print("-------------------") - poll_percent = 100.0*float(deltas["fast_wakeups"])/total_messages - sleep_percent = 100.0*float(deltas["slow_wakeups"])/total_messages + avail_percent = 100.0*float(deltas["wait_none"])/total_messages + poll_percent = 100.0*float(deltas["wait_fast"])/total_messages + sleep_percent = 100.0*float(deltas["wait_block"])/total_messages if deltas["gen3_alt_handoffs"]: gen3_alt_percent = (100.0*deltas["gen3_alt_handoffs"] /deltas["gen3_handoffs"]) @@ -392,8 +393,7 @@ def scale_number(number): /deltas["packets_rcvd_GRANT"]) else: grant_bypass_percent = 0.0 - print("Available immediately: %5.1f%%" % (100.0 - poll_percent - - sleep_percent)) + print("Available immediately: %5.1f%%" % (avail_percent)) print("Arrived while polling: %5.1f%%" % (poll_percent)) print("Blocked at least once: %5.1f%%" % (sleep_percent)) print("Alternate GRO handoffs: %5.1f%%" % (gen3_alt_percent)) @@ -440,14 +440,6 @@ def scale_number(number): print("\nCanaries (possible problem indicators):") print("---------------------------------------") - for symbol in ["requests_queued", "responses_queued"]: - delta = deltas[symbol] - if delta != 0: - received = deltas[symbol[:-7] + "_received"] - if (received != 0): - percent = "(%.1f%%)" % (100.0*float(delta)/float(received)) - percent = percent.ljust(12) - print("%-28s %15d %s %s" % (symbol, delta, percent, docs[symbol])) for symbol in ["resent_packets", "resent_packets_used", "packet_discards", "resent_discards", "unknown_rpcs", "peer_kmalloc_errors", "peer_route_errors", "control_xmit_errors", From 03b767e1f1f273713410405408b889f623a8767d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Apr 2025 16:13:57 -0700 Subject: [PATCH 244/625] Use _exit instead of exit in cp_node.cc No need for exit handlers to run and they can cause seg faults due to ordering issues. --- util/cp_node.cc | 82 ++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/util/cp_node.cc b/util/cp_node.cc index 5506a306..4e4fbb55 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -794,7 +794,7 @@ void tcp_connection::set_epoll_events(int epoll_fd, uint32_t events) : EPOLL_CTL_MOD, fd, &ev) < 0) { log(NORMAL, "FATAL: couldn't add/modify epoll event: %s\n", strerror(errno)); - exit(1); + _exit(1); } epoll_events = events; } @@ -862,7 +862,7 @@ bool tcp_connection::xmit() "to %s: %s (port %d)\n", print_address(&peer), strerror(errno), port); - exit(1); + _exit(1); } } if (bytes_sent < header->length) { @@ -987,7 +987,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, log(NORMAL, "FATAL: homa_server couldn't open Homa " "socket: %s\n", strerror(errno)); - exit(1); + _exit(1); } memset(&addr, 0, sizeof(addr)); @@ -1002,7 +1002,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, log(NORMAL, "FATAL: homa_server couldn't bind socket " "to Homa port %d: %s\n", port, strerror(errno)); - exit(1); + _exit(1); } log(NORMAL, "Successfully bound to Homa port %d\n", port); @@ -1012,7 +1012,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, if (buf_region == MAP_FAILED) { printf("Couldn't mmap buffer region for server on port %d: %s\n", port, strerror(errno)); - exit(1); + _exit(1); } arg.start = (uintptr_t)buf_region; arg.length = buf_size; @@ -1021,7 +1021,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, if (status < 0) { printf("FATAL: error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); - exit(1); + _exit(1); } for (int i = 0; i < num_threads; i++) { @@ -1117,7 +1117,7 @@ void homa_server::server(int thread_id, server_metrics *metrics) log(NORMAL, "FATAL: homa_reply failed for server " "port %d: %s\n", port, strerror(errno)); - exit(1); + _exit(1); } metrics->requests++; metrics->bytes_in += length; @@ -1219,7 +1219,7 @@ tcp_server::tcp_server(int port, int id, int num_threads, if (listen_fd == -1) { log(NORMAL, "FATAL: couldn't open server socket: %s\n", strerror(errno)); - exit(1); + _exit(1); } int option_value = 1; if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &option_value, @@ -1227,13 +1227,13 @@ tcp_server::tcp_server(int port, int id, int num_threads, log(NORMAL, "FATAL: couldn't set SO_REUSEADDR on listen " "socket: %s", strerror(errno)); - exit(1); + _exit(1); } if (fcntl(listen_fd, F_SETFL, O_NONBLOCK) != 0) { log(NORMAL, "FATAL: couldn't set O_NONBLOCK on listen " "socket: %s", strerror(errno)); - exit(1); + _exit(1); } sockaddr_in_union addr; if (inet_family == AF_INET) { @@ -1248,12 +1248,12 @@ tcp_server::tcp_server(int port, int id, int num_threads, if (bind(listen_fd, &addr.sa, sizeof(addr)) == -1) { log(NORMAL, "FATAL: couldn't bind to port %d: %s\n", port, strerror(errno)); - exit(1); + _exit(1); } if (listen(listen_fd, 1000) == -1) { log(NORMAL, "FATAL: couldn't listen on socket: %s", strerror(errno)); - exit(1); + _exit(1); } epoll_fd = epoll_create(10); @@ -1261,7 +1261,7 @@ tcp_server::tcp_server(int port, int id, int num_threads, log(NORMAL, "FATAL: couldn't create epoll instance for " "TCP server: %s\n", strerror(errno)); - exit(1); + _exit(1); } struct epoll_event ev; ev.events = EPOLLIN; @@ -1269,7 +1269,7 @@ tcp_server::tcp_server(int port, int id, int num_threads, if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0) { log(NORMAL, "FATAL: couldn't add listen socket to epoll: %s\n", strerror(errno)); - exit(1); + _exit(1); } metrics = new server_metrics(experiment); @@ -1296,7 +1296,7 @@ tcp_server::~tcp_server() if (pipe2(fds, 0) < 0) { log(NORMAL, "FATAL: couldn't create pipe to shutdown TCP " "server: %s\n", strerror(errno)); - exit(1); + _exit(1); } struct epoll_event ev; ev.events = EPOLLIN; @@ -1305,7 +1305,7 @@ tcp_server::~tcp_server() if (write(fds[1], "xxxx", 4) < 0) { log(NORMAL, "FATAL: couldn't write to TCP shutdown pipe: %s\n", strerror(errno)); - exit(1); + _exit(1); } for (size_t i = 0; i < threads.size(); i++) @@ -1367,7 +1367,7 @@ void tcp_server::server(int thread_id) continue; log(NORMAL, "FATAL: epoll_wait failed: %s\n", strerror(errno)); - exit(1); + _exit(1); } tt("epoll_wait returned %d events in server pid %d", num_events, pid); @@ -1411,7 +1411,7 @@ void tcp_server::accept(int epoll_fd) return; log(NORMAL, "FATAL: couldn't accept incoming TCP connection: " "%s\n", strerror(errno)); - exit(1); + _exit(1); } /* Make sure the connection appears to be coming from someone @@ -1447,7 +1447,7 @@ void tcp_server::accept(int epoll_fd) if (fd >= MAX_FDS) { log(NORMAL, "FATAL: TCP socket fd %d is greater than MAX_FDS\n", fd); - exit(1); + _exit(1); } spin_lock lock_guard(&fd_locks[fd]); tcp_connection *connection = new tcp_connection(fd, fd, port, @@ -1722,7 +1722,7 @@ client::client(int id, std::string& experiment) log(NORMAL, "FATAL: couldn't look up address " "for %s: %s\n", host, gai_strerror(status)); - exit(1); + _exit(1); } dest = reinterpret_cast (matching_addresses->ai_addr); @@ -1814,7 +1814,7 @@ int client::get_rinfo() "total_responses %ld, last_rinfo %d)\n", rinfos.size(), total_requests, total_responses.load(), last_rinfo); - exit(1); + _exit(1); } } } @@ -1955,7 +1955,7 @@ homa_client::homa_client(int id, std::string& experiment) fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); if (fd < 0) { log(NORMAL, "Couldn't open Homa socket: %s\n", strerror(errno)); - exit(1); + _exit(1); } buf_region = (char *) mmap(NULL, buf_size, PROT_READ|PROT_WRITE, @@ -1963,7 +1963,7 @@ homa_client::homa_client(int id, std::string& experiment) if (buf_region == MAP_FAILED) { printf("Couldn't mmap buffer region for homa_client id %d: %s\n", id, strerror(errno)); - exit(1); + _exit(1); } arg.start = (uintptr_t)buf_region; arg.length = buf_size; @@ -1972,7 +1972,7 @@ homa_client::homa_client(int id, std::string& experiment) if (status < 0) { printf("FATAL: error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); - exit(1); + _exit(1); } if (unloaded) { @@ -2062,13 +2062,13 @@ bool homa_client::wait_response(homa::receiver *receiver, uint64_t rpc_id) strerror(errno), rpc_id, print_address((union sockaddr_in_union *) receiver->src_addr())); - exit(1); + _exit(1); } header = receiver->get(0); if (header == nullptr) { log(NORMAL, "FATAL: Homa response message contained %lu bytes; " "need at least %lu", length, sizeof(*header)); - exit(1); + _exit(1); } uint64_t end_time = rdtsc(); tt("Received response, cid 0x%08x, id %x, %d bytes", @@ -2148,7 +2148,7 @@ void homa_client::sender() log(NORMAL, "FATAL: error in homa_send: %s (request " "length %d)\n", strerror(errno), header->length); - exit(1); + _exit(1); } requests[server]++; total_requests++; @@ -2212,7 +2212,7 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, log(NORMAL, "FATAL: error in homa_send: %s (request " "length %d)\n", strerror(errno), header->length); - exit(1); + _exit(1); } do { status = receiver->receive(0, rpc_id); @@ -2223,7 +2223,7 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, strerror(errno), rpc_id, print_address((union sockaddr_in_union *) receiver->src_addr())); - exit(1); + _exit(1); } return rdtsc() - start; } @@ -2371,7 +2371,7 @@ tcp_client::tcp_client(int id, std::string& experiment) if (epoll_fd < 0) { log(NORMAL, "FATAL: tcp_client couldn't create epoll " "instance: %s\n", strerror(errno)); - exit(1); + _exit(1); } for (uint32_t i = 0; i < server_addrs.size(); i++) { @@ -2380,7 +2380,7 @@ tcp_client::tcp_client(int id, std::string& experiment) log(NORMAL, "FATAL: couldn't open TCP client " "socket: %s\n", strerror(errno)); - exit(1); + _exit(1); } if (connect(fd, reinterpret_cast( &server_addrs[i]), @@ -2389,7 +2389,7 @@ tcp_client::tcp_client(int id, std::string& experiment) "to %s: %s\n", print_address(&server_addrs[i]), strerror(errno)); - exit(1); + _exit(1); } int flag = 1; setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &flag, sizeof(flag)); @@ -2398,7 +2398,7 @@ tcp_client::tcp_client(int id, std::string& experiment) "to server %s: %s", print_address(&server_addrs[i]), strerror(errno)); - exit(1); + _exit(1); } sockaddr_in_union addr; socklen_t length = sizeof(addr); @@ -2406,7 +2406,7 @@ tcp_client::tcp_client(int id, std::string& experiment) &length)) { log(NORMAL, "FATAL: getsockname failed for TCP client: " "%s\n", strerror(errno)); - exit(1); + _exit(1); } connections.emplace_back(new tcp_connection(fd, i, ntohs(addr.in4.sin_port), server_addrs[i])); @@ -2442,7 +2442,7 @@ tcp_client::~tcp_client() if (pipe2(fds, 0) < 0) { log(NORMAL, "FATAL: couldn't create pipe to shutdown TCP " "server: %s\n", strerror(errno)); - exit(1); + _exit(1); } struct epoll_event ev; ev.events = EPOLLIN; @@ -2451,7 +2451,7 @@ tcp_client::~tcp_client() if (write(fds[1], "xxxx", 4) < 0) { log(NORMAL, "FATAL: couldn't write to TCP shutdown " "pipe: %s\n", strerror(errno)); - exit(1); + _exit(1); } if (sending_thread) @@ -2595,7 +2595,7 @@ void tcp_client::receiver(int receiver_id) log(NORMAL, "FATAL: epoll_wait failed in tcp_client: " "%s\n", strerror(errno)); - exit(1); + _exit(1); } tt("epoll_wait returned %d events in client pid %d", num_events, pid); @@ -2629,7 +2629,7 @@ void tcp_client::read(tcp_connection *connection, int pid) if (error) { log(NORMAL, "FATAL: %s (client)\n", connection->error_message); - exit(1); + _exit(1); } } @@ -3508,13 +3508,13 @@ int main(int argc, char** argv) if (getrlimit(RLIMIT_NOFILE, &limits) != 0) { log(NORMAL, "FATAL: couldn't read file descriptor limits: " "%s\n", strerror(errno)); - exit(1); + _exit(1); } limits.rlim_cur = limits.rlim_max; if (setrlimit(RLIMIT_NOFILE, &limits) != 0) { log(NORMAL, "FATAL: couldn't increase file descriptor limit: " "%s\n", strerror(errno)); - exit(1); + _exit(1); } struct sigaction action; action.sa_sigaction = error_handler; @@ -3533,7 +3533,7 @@ int main(int argc, char** argv) for (int i = 1; i < argc; i++) words.emplace_back(argv[i]); if (!exec_words(words)) - exit(1); + _exit(1); /* Instead of going interactive, just print stats. * every second. From 632efffbc21da7a462e3566c6454c728ce94748a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Apr 2025 16:17:59 -0700 Subject: [PATCH 245/625] Restore --debug option for cperf.py --- util/cperf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/util/cperf.py b/util/cperf.py index 325e3729..d92e970b 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -654,6 +654,9 @@ def start_servers(exp, ids, options): % (options.tcp_server_ports, options.tcp_port_threads, options.protocol, exp, options.ipv6), ids) server_nodes = ids + if (options.debug): + print("Pausing for debug setup; type to continue: ", end="") + input() def run_experiment(name, clients, options): """ From b66df5a82883931554492b736cd6b999d5f336fe Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 4 Apr 2025 16:34:19 -0700 Subject: [PATCH 246/625] Comment out pr_err statements in homa_timer --- homa_timer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/homa_timer.c b/homa_timer.c index 938ea18b..39f667d2 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -143,17 +143,17 @@ void homa_check_rpc(struct homa_rpc *rpc) rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); /* Should be if (homa->verbose) */ - pr_notice("Homa client RESEND to %s:%d for id %llu, offset %d\n", - homa_print_ipv6_addr(&rpc->peer->addr), - rpc->dport, rpc->id, rpc->msgin.recv_end); + // pr_notice("Homa client RESEND to %s:%d for id %llu, offset %d\n", + // homa_print_ipv6_addr(&rpc->peer->addr), + // rpc->dport, rpc->id, rpc->msgin.recv_end); } else { tt_record4("Sent RESEND for server RPC id %llu, client 0x%x:%d offset %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); /* Should be if (homa->verbose) */ - pr_notice("Homa server RESEND to %s:%d for id %llu, offset %d\n", - homa_print_ipv6_addr(&rpc->peer->addr), - rpc->dport, rpc->id, rpc->msgin.recv_end); + // pr_notice("Homa server RESEND to %s:%d for id %llu, offset %d\n", + // homa_print_ipv6_addr(&rpc->peer->addr), + // rpc->dport, rpc->id, rpc->msgin.recv_end); } #endif /* See strip.py */ } From bef25f1bf9bbe30996c3fd37cc2ee81c3bb896c4 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 4 Apr 2025 16:36:00 -0700 Subject: [PATCH 247/625] Refactor pacer into separate file homa_pacer.c --- Makefile | 2 + homa_impl.h | 181 +---------- homa_offload.c | 3 +- homa_outgoing.c | 347 +------------------- homa_pacer.c | 493 ++++++++++++++++++++++++++++ homa_pacer.h | 220 +++++++++++++ homa_plumbing.c | 34 +- homa_rpc.c | 3 +- homa_rpc.h | 5 +- homa_utils.c | 66 +--- test/Makefile | 4 +- test/mock.c | 9 +- test/mock.h | 1 + test/unit_homa_grant.c | 5 +- test/unit_homa_incoming.c | 5 +- test/unit_homa_interest.c | 2 +- test/unit_homa_metrics.c | 2 +- test/unit_homa_offload.c | 2 +- test/unit_homa_outgoing.c | 454 +------------------------- test/unit_homa_pacer.c | 660 ++++++++++++++++++++++++++++++++++++++ test/unit_homa_peer.c | 2 +- test/unit_homa_plumbing.c | 2 +- test/unit_homa_pool.c | 2 +- test/unit_homa_rpc.c | 9 +- test/unit_homa_skb.c | 2 +- test/unit_homa_sock.c | 8 +- test/unit_homa_timer.c | 2 +- test/unit_homa_utils.c | 22 +- test/utils.c | 3 +- 29 files changed, 1464 insertions(+), 1086 deletions(-) create mode 100644 homa_pacer.c create mode 100644 homa_pacer.h create mode 100644 test/unit_homa_pacer.c diff --git a/Makefile b/Makefile index e25d9010..9e8099e7 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ HOMA_OBJS := homa_devel.o \ homa_incoming.o \ homa_interest.o \ homa_outgoing.o \ + homa_pacer.o \ homa_peer.o \ homa_pool.o \ homa_plumbing.o \ @@ -55,6 +56,7 @@ LINUX_SRC_DIR ?= ../net-next HOMA_TARGET ?= $(LINUX_SRC_DIR)/net/homa CP_HDRS := homa_impl.h \ homa_interest.h \ + homa_pacer.h \ homa_peer.h \ homa_pool.h \ homa_rpc.h \ diff --git a/homa_impl.h b/homa_impl.h index 37691c09..ef8a7ace 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -120,16 +120,6 @@ struct homa { */ atomic64_t next_outgoing_id; - /** - * @link_idle_time: The time, measured by sched_clock, at which we - * estimate that all of the packets we have passed to Linux for - * transmission will have been transmitted. May be in the past. - * This estimate assumes that only Homa is transmitting data, so - * it could be a severe underestimate if there is competing traffic - * from, say, TCP. Access only with atomic ops. - */ - atomic64_t link_idle_time ____cacheline_aligned_in_smp; - #ifndef __STRIP__ /* See strip.py */ /** * @grantable_lock: Used to synchronize access to grant-related @@ -248,61 +238,9 @@ struct homa { #endif /* See strip.py */ /** - * @pacer_mutex: Ensures that only one instance of homa_pacer_xmit - * runs at a time. Only used in "try" mode: never block on this. - */ - spinlock_t pacer_mutex ____cacheline_aligned_in_smp; - - /** - * @pacer_fifo_fraction: The fraction of time (in thousandths) when - * the pacer should transmit next from the oldest message, rather - * than the highest-priority message. Set externally via sysctl. - */ - int pacer_fifo_fraction; - - /** - * @pacer_fifo_count: When this becomes <= zero, it's time for the - * pacer to allow the oldest RPC to transmit. + * @pacer: Information related to the pacer; managed by homa_pacer.c. */ - int pacer_fifo_count; - - /** - * @pacer_wake_time: time (in sched_clock units) when the pacer last - * woke up (if the pacer is running) or 0 if the pacer is sleeping. - */ - u64 pacer_wake_time; - - /** - * @throttle_lock: Used to synchronize access to @throttled_rpcs. To - * insert or remove an RPC from throttled_rpcs, must first acquire - * the RPC's socket lock, then this lock. - */ - spinlock_t throttle_lock; - - /** - * @throttled_rpcs: Contains all homa_rpcs that have bytes ready - * for transmission, but which couldn't be sent without exceeding - * the queue limits for transmission. - */ - struct list_head throttled_rpcs; - - /** - * @throttle_add: The time (in sched_clock() units) when the most - * recent RPC was added to @throttled_rpcs. - */ - u64 throttle_add; - - /** - * @throttle_min_bytes: If a packet has fewer bytes than this, then it - * bypasses the throttle mechanism and is transmitted immediately. - * We have this limit because for very small packets we can't keep - * up with the NIC (we're limited by CPU overheads); there's no - * need for throttling and going through the throttle mechanism - * adds overhead, which slows things down. At least, that's the - * hypothesis (needs to be verified experimentally!). Set externally - * via sysctl. - */ - int throttle_min_bytes; + struct homa_pacer *pacer; /** * @prev_default_port: The most recent port number assigned from @@ -394,15 +332,7 @@ struct homa { * same as @window_param. */ int window_param; -#endif /* See strip.py */ - /** - * @link_mbps: The raw bandwidth of the network uplink, in - * units of 1e06 bits per second. Set externally via sysctl. - */ - int link_mbps; - -#ifndef __STRIP__ /* See strip.py */ /** * @poll_usecs: Amount of time (in microseconds) that a thread * will spend busy-waiting for an incoming messages before @@ -543,40 +473,6 @@ struct homa { */ int max_dead_buffs; - /** - * @pacer_exit: true means that the pacer thread should exit as - * soon as possible. - */ - bool pacer_exit; - - /** - * @pacer_wait_queue: Used to block the pacer thread when there - * are no throttled RPCs. - */ - struct wait_queue_head pacer_wait_queue; - - /** - * @max_nic_queue_ns: Limits the NIC queue length: we won't queue - * up a packet for transmission if link_idle_time is this many - * nanoseconds in the future (or more). Set externally via sysctl. - */ - int max_nic_queue_ns; - - /** - * @pacer_kthread: Kernel thread that transmits packets from - * throttled_rpcs in a way that limits queue buildup in the - * NIC. - */ - struct task_struct *pacer_kthread; - - /** - * @ns_per_mbyte: the number of ns that it takes to transmit - * 10**6 bytes on our uplink. This is actually a slight overestimate - * of the value, to ensure that we don't underestimate NIC queue - * length and queue too many packets. - */ - u32 ns_per_mbyte; - #ifndef __STRIP__ /* See strip.py */ /** * @verbose: Nonzero enables additional logging. Set externally via @@ -818,40 +714,6 @@ static inline void homa_set_doff(struct homa_data_hdr *h, int size) h->common.doff = size << 2; } -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_throttle_lock() - Acquire the throttle lock. If the lock - * isn't immediately available, record stats on the waiting time. - * @homa: Overall data about the Homa protocol implementation. - */ -static inline void homa_throttle_lock(struct homa *homa) - __acquires(&homa->throttle_lock) -{ - if (!spin_trylock_bh(&homa->throttle_lock)) - homa_throttle_lock_slow(homa); -} -#else /* See strip.py */ -/** - * homa_throttle_lock() - Acquire the throttle lock. - * @homa: Overall data about the Homa protocol implementation. - */ -static inline void homa_throttle_lock(struct homa *homa) - __acquires(&homa->throttle_lock) -{ - spin_lock_bh(&homa->throttle_lock); -} -#endif /* See strip.py */ - -/** - * homa_throttle_unlock() - Release the throttle lock. - * @homa: Overall data about the Homa protocol implementation. - */ -static inline void homa_throttle_unlock(struct homa *homa) - __releases(&homa->throttle_lock) -{ - spin_unlock_bh(&homa->throttle_lock); -} - /** skb_is_ipv6() - Return true if the packet is encapsulated with IPv6, * false otherwise (presumably it's IPv4). */ @@ -959,12 +821,9 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb); -void homa_add_to_throttled(struct homa_rpc *rpc); int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb); int homa_bind(struct socket *sk, struct sockaddr *addr, int addr_len); -int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, - bool force); void homa_close(struct sock *sock, long timeout); int homa_copy_to_user(struct homa_rpc *rpc); void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); @@ -984,7 +843,7 @@ int homa_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int homa_hash(struct sock *sk); enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); -int homa_init(struct homa *homa); +int homa_init(struct homa *homa, struct net *net); int homa_ioctl(struct sock *sk, int cmd, int *karg); int homa_load(void); int homa_message_out_fill(struct homa_rpc *rpc, @@ -997,14 +856,10 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, int length, int max_seg_data); int homa_net_init(struct net *net); void homa_net_exit(struct net *net); -int homa_pacer_main(void *transport); -void homa_pacer_stop(struct homa *homa); -void homa_pacer_xmit(struct homa *homa); __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); -void homa_remove_from_throttled(struct homa_rpc *rpc); void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk); void homa_rpc_abort(struct homa_rpc *crpc, int error); @@ -1046,10 +901,8 @@ int homa_dointvec(const struct ctl_table *table, int write, #endif void homa_incoming_sysctl_changed(struct homa *homa); int homa_ioc_abort(struct sock *sk, int *karg); -void homa_log_throttled(struct homa *homa); int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched); -void homa_outgoing_sysctl_changed(struct homa *homa); void homa_prios_changed(struct homa *homa); void homa_resend_data(struct homa_rpc *rpc, int start, int end, int priority); @@ -1074,33 +927,6 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end); void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc); #endif /* See strip.py */ -/** - * homa_check_pacer() - This method is invoked at various places in Homa to - * see if the pacer needs to transmit more packets and, if so, transmit - * them. It's needed because the pacer thread may get descheduled by - * Linux, result in output stalls. - * @homa: Overall data about the Homa protocol implementation. No locks - * should be held when this function is invoked. - * @softirq: Nonzero means this code is running at softirq (bh) level; - * zero means it's running in process context. - */ -static inline void homa_check_pacer(struct homa *homa, int softirq) -{ - if (list_empty(&homa->throttled_rpcs)) - return; - - /* The ">> 1" in the line below gives homa_pacer_main the first chance - * to queue new packets; if the NIC queue becomes more than half - * empty, then we will help out here. - */ - if ((sched_clock() + (homa->max_nic_queue_ns >> 1)) < - atomic64_read(&homa->link_idle_time)) - return; - tt_record("homa_check_pacer calling homa_pacer_xmit"); - homa_pacer_xmit(homa); - INC_METRIC(pacer_needed_help, 1); -} - /** * homa_from_net() - Return the struct homa associated with a particular * struct net. @@ -1134,5 +960,4 @@ static inline struct homa *homa_from_skb(struct sk_buff *skb) return (struct homa *)net_generic(dev_net(skb->dev), homa_net_id); } -extern struct completion homa_pacer_kthread_done; #endif /* _HOMA_IMPL_H */ diff --git a/homa_offload.c b/homa_offload.c index 5355512a..1ee41a9b 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_offload.h" +#include "homa_pacer.h" DEFINE_PER_CPU(struct homa_offload_core, homa_offload_core); @@ -449,7 +450,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, homa_set_softirq_cpu(skb, smp_processor_id()); done: - homa_check_pacer(homa, 1); + homa_pacer_check(homa->pacer); offload_core->last_gro = sched_clock(); return result; diff --git a/homa_outgoing.c b/homa_outgoing.c index 8348675d..c73facb2 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" #ifndef __STRIP__ /* See strip.py */ @@ -377,7 +378,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) xmit) { #endif /* See strip.py */ tt_record1("waking up pacer for id %d", rpc->id); - homa_add_to_throttled(rpc); + homa_pacer_manage_rpc(rpc); } } tt_record2("finished copy from user space for id %d, length %d", @@ -612,11 +613,11 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) #endif /* See strip.py */ if ((rpc->msgout.length - rpc->msgout.next_xmit_offset) - >= homa->throttle_min_bytes) { - if (!homa_check_nic_queue(homa, skb, force)) { + >= homa->pacer->throttle_min_bytes) { + if (!homa_pacer_check_nic_q(homa->pacer, skb, force)) { tt_record1("homa_xmit_data adding id %u to throttle queue", rpc->id); - homa_add_to_throttled(rpc); + homa_pacer_manage_rpc(rpc); break; } } @@ -853,7 +854,8 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) new_homa_info->offset = offset; tt_record3("retransmitting offset %d, length %d, id %d", offset, seg_length, rpc->id); - homa_check_nic_queue(rpc->hsk->homa, new_skb, true); + homa_pacer_check_nic_q(rpc->hsk->homa->pacer, + new_skb, true); #ifndef __STRIP__ /* See strip.py */ __homa_xmit_data(new_skb, rpc, priority); #else /* See strip.py */ @@ -866,338 +868,3 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) resend_done: return; } - -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_outgoing_sysctl_changed() - Invoked whenever a sysctl value is changed; - * any output-related parameters that depend on sysctl-settable values. - * @homa: Overall data about the Homa protocol implementation. - */ -void homa_outgoing_sysctl_changed(struct homa *homa) -{ - u64 tmp; - - tmp = 8 * 1000ULL * 1000ULL * 1000ULL; - - /* Underestimate link bandwidth (overestimate time) by 1%. */ - tmp = tmp * 101 / 100; - do_div(tmp, homa->link_mbps); - homa->ns_per_mbyte = tmp; -} -#endif /* See strip.py */ - -/** - * homa_check_nic_queue() - This function is invoked before passing a packet - * to the NIC for transmission. It serves two purposes. First, it maintains - * an estimate of the NIC queue length. Second, it indicates to the caller - * whether the NIC queue is so full that no new packets should be queued - * (Homa's SRPT depends on keeping the NIC queue short). - * @homa: Overall data about the Homa protocol implementation. - * @skb: Packet that is about to be transmitted. - * @force: True means this packet is going to be transmitted - * regardless of the queue length. - * Return: Nonzero is returned if either the NIC queue length is - * acceptably short or @force was specified. 0 means that the - * NIC queue is at capacity or beyond, so the caller should delay - * the transmission of @skb. If nonzero is returned, then the - * queue estimate is updated to reflect the transmission of @skb. - */ -int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force) -{ - u64 idle, new_idle, clock, ns_for_packet; - int bytes; - - bytes = homa_get_skb_info(skb)->wire_bytes; - ns_for_packet = homa->ns_per_mbyte; - ns_for_packet *= bytes; - do_div(ns_for_packet, 1000000); - while (1) { - clock = sched_clock(); - idle = atomic64_read(&homa->link_idle_time); - if ((clock + homa->max_nic_queue_ns) < idle && !force && - !(homa->flags & HOMA_FLAG_DONT_THROTTLE)) - return 0; -#ifndef __STRIP__ /* See strip.py */ - if (!list_empty(&homa->throttled_rpcs)) - INC_METRIC(pacer_bytes, bytes); - if (idle < clock) { - if (homa->pacer_wake_time) { - u64 lost = (homa->pacer_wake_time > idle) - ? clock - homa->pacer_wake_time - : clock - idle; - INC_METRIC(pacer_lost_ns, lost); - tt_record1("pacer lost %d cycles", lost); - } - new_idle = clock + ns_for_packet; - } else { - new_idle = idle + ns_for_packet; - } -#else /* See strip.py */ - if (idle < clock) - new_idle = clock + ns_for_packet; - else - new_idle = idle + ns_for_packet; -#endif /* See strip.py */ - - /* This method must be thread-safe. */ - if (atomic64_cmpxchg_relaxed(&homa->link_idle_time, idle, - new_idle) == idle) - break; - } - return 1; -} - -/** - * homa_pacer_main() - Top-level function for the pacer thread. - * @transport: Pointer to struct homa. - * - * Return: Always 0. - */ -int homa_pacer_main(void *transport) -{ - struct homa *homa = (struct homa *)transport; - - while (1) { - if (homa->pacer_exit) - break; - homa_pacer_xmit(homa); - if (!list_empty(&homa->throttled_rpcs)) { - /* NIC queue is full; before calling pacer again, - * give other threads a chance to run (otherwise - * low-level packet processing such as softirq could - * get locked out). - */ - schedule(); - continue; - } - - tt_record("pacer sleeping"); - wait_event(homa->pacer_wait_queue, homa->pacer_exit || - !list_empty(&homa->throttled_rpcs)); - tt_record("pacer woke up"); - } - kthread_complete_and_exit(&homa_pacer_kthread_done, 0); - return 0; -} - -/** - * homa_pacer_xmit() - Transmit packets from the throttled list until - * either (a) the throttled list is empty or (b) the NIC queue has - * reached maximum allowable length. Note: this function may be invoked - * from either process context or softirq (BH) level. This function is - * invoked from multiple places, not just in the pacer thread. The reason - * for this is that (as of 10/2019) Linux's scheduling of the pacer thread - * is unpredictable: the thread may block for long periods of time (e.g., - * because it is assigned to the same CPU as a busy interrupt handler). - * This can result in poor utilization of the network link. So, this method - * gets invoked from other places as well, to increase the likelihood that we - * keep the link busy. Those other invocations are not guaranteed to happen, - * so the pacer thread provides a backstop. - * @homa: Overall data about the Homa protocol implementation. - */ -void homa_pacer_xmit(struct homa *homa) -{ - struct homa_rpc *rpc; - s64 queue_ns; - - /* Make sure only one instance of this function executes at a time. */ - if (!spin_trylock_bh(&homa->pacer_mutex)) - return; - - homa->pacer_wake_time = sched_clock(); - while (1) { - queue_ns = atomic64_read(&homa->link_idle_time) - sched_clock(); - if (queue_ns >= homa->max_nic_queue_ns) - break; - if (list_empty(&homa->throttled_rpcs)) - break; - - /* Lock the first throttled RPC. This may not be possible - * because we have to hold throttle_lock while locking - * the RPC; that means we can't wait for the RPC lock because - * of lock ordering constraints (see sync.txt). Thus, if - * the RPC lock isn't available, do nothing. Holding the - * throttle lock while locking the RPC is important because - * it keeps the RPC from being deleted before it can be locked. - */ - homa_throttle_lock(homa); - homa->pacer_fifo_count -= homa->pacer_fifo_fraction; - if (homa->pacer_fifo_count <= 0) { - struct homa_rpc *cur; - u64 oldest = ~0; - - homa->pacer_fifo_count += 1000; - rpc = NULL; - list_for_each_entry(cur, &homa->throttled_rpcs, - throttled_links) { - if (cur->msgout.init_ns < oldest) { - rpc = cur; - oldest = cur->msgout.init_ns; - } - } - } else { - rpc = list_first_entry_or_null(&homa->throttled_rpcs, - struct homa_rpc, - throttled_links); - } - if (!rpc) { - homa_throttle_unlock(homa); - break; - } - if (!homa_rpc_try_lock(rpc)) { - homa_throttle_unlock(homa); - INC_METRIC(pacer_skipped_rpcs, 1); - break; - } - homa_throttle_unlock(homa); - - tt_record4("pacer calling homa_xmit_data for rpc id %llu, port %d, offset %d, bytes_left %d", - rpc->id, rpc->hsk->port, - rpc->msgout.next_xmit_offset, - rpc->msgout.length - rpc->msgout.next_xmit_offset); - homa_xmit_data(rpc, true); - - /* Note: rpc->state could be RPC_DEAD here, but the code - * below should work anyway. - */ -#ifndef __STRIP__ /* See strip.py */ - if (!*rpc->msgout.next_xmit || rpc->msgout.next_xmit_offset >= - rpc->msgout.granted) { -#else /* See strip.py */ - if (!*rpc->msgout.next_xmit) { -#endif /* See strip.py */ - /* No more data can be transmitted from this message - * (right now), so remove it from the throttled list. - */ - homa_throttle_lock(homa); - if (!list_empty(&rpc->throttled_links)) { - tt_record2("pacer removing id %d from throttled list, offset %d", - rpc->id, rpc->msgout.next_xmit_offset); - list_del_init(&rpc->throttled_links); - } - homa_throttle_unlock(homa); - } - homa_rpc_unlock(rpc); - } - INC_METRIC(pacer_ns, sched_clock() - homa->pacer_wake_time); - homa->pacer_wake_time = 0; - spin_unlock_bh(&homa->pacer_mutex); -} - -/** - * homa_pacer_stop() - Will cause the pacer thread to exit (waking it up - * if necessary); doesn't return until after the pacer thread has exited. - * @homa: Overall data about the Homa protocol implementation. - */ -void homa_pacer_stop(struct homa *homa) -{ - homa->pacer_exit = true; - wake_up(&homa->pacer_wait_queue); - kthread_stop(homa->pacer_kthread); - homa->pacer_kthread = NULL; -} - -/** - * homa_add_to_throttled() - Make sure that an RPC is on the throttled list - * and wake up the pacer thread if necessary. - * @rpc: RPC with outbound packets that have been granted but can't be - * sent because of NIC queue restrictions. Must be locked by caller. - */ -void homa_add_to_throttled(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) -{ - struct homa *homa = rpc->hsk->homa; - struct homa_rpc *candidate; - int bytes_left; - int checks = 0; - u64 now; - - if (!list_empty(&rpc->throttled_links)) - return; - now = sched_clock(); -#ifndef __STRIP__ /* See strip.py */ - if (!list_empty(&homa->throttled_rpcs)) - INC_METRIC(throttled_ns, now - homa->throttle_add); -#endif /* See strip.py */ - homa->throttle_add = now; - bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; - homa_throttle_lock(homa); - list_for_each_entry(candidate, &homa->throttled_rpcs, - throttled_links) { - int bytes_left_cand; - - checks++; - - /* Watch out: the pacer might have just transmitted the last - * packet from candidate. - */ - bytes_left_cand = candidate->msgout.length - - candidate->msgout.next_xmit_offset; - if (bytes_left_cand > bytes_left) { - list_add_tail(&rpc->throttled_links, - &candidate->throttled_links); - goto done; - } - } - list_add_tail(&rpc->throttled_links, &homa->throttled_rpcs); -done: - homa_throttle_unlock(homa); - wake_up(&homa->pacer_wait_queue); - INC_METRIC(throttle_list_adds, 1); - INC_METRIC(throttle_list_checks, checks); -// tt_record("woke up pacer thread"); -} - -/** - * homa_remove_from_throttled() - Make sure that an RPC is not on the - * throttled list. - * @rpc: RPC of interest. - */ -void homa_remove_from_throttled(struct homa_rpc *rpc) -{ - if (unlikely(!list_empty(&rpc->throttled_links))) { - UNIT_LOG("; ", "removing id %llu from throttled list", rpc->id); - homa_throttle_lock(rpc->hsk->homa); - list_del(&rpc->throttled_links); -#ifndef __STRIP__ /* See strip.py */ - if (list_empty(&rpc->hsk->homa->throttled_rpcs)) - INC_METRIC(throttled_ns, sched_clock() - - rpc->hsk->homa->throttle_add); -#endif /* See strip.py */ - homa_throttle_unlock(rpc->hsk->homa); - INIT_LIST_HEAD(&rpc->throttled_links); - } -} - -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_log_throttled() - Print information to the system log about the - * RPCs on the throttled list. - * @homa: Overall information about the Homa transport. - */ -void homa_log_throttled(struct homa *homa) -{ - struct homa_rpc *rpc; - s64 bytes = 0; - int rpcs = 0; - - pr_notice("Printing throttled list\n"); - homa_throttle_lock(homa); - list_for_each_entry_rcu(rpc, &homa->throttled_rpcs, throttled_links) { - rpcs++; - if (!homa_rpc_try_lock(rpc)) { - pr_notice("Skipping throttled RPC: locked\n"); - continue; - } - if (*rpc->msgout.next_xmit) - bytes += rpc->msgout.length - - rpc->msgout.next_xmit_offset; - if (rpcs <= 20) - homa_rpc_log(rpc); - homa_rpc_unlock(rpc); - } - homa_throttle_unlock(homa); - pr_notice("Finished printing throttle list: %d rpcs, %lld bytes\n", - rpcs, bytes); -} -#endif /* See strip.py */ diff --git a/homa_pacer.c b/homa_pacer.c new file mode 100644 index 00000000..1946804f --- /dev/null +++ b/homa_pacer.c @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: BSD-2-Clause + +/* This file implements the Homa pacer, which implements SRPT for packet + * output. In order to do that, it throttles packet transmission to prevent + * the buildup of large queues in the NIC. + */ + +#include "homa_pacer.h" +#include "homa_rpc.h" + +#ifndef __STRIP__ /* See strip.py */ +/* Used to enable sysctl access to pacer-specific configuration parameters. The + * @data fields are actually offsets within a struct homa_pacer; these are + * converted to pointers into a net-specific struct homa later. + */ +#define OFFSET(field) ((void *) offsetof(struct homa_pacer, field)) +static struct ctl_table pacer_ctl_table[] = { + { + .procname = "link_mbps", + .data = OFFSET(link_mbps), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_pacer_dointvec + }, + { + .procname = "max_nic_queue_ns", + .data = OFFSET(max_nic_queue_ns), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_pacer_dointvec + }, + { + .procname = "pacer_fifo_fraction", + .data = OFFSET(fifo_fraction), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_pacer_dointvec + }, + { + .procname = "throttle_min_bytes", + .data = OFFSET(throttle_min_bytes), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_pacer_dointvec + }, +}; +#endif /* See strip.py */ + +/** + * homa_pacer_new() - Allocate and initialize a new pacer object, which + * will hold pacer-related information for @homa. + * @homa: Homa transport that the pacer will be associated with. + * @net: Network namespace that @homa is associated with. + * Return: A pointer to the new struct pacer, or a negative errno. + */ +struct homa_pacer *homa_pacer_new(struct homa *homa, struct net *net) +{ + struct homa_pacer *pacer; + int err; + + pacer = kmalloc(sizeof(*pacer), GFP_KERNEL | __GFP_ZERO); + if (!pacer) + return ERR_PTR(-ENOMEM); + pacer->homa = homa; + spin_lock_init(&pacer->mutex); + pacer->fifo_count = 1000; + spin_lock_init(&pacer->throttle_lock); + INIT_LIST_HEAD_RCU(&pacer->throttled_rpcs); + pacer->fifo_fraction = 50; + pacer->max_nic_queue_ns = 5000; + pacer->link_mbps = 25000; + pacer->throttle_min_bytes = 1000; + pacer->exit = false; + init_waitqueue_head(&pacer->wait_queue); + pacer->kthread = kthread_run(homa_pacer_main, pacer, "homa_pacer"); + if (IS_ERR(pacer->kthread)) { + err = PTR_ERR(pacer->kthread); + pr_err("Homa couldn't create pacer thread: error %d\n", err); + goto error; + } + init_completion(&pacer->kthread_done); + atomic64_set(&pacer->link_idle_time, sched_clock()); + +#ifndef __STRIP__ /* See strip.py */ + pacer->sysctl_header = register_net_sysctl(net, "net/homa", + pacer_ctl_table); + if (!pacer->sysctl_header) { + err = -ENOMEM; + pr_err("couldn't register sysctl parameters for Homa pacer\n"); + goto error; + } +#endif /* See strip.py */ + homa_pacer_update_sysctl_deps(pacer); + return pacer; + +error: + homa_pacer_destroy(pacer); + return ERR_PTR(err); +} + +/** + * homa_pacer_destroy() - Cleanup and destroy the pacer object for a Homa + * transport. + * @pacer: Object to destroy; caller must not reference the object + * again once this function returns. + */ +void homa_pacer_destroy(struct homa_pacer *pacer) +{ + pacer->exit = true; +#ifndef __STRIP__ /* See strip.py */ + if (pacer->sysctl_header) { + unregister_net_sysctl_table(pacer->sysctl_header); + pacer->sysctl_header = NULL; + } +#endif /* See strip.py */ + if (pacer->kthread) { + wake_up(&pacer->wait_queue); + kthread_stop(pacer->kthread); + wait_for_completion(&pacer->kthread_done); + pacer->kthread = NULL; + } + kfree(pacer); +} + +/** + * homa_pacer_check_nic_q() - This function is invoked before passing a + * packet to the NIC for transmission. It serves two purposes. First, it + * maintains an estimate of the NIC queue length. Second, it indicates to + * the caller whether the NIC queue is so full that no new packets should be + * queued (Homa's SRPT depends on keeping the NIC queue short). + * @pacer: Pacer information for a Homa transport. + * @skb: Packet that is about to be transmitted. + * @force: True means this packet is going to be transmitted + * regardless of the queue length. + * Return: Nonzero is returned if either the NIC queue length is + * acceptably short or @force was specified. 0 means that the + * NIC queue is at capacity or beyond, so the caller should delay + * the transmission of @skb. If nonzero is returned, then the + * queue estimate is updated to reflect the transmission of @skb. + */ +int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, + bool force) +{ + u64 idle, new_idle, clock, ns_for_packet; + int bytes; + + bytes = homa_get_skb_info(skb)->wire_bytes; + ns_for_packet = pacer->ns_per_mbyte; + ns_for_packet *= bytes; + do_div(ns_for_packet, 1000000); + while (1) { + clock = sched_clock(); + idle = atomic64_read(&pacer->link_idle_time); + if ((clock + pacer->max_nic_queue_ns) < idle && !force && + !(pacer->homa->flags & HOMA_FLAG_DONT_THROTTLE)) + return 0; +#ifndef __STRIP__ /* See strip.py */ + if (!list_empty(&pacer->throttled_rpcs)) + INC_METRIC(pacer_bytes, bytes); + if (idle < clock) { + if (pacer->wake_time) { + u64 lost = (pacer->wake_time > idle) + ? clock - pacer->wake_time + : clock - idle; + INC_METRIC(pacer_lost_ns, lost); + tt_record1("pacer lost %d cycles", lost); + } + new_idle = clock + ns_for_packet; + } else { + new_idle = idle + ns_for_packet; + } +#else /* See strip.py */ + if (idle < clock) + new_idle = clock + ns_for_packet; + else + new_idle = idle + ns_for_packet; +#endif /* See strip.py */ + + /* This method must be thread-safe. */ + if (atomic64_cmpxchg_relaxed(&pacer->link_idle_time, idle, + new_idle) == idle) + break; + } + return 1; +} + +/** + * homa_pacer_main() - Top-level function for the pacer thread. + * @arg: Pointer to pacer struct. + * + * Return: Always 0. + */ +int homa_pacer_main(void *arg) +{ + struct homa_pacer *pacer = arg; + + while (1) { + if (pacer->exit) + break; + pacer->wake_time = sched_clock(); + homa_pacer_xmit(pacer); + INC_METRIC(pacer_ns, sched_clock() - pacer->wake_time); + pacer->wake_time = 0; + if (!list_empty(&pacer->throttled_rpcs)) { + /* NIC queue is full; before calling pacer again, + * give other threads a chance to run (otherwise + * low-level packet processing such as softirq could + * get locked out). + */ + schedule(); + continue; + } + + tt_record("pacer sleeping"); + wait_event(pacer->wait_queue, pacer->exit || + !list_empty(&pacer->throttled_rpcs)); + tt_record("pacer woke up"); + } + kthread_complete_and_exit(&pacer->kthread_done, 0); + return 0; +} + +/** + * homa_pacer_xmit() - Transmit packets from the throttled list until + * either (a) the throttled list is empty or (b) the NIC queue has + * reached maximum allowable length. Note: this function may be invoked + * from either process context or softirq (BH) level. This function is + * invoked from multiple places, not just in the pacer thread. The reason + * for this is that (as of 10/2019) Linux's scheduling of the pacer thread + * is unpredictable: the thread may block for long periods of time (e.g., + * because it is assigned to the same CPU as a busy interrupt handler). + * This can result in poor utilization of the network link. So, this method + * gets invoked from other places as well, to increase the likelihood that we + * keep the link busy. Those other invocations are not guaranteed to happen, + * so the pacer thread provides a backstop. + * @pacer: Pacer information for a Homa transport. + */ +void homa_pacer_xmit(struct homa_pacer *pacer) +{ + struct homa_rpc *rpc; + s64 queue_ns; + + /* Make sure only one instance of this function executes at a time. */ + if (!spin_trylock_bh(&pacer->mutex)) + return; + + while (1) { + queue_ns = atomic64_read(&pacer->link_idle_time) - sched_clock(); + if (queue_ns >= pacer->max_nic_queue_ns) + break; + if (list_empty(&pacer->throttled_rpcs)) + break; + + /* Lock the first throttled RPC. This may not be possible + * because we have to hold throttle_lock while locking + * the RPC; that means we can't wait for the RPC lock because + * of lock ordering constraints (see sync.txt). Thus, if + * the RPC lock isn't available, do nothing. Holding the + * throttle lock while locking the RPC is important because + * it keeps the RPC from being deleted before it can be locked. + */ + homa_pacer_throttle_lock(pacer); + pacer->fifo_count -= pacer->fifo_fraction; + if (pacer->fifo_count <= 0) { + struct homa_rpc *cur; + u64 oldest = ~0; + + pacer->fifo_count += 1000; + rpc = NULL; + list_for_each_entry(cur, &pacer->throttled_rpcs, + throttled_links) { + if (cur->msgout.init_ns < oldest) { + rpc = cur; + oldest = cur->msgout.init_ns; + } + } + } else { + rpc = list_first_entry_or_null(&pacer->throttled_rpcs, + struct homa_rpc, + throttled_links); + } + if (!rpc) { + homa_pacer_throttle_unlock(pacer); + break; + } + if (!homa_rpc_try_lock(rpc)) { + homa_pacer_throttle_unlock(pacer); + INC_METRIC(pacer_skipped_rpcs, 1); + break; + } + homa_pacer_throttle_unlock(pacer); + + tt_record4("pacer calling homa_xmit_data for rpc id %llu, port %d, offset %d, bytes_left %d", + rpc->id, rpc->hsk->port, + rpc->msgout.next_xmit_offset, + rpc->msgout.length - rpc->msgout.next_xmit_offset); + homa_xmit_data(rpc, true); + + /* Note: rpc->state could be RPC_DEAD here, but the code + * below should work anyway. + */ +#ifndef __STRIP__ /* See strip.py */ + if (!*rpc->msgout.next_xmit || rpc->msgout.next_xmit_offset >= + rpc->msgout.granted) { +#else /* See strip.py */ + if (!*rpc->msgout.next_xmit) { +#endif /* See strip.py */ + /* No more data can be transmitted from this message + * (right now), so remove it from the throttled list. + */ + tt_record2("pacer removing id %d from throttled list, offset %d", + rpc->id, rpc->msgout.next_xmit_offset); + homa_pacer_unmanage_rpc(rpc); + } + homa_rpc_unlock(rpc); + } + spin_unlock_bh(&pacer->mutex); +} + +/** + * homa_pacer_manage_rpc() - Arrange for the pacer to transmit packets + * from this RPC (make sure that an RPC is on the throttled list and wake up + * the pacer thread if necessary). + * @rpc: RPC with outbound packets that have been granted but can't be + * sent because of NIC queue restrictions. Must be locked by caller. + */ +void homa_pacer_manage_rpc(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) +{ + struct homa_pacer *pacer = rpc->hsk->homa->pacer; + struct homa_rpc *candidate; + int bytes_left; + int checks = 0; + IF_NO_STRIP(u64 now); + + if (!list_empty(&rpc->throttled_links)) + return; + IF_NO_STRIP(now = sched_clock()); +#ifndef __STRIP__ /* See strip.py */ + if (!list_empty(&pacer->throttled_rpcs)) + INC_METRIC(throttled_ns, now - pacer->throttle_add); + pacer->throttle_add = now; +#endif /* See strip.py */ + bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; + homa_pacer_throttle_lock(pacer); + list_for_each_entry(candidate, &pacer->throttled_rpcs, + throttled_links) { + int bytes_left_cand; + + checks++; + + /* Watch out: the pacer might have just transmitted the last + * packet from candidate. + */ + bytes_left_cand = candidate->msgout.length - + candidate->msgout.next_xmit_offset; + if (bytes_left_cand > bytes_left) { + list_add_tail(&rpc->throttled_links, + &candidate->throttled_links); + goto done; + } + } + list_add_tail(&rpc->throttled_links, &pacer->throttled_rpcs); +done: + homa_pacer_throttle_unlock(pacer); + wake_up(&pacer->wait_queue); + INC_METRIC(throttle_list_adds, 1); + INC_METRIC(throttle_list_checks, checks); +// tt_record("woke up pacer thread"); +} + +/** + * homa_pacer_unmanage_rpc() - Make sure that an RPC is no longer managed + * by the pacer. + * @rpc: RPC of interest. + */ +void homa_pacer_unmanage_rpc(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) +{ + struct homa_pacer *pacer = rpc->hsk->homa->pacer; + + if (unlikely(!list_empty(&rpc->throttled_links))) { + UNIT_LOG("; ", "removing id %llu from throttled list", rpc->id); + homa_pacer_throttle_lock(pacer); + list_del_init(&rpc->throttled_links); +#ifndef __STRIP__ /* See strip.py */ + if (list_empty(&pacer->throttled_rpcs)) + INC_METRIC(throttled_ns, sched_clock() + - pacer->throttle_add); +#endif /* See strip.py */ + homa_pacer_throttle_unlock(pacer); + } +} + +/** + * homa_pacer_update_sysctl_deps() - Update any pacer fields that depend + * on values set by sysctl. This function is invoked anytime a pacer sysctl + * value is updated. + * @pacer: Pacer to update. + */ +void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) +{ + u64 tmp; + + tmp = 8 * 1000ULL * 1000ULL * 1000ULL; + + /* Underestimate link bandwidth (overestimate time) by 1%. */ + tmp = tmp * 101 / 100; + do_div(tmp, pacer->link_mbps); + pacer->ns_per_mbyte = tmp; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_pacer_dointvec() - This function is a wrapper around proc_dointvec. It + * is invoked to read and write pacer-related sysctl values. + * @table: sysctl table describing value to be read or written. + * @write: Nonzero means value is being written, 0 means read. + * @buffer: Address in user space of the input/output data. + * @lenp: Not exactly sure. + * @ppos: Not exactly sure. + * + * Return: 0 for success, nonzero for error. + */ +int homa_pacer_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct homa_pacer *pacer = homa_from_net(current->nsproxy->net_ns)->pacer; + struct ctl_table table_copy; + int result; + + /* Generate a new ctl_table that refers to a field in the + * net-specific struct homa. + */ + table_copy = *table; + table_copy.data = ((char *) pacer) + (uintptr_t) table_copy.data; + + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + if (write) + homa_pacer_update_sysctl_deps(pacer); + return result; +} + +/** + * homa_pacer_log_throttled() - Print information to the system log about the + * RPCs on the throttled list. + * @pacer: Pacer information for a Homa transport. + */ +void homa_pacer_log_throttled(struct homa_pacer *pacer) +{ + struct homa_rpc *rpc; + s64 bytes = 0; + int rpcs = 0; + + pr_notice("Printing throttled list\n"); + homa_pacer_throttle_lock(pacer); + list_for_each_entry_rcu(rpc, &pacer->throttled_rpcs, throttled_links) { + rpcs++; + if (!homa_rpc_try_lock(rpc)) { + pr_notice("Skipping throttled RPC: locked\n"); + continue; + } + if (*rpc->msgout.next_xmit) + bytes += rpc->msgout.length + - rpc->msgout.next_xmit_offset; + if (rpcs <= 20) + homa_rpc_log(rpc); + homa_rpc_unlock(rpc); + } + homa_pacer_throttle_unlock(pacer); + pr_notice("Finished printing throttle list: %d rpcs, %lld bytes\n", + rpcs, bytes); +} + +/** + * homa_pacer_throttle_lock_slow() - This function implements the slow path for + * acquiring the throttle lock. It is invoked when the lock isn't immediately + * available. It waits for the lock, but also records statistics about + * the waiting time. + * @pacer: Pacer information for a Homa transport. + */ +void homa_pacer_throttle_lock_slow(struct homa_pacer *pacer) + __acquires(&pacer->throttle_lock) +{ + u64 start = sched_clock(); + + tt_record("beginning wait for throttle lock"); + spin_lock_bh(&pacer->throttle_lock); + tt_record("ending wait for throttle lock"); + INC_METRIC(throttle_lock_misses, 1); + INC_METRIC(throttle_lock_miss_ns, sched_clock() - start); +} +#endif /* See strip.py */ diff --git a/homa_pacer.h b/homa_pacer.h new file mode 100644 index 00000000..d8eb8e7f --- /dev/null +++ b/homa_pacer.h @@ -0,0 +1,220 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file defines structs and functions related to the Homa pacer, + * which implements SRPT for packet output. In order to do that, it + * throttles packet transmission to prevent the buildup of + * large queues in the NIC. + */ + +#ifndef _HOMA_PACER_H +#define _HOMA_PACER_H + +#include "homa_impl.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_metrics.h" +#endif /* See strip.py */ + +/** + * struct homa_pacer - Contains information that the pacer users to + * manage packet output. There is one instance of this object stored + * in each struct homa. + */ +struct homa_pacer { + /** @homa: Transport that this pacer is associated with. */ + struct homa *homa; + + /** + * @mutex: Ensures that only one instance of homa_pacer_xmit + * runs at a time. Only used in "try" mode: never block on this. + */ + spinlock_t mutex; + + /** + * @fifo_count: When this becomes <= zero, it's time for the + * pacer to allow the oldest RPC to transmit. + */ + int fifo_count; + + /** + * @wake_time: time (in sched_clock units) when the pacer last + * woke up (if the pacer is running) or 0 if the pacer is sleeping. + */ + u64 wake_time; + + /** + * @throttle_lock: Used to synchronize access to @throttled_rpcs. Must + * hold when inserting or removing an RPC from throttled_rpcs. + */ + spinlock_t throttle_lock; + + /** + * @throttled_rpcs: Contains all homa_rpcs that have bytes ready + * for transmission, but which couldn't be sent without exceeding + * the NIC queue limit. + */ + struct list_head throttled_rpcs; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @throttle_add: The time (in sched_clock() units) when the most + * recent RPC was added to @throttled_rpcs. + */ + u64 throttle_add; +#endif /* See strip.py */ + + /** + * @fifo_fraction: Out of every 1000 packets transmitted by the + * pacer, this number will be transmitted from the oldest message + * rather than the highest-priority message. Set externally via + * sysctl. + */ + int fifo_fraction; + + /** + * @max_nic_queue_ns: Limits the NIC queue length: we won't queue + * up a packet for transmission if link_idle_time is this many + * nanoseconds in the future (or more). Set externally via sysctl. + */ + int max_nic_queue_ns; + + /** + * @link_mbps: The raw bandwidth of the network uplink, in + * units of 1e06 bits per second. Set externally via sysctl. + */ + int link_mbps; + + /** + * @ns_per_mbyte: the number of ns that it takes to transmit + * 10**6 bytes on our uplink. This is actually a slight overestimate + * of the value, to ensure that we don't underestimate NIC queue + * length and queue too many packets. + */ + u32 ns_per_mbyte; + + /** + * @throttle_min_bytes: If a packet has fewer bytes than this, then it + * bypasses the throttle mechanism and is transmitted immediately. + * We have this limit because for very small packets CPU overheads + * make it impossible to keep up with the NIC so (a) the NIC queue + * can't grow and (b) using the pacer would serialize all of these + * packets through a single core, which makes things even worse. + * Set externally via sysctl. + */ + int throttle_min_bytes; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @sysctl_header: Used to remove sysctl values when this structure + * is destroyed. + */ + struct ctl_table_header *sysctl_header; +#endif /* See strip.py */ + + /** + * @exit: true means that the pacer thread should exit as + * soon as possible. + */ + bool exit; + + /** + * @wait_queue: Used to block the pacer thread when there + * are no throttled RPCs. + */ + struct wait_queue_head wait_queue; + + /** + * @kthread: Kernel thread that transmits packets from + * throttled_rpcs in a way that limits queue buildup in the + * NIC. + */ + struct task_struct *kthread; + + /** + * @kthread_done: Used to wait for @kthread to exit. + */ + struct completion kthread_done; + + /** + * @link_idle_time: The time, measured by sched_clock, at which we + * estimate that all of the packets we have passed to the NIC for + * transmission will have been transmitted. May be in the past. + * This estimate assumes that only Homa is transmitting data, so + * it could be a severe underestimate if there is competing traffic + * from, say, TCP. Access only with atomic ops. + */ + atomic64_t link_idle_time ____cacheline_aligned_in_smp; +}; + +int homa_pacer_check_nic_q(struct homa_pacer *pacer, + struct sk_buff *skb, bool force); +int homa_pacer_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +void homa_pacer_destroy(struct homa_pacer *pacer); +void homa_pacer_unmanage_rpc(struct homa_rpc *rpc); +void homa_pacer_log_throttled(struct homa_pacer *pacer); +int homa_pacer_main(void *transport); +void homa_pacer_manage_rpc(struct homa_rpc *rpc); +struct homa_pacer *homa_pacer_new(struct homa *homa, struct net *net); +void homa_pacer_throttle_lock_slow(struct homa_pacer *pacer); +void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer); +void homa_pacer_xmit(struct homa_pacer *pacer); + +/** + * homa_pacer_check() - This method is invoked at various places in Homa to + * see if the pacer needs to transmit more packets and, if so, transmit + * them. It's needed because the pacer thread may get descheduled by + * Linux, result in output stalls. + * @pacer: Pacer information for a Homa transport. + */ +static inline void homa_pacer_check(struct homa_pacer *pacer) +{ + if (list_empty(&pacer->throttled_rpcs)) + return; + + /* The ">> 1" in the line below gives homa_pacer_main the first chance + * to queue new packets; if the NIC queue becomes more than half + * empty, then we will help out here. + */ + if ((sched_clock() + (pacer->max_nic_queue_ns >> 1)) < + atomic64_read(&pacer->link_idle_time)) + return; + tt_record("homa_check_pacer calling homa_pacer_xmit"); + homa_pacer_xmit(pacer); + INC_METRIC(pacer_needed_help, 1); +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_pacer_throttle_lock() - Acquire the throttle lock. If the lock + * isn't immediately available, record stats on the waiting time. + * @pacer: Pacer information for a Homa transport. + */ +static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) + __acquires(&homa->pacer.throttle_lock) +{ + if (!spin_trylock_bh(&pacer->throttle_lock)) + homa_pacer_throttle_lock_slow(pacer); +} +#else /* See strip.py */ +/** + * homa_pacer_throttle_lock() - Acquire the throttle lock. + * @pacer: Pacer information for a Homa transport. + */ +static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) + __acquires(&homa->pacer.throttle_lock) +{ + spin_lock_bh(&pacer->throttle_lock); +} +#endif /* See strip.py */ + +/** + * homa_pacer_throttle_unlock() - Release the throttle lock. + * @pacer: Pacer information for a Homa transport. + */ +static inline void homa_pacer_throttle_unlock(struct homa_pacer *pacer) + __releases(&pacer->throttle_lock) +{ + spin_unlock_bh(&pacer->throttle_lock); +} + +#endif /* _HOMA_PACER_H */ diff --git a/homa_plumbing.c b/homa_plumbing.c index 318c68e8..66a6c43b 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -8,6 +8,7 @@ #ifndef __STRIP__ /* See strip.py */ #include "homa_offload.h" #endif /* See strip.py */ +#include "homa_pacer.h" #include "homa_peer.h" #include "homa_pool.h" @@ -257,13 +258,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, - { - .procname = "link_mbps", - .data = OFFSET(link_mbps), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, { .procname = "max_dead_buffs", .data = OFFSET(max_dead_buffs), @@ -292,13 +286,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, - { - .procname = "max_nic_queue_ns", - .data = OFFSET(max_nic_queue_ns), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, { .procname = "max_incoming", .data = OFFSET(max_incoming), @@ -341,13 +328,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, - { - .procname = "pacer_fifo_fraction", - .data = OFFSET(pacer_fifo_fraction), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, { .procname = "poll_usecs", .data = OFFSET(poll_usecs), @@ -411,13 +391,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, - { - .procname = "throttle_min_bytes", - .data = OFFSET(throttle_min_bytes), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, { .procname = "timeout_resends", .data = OFFSET(timeout_resends), @@ -651,7 +624,7 @@ int homa_net_init(struct net *net) pr_notice("Homa attaching to net namespace\n"); - status = homa_init(homa); + status = homa_init(homa, net); if (status) goto homa_init_err; #ifndef __STRIP__ /* See strip.py */ @@ -1693,7 +1666,6 @@ int homa_dointvec(const struct ctl_table *table, int write, * dependent information). */ homa_incoming_sysctl_changed(homa); - homa_outgoing_sysctl_changed(homa); /* For this value, only call the method when this * particular value was written (don't want to increment @@ -1719,7 +1691,7 @@ int homa_dointvec(const struct ctl_table *table, int write, tt_record("Freezing because of sysctl"); tt_freeze(); } else if (homa->sysctl_action == 4) { - homa_log_throttled(homa); + homa_pacer_log_throttled(homa->pacer); } else if (homa->sysctl_action == 5) { tt_printk(); } else if (homa->sysctl_action == 6) { diff --git a/homa_rpc.c b/homa_rpc.c index d7bc5c11..f56e8423 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -4,6 +4,7 @@ #include "homa_impl.h" #include "homa_interest.h" +#include "homa_pacer.h" #include "homa_peer.h" #include "homa_pool.h" #ifndef __STRIP__ /* See strip.py */ @@ -310,7 +311,7 @@ void homa_rpc_end(struct homa_rpc *rpc) rpc->hsk->homa->max_dead_buffs = rpc->hsk->dead_skbs; homa_sock_unlock(rpc->hsk); - homa_remove_from_throttled(rpc); + homa_pacer_unmanage_rpc(rpc); } /** diff --git a/homa_rpc.h b/homa_rpc.h index 58ced478..e9b588d0 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -368,8 +368,9 @@ struct homa_rpc { #endif /* See strip.py */ /** - * @throttled_links: Used to link this RPC into homa->throttled_rpcs. - * If this RPC isn't in homa->throttled_rpcs, this is an empty + * @throttled_links: Used to link this RPC into + * homa->pacer.throttled_rpcs. If this RPC isn't in + * homa->pacer.throttled_rpcs, this is an empty * list pointing to itself. */ struct list_head throttled_links; diff --git a/homa_utils.c b/homa_utils.c index a82d0d0f..1e9b3397 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" #ifndef __STRIP__ /* See strip.py */ @@ -15,17 +16,16 @@ #include "homa_stub.h" #endif /* See strip.py */ -struct completion homa_pacer_kthread_done; - /** * homa_init() - Constructor for homa objects. * @homa: Object to initialize. + * @net: Network namespace that @homa is associated with. * * Return: 0 on success, or a negative errno if there was an error. Even * if an error occurs, it is safe (and necessary) to call * homa_destroy at some point. */ -int homa_init(struct homa *homa) +int homa_init(struct homa *homa, struct net *net) { int err; #ifndef __STRIP__ /* See strip.py */ @@ -36,20 +36,18 @@ int homa_init(struct homa *homa) #endif /* See strip.py */ memset(homa, 0, sizeof(*homa)); - init_completion(&homa_pacer_kthread_done); atomic64_set(&homa->next_outgoing_id, 2); - atomic64_set(&homa->link_idle_time, sched_clock()); #ifndef __STRIP__ /* See strip.py */ spin_lock_init(&homa->grantable_lock); INIT_LIST_HEAD(&homa->grantable_peers); homa->last_grantable_change = sched_clock(); #endif /* See strip.py */ - spin_lock_init(&homa->pacer_mutex); - homa->pacer_fifo_fraction = 50; - homa->pacer_fifo_count = 1; - spin_lock_init(&homa->throttle_lock); - INIT_LIST_HEAD_RCU(&homa->throttled_rpcs); - homa->throttle_min_bytes = 200; + homa->pacer = homa_pacer_new(homa, net); + if (IS_ERR(homa->pacer)) { + err = PTR_ERR(homa->pacer); + homa->pacer = NULL; + return err; + } homa->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL); if (!homa->port_map) { @@ -82,9 +80,6 @@ int homa_init(struct homa *homa) #ifndef __STRIP__ /* See strip.py */ homa->unsched_bytes = 40000; homa->window_param = 100000; -#endif /* See strip.py */ - homa->link_mbps = 25000; -#ifndef __STRIP__ /* See strip.py */ homa->poll_usecs = 50; homa->num_priorities = HOMA_MAX_PRIORITIES; for (i = 0; i < HOMA_MAX_PRIORITIES; i++) @@ -115,22 +110,11 @@ int homa_init(struct homa *homa) homa->request_ack_ticks = 2; homa->reap_limit = 10; homa->dead_buffs_limit = 5000; - homa->pacer_exit = false; - init_waitqueue_head(&homa->pacer_wait_queue); - homa->max_nic_queue_ns = 5000; - homa->pacer_kthread = kthread_run(homa_pacer_main, homa, - "homa_pacer"); - if (IS_ERR(homa->pacer_kthread)) { - err = PTR_ERR(homa->pacer_kthread); - homa->pacer_kthread = NULL; - pr_err("couldn't create homa pacer thread: error %d\n", err); - return err; - } - homa->wmem_max = 100000000; #ifndef __STRIP__ /* See strip.py */ homa->verbose = 0; #endif /* See strip.py */ homa->max_gso_size = 10000; + homa->wmem_max = 100000000; #ifndef __STRIP__ /* See strip.py */ homa->max_gro_skbs = 20; homa->gro_policy = HOMA_GRO_NORMAL; @@ -139,7 +123,6 @@ int homa_init(struct homa *homa) #endif /* See strip.py */ homa->bpage_lease_usecs = 10000; #ifndef __STRIP__ /* See strip.py */ - homa_outgoing_sysctl_changed(homa); homa_incoming_sysctl_changed(homa); #endif /* See strip.py */ return 0; @@ -155,10 +138,6 @@ void homa_destroy(struct homa *homa) #include "utils.h" unit_homa_destroy(homa); #endif /* __UNIT_TEST__ */ - if (homa->pacer_kthread) { - homa_pacer_stop(homa); - wait_for_completion(&homa_pacer_kthread_done); - } /* The order of the following statements matters! */ if (homa->port_map) { @@ -166,6 +145,10 @@ void homa_destroy(struct homa *homa) kfree(homa->port_map); homa->port_map = NULL; } + if (homa->pacer) { + homa_pacer_destroy(homa->pacer); + homa->pacer = NULL; + } if (homa->peers) { homa_peertab_destroy(homa->peers); kfree(homa->peers); @@ -229,24 +212,3 @@ void homa_spin(int ns) /* Empty loop body.*/ ; } - -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_throttle_lock_slow() - This function implements the slow path for - * acquiring the throttle lock. It is invoked when the lock isn't immediately - * available. It waits for the lock, but also records statistics about - * the waiting time. - * @homa: Overall data about the Homa protocol implementation. - */ -void homa_throttle_lock_slow(struct homa *homa) - __acquires(&homa->throttle_lock) -{ - u64 start = sched_clock(); - - tt_record("beginning wait for throttle lock"); - spin_lock_bh(&homa->throttle_lock); - tt_record("ending wait for throttle lock"); - INC_METRIC(throttle_lock_misses, 1); - INC_METRIC(throttle_lock_miss_ns, sched_clock() - start); -} -#endif /* See strip.py */ diff --git a/test/Makefile b/test/Makefile index c1ec9fb1..48150271 100644 --- a/test/Makefile +++ b/test/Makefile @@ -35,13 +35,14 @@ DEFS += -D__STRIP__ endif WARNS := -Wall -Wundef -Wno-trigraphs -Wno-sign-compare \ - -Wno-strict-aliasing -Werror + -Wno-strict-aliasing -Wunused-but-set-variable -Werror CFLAGS := $(WARNS) -Wstrict-prototypes -MD -g $(CINCLUDES) $(DEFS) CCFLAGS := -std=c++11 $(WARNS) -MD -g $(CCINCLUDES) $(DEFS) -fsanitize=address TEST_SRCS := unit_homa_incoming.c \ unit_homa_interest.c \ unit_homa_outgoing.c \ + unit_homa_pacer.c \ unit_homa_peer.c \ unit_homa_pool.c \ unit_homa_plumbing.c \ @@ -62,6 +63,7 @@ HOMA_SRCS := homa_devel.c \ homa_interest.c \ homa_incoming.c \ homa_outgoing.c \ + homa_pacer.c \ homa_peer.c \ homa_pool.c \ homa_plumbing.c \ diff --git a/test/mock.c b/test/mock.c index f1c0ce35..4d6e0eef 100644 --- a/test/mock.c +++ b/test/mock.c @@ -48,6 +48,7 @@ int mock_kmalloc_errors; int mock_kthread_create_errors; int mock_prepare_to_wait_errors; int mock_register_protosw_errors; +int mock_register_sysctl_errors; int mock_route_errors; int mock_spin_lock_held; int mock_trylock_errors; @@ -57,7 +58,7 @@ int mock_wait_intr_irq_errors; /* The return value from calls to signal_pending(). */ int mock_signal_pending; -/* Used as current task during tests. */ +/* Used as current task during tests. Also returned by kthread_run. */ struct task_struct mock_task; /* If a test sets this variable to nonzero, ip_queue_xmit will log @@ -901,11 +902,12 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), { if (mock_check_error(&mock_kthread_create_errors)) return ERR_PTR(-EACCES); - return NULL; + return &mock_task; } int kthread_stop(struct task_struct *k) { + unit_log_printf("; ", "kthread_stop"); return 0; } @@ -1667,6 +1669,8 @@ void mock_rcu_read_unlock(void) struct ctl_table_header *mock_register_net_sysctl(struct net *net, const char *path, struct ctl_table *table) { + if (mock_check_error(&mock_register_sysctl_errors)) + return NULL; return (struct ctl_table_header *)11111; } @@ -1962,6 +1966,7 @@ void mock_teardown(void) mock_kthread_create_errors = 0; mock_prepare_to_wait_errors = 0; mock_register_protosw_errors = 0; + mock_register_sysctl_errors = 0; mock_wait_intr_irq_errors = 0; mock_copy_to_user_dont_copy = 0; mock_bpage_size = 0x10000; diff --git a/test/mock.h b/test/mock.h index 05999fe0..efa5ab18 100644 --- a/test/mock.h +++ b/test/mock.h @@ -126,6 +126,7 @@ extern int mock_kmalloc_errors; extern int mock_kthread_create_errors; extern int mock_prepare_to_wait_errors; extern int mock_register_protosw_errors; +extern int mock_register_sysctl_errors; extern int mock_wait_intr_irq_errors; extern char mock_xmit_prios[]; extern int mock_log_wakeups; diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 3ed838f2..edd5cfbe 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -2,6 +2,7 @@ #include "homa_impl.h" #include "homa_grant.h" +#include "homa_pacer.h" #include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" @@ -75,12 +76,12 @@ FIXTURE_SETUP(homa_grant) self->server_port = 99; self->client_id = 1234; self->server_id = 1235; - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); self->homa.num_priorities = 1; self->homa.poll_usecs = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; - self->homa.pacer_fifo_fraction = 0; + self->homa.pacer->fifo_fraction = 0; self->homa.grant_fifo_fraction = 0; self->homa.window_param = 10000; self->homa.grant_window = 10000; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 31995db5..662ac6d4 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2,6 +2,7 @@ #include "homa_impl.h" #include "homa_interest.h" +#include "homa_pacer.h" #include "homa_peer.h" #include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 @@ -86,14 +87,14 @@ FIXTURE_SETUP(homa_incoming) self->server_port = 99; self->client_id = 1234; self->server_id = 1235; - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 1; self->homa.poll_usecs = 0; #endif /* See strip.py */ self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; - self->homa.pacer_fifo_fraction = 0; + self->homa.pacer->fifo_fraction = 0; #ifndef __STRIP__ /* See strip.py */ self->homa.grant_fifo_fraction = 0; self->homa.unsched_bytes = 10000; diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index 9c28e86a..bf8a0541 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -54,7 +54,7 @@ FIXTURE(homa_interest) { }; FIXTURE_SETUP(homa_interest) { - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c index d09c364b..5ee79d55 100644 --- a/test/unit_homa_metrics.c +++ b/test/unit_homa_metrics.c @@ -12,7 +12,7 @@ FIXTURE(homa_metrics) { }; FIXTURE_SETUP(homa_metrics) { - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); } FIXTURE_TEARDOWN(homa_metrics) diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index c661ffd5..6ae9d633 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -40,7 +40,7 @@ FIXTURE_SETUP(homa_offload) { int i; - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.unsched_bytes = 10000; diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index dfd95bee..0945199c 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" #ifndef __STRIP__ /* See strip.py */ @@ -27,7 +28,7 @@ static void unlock_hook(char *id) } /* The following hook function frees an RPC when it is locked. */ -void lock_free_hook(char *id) +static void lock_free_hook(char *id) { if (strcmp(id, "spin_lock") != 0) return; @@ -38,7 +39,7 @@ void lock_free_hook(char *id) } #ifdef __STRIP__ /* See strip.py */ -void mock_resend_data(struct homa_rpc *rpc, int start, int end, +static void mock_resend_data(struct homa_rpc *rpc, int start, int end, int priority) { homa_resend_data(rpc, start, end); @@ -47,17 +48,6 @@ void mock_resend_data(struct homa_rpc *rpc, int start, int end, mock_resend_data(rpc, start, end, priority); #endif /* See strip.py */ -static int hook_count; -static void remove_throttled_hook(char *id) { - if (strcmp(id, "spin_lock") != 0) - return; - if (hook_count <= 0) - return; - hook_count--; - if (hook_count == 0) - homa_remove_from_throttled(hook_rpc); -} - /* Compute the expected "truesize" value for a Homa packet, given * the number of bytes of message data in the packet. */ @@ -88,16 +78,15 @@ FIXTURE_SETUP(homa_outgoing) self->server_port = 99; self->client_id = 1234; self->server_id = 1235; - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); mock_ns = 10000; - atomic64_set(&self->homa.link_idle_time, 10000); - self->homa.ns_per_mbyte = 1000000; + self->homa.pacer->ns_per_mbyte = 1000000; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.window_param = 10000; - self->homa.pacer_fifo_fraction = 0; + self->homa.pacer->fifo_fraction = 0; #endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, self->client_port); self->server_addr.in6.sin6_family = AF_INET; @@ -832,9 +821,9 @@ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) self->server_port, self->client_id, 200, 1000); unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 11000); - self->homa.max_nic_queue_ns = 500; - self->homa.throttle_min_bytes = 250; + atomic64_set(&self->homa.pacer->link_idle_time, 11000); + self->homa.pacer->max_nic_queue_ns = 500; + self->homa.pacer->throttle_min_bytes = 250; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_xmit_data(crpc, false); EXPECT_STREQ("xmit DATA 200@0", unit_log_get()); @@ -852,8 +841,8 @@ TEST_F(homa_outgoing, homa_xmit_data__force) self->server_port, self->client_id+2, 5000, 1000); /* First, get an RPC on the throttled list. */ - atomic64_set(&self->homa.link_idle_time, 11000); - self->homa.max_nic_queue_ns = 3000; + atomic64_set(&self->homa.pacer->link_idle_time, 11000); + self->homa.pacer->max_nic_queue_ns = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_xmit_data(crpc1, false); unit_log_clear(); @@ -876,8 +865,8 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) self->server_port, self->client_id, 6000, 1000); unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 11000); - self->homa.max_nic_queue_ns = 3000; + atomic64_set(&self->homa.pacer->link_idle_time, 11000); + self->homa.pacer->max_nic_queue_ns = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_xmit_data(crpc, false); @@ -1124,420 +1113,3 @@ TEST_F(homa_outgoing, homa_resend_data__set_homa_info) "homa_info: wire_bytes 1538, data_bytes 1400, seg_length 1400, offset 8400", unit_log_get()); } - -#ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_outgoing, homa_outgoing_sysctl_changed) -{ - self->homa.link_mbps = 10000; - homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(808000, self->homa.ns_per_mbyte); - - self->homa.link_mbps = 1000; - homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(8080000, self->homa.ns_per_mbyte); - - self->homa.link_mbps = 40000; - homa_outgoing_sysctl_changed(&self->homa); - EXPECT_EQ(202000, self->homa.ns_per_mbyte); -} -#endif /* See strip.py */ - -TEST_F(homa_outgoing, homa_check_nic_queue__basics) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - mock_ns = 8000; - self->homa.max_nic_queue_ns = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - false)); - EXPECT_EQ(9500, atomic64_read(&self->homa.link_idle_time)); -} -TEST_F(homa_outgoing, homa_check_nic_queue__queue_full) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - mock_ns = 7999; - self->homa.max_nic_queue_ns = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(0, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - false)); - EXPECT_EQ(9000, atomic64_read(&self->homa.link_idle_time)); -} -TEST_F(homa_outgoing, homa_check_nic_queue__queue_full_but_force) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - mock_ns = 7999; - self->homa.max_nic_queue_ns = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - true)); - EXPECT_EQ(9500, atomic64_read(&self->homa.link_idle_time)); -} -TEST_F(homa_outgoing, homa_check_nic_queue__pacer_metrics) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - homa_add_to_throttled(crpc); - unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - self->homa.pacer_wake_time = 9800; - mock_ns = 10000; - self->homa.max_nic_queue_ns = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - true)); - EXPECT_EQ(10500, atomic64_read(&self->homa.link_idle_time)); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); - EXPECT_EQ(200, homa_metrics_per_cpu()->pacer_lost_ns); -#endif /* See strip.py */ -} -TEST_F(homa_outgoing, homa_check_nic_queue__queue_empty) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 500, 1000); - - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - unit_log_clear(); - atomic64_set(&self->homa.link_idle_time, 9000); - mock_ns = 10000; - self->homa.max_nic_queue_ns = 1000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - EXPECT_EQ(1, homa_check_nic_queue(&self->homa, crpc->msgout.packets, - true)); - EXPECT_EQ(10500, atomic64_read(&self->homa.link_idle_time)); -} - -/* Don't know how to unit test homa_pacer_main... */ - -TEST_F(homa_outgoing, homa_pacer_xmit__basics) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 5000, 1000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, - 10000, 1000); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+4, - 150000, 1000); - - homa_add_to_throttled(crpc1); - homa_add_to_throttled(crpc2); - homa_add_to_throttled(crpc3); - self->homa.max_nic_queue_ns = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", - unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 1234, next_offset 2800; " - "request id 1236, next_offset 0; " - "request id 1238, next_offset 0", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__pacer_already_active) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 10000, 1000); - - homa_add_to_throttled(crpc); - self->homa.max_nic_queue_ns = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - mock_trylock_errors = 1; - unit_log_clear(); - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 1234, next_offset 0", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__nic_queue_fills) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 10000, 1000); - - homa_add_to_throttled(crpc); - self->homa.max_nic_queue_ns = 2001; - mock_ns = 10000; - atomic64_set(&self->homa.link_idle_time, 12000); - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - homa_pacer_xmit(&self->homa); - - /* Just room for one packet before NIC queue fills. */ - EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 1234, next_offset 1400", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__queue_empty) -{ - self->homa.max_nic_queue_ns = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - homa_pacer_xmit(&self->homa); - unit_log_throttled(&self->homa); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__xmit_fifo) -{ - struct homa_rpc *crpc1, *crpc2, *crpc3; - - mock_ns = 10000; - crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, 2, 20000, 1000); - mock_ns = 11000; - crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, 4, 10000, 1000); - mock_ns = 12000; - crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, 6, 30000, 1000); - homa_add_to_throttled(crpc1); - homa_add_to_throttled(crpc2); - homa_add_to_throttled(crpc3); - - /* First attempt: pacer_fifo_count doesn't reach zero. */ - self->homa.max_nic_queue_ns = 1300; - self->homa.pacer_fifo_count = 200; - self->homa.pacer_fifo_fraction = 150; - mock_ns= 13000; - atomic64_set(&self->homa.link_idle_time, 10000); - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - mock_xmit_log_verbose = 1; - homa_pacer_xmit(&self->homa); - EXPECT_SUBSTR("id 4, message_length 10000, offset 0, data_length 1400", - unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 1400; " - "request id 2, next_offset 0; " - "request id 6, next_offset 0", unit_log_get()); - EXPECT_EQ(50, self->homa.pacer_fifo_count); - - /* Second attempt: pacer_fifo_count reaches zero. */ - atomic64_set(&self->homa.link_idle_time, 10000); - unit_log_clear(); - homa_pacer_xmit(&self->homa); - EXPECT_SUBSTR("id 2, message_length 20000, offset 0, data_length 1400", - unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 1400; " - "request id 2, next_offset 1400; " - "request id 6, next_offset 0", unit_log_get()); - EXPECT_EQ(900, self->homa.pacer_fifo_count); -} -TEST_F(homa_outgoing, homa_pacer_xmit__rpc_removed_from_queue_before_locked) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 10000, 1000); - - homa_add_to_throttled(crpc); - self->homa.max_nic_queue_ns = 10000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - unit_hook_register(remove_throttled_hook); - hook_rpc = crpc; - hook_count = 2; - homa_pacer_xmit(&self->homa); - - EXPECT_STREQ("removing id 1234 from throttled list", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__rpc_locked) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, - 5000, 1000); - - homa_add_to_throttled(crpc); - self->homa.max_nic_queue_ns = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - mock_trylock_errors = ~1; - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("", unit_log_get()); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_skipped_rpcs); -#endif /* See strip.py */ - unit_log_clear(); - mock_trylock_errors = 0; - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", - unit_log_get()); -} -TEST_F(homa_outgoing, homa_pacer_xmit__remove_from_queue) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 2, - 1000, 1000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 4, - 2000, 1000); - - homa_add_to_throttled(crpc1); - homa_add_to_throttled(crpc2); - self->homa.max_nic_queue_ns = 2000; - self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; - unit_log_clear(); - - /* First call completes id 2, but id 4 is still in the queue. */ - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("xmit DATA 1000@0; xmit DATA 1400@0", - unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 1400", unit_log_get()); - EXPECT_TRUE(list_empty(&crpc1->throttled_links)); - - /* Second call completes id 4, queue now empty. */ - unit_log_clear(); - self->homa.max_nic_queue_ns = 10000; - homa_pacer_xmit(&self->homa); - EXPECT_STREQ("xmit DATA 600@1400", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("", unit_log_get()); - EXPECT_TRUE(list_empty(&crpc2->throttled_links)); -} - -/* Don't know how to unit test homa_pacer_stop... */ - -TEST_F(homa_outgoing, homa_add_to_throttled__basics) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 2, 10000, 1000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 4, 5000, 1000); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 6, 15000, 1000); - struct homa_rpc *crpc4 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 8, 12000, 1000); - struct homa_rpc *crpc5 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, 10, 10000, 1000); - - /* Basics: add one RPC. */ - mock_log_wakeups = 1; - unit_log_clear(); - homa_add_to_throttled(crpc1); - EXPECT_STREQ("wake_up", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 2, next_offset 0", unit_log_get()); - - /* Check priority ordering. */ - homa_add_to_throttled(crpc2); - homa_add_to_throttled(crpc3); - homa_add_to_throttled(crpc4); - homa_add_to_throttled(crpc5); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 0; " - "request id 2, next_offset 0; " - "request id 10, next_offset 0; " - "request id 8, next_offset 0; " - "request id 6, next_offset 0", unit_log_get()); - - /* Don't reinsert if already present. */ - unit_log_clear(); - homa_add_to_throttled(crpc1); - EXPECT_STREQ("", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 4, next_offset 0; " - "request id 2, next_offset 0; " - "request id 10, next_offset 0; " - "request id 8, next_offset 0; " - "request id 6, next_offset 0", unit_log_get()); -} -#ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_outgoing, homa_add_to_throttled__inc_metrics) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 1000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+2, 10000, 1000); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id+4, 15000, 1000); - - homa_add_to_throttled(crpc1); - EXPECT_EQ(1, homa_metrics_per_cpu()->throttle_list_adds); - EXPECT_EQ(0, homa_metrics_per_cpu()->throttle_list_checks); - - homa_add_to_throttled(crpc2); - EXPECT_EQ(2, homa_metrics_per_cpu()->throttle_list_adds); - EXPECT_EQ(1, homa_metrics_per_cpu()->throttle_list_checks); - - homa_add_to_throttled(crpc3); - EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_adds); - EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_checks); -} -#endif /* See strip.py */ - -TEST_F(homa_outgoing, homa_remove_from_throttled) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 1000); - - homa_add_to_throttled(crpc); - EXPECT_FALSE(list_empty(&self->homa.throttled_rpcs)); - - // First attempt will remove. - unit_log_clear(); - homa_remove_from_throttled(crpc); - EXPECT_TRUE(list_empty(&self->homa.throttled_rpcs)); - EXPECT_STREQ("removing id 1234 from throttled list", unit_log_get()); - - // Second attempt: nothing to do. - unit_log_clear(); - homa_remove_from_throttled(crpc); - EXPECT_TRUE(list_empty(&self->homa.throttled_rpcs)); - EXPECT_STREQ("", unit_log_get()); -} diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c new file mode 100644 index 00000000..74e824cd --- /dev/null +++ b/test/unit_homa_pacer.c @@ -0,0 +1,660 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#include "homa_impl.h" +#include "homa_pacer.h" +#include "homa_rpc.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +static struct homa_rpc *hook_rpc; +static int hook_count; +static void unmanage_hook(char *id) { + if (strcmp(id, "spin_lock") != 0) + return; + if (hook_count <= 0) + return; + hook_count--; + if (hook_count == 0) + homa_pacer_unmanage_rpc(hook_rpc); +} + +static u64 hook_exit_ns; +static struct homa_pacer *hook_pacer; +static void exit_hook(char *id) { + mock_ns += mock_ns_tick; + if (mock_ns >= hook_exit_ns) + hook_pacer->exit = true; +} + +static void exit_idle_hook(char *id) { + if (strcmp(id, "schedule") == 0) + unit_log_printf("; ", "time %llu", mock_ns); + if (list_empty(&hook_pacer->throttled_rpcs)) + hook_pacer->exit = true; +} + +static void manage_hook(char *id) +{ + if (strcmp(id, "prepare_to_wait") == 0 && hook_rpc) { + homa_pacer_manage_rpc(hook_rpc); + hook_rpc = NULL; + } +} + +FIXTURE(homa_pacer) { + struct in6_addr client_ip[1]; + int client_port; + struct in6_addr server_ip[1]; + int server_port; + u64 client_id; + u64 server_id; + struct homa homa; + struct homa_sock hsk; +}; +FIXTURE_SETUP(homa_pacer) +{ + self->client_ip[0] = unit_get_in_addr("196.168.0.1"); + self->client_port = 40000; + self->server_ip[0] = unit_get_in_addr("1.2.3.4"); + self->server_port = 99; + self->client_id = 1234; + self->server_id = 1235; + homa_init(&self->homa, &mock_net); + mock_set_homa(&self->homa); + self->homa.pacer->ns_per_mbyte = 1000000; + self->homa.pacer->throttle_min_bytes = 0; +#ifndef __STRIP__ /* See strip.py */ + self->homa.pacer->fifo_fraction = 0; +#endif /* See strip.py */ + mock_sock_init(&self->hsk, &self->homa, self->client_port); +} +FIXTURE_TEARDOWN(homa_pacer) +{ + homa_destroy(&self->homa); + unit_teardown(); +} + +TEST_F(homa_pacer, homa_pacer_new__success) +{ + struct homa_pacer *pacer; + + pacer = homa_pacer_new(&self->homa, &mock_net); + EXPECT_FALSE(IS_ERR(pacer)); + EXPECT_EQ(&self->homa, pacer->homa); + homa_pacer_destroy(pacer); +} +TEST_F(homa_pacer, homa_pacer_new__cant_allocate_memory) +{ + struct homa_pacer *pacer; + + mock_kmalloc_errors = 1; + pacer = homa_pacer_new(&self->homa, &mock_net); + EXPECT_TRUE(IS_ERR(pacer)); + EXPECT_EQ(ENOMEM, -PTR_ERR(pacer)); +} +TEST_F(homa_pacer, homa_pacer_new__cant_create_pacer_thread) +{ + struct homa_pacer *pacer; + + mock_kthread_create_errors = 1; + pacer = homa_pacer_new(&self->homa, &mock_net); + EXPECT_TRUE(IS_ERR(pacer)); + EXPECT_EQ(EACCES, -PTR_ERR(pacer)); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_pacer, homa_pacer_new__cant_register_sysctls) +{ + struct homa_pacer *pacer; + + mock_register_sysctl_errors = 1; + pacer = homa_pacer_new(&self->homa, &mock_net); + EXPECT_TRUE(IS_ERR(pacer)); + EXPECT_EQ(ENOMEM, -PTR_ERR(pacer)); +} +#endif /* See strip.py */ + +TEST_F(homa_pacer, homa_pacer_destroy__basics) +{ + struct homa_pacer *pacer; + + pacer = homa_pacer_new(&self->homa, &mock_net); + EXPECT_FALSE(IS_ERR(pacer)); + unit_log_clear(); + homa_pacer_destroy(pacer); + EXPECT_STREQ("kthread_stop", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_destroy__no_thread) +{ + struct homa_pacer *pacer; + + pacer = homa_pacer_new(&self->homa, &mock_net); + EXPECT_FALSE(IS_ERR(pacer)); + pacer->kthread = NULL; + unit_log_clear(); + homa_pacer_destroy(pacer); + EXPECT_STREQ("", unit_log_get()); +} + +TEST_F(homa_pacer, homa_pacer_check_nic_q__success) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 500, 1000); + + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 9000); + mock_ns = 8000; + self->homa.pacer->max_nic_queue_ns = 1000; + EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, + crpc->msgout.packets, false)); + EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); +} +TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 500, 1000); + + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 9000); + mock_ns = 7999; + self->homa.pacer->max_nic_queue_ns = 1000; + EXPECT_EQ(0, homa_pacer_check_nic_q(self->homa.pacer, + crpc->msgout.packets, false)); + EXPECT_EQ(9000, atomic64_read(&self->homa.pacer->link_idle_time)); +} +TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full_but_force) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 500, 1000); + + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 9000); + mock_ns = 7999; + self->homa.pacer->max_nic_queue_ns = 1000; + EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, + crpc->msgout.packets, true)); + EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); +} +TEST_F(homa_pacer, homa_pacer_check_nic_q__pacer_metrics) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 500, 1000); + + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; + homa_pacer_manage_rpc(crpc); + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 9000); + self->homa.pacer->wake_time = 9800; + mock_ns = 10000; + self->homa.pacer->max_nic_queue_ns = 1000; + EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, + crpc->msgout.packets, true)); + EXPECT_EQ(10500, atomic64_read(&self->homa.pacer->link_idle_time)); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); + EXPECT_EQ(200, homa_metrics_per_cpu()->pacer_lost_ns); +#endif /* See strip.py */ +} +TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_empty) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 500, 1000); + + homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 9000); + mock_ns = 10000; + self->homa.pacer->max_nic_queue_ns = 1000; + EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, + crpc->msgout.packets, true)); + EXPECT_EQ(10500, atomic64_read(&self->homa.pacer->link_idle_time)); +} + +TEST_F(homa_pacer, homa_pacer_main__exit) +{ + unit_hook_register(exit_hook); + hook_pacer = self->homa.pacer; + hook_exit_ns = 5000; + mock_ns_tick = 200; + homa_pacer_main(self->homa.pacer); + EXPECT_TRUE(mock_ns >= 5000); +} +TEST_F(homa_pacer, homa_pacer_main__xmit_data) +{ + struct homa_rpc *crpc1, *crpc2; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+2, 10000, 1000); + + homa_pacer_manage_rpc(crpc1); + homa_pacer_manage_rpc(crpc2); + self->homa.pacer->max_nic_queue_ns = 3000; + mock_ns_tick = 200; + unit_hook_register(exit_idle_hook); + hook_pacer = self->homa.pacer; + unit_log_clear(); + homa_pacer_main(self->homa.pacer); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit DATA 1400@0; " + "xmit DATA 1400@1400; " + "xmit DATA 1400@2800; time 1600; time 2200; " + "xmit DATA 800@4200; " + "removing id 1234 from throttled list; time 3200; " + "xmit DATA 1400@0; time 4400; " + "xmit DATA 1400@1400; time 5600; time 6200; " + "xmit DATA 1400@2800; time 7400; " + "xmit DATA 1400@4200; time 8600; time 9200; " + "xmit DATA 1400@5600; time 10400; time 11000; " + "xmit DATA 1400@7000; time 12200; " + "xmit DATA 1400@8400; time 13400; time 14000; " + "xmit DATA 200@9800; " + "removing id 1236 from throttled list", + unit_log_get()); +#endif /* See strip.py */ +} +TEST_F(homa_pacer, homa_pacer_main__rpc_arrives_while_sleeping) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, + 5000, 1000); + + unit_hook_register(exit_hook); + hook_pacer = self->homa.pacer; + hook_exit_ns = 5000; + mock_ns_tick = 200; + unit_hook_register(manage_hook); + hook_rpc = crpc; + self->homa.pacer->max_nic_queue_ns = 2000; + + unit_log_clear(); + homa_pacer_main(self->homa.pacer); + EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400; xmit DATA 1400@2800", + unit_log_get()); +} + +TEST_F(homa_pacer, homa_pacer_xmit__basics) +{ + struct homa_rpc *crpc1, *crpc2, *crpc3; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+2, 10000, 1000); + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+4, 150000, 1000); + + homa_pacer_manage_rpc(crpc1); + homa_pacer_manage_rpc(crpc2); + homa_pacer_manage_rpc(crpc3); + self->homa.pacer->max_nic_queue_ns = 2000; + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", + unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 1234, next_offset 2800; " + "request id 1236, next_offset 0; " + "request id 1238, next_offset 0", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__pacer_already_active) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 10000, 1000); + + homa_pacer_manage_rpc(crpc); + self->homa.pacer->max_nic_queue_ns = 2000; + mock_trylock_errors = 1; + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 1234, next_offset 0", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__nic_queue_fills) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 10000, 1000); + + homa_pacer_manage_rpc(crpc); + self->homa.pacer->max_nic_queue_ns = 2001; + mock_ns = 10000; + atomic64_set(&self->homa.pacer->link_idle_time, 12000); + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + + /* Just room for one packet before NIC queue fills. */ + EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 1234, next_offset 1400", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__queue_empty) +{ + self->homa.pacer->max_nic_queue_ns = 2000; + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__xmit_fifo) +{ + struct homa_rpc *crpc1, *crpc2, *crpc3; + + mock_ns = 10000; + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 2, + 20000, 1000); + mock_ns = 11000; + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 4, + 10000, 1000); + mock_ns = 12000; + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 6, + 30000, 1000); + homa_pacer_manage_rpc(crpc1); + homa_pacer_manage_rpc(crpc2); + homa_pacer_manage_rpc(crpc3); + + /* First attempt: pacer->fifo_count doesn't reach zero. */ + self->homa.pacer->max_nic_queue_ns = 1300; + self->homa.pacer->fifo_count = 200; + self->homa.pacer->fifo_fraction = 150; + mock_ns= 13000; + atomic64_set(&self->homa.pacer->link_idle_time, 10000); + unit_log_clear(); + mock_xmit_log_verbose = 1; + homa_pacer_xmit(self->homa.pacer); + EXPECT_SUBSTR("id 4, message_length 10000, offset 0, data_length 1400", + unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 1400; " + "request id 2, next_offset 0; " + "request id 6, next_offset 0", unit_log_get()); + EXPECT_EQ(50, self->homa.pacer->fifo_count); + + /* Second attempt: pacer->fifo_count reaches zero. */ + atomic64_set(&self->homa.pacer->link_idle_time, 10000); + unit_log_clear(); + homa_pacer_xmit(self->homa.pacer); + EXPECT_SUBSTR("id 2, message_length 20000, offset 0, data_length 1400", + unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 1400; " + "request id 2, next_offset 1400; " + "request id 6, next_offset 0", unit_log_get()); + EXPECT_EQ(900, self->homa.pacer->fifo_count); +} +TEST_F(homa_pacer, homa_pacer_xmit__rpc_removed_from_queue_before_locked) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 10000, 1000); + + homa_pacer_manage_rpc(crpc); + self->homa.pacer->max_nic_queue_ns = 10000; + unit_log_clear(); + unit_hook_register(unmanage_hook); + hook_rpc = crpc; + hook_count = 2; + homa_pacer_xmit(self->homa.pacer); + + EXPECT_STREQ("removing id 1234 from throttled list", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__rpc_locked) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + + homa_pacer_manage_rpc(crpc); + self->homa.pacer->max_nic_queue_ns = 2000; + unit_log_clear(); + mock_trylock_errors = ~1; + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("", unit_log_get()); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_skipped_rpcs); +#endif /* See strip.py */ + unit_log_clear(); + mock_trylock_errors = 0; + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", + unit_log_get()); +} +TEST_F(homa_pacer, homa_pacer_xmit__remove_from_queue) +{ + struct homa_rpc *crpc1, *crpc2; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 2, + 1000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 4, + 2000, 1000); + + homa_pacer_manage_rpc(crpc1); + homa_pacer_manage_rpc(crpc2); + self->homa.pacer->max_nic_queue_ns = 2000; + unit_log_clear(); + + /* First call completes id 2, but id 4 is still in the queue. */ + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("xmit DATA 1000@0; removing id 2 from throttled list; " + "xmit DATA 1400@0", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 1400", unit_log_get()); + EXPECT_TRUE(list_empty(&crpc1->throttled_links)); + + /* Second call completes id 4, queue now empty. */ + unit_log_clear(); + self->homa.pacer->max_nic_queue_ns = 10000; + homa_pacer_xmit(self->homa.pacer); + EXPECT_STREQ("xmit DATA 600@1400; removing id 4 from throttled list", + unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); + EXPECT_TRUE(list_empty(&crpc2->throttled_links)); +} + +TEST_F(homa_pacer, homa_pacer_manage_rpc__basics) +{ + struct homa_rpc *crpc1, *crpc2, *crpc3, *crpc4, *crpc5; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 2, 10000, + 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 4, 5000, + 1000); + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 6, 15000, + 1000); + crpc4 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 8, 12000, + 1000); + crpc5 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 10, 10000, + 1000); + + /* Basics: add one RPC. */ + mock_log_wakeups = 1; + unit_log_clear(); + homa_pacer_manage_rpc(crpc1); + EXPECT_STREQ("wake_up", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 2, next_offset 0", unit_log_get()); + + /* Check priority ordering. */ + homa_pacer_manage_rpc(crpc2); + homa_pacer_manage_rpc(crpc3); + homa_pacer_manage_rpc(crpc4); + homa_pacer_manage_rpc(crpc5); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 0; " + "request id 2, next_offset 0; " + "request id 10, next_offset 0; " + "request id 8, next_offset 0; " + "request id 6, next_offset 0", unit_log_get()); + + /* Don't reinsert if already present. */ + unit_log_clear(); + homa_pacer_manage_rpc(crpc1); + EXPECT_STREQ("", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("request id 4, next_offset 0; " + "request id 2, next_offset 0; " + "request id 10, next_offset 0; " + "request id 8, next_offset 0; " + "request id 6, next_offset 0", unit_log_get()); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_pacer, homa_pacer_manage_rpc__inc_metrics) +{ + struct homa_rpc *crpc1, *crpc2, *crpc3; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+2, 10000, 1000); + crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+4, 15000, 1000); + + homa_pacer_manage_rpc(crpc1); + EXPECT_EQ(1, homa_metrics_per_cpu()->throttle_list_adds); + EXPECT_EQ(0, homa_metrics_per_cpu()->throttle_list_checks); + + homa_pacer_manage_rpc(crpc2); + EXPECT_EQ(2, homa_metrics_per_cpu()->throttle_list_adds); + EXPECT_EQ(1, homa_metrics_per_cpu()->throttle_list_checks); + + homa_pacer_manage_rpc(crpc3); + EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_adds); + EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_checks); +} +#endif /* See strip.py */ + +TEST_F(homa_pacer, homa_pacer_unmanage_rpc__basics) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + + homa_pacer_manage_rpc(crpc); + EXPECT_FALSE(list_empty(&self->homa.pacer->throttled_rpcs)); + + // First attempt will remove. + unit_log_clear(); + homa_pacer_unmanage_rpc(crpc); + EXPECT_TRUE(list_empty(&self->homa.pacer->throttled_rpcs)); + EXPECT_STREQ("removing id 1234 from throttled list", unit_log_get()); + + // Second attempt: nothing to do. + unit_log_clear(); + homa_pacer_unmanage_rpc(crpc); + EXPECT_TRUE(list_empty(&self->homa.pacer->throttled_rpcs)); + EXPECT_STREQ("", unit_log_get()); +} +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_pacer, homa_pacer_unmanage_rpc__metrics) +{ + struct homa_rpc *crpc1, *crpc2; + + crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 1000); + crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id+2, 5000, 1000); + + mock_ns = 1000; + homa_pacer_manage_rpc(crpc1); + EXPECT_EQ(1000, self->homa.pacer->throttle_add); + EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_ns); + + mock_ns = 3000; + homa_pacer_manage_rpc(crpc2); + EXPECT_EQ(3000, self->homa.pacer->throttle_add); + EXPECT_EQ(2000, homa_metrics_per_cpu()->throttled_ns); + + mock_ns = 7000; + homa_pacer_unmanage_rpc(crpc1); + EXPECT_EQ(3000, self->homa.pacer->throttle_add); + EXPECT_EQ(2000, homa_metrics_per_cpu()->throttled_ns); + + mock_ns = 8000; + homa_pacer_unmanage_rpc(crpc2); + EXPECT_EQ(3000, self->homa.pacer->throttle_add); + EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_ns); +} +#endif /* See strip.py */ + +TEST_F(homa_pacer, homa_pacer_update_sysctl_deps) +{ + self->homa.pacer->link_mbps = 10000; + homa_pacer_update_sysctl_deps(self->homa.pacer); + EXPECT_EQ(808000, self->homa.pacer->ns_per_mbyte); + + self->homa.pacer->link_mbps = 1000; + homa_pacer_update_sysctl_deps(self->homa.pacer); + EXPECT_EQ(8080000, self->homa.pacer->ns_per_mbyte); + + self->homa.pacer->link_mbps = 40000; + homa_pacer_update_sysctl_deps(self->homa.pacer); + EXPECT_EQ(202000, self->homa.pacer->ns_per_mbyte); +} \ No newline at end of file diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index c8caa257..0e29b56c 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -45,7 +45,7 @@ FIXTURE(homa_peer) { }; FIXTURE_SETUP(homa_peer) { - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); homa_peertab_init(&self->peertab); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index b4498a89..91d23d75 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -55,7 +55,7 @@ FIXTURE_SETUP(homa_plumbing) self->client_addr.in6.sin6_port = htons(self->client_port); self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); if (self->homa.wmem_max == 0) printf("homa_plumbing fixture found wmem_max 0\n"); mock_set_homa(&self->homa); diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 954642de..f4649db8 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -18,7 +18,7 @@ FIXTURE(homa_pool) { }; FIXTURE_SETUP(homa_pool) { - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 95ffe91e..b1e2390d 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_pacer.h" #include "homa_peer.h" #include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 @@ -66,7 +67,7 @@ FIXTURE_SETUP(homa_rpc) self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; @@ -509,11 +510,11 @@ TEST_F(homa_rpc, homa_rpc_end__remove_from_throttled_list) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 10000, 1000); - homa_add_to_throttled(crpc); - EXPECT_EQ(1, unit_list_length(&self->homa.throttled_rpcs)); + homa_pacer_manage_rpc(crpc); + EXPECT_EQ(1, unit_list_length(&self->homa.pacer->throttled_rpcs)); unit_log_clear(); homa_rpc_end(crpc); - EXPECT_EQ(0, unit_list_length(&self->homa.throttled_rpcs)); + EXPECT_EQ(0, unit_list_length(&self->homa.pacer->throttled_rpcs)); } TEST_F(homa_rpc, homa_rpc_reap__basics) diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 620f53c7..d2a9f6a5 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -80,7 +80,7 @@ FIXTURE(homa_skb) { }; FIXTURE_SETUP(homa_skb) { - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); self->skb = alloc_skb_fclone(200, GFP_KERNEL); if (!self->skb) diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 26db62c1..a13d7c58 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -38,7 +38,7 @@ FIXTURE(homa_sock) { }; FIXTURE_SETUP(homa_sock) { - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); self->client_ip[0] = unit_get_in_addr("196.168.0.1"); @@ -66,7 +66,7 @@ TEST_F(homa_sock, homa_socktab_start_scan) struct homa_socktab_scan scan; homa_destroy(&self->homa); - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); EXPECT_EQ(&self->hsk, homa_socktab_start_scan(self->homa.port_map, @@ -83,7 +83,7 @@ TEST_F(homa_sock, homa_socktab_next) int first_port = 34000; homa_destroy(&self->homa); - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); mock_sock_init(&hsk1, &self->homa, first_port); mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); @@ -116,7 +116,7 @@ TEST_F(homa_sock, homa_socktab_end_scan) struct homa_socktab_scan scan1, scan2, scan3; homa_destroy(&self->homa); - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); homa_socktab_start_scan(self->homa.port_map, &scan1); diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 8a1d1868..16d6181e 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -31,7 +31,7 @@ FIXTURE_SETUP(homa_timer) self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.resend_ticks = 2; diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index aa748f24..6503a06a 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -15,7 +15,7 @@ FIXTURE(homa_utils) { }; FIXTURE_SETUP(homa_utils) { - homa_init(&self->homa); + homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); unit_log_clear(); } @@ -59,7 +59,7 @@ TEST_F(homa_utils, homa_init__kmalloc_failure_for_port_map) memset(&homa2, 0, sizeof(homa2)); mock_kmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); EXPECT_EQ(NULL, homa2.port_map); homa_destroy(&homa2); } @@ -68,8 +68,8 @@ TEST_F(homa_utils, homa_init__kmalloc_failure_for_peers) struct homa homa2; memset(&homa2, 0, sizeof(homa2)); - mock_kmalloc_errors = 2; - EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + mock_kmalloc_errors = 4; + EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); EXPECT_NE(NULL, homa2.port_map); EXPECT_EQ(NULL, homa2.peers); homa_destroy(&homa2); @@ -80,23 +80,13 @@ TEST_F(homa_utils, homa_init__homa_skb_init_failure) struct homa homa2; memset(&homa2, 0, sizeof(homa2)); - mock_kmalloc_errors = 4; - EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + mock_kmalloc_errors = 8; + EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); EXPECT_SUBSTR("Couldn't initialize skb management (errno 12)", mock_printk_output); homa_destroy(&homa2); } #endif /* See strip.py */ -TEST_F(homa_utils, homa_init__cant_create_pacer_thread) -{ - struct homa homa2; - - memset(&homa2, 0, sizeof(homa2)); - mock_kthread_create_errors = 1; - EXPECT_EQ(EACCES, -homa_init(&homa2)); - EXPECT_EQ(NULL, homa2.pacer_kthread); - homa_destroy(&homa2); -} #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_print_ipv4_addr) diff --git a/test/utils.c b/test/utils.c index d1e9e8b5..4fdfeec9 100644 --- a/test/utils.c +++ b/test/utils.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" #include "ccutils.h" @@ -299,7 +300,7 @@ void unit_log_throttled(struct homa *homa) { struct homa_rpc *rpc; - list_for_each_entry_rcu(rpc, &homa->throttled_rpcs, throttled_links) { + list_for_each_entry_rcu(rpc, &homa->pacer->throttled_rpcs, throttled_links) { unit_log_printf("; ", "%s id %llu, next_offset %d", homa_is_client(rpc->id) ? "request" : "response", rpc->id, From 5b6b3a57d7ffa49df4efa2328f26a9c65688b49e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 7 Apr 2025 11:28:18 -0700 Subject: [PATCH 248/625] Minor addition to cloudlab/gdbinit --- cloudlab/gdbinit | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudlab/gdbinit b/cloudlab/gdbinit index dc4bcea5..22b8b08a 100644 --- a/cloudlab/gdbinit +++ b/cloudlab/gdbinit @@ -1 +1,2 @@ set style address foreground green +set debuginfod enabled off From c088e6d22da5b31fe6965a9678644f2a2399b741 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Apr 2025 15:13:08 -0700 Subject: [PATCH 249/625] Remove '^M' sequences in ttsyslog.py input --- util/ttsyslog.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/util/ttsyslog.py b/util/ttsyslog.py index 4dc97f5c..0b212dcb 100755 --- a/util/ttsyslog.py +++ b/util/ttsyslog.py @@ -37,6 +37,10 @@ f = open(sys.argv[1]) for line in f: + line = line.rstrip() + if line.endswith('^M'): + line = line[:-2] + # Ignore everything up until the initial line containing the clock speed. if cpu_ghz == None: match = re.match('.*cpu_khz: ([0-9.]+)', line) From 6ddd9065540e107ba14345e24d2c2c75df92652b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Apr 2025 16:14:07 -0700 Subject: [PATCH 250/625] Add consistency-checking functions to homa_devel.c --- homa_devel.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ homa_devel.h | 2 ++ 2 files changed, 55 insertions(+) diff --git a/homa_devel.c b/homa_devel.c index 1ebf54df..04980482 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -546,3 +546,56 @@ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) } } #endif /* See strip.py */ + +/** + * homa_check_addr() - Verify that an address falls within the allowable + * range for kernel data. If not, crash the kernel. + * @p: Address to check. + */ +void homa_check_addr(void *p) +{ + uintptr_t addr = (uintptr_t)p; + + if ((addr & 0xffff800000000000) != 0xffff800000000000) { + pr_err("homa_check_addr received bogus address 0x%lx\n", addr); + tt_dbg1("foo"); + BUG_ON(1); + } +} + +/** + * homa_check_list() - Scan a list to make sure its pointer structure is + * not corrupted and that its length is bounded. Crashes the kernel if + * a problem is found. + * @list: Head of list to scan. + * @max_length: If the list has more than this many elements, it is + * assumed to have an internal loop. + */ +void homa_check_list(struct list_head *list, int max_length) +{ + struct list_head *p, *prev; + int num_elems; + + homa_check_addr(list->next); + homa_check_addr(list->prev); + prev = list; + for (p = list->next, num_elems = 0; ; p = p->next, num_elems++) { + if (p->prev != prev) { + pr_err("homa_check_list found bogus list structure: p->prev 0x%px, prev 0x%px\n", + p->prev, prev); + tt_dbg1("foo"); + BUG_ON(1); + } + if (p == list) + break; + if (num_elems > max_length) { + pr_err("homa_check_list found list with > %d elements\n", + max_length); + tt_dbg1("foo"); + BUG_ON(1); + } + homa_check_addr(p->next); + homa_check_addr(p->prev); + prev = p; + } +} diff --git a/homa_devel.h b/homa_devel.h index a09bcdcc..ee4bf716 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -72,6 +72,8 @@ static inline void check_addr_valid(void *addr, char *info) #define IF_NO_STRIP(...) #endif /* See strip.py */ +void homa_check_addr(void *p); +void homa_check_list(struct list_head *list, int max_length); void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format); void homa_freeze_peers(struct homa *homa); From 76cd70346e26931ca16eefb72720fb35e85de18e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Apr 2025 16:22:17 -0700 Subject: [PATCH 251/625] Fix race in homa_grant.c homa_grant_add_rpc was being invoked without holding the rpc lock. Changed locking structure to fix this (homa_grant_check_rpc must now be invoked with the lock held). --- homa_grant.c | 16 +++++++++++++--- homa_incoming.c | 8 +------- homa_plumbing.c | 2 +- homa_pool.c | 6 +----- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 299c1eee..b8440190 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -60,6 +60,7 @@ void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) * @rpc: The RPC to add/reposition. Must be locked by caller. */ void homa_grant_add_rpc(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; struct homa_peer *peer = rpc->peer; @@ -155,6 +156,7 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) * a grantable list. Must be locked by caller. */ void homa_grant_remove_rpc(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; struct homa_peer *peer = rpc->peer; @@ -303,10 +305,10 @@ int homa_grant_try_send(struct homa_rpc *rpc, struct homa *homa) * RPC relative to outgoing grants and takes any appropriate actions that * are needed (such as adding the RPC to the grantable list or sending * grants for this or other RPCs). - * @rpc: RPC to check. Must not be locked by the caller, but caller - * must own a reference. + * @rpc: RPC to check. Must be locked by the caller. */ void homa_grant_check_rpc(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) { /* Overall design note: * The grantable lock has proven to be a performance bottleneck, @@ -353,7 +355,11 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) goto done; } - /* Not a new message; see if we can upgrade the message's priority. */ + /* Not a new message; see if we can upgrade the message's priority. + * This accesses data that might be changing concurrently (e.g. + * active_rpcs), but it should be safe: the worst that can happen + * is extra calls to homa_grant_recalc. + */ rank = atomic_read(&rpc->msgin.rank); if (homa->active_rpcs[rank] != rpc) { /* RPC not currently active. */ @@ -384,7 +390,11 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) goto done; recalc: + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); homa_grant_recalc(homa); + homa_rpc_lock(rpc); + homa_rpc_put(rpc); done: tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); diff --git a/homa_incoming.c b/homa_incoming.c index eda30052..9f958313 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -588,14 +588,8 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) kfree_skb(skb); } if (rpc) { -#ifndef __STRIP__ /* See strip.py */ - homa_rpc_hold(rpc); - homa_rpc_unlock(rpc); - homa_grant_check_rpc(rpc); - homa_rpc_put(rpc); -#else /* See strip.py */ + IF_NO_STRIP(homa_grant_check_rpc(rpc)); homa_rpc_unlock(rpc); -#endif /* See strip.py */ } while (num_acks > 0) { diff --git a/homa_plumbing.c b/homa_plumbing.c index 66a6c43b..8f5085de 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -483,7 +483,7 @@ int __init homa_load(void) { int status; - pr_notice("Homa module loading\n"); + pr_err("Homa module loading\n"); #ifndef __STRIP__ /* See strip.py */ pr_notice("Homa structure sizes: homa_data_hdr %u, homa_seg_hdr %u, ack %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", sizeof32(struct homa_data_hdr), diff --git a/homa_pool.c b/homa_pool.c index d69eb69c..50b18b2f 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -495,13 +495,9 @@ void homa_pool_check_waiting(struct homa_pool *pool) if (rpc->msgin.num_bpages > 0) { /* Allocation succeeded; "wake up" the RPC. */ rpc->msgin.resend_all = 1; - homa_rpc_hold(rpc); - homa_rpc_unlock(rpc); homa_grant_check_rpc(rpc); - homa_rpc_put(rpc); - } else { - homa_rpc_unlock(rpc); } + homa_rpc_unlock(rpc); #else /* See strip.py */ homa_rpc_unlock(rpc); #endif /* See strip.py */ From 3df96a42fc28bf1bd4e9e51cce5b64c648018c1d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 16 Apr 2025 09:29:24 -0700 Subject: [PATCH 252/625] Another refactor of homa_grant.c to fix races However, there are still kernel crashes even after this fix. It's time to give up on this approach and try a new approach that is more obviously correct. --- homa_grant.c | 183 ++++++++++++++++++++++++----------------- homa_grant.h | 4 +- homa_incoming.c | 2 + homa_interest.h | 1 + homa_plumbing.c | 5 +- homa_rpc.c | 14 ++-- homa_rpc.h | 2 +- homa_sock.c | 1 + homa_timer.c | 5 +- notes.txt | 72 ++++++++++++++++ test/unit_homa_grant.c | 135 ++++++++++++------------------ timetrace.c | 46 ++++++++++- 12 files changed, 295 insertions(+), 175 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index b8440190..d9ab5d74 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -10,6 +10,27 @@ #include "homa_rpc.h" #include "homa_wire.h" +/* Design Notes: + * This file is pretty complicated because of locking issues. Recalculating + * the priorities for granting requires @homa->grantable_lock, which is + * global. Priorities can potentially change every time a data packet + * arrives, but acquiring the global lock for each data packet would result + * in unacceptable contention (this was tried in earlier versions). The + * approach used here is to separate per-data-packet opereations + * (homa_grant_check_rpc) from the full priority recalculation + * (homa_grant_recalc). Hopefully most calls to homa_grant_check_rpc can be + * handled without calling homa_grant_recalc. It has been challenging to + * implement this safely and there are a few races; as one example, information + * may change while homa_grant_get_offset is using it. However, I believe + * that this race is "safe" (the worst that will happen is sending out a grant + * with an incorrect offset, which has only minor performance consequences). + * + * Another overall requirement for the file is not to hold locks (either + * RPC locks or @homa->grantable_lock) when actually sending grants. This + * is because packet transmission takes a long time, so holding a lock + * could result in unacceptable contention. + */ + /** * homa_grant_outranks() - Returns nonzero if rpc1 should be considered * higher priority for grants than rpc2, and zero if the two RPCS are @@ -34,10 +55,11 @@ int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) * for an RPC (i.e., data that has been granted but not yet received) and make * sure this is properly reflected in rpc->msgin.incoming * and homa->total_incoming. - * @rpc: RPC to check; need not be locked. + * @rpc: RPC to check; must be locked. * @homa: Overall information about the Homa transport. */ void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) + __must_hold(&rpc->bucket->lock) { int incoming, delta; @@ -45,11 +67,10 @@ void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) rpc->msgin.bytes_remaining); if (incoming < 0) incoming = 0; - delta = incoming - atomic_read(&rpc->msgin.rec_incoming); - if (delta != 0) { - atomic_add(delta, &rpc->msgin.rec_incoming); + delta = incoming - rpc->msgin.rec_incoming; + if (delta != 0) atomic_add(delta, &homa->total_incoming); - } + rpc->msgin.rec_incoming = incoming; } /** @@ -223,18 +244,16 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) } /** - * homa_grant_update_offset() - Select a new grant offset for a message, - * assuming that the message is high enough priority to deserve grants. - * @rpc: The RPC to check for possible grant. Need not be locked by - * the caller (if it isn't locked, the worst that will happen - * is the sending of an extraneous grant). - * @homa: Overall information about the Homa transport. - * Return: Nonzero means that @rpc->msgin.granted was increased (presumably - * the caller will now send a GRANT packet). Zero means that @rpc->msgin.granted - * can't be increased at this time. This function will set a bit in - * homa->needy_ranks if available incoming was exhausted. + * homa_grant_get_offset() - Compute a new grant offset for an RPC. This + * function may race with other functions modifying RPC state; see Design Notes + * at the start of this file. + * @rpc: RPC whose grant offset is desired. Need not be locked. + * @homa: Overall information about the Homa transport. This function + * may set @homa->incoming_hit_limit. + * Return: New grant offset for this RPC (rpc->msgin.granted is not updated) + * May be zero or negative if no additional grants should be sent. */ -int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa) +int homa_grant_get_offset(struct homa_rpc *rpc, struct homa *homa) { int received, new_grant_offset, incoming_delta, avl_incoming; @@ -249,54 +268,36 @@ int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa) new_grant_offset = received + homa->grant_window; if (new_grant_offset > rpc->msgin.length) new_grant_offset = rpc->msgin.length; - incoming_delta = (new_grant_offset - received) - - atomic_read(&rpc->msgin.rec_incoming); + incoming_delta = new_grant_offset - received - rpc->msgin.rec_incoming; avl_incoming = homa->max_incoming - atomic_read(&homa->total_incoming); if (avl_incoming < incoming_delta) { atomic_set(&homa->incoming_hit_limit, 1); - tt_record3("insufficient headroom: needed %d, available %d, used %d", + tt_record3("insufficient headroom for grant: needed %d, available %d, used %d", incoming_delta, avl_incoming, atomic_read(&homa->total_incoming)); new_grant_offset -= incoming_delta - avl_incoming; } - if (new_grant_offset <= rpc->msgin.granted) - return 0; - tt_record4("sending grant for id %llu, offset %d, priority %d, increment %d", - rpc->id, new_grant_offset, rpc->msgin.priority, - new_grant_offset - rpc->msgin.granted); - rpc->msgin.granted = new_grant_offset; - return 1; + return new_grant_offset; } /** - * homa_grant_try_send() - If an RPC needs granting and there is headroom - * under @homa->max_incoming, send a grant. - * @rpc: RPC to check. Should not be locked, but caller must own a - * reference. - * @homa: Overall info about the Homa transport. - * Return: 1 means that homa_grant_recalc now needs to be called (@rpc - * became completely granted and was removed from the grantable list). + * homa_grant_send() - Issue a GRANT packet for the current grant offset + * of an incoming RPC. + * @rpc: RPC for which to issue GRANT. Should not be locked (to + * minimize lock contention, since sending a packet is slow), + * but caller must hold a reference to keep it from being reaped. + * The msgin.resend_all field will be cleared. */ -int homa_grant_try_send(struct homa_rpc *rpc, struct homa *homa) +void homa_grant_send(struct homa_rpc *rpc) { struct homa_grant_hdr grant; - if (!homa_grant_update_offset(rpc, homa)) - return 0; - homa_grant_update_incoming(rpc, homa); - grant.offset = htonl(rpc->msgin.granted); grant.priority = rpc->msgin.priority; grant.resend_all = rpc->msgin.resend_all; if (grant.resend_all) rpc->msgin.resend_all = 0; homa_xmit_control(GRANT, &grant, sizeof(grant), rpc); - - if (rpc->msgin.granted >= rpc->msgin.length) { - homa_grant_remove_rpc(rpc); - return 1; - } - return 0; } /** @@ -310,20 +311,9 @@ int homa_grant_try_send(struct homa_rpc *rpc, struct homa *homa) void homa_grant_check_rpc(struct homa_rpc *rpc) __must_hold(&rpc->bucket->lock) { - /* Overall design note: - * The grantable lock has proven to be a performance bottleneck, - * particularly as network speeds increase. homa_grant_recalc must - * acquire that lock in order to recompute the set of messages - * we will grant to. The current design of this module tries to - * avoid calls to homa_grant_recalc by saving the current grant - * configuration in homa->active_rpcs etc. Then this function can - * issue new grants to an RPC in many cases without calling - * homa_grant_recalc or acquiring grantable_lock. Unfortunately - * there are quite a few situations where homa_grant_recalc must - * be called, which create a lot of special cases in this function. - */ + /* See Design Notes at the start of the file. */ struct homa *homa = rpc->hsk->homa; - int rank; + int new_offset, rank; if (rpc->msgin.length < 0 || rpc->state == RPC_DEAD || rpc->msgin.num_bpages <= 0) @@ -384,10 +374,31 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) } /* Ideally this should be the common case: no need to consider - * any other RPCs. + * any other RPCs or recompute priorities. */ - if (!homa_grant_try_send(rpc, homa)) - goto done; + new_offset = homa_grant_get_offset(rpc, homa); + if (new_offset > rpc->msgin.granted) { + int recalc = 0; + + tt_record4("sending grant for id %llu, offset %d, priority %d, increment %d", + rpc->id, new_offset, + rpc->msgin.priority, + new_offset - rpc->msgin.granted); + rpc->msgin.granted = new_offset; + homa_grant_update_incoming(rpc, homa); + if (rpc->msgin.granted >= rpc->msgin.length) { + homa_grant_remove_rpc(rpc); + recalc = 1; + } + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); + homa_grant_send(rpc); + if (recalc) + homa_grant_recalc(homa); + homa_rpc_lock(rpc); + homa_rpc_put(rpc); + } + goto done; recalc: homa_rpc_hold(rpc); @@ -412,7 +423,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) void homa_grant_recalc(struct homa *homa) { /* A copy of homa->active_rpcs; needed so we can send grants - * without holding grantable_lock. + * without holding grantable_lock. See Design Notes at the top + * of this file. */ struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; int i, active, try_again; @@ -428,6 +440,13 @@ void homa_grant_recalc(struct homa *homa) * opportunities to grant to additional messages. */ while (1) { + /* The first part of this computation holds the grantable + * lock but not individual RPC locks (we know that any RPC + * in @homa->active_rpcs cannot be reaped until it is removed + * from the list, and that requires the grantable lock). It + * takes references on all active RPCs before releasing the + * grantable lock. + */ if (!homa_grantable_lock(homa, 1)) { INC_METRIC(grant_recalc_skips, 1); break; @@ -486,18 +505,34 @@ void homa_grant_recalc(struct homa *homa) homa->grant_window = homa->max_incoming / (homa->num_active_rpcs + 1); - /* Release homa->grantable_lock before actually sending grants, - * because sending grants takes a while and holding - * grantable_lock would significantly increase contention for - * it. We don't hold RPC locks while sending grants either, - * for the same reason (but we do hold a reference, to keep - * the RPC from being reaped). + /* The second part of the computation is done without + * holding the grantable lock, but it will acquire RPC locks. + * The grantable lock is released because (a) we want to + * reduce contention for it and (b) we can't acquire RPC locks + * while holding it. References on the active RPCs keep them + * from being reaped. */ homa_grantable_unlock(homa); for (i = 0; i < active; i++) { struct homa_rpc *rpc = active_rpcs[i]; - - try_again += homa_grant_try_send(rpc, homa); + int new_offset; + + new_offset = homa_grant_get_offset(rpc, homa); + if (new_offset > rpc->msgin.granted) { + tt_record4("sending grant for id %llu, offset %d, priority %d, increment %d", + rpc->id, new_offset, + rpc->msgin.priority, + new_offset - rpc->msgin.granted); + homa_rpc_lock(rpc); + rpc->msgin.granted = new_offset; + homa_grant_update_incoming(rpc, homa); + if (rpc->msgin.granted >= rpc->msgin.length) { + homa_grant_remove_rpc(rpc); + try_again = 1; + } + homa_rpc_unlock(rpc); + homa_grant_send(rpc); + } homa_rpc_put(rpc); } @@ -621,13 +656,12 @@ void homa_grant_find_oldest(struct homa *homa) * homa_grant_end_rpc() - This function is invoked when homa_rpc_end is * invoked; it cleans up any state related to grants for that RPC's * incoming message. - * @rpc: The RPC to clean up. Must be locked by the caller. + * @rpc: The RPC to clean up. Must be locked by the caller. This function + * may release and then reacquire the lock. */ void homa_grant_end_rpc(struct homa_rpc *rpc) - __releases(rpc->bucket_lock) { struct homa *homa = rpc->hsk->homa; - int incoming; if (!list_empty(&rpc->grantable_links)) { homa_grant_remove_rpc(rpc); @@ -640,9 +674,8 @@ void homa_grant_end_rpc(struct homa_rpc *rpc) } } - incoming = atomic_read(&rpc->msgin.rec_incoming); - if (incoming != 0) - atomic_sub(incoming, &homa->total_incoming); + if (rpc->msgin.rec_incoming != 0) + atomic_sub(rpc->msgin.rec_incoming, &homa->total_incoming); } /** diff --git a/homa_grant.h b/homa_grant.h index 189c78d5..ed5dd89a 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -12,6 +12,7 @@ void homa_grant_add_rpc(struct homa_rpc *rpc); void homa_grant_check_rpc(struct homa_rpc *rpc); void homa_grant_end_rpc(struct homa_rpc *rpc); void homa_grant_find_oldest(struct homa *homa); +int homa_grant_get_offset(struct homa_rpc *rpc, struct homa *homa); void homa_grant_log_tt(struct homa *homa); int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2); @@ -20,10 +21,9 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_grant_recalc(struct homa *homa); void homa_grant_remove_rpc(struct homa_rpc *rpc); +void homa_grant_send(struct homa_rpc *rpc); void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa); -int homa_grant_try_send(struct homa_rpc *rpc, struct homa *homa); -int homa_grant_update_offset(struct homa_rpc *rpc, struct homa *homa); /** * homa_grantable_lock() - Acquire the grantable lock. If the lock diff --git a/homa_incoming.c b/homa_incoming.c index 9f958313..18615fea 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1296,6 +1296,8 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) #endif /* See strip.py */ int result; + INIT_LIST_HEAD(&interest.links); + init_waitqueue_head(&interest.wait_queue); /* Each iteration through this loop waits until an RPC needs attention * in some way (e.g. packets have arrived), then deals with that need * (e.g. copy to user space). It may take many iterations until an diff --git a/homa_interest.h b/homa_interest.h index 161e86a2..ce755e5a 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -67,6 +67,7 @@ struct homa_interest { */ static inline void homa_interest_unlink_shared(struct homa_interest *interest) { + tt_record("homa_interest_unlink_shared invoked"); if (!list_empty(&interest->links)) { homa_sock_lock(interest->hsk); list_del_init(&interest->links); diff --git a/homa_plumbing.c b/homa_plumbing.c index 8f5085de..a7b76010 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1300,10 +1300,9 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, #ifndef __STRIP__ /* See strip.py */ finish = sched_clock(); #endif /* See strip.py */ - tt_record3("homa_recvmsg returning id %d, length %d, bpage0 %d", - control.id, result, - control.bpage_offsets[0] >> HOMA_BPAGE_SHIFT); INC_METRIC(recv_ns, finish - start); + tt_record2("homa_recvmsg returning status %d, id %d", result, + control.id); return result; } diff --git a/homa_rpc.c b/homa_rpc.c index f56e8423..909b9a2c 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -471,8 +471,8 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) kfree(gap); } } - tt_record1("homa_rpc_reap finished reaping id %d", - rpc->id); + tt_record2("homa_rpc_reap finished reaping id %d, socket %d", + rpc->id, rpc->hsk->port); rpc->state = 0; kfree(rpc); } @@ -746,7 +746,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) if (!homa_protect_rpcs(hsk)) continue; list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - int incoming, rec_incoming; + int incoming; if (rpc->state != RPC_INCOMING) continue; @@ -755,13 +755,13 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) - rpc->msgin.bytes_remaining); if (incoming < 0) incoming = 0; - rec_incoming = atomic_read(&rpc->msgin.rec_incoming); - if (rec_incoming == 0) + if (rpc->msgin.rec_incoming == 0) continue; - total_incoming += rec_incoming; + total_incoming += rpc->msgin.rec_incoming; if (verbose) tt_record3("homa_validate_incoming: RPC id %d, incoming %d, rec_incoming %d", - rpc->id, incoming, rec_incoming); + rpc->id, incoming, + rpc->msgin.rec_incoming); if (rpc->msgin.granted >= rpc->msgin.length) continue; if (list_empty(&rpc->grantable_links)) { diff --git a/homa_rpc.h b/homa_rpc.h index e9b588d0..0fac0adc 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -163,7 +163,7 @@ struct homa_message_in { * @rec_incoming: Number of bytes in homa->total_incoming currently * contributed ("recorded") from this RPC. */ - atomic_t rec_incoming; + int rec_incoming; /** * @rank: A hint: if homa->active_rpcs[@rank] refers to this RPC then diff --git a/homa_sock.c b/homa_sock.c index 8bb43037..0f6be755 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -3,6 +3,7 @@ /* This file manages homa_sock and homa_socktab objects. */ #include "homa_impl.h" +#include "homa_grant.h" #include "homa_interest.h" #include "homa_peer.h" #include "homa_pool.h" diff --git a/homa_timer.c b/homa_timer.c index 39f667d2..6f128beb 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -94,10 +94,12 @@ void homa_check_rpc(struct homa_rpc *rpc) tt_record3("RPC id %d, peer 0x%x, aborted because of timeout, state %d", rpc->id, tt_addr(rpc->peer->addr), rpc->state); #ifndef __STRIP__ /* See strip.py */ +#if 0 homa_rpc_log_active_tt(homa, 0); tt_record1("Freezing because of RPC abort (id %d)", rpc->id); homa_freeze_peers(homa); tt_freeze(); +#endif if (homa->verbose) pr_notice("RPC id %llu, peer %s, aborted because of timeout, state %d\n", rpc->id, @@ -254,8 +256,7 @@ void homa_timer(struct homa *homa) #ifndef __STRIP__ /* See strip.py */ } else if (rpc->state == RPC_INCOMING) { total_incoming_rpcs += 1; - sum_incoming_rec += - atomic_read(&rpc->msgin.rec_incoming); + sum_incoming_rec += rpc->msgin.rec_incoming; sum_incoming += rpc->msgin.granted - (rpc->msgin.length - rpc->msgin.bytes_remaining); diff --git a/notes.txt b/notes.txt index 81a90eeb..46760930 100755 --- a/notes.txt +++ b/notes.txt @@ -1,6 +1,22 @@ Notes for Homa implementation in Linux: --------------------------------------- +* Grant changes: + * Make sure active_rpcs[0] is NULL if no active RPCs. + * Potential race: active_rpcs vs. active_remaining + * Duplicated hit_limit code in check_rpc + * Why call update_incoming again in try_send? + * Hold RPC lock when reading rec_incoming (no longer atomic) + * Eliminate homa_grant_try_send + * No increment in new "sending grant" tt_record. + +* Failure modes: + * homa_grant_add_rpc: list has a loop, or encounter a null list link + * stack corruption under homa_recvmsg after socket shutdown. + +* Move interest cleanup code from homa_sock to a new function in + homa_interest. Also move wakeup code from homa_rpc_handoff. + * Thoughts on making TCP and Homa play better together: * Goals: * Keep the NIC tx queue from growing long. @@ -414,3 +430,59 @@ Notes for Homa implementation in Linux: #ifndef __STRIP__ /* See strip.py */ #endif /* See strip.py */ +Call Trace:^M +[281667.695467] ^M +[281667.695601] ? show_regs+0x64/0x70^M +[281667.696414] ? __die+0x24/0x70^M +[281667.696992] ? page_fault_oops+0x21c/0x730 fault.c:716 +[281667.697224] ? __pfx_page_fault_oops+0x10/0x10^M +[281667.697857] ? bpf_ksym_find+0xcb/0xe0 kernel/bpf/core.c:740? +[281667.698072] ? __pfx_is_prefetch.isra.0+0x10/0x10^M +[281667.698801] ? search_bpf_extables+0xb5/0xd0 kernel/bpf/core.c:794 +[281667.699448] ? search_exception_tables+0x60/0x70^M kernel/extable.c:62 +[281667.700086] ? fixup_exception+0x3b/0x400 mm/extable.c:320 +[281667.700309] ? sched_clock+0x10/0x30^M +[281667.700526] ? kernelmode_fixup_or_oops.isra.0+0x6b/0x80 fault.c:733? +[281667.700829] ? __bad_area_nosemaphore+0x1e6/0x340 fault.c:790 +[281667.701445] ? spurious_kernel_fault_check+0x46/0xb0^M +[281667.701724] ? bad_area_nosemaphore+0x16/0x20^M fault.c:839 +[281667.702368] ? do_kern_addr_fault+0x95/0xb0 fault.c:1203 +[281667.703008] ? exc_page_fault+0xdd/0xe0^M +[281667.703231] ? asm_exc_page_fault+0x27/0x30^M +[281667.703874] ? __mod_memcg_lruvec_state+0x188/0x300^M +[281667.704197] ? __mod_memcg_lruvec_state+0x19e/0x300^M +[281667.704469] folio_batch_move_lru+0xc9/0x240 swap.c:168 +[281667.705820] lru_add_drain_cpu+0xf3/0x190 swap.c:616 +[281667.706021] lru_add_drain+0x24/0x60 swap.c:698 +[281667.706214] zap_page_range_single+0xa8/0x340 memory.c:1938 +[281667.706846] ? __pfx_zap_page_range_single+0x10/0x10^M +[281667.807198] ? userfaultfd_remove+0x8e/0x210 fs/userfaultd.c:777? +[281667.807853] ? __pfx_userfaultfd_remove+0x10/0x10^M +[281667.808526] ? __pfx_find_vma_prev+0x10/0x10 mmap.c:98x? +[281668.309000] do_madvise.part.0+0x1db8/0x1f90^M +[281668.309630] ? ____sys_recvmsg+0x15f/0x380 socket.c:2803 +[281668.309905] ? __pfx_do_madvise.part.0+0x10/0x10^M +[281668.310514] ? update_min_vruntime+0x11b/0x130^M +[281668.311116] ? __update_load_avg_cfs_rq+0x78/0x5f0^M +[281668.311722] ? __pfx___resched_curr+0x10/0x10^M +[281668.312349] ? ___sys_recvmsg+0xe0/0x150 socket.c:2845 +[281668.312557] ? __pfx____sys_recvmsg+0x10/0x10^M +[281668.313169] ? __kasan_check_read+0x11/0x20^M +[281668.313791] ? psi_group_change+0x2e9/0x4a0^M +[281668.314416] ? __set_next_task_fair.part.0+0x28/0x310^M +[281668.314711] ? __kasan_check_write+0x14/0x20^M +[281668.315326] ? recalc_sigpending+0xa7/0xf0^M +[281668.315529] ? preempt_count_sub+0x18/0xc0^M +[281668.315774] ? _raw_spin_unlock_irq+0x1f/0x40^M +[281668.316381] ? sigprocmask+0x129/0x1c0^M +[281668.316584] ? __pfx_sigprocmask+0x10/0x10^M +[281668.316812] ? __kasan_check_write+0x14/0x20^M +[281668.317411] ? __x64_sys_rt_sigprocmask+0x105/0x190^M +[281668.317680] ? __pfx___x64_sys_rt_sigprocmask+0x10/0x10^M +[281668.317978] __x64_sys_madvise+0x9a/0xb0^M +[281668.318189] ? __x64_sys_madvise+0x9a/0xb0^M +[281668.318393] x64_sys_call+0x1f34/0x20b0^M +[281668.318603] do_syscall_64+0x4b/0x110^M + +xmit DATA 1400@0; xmit DATA 1400@1400; xmit DATA 1400@2800; time 1600; time 2200; xmit DATA 800@4200; removing id 1234 from throttled list; time 3200; xmit DATA 1400@0; time 4400; xmit DATA 1400@1400; time 5600; time 6200; xmit DATA 1400@2800; time 7400; xmit DATA 1400@4200; time 8600; time 9200; xmit DATA 1400@5600; time 10400; time 11000; xmit DATA 1400@7000; time 12200; xmit DATA 1400@8400; time 13400; time 14000; xmit DATA 200@9800; removing id 1236 from throttled list +xmit DATA 1400@0; xmit DATA 1400@1400; xmit DATA 1400@2800; xmit DATA 800@4200; removing id 1234 from throttled list; time 1400; time 2000; xmit DATA 1400@0; time 3200; time 3800; xmit DATA 1400@1400; time 5000; xmit DATA 1400@2800; time 6200; time 6800; xmit DATA 1400@4200; time 8000; xmit DATA 1400@5600; time 9200; time 9800; xmit DATA 1400@7000; time 11000; time 11600; xmit DATA 1400@8400; xmit DATA 200@9800; removing id 1236 from throttled list \ No newline at end of file diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index edd5cfbe..a04c84f4 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -163,29 +163,29 @@ TEST_F(homa_grant, homa_grant_update_incoming) atomic_set(&self->homa.total_incoming, 1000); rpc->msgin.bytes_remaining = 19000; rpc->msgin.granted = 3000; - atomic_set(&rpc->msgin.rec_incoming, 500); + rpc->msgin.rec_incoming = 500; homa_grant_update_incoming(rpc, &self->homa); EXPECT_EQ(2500, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(2000, atomic_read(&rpc->msgin.rec_incoming)); + EXPECT_EQ(2000, rpc->msgin.rec_incoming); /* Case 2: incoming negative. */ atomic_set(&self->homa.total_incoming, 1000); rpc->msgin.bytes_remaining = 16000; rpc->msgin.granted = 3000; - atomic_set(&rpc->msgin.rec_incoming, 500); + rpc->msgin.rec_incoming = 500; homa_grant_update_incoming(rpc, &self->homa); EXPECT_EQ(500, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(0, atomic_read(&rpc->msgin.rec_incoming)); + EXPECT_EQ(0, rpc->msgin.rec_incoming); /* Case 3: no change to rec_incoming. */ atomic_set(&self->homa.total_incoming, 1000); self->homa.max_incoming = 1000; rpc->msgin.bytes_remaining = 16000; rpc->msgin.granted = 4500; - atomic_set(&rpc->msgin.rec_incoming, 500); + rpc->msgin.rec_incoming = 500; homa_grant_update_incoming(rpc, &self->homa); EXPECT_EQ(1000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(500, atomic_read(&rpc->msgin.rec_incoming)); + EXPECT_EQ(500, rpc->msgin.rec_incoming); } TEST_F(homa_grant, homa_grant_add_rpc__update_metrics) @@ -431,110 +431,74 @@ TEST_F(homa_grant, homa_grant_remove_rpc__reposition_peer_in_homa_list) EXPECT_EQ(3, self->homa.num_grantable_rpcs); } -TEST_F(homa_grant, homa_grant_update_offset__basics) +TEST_F(homa_grant, homa_grant_get_offset__basics) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); - EXPECT_EQ(10000, rpc->msgin.granted); + EXPECT_EQ(10000, homa_grant_get_offset(rpc, &self->homa)); EXPECT_EQ(0, atomic_read(&self->homa.incoming_hit_limit)); } -TEST_F(homa_grant, homa_grant_update_offset__rpc_idle) +TEST_F(homa_grant, homa_grant_get_offset__rpc_idle) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); rpc->silent_ticks = 2; - EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); - EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_EQ(0, homa_grant_get_offset(rpc, &self->homa)); } -TEST_F(homa_grant, homa_grant_update_offset__end_of_message) +TEST_F(homa_grant, homa_grant_get_offset__end_of_message) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); /* First call grants remaining bytes in message. */ rpc->msgin.bytes_remaining = 5000; - EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); - EXPECT_EQ(20000, rpc->msgin.granted); - EXPECT_EQ(0, atomic_read(&self->homa.incoming_hit_limit)); + EXPECT_EQ(20000, homa_grant_get_offset(rpc, &self->homa)); - /* Second call cannot grant anymore. */ - EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); - EXPECT_EQ(20000, rpc->msgin.granted); + /* Second call cannot grant anything additional. */ + EXPECT_EQ(20000, homa_grant_get_offset(rpc, &self->homa)); } -TEST_F(homa_grant, homa_grant_update_offset__insufficient_room_in_incoming) +TEST_F(homa_grant, homa_grant_get_offset__insufficient_room_in_incoming) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); rpc->msgin.bytes_remaining = 5000; atomic_set(&rpc->msgin.rank, 5); atomic_set(&self->homa.total_incoming, 48000); - EXPECT_EQ(1, homa_grant_update_offset(rpc, &self->homa)); - EXPECT_EQ(17000, rpc->msgin.granted); - EXPECT_EQ(1, atomic_read(&self->homa.incoming_hit_limit)); + EXPECT_EQ(17000, homa_grant_get_offset(rpc, &self->homa)); } -TEST_F(homa_grant, homa_grant_update_offset__incoming_overcommitted) +TEST_F(homa_grant, homa_grant_get_offset__incoming_overcommitted) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - atomic_set(&rpc->msgin.rank, 6); atomic_set(&self->homa.total_incoming, 51000); - EXPECT_EQ(0, homa_grant_update_offset(rpc, &self->homa)); - EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_EQ(-1000, homa_grant_get_offset(rpc, &self->homa)); EXPECT_EQ(1, atomic_read(&self->homa.incoming_hit_limit)); } -TEST_F(homa_grant, homa_grant_try_send__basics) -{ - struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - - atomic_set(&rpc->msgin.rank, 1); - unit_log_clear(); - EXPECT_EQ(0, homa_grant_try_send(rpc, &self->homa)); - EXPECT_EQ(10000, rpc->msgin.granted); - EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); - EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); -} -TEST_F(homa_grant, homa_grant_try_send__cant_grant) +TEST_F(homa_grant, homa_grant_send__basics) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - atomic_set(&rpc->msgin.rank, 1); - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); + mock_xmit_log_verbose = 1; + rpc->msgin.granted = 2600; + rpc->msgin.priority = 6; unit_log_clear(); - EXPECT_EQ(0, homa_grant_try_send(rpc, &self->homa)); - EXPECT_EQ(0, rpc->msgin.granted); - EXPECT_EQ(1, atomic_read(&self->homa.incoming_hit_limit)); - EXPECT_EQ(50000, atomic_read(&self->homa.total_incoming)); - EXPECT_STREQ("", unit_log_get()); + homa_grant_send(rpc); + EXPECT_SUBSTR("id 100, offset 2600, grant_prio 6", unit_log_get()); } -TEST_F(homa_grant, homa_grant_try_send__resend_all) +TEST_F(homa_grant, homa_grant_send__resend_all) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + mock_xmit_log_verbose = 1; + rpc->msgin.granted = 9999; + rpc->msgin.priority = 4; rpc->msgin.resend_all = 1; unit_log_clear(); - EXPECT_EQ(0, homa_grant_try_send(rpc, &self->homa)); - EXPECT_STREQ("xmit GRANT 10000@0 resend_all", unit_log_get()); + homa_grant_send(rpc); + EXPECT_SUBSTR("id 100, offset 9999, grant_prio 4, resend_all", + unit_log_get()); EXPECT_EQ(0, rpc->msgin.resend_all); } -TEST_F(homa_grant, homa_grant_try_send__end_of_message) -{ - struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 5000); - - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 100, remaining 5000", - unit_log_get()); - - unit_log_clear(); - EXPECT_EQ(1, homa_grant_try_send(rpc, &self->homa)); - EXPECT_EQ(5000, rpc->msgin.granted); - EXPECT_EQ(5000, atomic_read(&self->homa.total_incoming)); - EXPECT_STREQ("xmit GRANT 5000@0", unit_log_get()); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("", unit_log_get()); -} TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) { @@ -544,9 +508,9 @@ TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) rpc->msgin.bytes_remaining = 500; rpc->msgin.granted = 2000; - atomic_set(&rpc->msgin.rec_incoming, 0); + rpc->msgin.rec_incoming = 0; homa_grant_check_rpc(rpc); - EXPECT_EQ(0, atomic_read(&rpc->msgin.rec_incoming)); + EXPECT_EQ(0, rpc->msgin.rec_incoming); EXPECT_EQ(0, atomic_read(&self->homa.total_incoming)); EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); } @@ -559,7 +523,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) homa_message_in_init(rpc, 2000, 0); homa_grant_check_rpc(rpc); - EXPECT_EQ(2000, atomic_read(&rpc->msgin.rec_incoming)); + EXPECT_EQ(2000, rpc->msgin.rec_incoming); EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); old_state = rpc->state; @@ -567,7 +531,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) rpc->msgin.bytes_remaining = 0; homa_grant_check_rpc(rpc); rpc->state = old_state; - EXPECT_EQ(2000, atomic_read(&rpc->msgin.rec_incoming)); + EXPECT_EQ(2000, rpc->msgin.rec_incoming); EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); } TEST_F(homa_grant, homa_grant_check_rpc__message_fully_granted_no_recalc) @@ -596,7 +560,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__message_fully_granted_must_recalc) homa_message_in_init(rpc1, 2000, 0); rpc1->msgin.granted = 2000; rpc1->msgin.bytes_remaining = 0; - atomic_set(&rpc1->msgin.rec_incoming, 1500); + rpc1->msgin.rec_incoming = 1500; /* Second RPC will be waiting for incoming. */ rpc2 = test_rpc(self, 100, self->server_ip, 5000); @@ -629,7 +593,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) homa_grant_check_rpc(rpc); EXPECT_EQ(18000, rpc->msgin.granted); - EXPECT_EQ(10000, atomic_read(&rpc->msgin.rec_incoming)); + EXPECT_EQ(10000, rpc->msgin.rec_incoming); EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_calls); @@ -650,7 +614,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) homa_message_in_init(rpc3, 20000, 0); homa_grant_check_rpc(rpc3); EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(10000, atomic_read(&rpc3->msgin.rec_incoming)); + EXPECT_EQ(10000, rpc3->msgin.rec_incoming); EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); @@ -672,7 +636,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) homa_message_in_init(rpc3, 30000, 0); homa_grant_check_rpc(rpc3); EXPECT_EQ(0, rpc3->msgin.granted); - EXPECT_EQ(0, atomic_read(&rpc3->msgin.rec_incoming)); + EXPECT_EQ(0, rpc3->msgin.rec_incoming); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); } @@ -694,7 +658,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) rpc3->msgin.bytes_remaining = 15000; homa_grant_check_rpc(rpc3); EXPECT_EQ(35000, rpc3->msgin.granted); - EXPECT_EQ(10000, atomic_read(&rpc3->msgin.rec_incoming)); + EXPECT_EQ(10000, rpc3->msgin.rec_incoming); EXPECT_EQ(rpc1, self->homa.active_rpcs[1]); EXPECT_EQ(rpc3, self->homa.active_rpcs[0]); } @@ -716,7 +680,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) unit_log_clear(); homa_grant_check_rpc(rpc3); EXPECT_EQ(25000, rpc3->msgin.granted); - EXPECT_EQ(10000, atomic_read(&rpc3->msgin.rec_incoming)); + EXPECT_EQ(10000, rpc3->msgin.rec_incoming); EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); @@ -810,6 +774,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__grant_to_self_and_recalc) EXPECT_EQ(10000, rpc3->msgin.granted); EXPECT_EQ(0, rpc4->msgin.granted); EXPECT_EQ(2, atomic_read(&rpc4->msgin.rank)); + EXPECT_TRUE(list_empty(&rpc3->grantable_links)); EXPECT_STREQ("xmit GRANT 10000@0; homa_grant_recalc", unit_log_get()); } @@ -847,7 +812,7 @@ TEST_F(homa_grant, homa_grant_recalc__basics) EXPECT_EQ(0, rpc4->msgin.granted); EXPECT_NE(0, homa_metrics_per_cpu()->grant_recalc_ns); } -TEST_F(homa_grant, homa_grant_recalc__skip_recalc) +TEST_F(homa_grant, homa_grant_recalc__cant_acquire_grantable_lock) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); @@ -983,9 +948,11 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_so_recalc) EXPECT_EQ(10000, rpc2->msgin.granted); EXPECT_EQ(10000, rpc3->msgin.granted); EXPECT_EQ(2000, rpc4->msgin.granted); + EXPECT_TRUE(list_empty(&rpc2->grantable_links)); + EXPECT_TRUE(list_empty(&rpc3->grantable_links)); EXPECT_EQ(2, homa_metrics_per_cpu()->grant_recalc_loops); } -TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) +TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_cant_get_lock) { struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; @@ -997,7 +964,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_skip_recalc) self->homa.max_overcommit = 2; unit_hook_register(grantable_spinlock_hook); hook_homa = &self->homa; - mock_trylock_errors = 0xf8; + mock_trylock_errors = 0xe0; EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_skips); homa_grant_recalc(&self->homa); @@ -1148,7 +1115,7 @@ TEST_F(homa_grant, homa_grant_end_rpc__rpc_not_grantable) self->client_ip, self->server_ip, self->server_port, 100, 1000, 2000); atomic_set(&self->homa.total_incoming, 10000); - atomic_set(&rpc->msgin.rec_incoming, 3000); + rpc->msgin.rec_incoming = 3000; homa_grant_end_rpc(rpc); EXPECT_EQ(7000, atomic_read(&self->homa.total_incoming)); } @@ -1164,7 +1131,7 @@ TEST_F(homa_grant, homa_grant_end_rpc__in_active_list) EXPECT_EQ(rpc1, self->homa.active_rpcs[0]); EXPECT_EQ(rpc2, self->homa.active_rpcs[1]); EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(10000, atomic_read(&rpc1->msgin.rec_incoming)); + EXPECT_EQ(10000, rpc1->msgin.rec_incoming); unit_log_clear(); homa_grant_end_rpc(rpc1); @@ -1184,10 +1151,10 @@ TEST_F(homa_grant, homa_grant_end_rpc__not_in_active_list) EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(0, atomic_read(&rpc3->msgin.rec_incoming)); + EXPECT_EQ(0, rpc3->msgin.rec_incoming); EXPECT_FALSE(list_empty(&rpc3->grantable_links)); - atomic_set(&rpc3->msgin.rec_incoming, 5000); + rpc3->msgin.rec_incoming = 5000; homa_grant_end_rpc(rpc3); EXPECT_TRUE(list_empty(&rpc3->grantable_links)); EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); diff --git a/timetrace.c b/timetrace.c index 11b8c504..d39f0500 100644 --- a/timetrace.c +++ b/timetrace.c @@ -92,6 +92,10 @@ int tt_pf_storage = TT_PF_BUF_SIZE; /* Set during tests to disable "cpu_khz" line in trace output. */ bool tt_test_no_khz; +#define MAX_IDS 10 +#define MAX_CORES 50 +static atomic_t id_counts[MAX_CORES][MAX_IDS]; + /** * tt_init(): Enable time tracing, create /proc file for reading traces. * @proc_file: Name of a file in /proc; this file can be read to extract @@ -817,8 +821,26 @@ void tt_get_messages(char *buffer, size_t length) */ void tt_dbg1(char *msg, ...) { - pr_err("tt_dbg1 is dumping timetrace\n"); + int id, core; + int problems = 0; + + if (atomic_read(&tt_frozen)) + return; tt_freeze(); + + for (core = 0; core < MAX_CORES; core++) { + for (id = 0; id < MAX_IDS; id++) { + int value = atomic_read(&id_counts[core][id]); + if (value != 0) { + pr_err("Core %d has count %d for id %d\n", + core, value, id); + problems++; + } + } + } + pr_err("tt_dbg1 found %d nonzero counters (running on core %d)\n", + problems, raw_smp_processor_id()); + pr_err("Dumping timetrace\n"); tt_printk(); pr_err("Finished dumping timetrace\n"); } @@ -830,6 +852,17 @@ void tt_dbg1(char *msg, ...) */ void tt_dbg2(char *msg, ...) { + va_list ap; + int core; + int id; + + va_start(ap, msg); + id = va_arg(ap, int); + core = va_arg(ap, int); + atomic_add(1, &id_counts[core][id]); + tt_record4("tt_dbg2 incremented counter %d for core %d to %d in pid %d", + id, core, atomic_read(&id_counts[core][id]), current->pid); + va_end(ap); } /** @@ -839,6 +872,17 @@ void tt_dbg2(char *msg, ...) */ void tt_dbg3(char *msg, ...) { + va_list ap; + int core; + int id; + + va_start(ap, msg); + id = va_arg(ap, int); + core = va_arg(ap, int); + atomic_sub(1, &id_counts[core][id]); + tt_record4("tt_dbg3 decremented counter %d for core %d to %d in pid %d", + id, core, atomic_read(&id_counts[core][id]), current->pid); + va_end(ap); } /** From f1642f273cd56f3093e6f43ad0d1e127b619a0f9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 16 Apr 2025 12:17:09 -0700 Subject: [PATCH 253/625] Rename "grantable lock" -> "grant lock" --- homa_grant.c | 54 +++++++++++++++++++++--------------------- homa_grant.h | 27 ++++++++++----------- homa_impl.h | 12 +++++----- homa_incoming.c | 2 +- homa_metrics.c | 20 ++++++++-------- homa_metrics.h | 19 +++++++-------- homa_peer.h | 2 +- homa_rpc.h | 2 +- homa_utils.c | 2 +- sync.txt | 2 +- test/unit_homa_grant.c | 34 +++++++++++++------------- util/metrics.py | 6 ++--- util/tthoma.py | 20 ++++++++-------- 13 files changed, 99 insertions(+), 103 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index d9ab5d74..0c450a86 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -12,7 +12,7 @@ /* Design Notes: * This file is pretty complicated because of locking issues. Recalculating - * the priorities for granting requires @homa->grantable_lock, which is + * the priorities for granting requires @homa->grant_lock, which is * global. Priorities can potentially change every time a data packet * arrives, but acquiring the global lock for each data packet would result * in unacceptable contention (this was tried in earlier versions). The @@ -26,7 +26,7 @@ * with an incorrect offset, which has only minor performance consequences). * * Another overall requirement for the file is not to hold locks (either - * RPC locks or @homa->grantable_lock) when actually sending grants. This + * RPC locks or @homa->grant_lock) when actually sending grants. This * is because packet transmission takes a long time, so holding a lock * could result in unacceptable contention. */ @@ -88,7 +88,7 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) struct homa_peer *peer_cand; struct homa_rpc *candidate; - homa_grantable_lock(homa, 0); + homa_grant_lock(homa, 0); /* Make sure this message is in the right place in the grantable_rpcs * list for its peer. @@ -167,7 +167,7 @@ void homa_grant_add_rpc(struct homa_rpc *rpc) list_add(&prev_peer->grantable_links, &peer->grantable_links); } done: - homa_grantable_unlock(homa); + homa_grant_unlock(homa); } /** @@ -188,13 +188,13 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) if (list_empty(&rpc->grantable_links)) return; - homa_grantable_lock(homa, 0); + homa_grant_lock(homa, 0); /* Must check list again: might have been removed by someone * else before we got the lock. */ if (list_empty(&rpc->grantable_links)) { - homa_grantable_unlock(homa); + homa_grant_unlock(homa); return; } @@ -240,7 +240,7 @@ void homa_grant_remove_rpc(struct homa_rpc *rpc) } done: - homa_grantable_unlock(homa); + homa_grant_unlock(homa); } /** @@ -423,7 +423,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) void homa_grant_recalc(struct homa *homa) { /* A copy of homa->active_rpcs; needed so we can send grants - * without holding grantable_lock. See Design Notes at the top + * without holding grant_lock. See Design Notes at the top * of this file. */ struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; @@ -440,14 +440,14 @@ void homa_grant_recalc(struct homa *homa) * opportunities to grant to additional messages. */ while (1) { - /* The first part of this computation holds the grantable + /* The first part of this computation holds the grant * lock but not individual RPC locks (we know that any RPC * in @homa->active_rpcs cannot be reaped until it is removed - * from the list, and that requires the grantable lock). It + * from the list, and that requires the grant lock). It * takes references on all active RPCs before releasing the - * grantable lock. + * grant lock. */ - if (!homa_grantable_lock(homa, 1)) { + if (!homa_grant_lock(homa, 1)) { INC_METRIC(grant_recalc_skips, 1); break; } @@ -506,13 +506,13 @@ void homa_grant_recalc(struct homa *homa) (homa->num_active_rpcs + 1); /* The second part of the computation is done without - * holding the grantable lock, but it will acquire RPC locks. - * The grantable lock is released because (a) we want to + * holding the grant lock, but it will acquire RPC locks. + * The grant lock is released because (a) we want to * reduce contention for it and (b) we can't acquire RPC locks * while holding it. References on the active RPCs keep them * from being reaped. */ - homa_grantable_unlock(homa); + homa_grant_unlock(homa); for (i = 0; i < active; i++) { struct homa_rpc *rpc = active_rpcs[i]; int new_offset; @@ -610,7 +610,7 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, /** * homa_grant_find_oldest() - Recompute the value of homa->oldest_rpc. * @homa: Overall data about the Homa protocol implementation. The - * grantable_lock must be held by the caller. + * grant_lock must be held by the caller. */ void homa_grant_find_oldest(struct homa *homa) { @@ -679,40 +679,40 @@ void homa_grant_end_rpc(struct homa_rpc *rpc) } /** - * homa_grantable_lock_slow() - This function implements the slow path for - * acquiring the grantable lock. It is invoked when the lock isn't immediately + * homa_grant_lock_slow() - This function implements the slow path for + * acquiring the grant lock. It is invoked when the lock isn't immediately * available. It waits for the lock, but also records statistics about * the waiting time. * @homa: Overall data about the Homa protocol implementation. * @recalc: Nonzero means the caller is homa_grant_recalc; if another thread * is already recalculating, can return without waiting for the lock. - * Return: Nonzero means this thread now owns the grantable lock. Zero + * Return: Nonzero means this thread now owns the grant lock. Zero * means the lock was not acquired and there is no need for this * thread to do the work of homa_grant_recalc because some other * thread started a fresh calculation after this method was invoked. */ -int homa_grantable_lock_slow(struct homa *homa, int recalc) - __acquires(&homa->grantable_lock) +int homa_grant_lock_slow(struct homa *homa, int recalc) + __acquires(&homa->grant_lock) { int starting_count = atomic_read(&homa->grant_recalc_count); u64 start = sched_clock(); int result = 0; - tt_record("beginning wait for grantable lock"); + tt_record("beginning wait for grant lock"); while (1) { - if (spin_trylock_bh(&homa->grantable_lock)) { - tt_record("ending wait for grantable lock"); + if (spin_trylock_bh(&homa->grant_lock)) { + tt_record("ending wait for grant lock"); result = 1; break; } if (recalc && atomic_read(&homa->grant_recalc_count) != starting_count) { - tt_record("skipping wait for grantable lock: recalc elsewhere"); + tt_record("skipping wait for grant lock: recalc elsewhere"); break; } } - INC_METRIC(grantable_lock_misses, 1); - INC_METRIC(grantable_lock_miss_ns, sched_clock() - start); + INC_METRIC(grant_lock_misses, 1); + INC_METRIC(grant_lock_miss_ns, sched_clock() - start); return result; } diff --git a/homa_grant.h b/homa_grant.h index ed5dd89a..18611757 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -7,12 +7,12 @@ #include "homa_rpc.h" -int homa_grantable_lock_slow(struct homa *homa, int recalc); void homa_grant_add_rpc(struct homa_rpc *rpc); void homa_grant_check_rpc(struct homa_rpc *rpc); void homa_grant_end_rpc(struct homa_rpc *rpc); void homa_grant_find_oldest(struct homa *homa); int homa_grant_get_offset(struct homa_rpc *rpc, struct homa *homa); +int homa_grant_lock_slow(struct homa *homa, int recalc); void homa_grant_log_tt(struct homa *homa); int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2); @@ -26,39 +26,38 @@ void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa); /** - * homa_grantable_lock() - Acquire the grantable lock. If the lock + * homa_grant_lock() - Acquire the grant lock. If the lock * isn't immediately available, record stats on the waiting time. * @homa: Overall data about the Homa protocol implementation. * @recalc: Nonzero means the caller is homa_grant_recalc; if another thread * is already recalculating, can return without waiting for the lock. - * Return: Nonzero means this thread now owns the grantable lock. Zero + * Return: Nonzero means this thread now owns the grant lock. Zero * means the lock was not acquired and there is no need for this * thread to do the work of homa_grant_recalc because some other * thread started a fresh calculation after this method was invoked. */ -static inline int homa_grantable_lock(struct homa *homa, int recalc) - __acquires(&homa->grantable_lock) +static inline int homa_grant_lock(struct homa *homa, int recalc) + __acquires(&homa->grant_lock) { int result; - if (spin_trylock_bh(&homa->grantable_lock)) + if (spin_trylock_bh(&homa->grant_lock)) result = 1; else - result = homa_grantable_lock_slow(homa, recalc); - homa->grantable_lock_time = sched_clock(); + result = homa_grant_lock_slow(homa, recalc); + homa->grant_lock_time = sched_clock(); return result; } /** - * homa_grantable_unlock() - Release the grantable lock. + * homa_grant_unlock() - Release the grant lock. * @homa: Overall data about the Homa protocol implementation. */ -static inline void homa_grantable_unlock(struct homa *homa) - __releases(&homa->grantable_lock) +static inline void homa_grant_unlock(struct homa *homa) + __releases(&homa->grant_lock) { - INC_METRIC(grantable_lock_ns, sched_clock() - - homa->grantable_lock_time); - spin_unlock_bh(&homa->grantable_lock); + INC_METRIC(grant_lock_ns, sched_clock() - homa->grant_lock_time); + spin_unlock_bh(&homa->grant_lock); } /** diff --git a/homa_impl.h b/homa_impl.h index ef8a7ace..eed204c5 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -122,7 +122,7 @@ struct homa { #ifndef __STRIP__ /* See strip.py */ /** - * @grantable_lock: Used to synchronize access to grant-related + * @grant_lock: Used to synchronize access to grant-related * fields below. In order to reduce contention, this lock is held * only when making structural changes (e.g. modifying grantable_peers * or active_rpcs). It is not held when computing new grant offsets @@ -130,13 +130,13 @@ struct homa { * possible for RPCs to receive grants out of priority order, or to * receive duplicate grants. */ - spinlock_t grantable_lock ____cacheline_aligned_in_smp; + spinlock_t grant_lock ____cacheline_aligned_in_smp; /** - * @grantable_lock_time: sched_clock() time when grantable_lock + * @grant_lock_time: sched_clock() time when grant_lock * was last locked. */ - u64 grantable_lock_time; + u64 grant_lock_time; /** * @grant_recalc_count: Incremented every time homa_grant_recalc @@ -187,7 +187,7 @@ struct homa { /** * @active_remaining: entry i in this array contains a copy of * active_rpcs[i]->msgin.bytes_remaining. These values can be - * updated by the corresponding RPCs without holding the grantable + * updated by the corresponding RPCs without holding the grant * lock. Perfect consistency isn't required; this are hints used to * detect when the priority ordering of messages changes. */ @@ -196,7 +196,7 @@ struct homa { /** * @oldest_rpc: The RPC with incoming data whose start_ns is * farthest in the past). NULL means either there are no incoming - * RPCs or the oldest needs to be recomputed. Must hold grantable_lock + * RPCs or the oldest needs to be recomputed. Must hold grant_lock * to update. */ struct homa_rpc *oldest_rpc; diff --git a/homa_incoming.c b/homa_incoming.c index 18615fea..5e80b965 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1016,7 +1016,7 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, * being retained until fifo grants are reimplemented using the new grant * mechanism. * @homa: Overall data about the Homa protocol implementation. The - * grantable_lock must be held by the caller. + * grant lock must be held by the caller. * Return: An RPC to which to send a FIFO grant, or NULL if there is * no appropriate RPC. This method doesn't actually send a grant, * but it updates @msgin.granted to reflect the desired grant. diff --git a/homa_metrics.c b/homa_metrics.c index 89121f56..064145d6 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -234,8 +234,8 @@ char *homa_metrics_print(void) m->so_set_buf_ns); M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_RCVBUF\n", m->so_set_buf_calls); - M("grantable_lock_ns %15llu Time spent with homa->grantable_lock locked\n", - m->grantable_lock_ns); + M("grant_lock_ns %15llu Time spent with grant lock locked\n", + m->grant_lock_ns); M("timer_ns %15llu Time spent in homa_timer\n", m->timer_ns); M("timer_reap_ns %15llu Time in homa_timer spent reaping RPCs\n", @@ -313,16 +313,16 @@ char *homa_metrics_print(void) m->peer_ack_lock_misses); M("peer_ack_lock_miss_ns %15llu Time lost waiting for peer ack locks\n", m->peer_ack_lock_miss_ns); - M("grantable_lock_misses %15llu Grantable lock misses\n", - m->grantable_lock_misses); - M("grantable_lock_miss_ns %15llu Time lost waiting for grantable lock\n", - m->grantable_lock_miss_ns); + M("grant_lock_misses %15llu Grant lock misses\n", + m->grant_lock_misses); + M("grant_lock_miss_ns %15llu Time lost waiting for grant lock\n", + m->grant_lock_miss_ns); M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", m->grantable_rpcs_integral); - M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", - m->grant_check_calls); - M("grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", - m->grant_recalc_calls); + M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", + m->grant_check_calls); + M("grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", + m->grant_recalc_calls); M("grant_recalc_ns %15llu Time spent in homa_grant_recalc\n", m->grant_recalc_ns); M("grant_recalc_loops %15llu Number of times homa_grant_recalc looped back\n", diff --git a/homa_metrics.h b/homa_metrics.h index be4fc62f..afc32b31 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -242,11 +242,8 @@ struct homa_metrics { */ u64 so_set_buf_calls; - /** - * @grantable_lock_ns: total time spent with homa->grantable_lock - * locked. - */ - u64 grantable_lock_ns; + /** @grant_lock_ns: total time spent with the grant lock locked. */ + u64 grant_lock_ns; /** @timer_ns: total time spent in homa_timer. */ u64 timer_ns; @@ -467,16 +464,16 @@ struct homa_metrics { u64 peer_ack_lock_misses; /** - * @grantable_lock_miss_ns: total time spent waiting for grantable - * lock misses. + * @grant_lock_miss_ns: total time spent waiting for grant lock + * misses. */ - u64 grantable_lock_miss_ns; + u64 grant_lock_miss_ns; /** - * @grantable_lock_misses: total number of times that Homa had to wait - * to acquire the grantable lock. + * @grant_lock_misses: total number of times that Homa had to wait + * to acquire the grant lock. */ - u64 grantable_lock_misses; + u64 grant_lock_misses; /** * @grantable_rpcs_integral: cumulative sum of time_delta*grantable, diff --git a/homa_peer.h b/homa_peer.h index eb397de0..09bd4b61 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -121,7 +121,7 @@ struct homa_peer { * responses) involving this peer whose msgins require (or required * them in the past) and have not been fully received. The list is * sorted in priority order (head has fewest bytes_remaining). - * Locked with homa->grantable_lock. + * Locked with homa->grant_lock. */ struct list_head grantable_rpcs; diff --git a/homa_rpc.h b/homa_rpc.h index 0fac0adc..52fd1444 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -155,7 +155,7 @@ struct homa_message_in { * @granted: Total # of bytes (starting from offset 0) that the sender * may transmit without additional grants, includes unscheduled bytes. * Never larger than @length. Note: once initialized, this - * may not be modified without holding @homa->grantable_lock. + * may not be modified without holding @homa->grant_lock. */ int granted; diff --git a/homa_utils.c b/homa_utils.c index 1e9b3397..5cde1262 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -38,7 +38,7 @@ int homa_init(struct homa *homa, struct net *net) memset(homa, 0, sizeof(*homa)); atomic64_set(&homa->next_outgoing_id, 2); #ifndef __STRIP__ /* See strip.py */ - spin_lock_init(&homa->grantable_lock); + spin_lock_init(&homa->grant_lock); INIT_LIST_HEAD(&homa->grantable_peers); homa->last_grantable_change = sched_clock(); #endif /* See strip.py */ diff --git a/sync.txt b/sync.txt index dcb11d54..eb3c6ffb 100644 --- a/sync.txt +++ b/sync.txt @@ -30,7 +30,7 @@ This file describes the synchronization strategy used for Homa. locks are held, they must always be acquired in a consistent order, in order to prevent deadlock. For each lock, here are the other locks that may be acquired while holding the given lock. - * RPC: socket, grantable, throttle, peer->ack_lock + * RPC: socket, grant, throttle, peer->ack_lock * Socket: port_map.write_lock Any lock not listed above must be a "leaf" lock: no other lock will be acquired while holding the lock. diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index a04c84f4..9af4d0ea 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -27,7 +27,7 @@ char *rpc_ids(struct homa_rpc **rpcs, int count) } static struct homa *hook_homa; -static void grantable_spinlock_hook(char *id) +static void grant_spinlock_hook(char *id) { if (strcmp(id, "spin_lock") != 0) return; @@ -816,7 +816,7 @@ TEST_F(homa_grant, homa_grant_recalc__cant_acquire_grantable_lock) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - unit_hook_register(grantable_spinlock_hook); + unit_hook_register(grant_spinlock_hook); hook_homa = &self->homa; mock_trylock_errors = 0xff; @@ -962,7 +962,7 @@ TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_cant_get_lock) rpc4 = test_rpc(self, 106, self->server_ip, 10000); self->homa.max_incoming = 32000; self->homa.max_overcommit = 2; - unit_hook_register(grantable_spinlock_hook); + unit_hook_register(grant_spinlock_hook); hook_homa = &self->homa; mock_trylock_errors = 0xe0; EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_skips); @@ -1160,35 +1160,35 @@ TEST_F(homa_grant, homa_grant_end_rpc__not_in_active_list) EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); } -TEST_F(homa_grant, homa_grantable_lock_slow__basics) +TEST_F(homa_grant, homa_grant_lock_slow__basics) { mock_ns = 500; - unit_hook_register(grantable_spinlock_hook); + unit_hook_register(grant_spinlock_hook); - EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); - homa_grantable_unlock(&self->homa); + EXPECT_EQ(1, homa_grant_lock_slow(&self->homa, 0)); + homa_grant_unlock(&self->homa); - EXPECT_EQ(1, homa_metrics_per_cpu()->grantable_lock_misses); - EXPECT_EQ(500, homa_metrics_per_cpu()->grantable_lock_miss_ns); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_lock_misses); + EXPECT_EQ(500, homa_metrics_per_cpu()->grant_lock_miss_ns); } -TEST_F(homa_grant, homa_grantable_lock_slow__recalc_count) +TEST_F(homa_grant, homa_grant_lock_slow__recalc_count) { mock_ns = 500; - unit_hook_register(grantable_spinlock_hook); + unit_hook_register(grant_spinlock_hook); hook_homa = &self->homa; mock_trylock_errors = 0xff; - EXPECT_EQ(0, homa_grantable_lock_slow(&self->homa, 1)); + EXPECT_EQ(0, homa_grant_lock_slow(&self->homa, 1)); hook_homa = NULL; - EXPECT_EQ(1, homa_metrics_per_cpu()->grantable_lock_misses); - EXPECT_EQ(500, homa_metrics_per_cpu()->grantable_lock_miss_ns); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_lock_misses); + EXPECT_EQ(500, homa_metrics_per_cpu()->grant_lock_miss_ns); /* Make sure the check only occurs if the recalc argument is set. */ mock_trylock_errors = 0xff; - EXPECT_EQ(1, homa_grantable_lock_slow(&self->homa, 0)); - EXPECT_EQ(2, homa_metrics_per_cpu()->grantable_lock_misses); - homa_grantable_unlock(&self->homa); + EXPECT_EQ(1, homa_grant_lock_slow(&self->homa, 0)); + EXPECT_EQ(2, homa_metrics_per_cpu()->grant_lock_misses); + homa_grant_unlock(&self->homa); } /* Functions in homa_grant.h: diff --git a/util/metrics.py b/util/metrics.py index ee4e0925..1220aaec 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -354,7 +354,7 @@ def scale_number(number): print("\nLock Misses:") print("------------") print(" Misses/sec. ns/Miss %CPU") - for lock in ["client", "server", "socket", "grantable", "throttle", "peer_ack"]: + for lock in ["client", "server", "socket", "grant", "throttle", "peer_ack"]: misses = float(deltas[lock + "_lock_misses"]) ns = float(deltas[lock + "_lock_miss_ns"]) if misses == 0: @@ -454,8 +454,8 @@ def scale_number(number): rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) print("%-28s %15d %s%s" % (symbol, deltas[symbol], rate_info, docs[symbol])) - for symbol in ["pacer_lost_ns", "timer_reap_ns", - "data_pkt_reap_ns", "grantable_lock_ns"]: + for symbol in ["pacer_lost_ns", "timer_reap_ns", "data_pkt_reap_ns", + "grant_lock_ns"]: delta = deltas[symbol] if delta == 0 or time_delta == 0: continue diff --git a/util/tthoma.py b/util/tthoma.py index 8ddb110c..df43bb95 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -2900,11 +2900,11 @@ def output(self): tx_id, pkt['offset'])) #------------------------------------------------ -# Analyzer: grantablelock +# Analyzer: grantlock #------------------------------------------------ -class AnalyzeGrantablelock: +class AnalyzeGrantlock: """ - Analyzes contention for the grantable lock, which controls centrally + Analyzes contention for the grant lock, which controls centrally managed data about grantable RPCs. """ @@ -2920,7 +2920,7 @@ def __init__(self, dispatcher): # self.nodes = {} - # One record for each interval where a core blocked for the grantable + # One record for each interval where a core blocked for the grant # lock: where time is when the lock was # finally acquired, duration is how long the core had to wait, and # node and core indicate where the block occurred. @@ -2958,7 +2958,7 @@ def init_trace(self, trace): self.last_unblock = None def tt_lock_wait(self, trace, time, core, event, lock_name): - if lock_name != 'grantable': + if lock_name != 'grant': return if event == 'beginning': # Core blocked on lock @@ -2991,17 +2991,17 @@ def output(self): global traces print('\n-----------------------') - print('Analyzer: grantablelock') + print('Analyzer: grantlock') print('-----------------------\n') - print('Per-node statistics on usage of the grantable lock:') + print('Per-node statistics on usage of the grant lock:') print('Node: Name of node') print('Blocked: Fraction of core(s) wasted while blocked on the lock ' '(1.0 means') print(' that on average, one core was blocked on the lock)') print('MaxCore: The core that spent the largest fraction of its time ' 'blocked on') - print(' the grantable lock') + print(' the grant lock') print('MaxBlk: Fraction of time that MaxCore was blocked on the lock') print('HoldFrac: Fraction of time this node held the lock (note: ' 'hold times ') @@ -3043,7 +3043,7 @@ def output(self): print('%-10s %5.2f C%02d %6.3f %s' % (name, total_block/elapsed, max_block_core, max_block/elapsed, hold_info)) - print('\nLongest times a core had to wait for the grantable lock:') + print('\nLongest times a core had to wait for the grant lock:') print(' EndTime BlockTime Node Core') self.block_intervals.sort(key=lambda t : t[1], reverse=True) for i in range(len(self.block_intervals)): @@ -3052,7 +3052,7 @@ def output(self): time, duration, node, core = self.block_intervals[i] print('%9.3f %7.1f %10s %4d' % (time, duration, node, core)) - print('\nLongest periods that one core held the grantable lock:') + print('\nLongest periods that one core held the grant lock:') print('StartTime HoldTime Node Core') self.hold_times.sort(key=lambda t : t[1], reverse=True) for i in range(len(self.hold_times)): From 605810c1879fb271268e5079962dce5c71cffa70 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 24 Apr 2025 08:51:16 -0700 Subject: [PATCH 254/625] Add new entry in .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 241ffa67..3d4c0094 100644 --- a/.gitignore +++ b/.gitignore @@ -21,5 +21,5 @@ reports/ traces/ -bytedance/ +users/ saved_traces/ \ No newline at end of file From e5e36a4c6d6a8f55cc9fc60e32b0288f427915a4 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 24 Apr 2025 08:42:45 -0700 Subject: [PATCH 255/625] Improvements to cp_node.cc * Introduce fatal method to exit without destructing static objects * Handle RPC timeouts without exiting --- util/cp_node.cc | 131 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 43 deletions(-) diff --git a/util/cp_node.cc b/util/cp_node.cc index 4e4fbb55..7d164cf9 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -245,6 +245,16 @@ int kfreeze_count = 0; */ int64_t debug[5]; +/** + * fatal() - Invoked when fatal errors occur: exits the application. + */ +void fatal() +{ + fflush(stdout); + fflush(stderr); + _exit(1); +} + /** * print_help() - Print out usage information for this program. * @name: Name of the program (argv[0]) @@ -794,7 +804,7 @@ void tcp_connection::set_epoll_events(int epoll_fd, uint32_t events) : EPOLL_CTL_MOD, fd, &ev) < 0) { log(NORMAL, "FATAL: couldn't add/modify epoll event: %s\n", strerror(errno)); - _exit(1); + fatal(); } epoll_events = events; } @@ -862,7 +872,7 @@ bool tcp_connection::xmit() "to %s: %s (port %d)\n", print_address(&peer), strerror(errno), port); - _exit(1); + fatal(); } } if (bytes_sent < header->length) { @@ -910,7 +920,8 @@ class server_metrics { /** * @metrics: keeps track of metrics for all servers (whether Homa or TCP). - * These are malloc-ed and must be freed eventually. + * These are malloc-ed and must be freed eventually. This is a pointer so + * that it doesn't get destructed */ std::vector metrics; @@ -987,7 +998,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, log(NORMAL, "FATAL: homa_server couldn't open Homa " "socket: %s\n", strerror(errno)); - _exit(1); + fatal(); } memset(&addr, 0, sizeof(addr)); @@ -1002,7 +1013,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, log(NORMAL, "FATAL: homa_server couldn't bind socket " "to Homa port %d: %s\n", port, strerror(errno)); - _exit(1); + fatal(); } log(NORMAL, "Successfully bound to Homa port %d\n", port); @@ -1012,7 +1023,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, if (buf_region == MAP_FAILED) { printf("Couldn't mmap buffer region for server on port %d: %s\n", port, strerror(errno)); - _exit(1); + fatal(); } arg.start = (uintptr_t)buf_region; arg.length = buf_size; @@ -1021,7 +1032,7 @@ homa_server::homa_server(int port, int id, int inet_family, int num_threads, if (status < 0) { printf("FATAL: error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); - _exit(1); + fatal(); } for (int i = 0; i < num_threads; i++) { @@ -1117,7 +1128,7 @@ void homa_server::server(int thread_id, server_metrics *metrics) log(NORMAL, "FATAL: homa_reply failed for server " "port %d: %s\n", port, strerror(errno)); - _exit(1); + fatal(); } metrics->requests++; metrics->bytes_in += length; @@ -1219,7 +1230,7 @@ tcp_server::tcp_server(int port, int id, int num_threads, if (listen_fd == -1) { log(NORMAL, "FATAL: couldn't open server socket: %s\n", strerror(errno)); - _exit(1); + fatal(); } int option_value = 1; if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &option_value, @@ -1227,13 +1238,13 @@ tcp_server::tcp_server(int port, int id, int num_threads, log(NORMAL, "FATAL: couldn't set SO_REUSEADDR on listen " "socket: %s", strerror(errno)); - _exit(1); + fatal(); } if (fcntl(listen_fd, F_SETFL, O_NONBLOCK) != 0) { log(NORMAL, "FATAL: couldn't set O_NONBLOCK on listen " "socket: %s", strerror(errno)); - _exit(1); + fatal(); } sockaddr_in_union addr; if (inet_family == AF_INET) { @@ -1248,12 +1259,12 @@ tcp_server::tcp_server(int port, int id, int num_threads, if (bind(listen_fd, &addr.sa, sizeof(addr)) == -1) { log(NORMAL, "FATAL: couldn't bind to port %d: %s\n", port, strerror(errno)); - _exit(1); + fatal(); } if (listen(listen_fd, 1000) == -1) { log(NORMAL, "FATAL: couldn't listen on socket: %s", strerror(errno)); - _exit(1); + fatal(); } epoll_fd = epoll_create(10); @@ -1261,7 +1272,7 @@ tcp_server::tcp_server(int port, int id, int num_threads, log(NORMAL, "FATAL: couldn't create epoll instance for " "TCP server: %s\n", strerror(errno)); - _exit(1); + fatal(); } struct epoll_event ev; ev.events = EPOLLIN; @@ -1269,7 +1280,7 @@ tcp_server::tcp_server(int port, int id, int num_threads, if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0) { log(NORMAL, "FATAL: couldn't add listen socket to epoll: %s\n", strerror(errno)); - _exit(1); + fatal(); } metrics = new server_metrics(experiment); @@ -1296,7 +1307,7 @@ tcp_server::~tcp_server() if (pipe2(fds, 0) < 0) { log(NORMAL, "FATAL: couldn't create pipe to shutdown TCP " "server: %s\n", strerror(errno)); - _exit(1); + fatal(); } struct epoll_event ev; ev.events = EPOLLIN; @@ -1305,7 +1316,7 @@ tcp_server::~tcp_server() if (write(fds[1], "xxxx", 4) < 0) { log(NORMAL, "FATAL: couldn't write to TCP shutdown pipe: %s\n", strerror(errno)); - _exit(1); + fatal(); } for (size_t i = 0; i < threads.size(); i++) @@ -1367,7 +1378,7 @@ void tcp_server::server(int thread_id) continue; log(NORMAL, "FATAL: epoll_wait failed: %s\n", strerror(errno)); - _exit(1); + fatal(); } tt("epoll_wait returned %d events in server pid %d", num_events, pid); @@ -1411,7 +1422,7 @@ void tcp_server::accept(int epoll_fd) return; log(NORMAL, "FATAL: couldn't accept incoming TCP connection: " "%s\n", strerror(errno)); - _exit(1); + fatal(); } /* Make sure the connection appears to be coming from someone @@ -1447,7 +1458,7 @@ void tcp_server::accept(int epoll_fd) if (fd >= MAX_FDS) { log(NORMAL, "FATAL: TCP socket fd %d is greater than MAX_FDS\n", fd); - _exit(1); + fatal(); } spin_lock lock_guard(&fd_locks[fd]); tcp_connection *connection = new tcp_connection(fd, fd, port, @@ -1524,6 +1535,11 @@ class client { */ bool active; + /** + * @id: RPC identifier for the request (only for Homa requests). + */ + uint64_t id; + rinfo() : start_time(0), request_length(0), active(false) {} }; @@ -1722,7 +1738,7 @@ client::client(int id, std::string& experiment) log(NORMAL, "FATAL: couldn't look up address " "for %s: %s\n", host, gai_strerror(status)); - _exit(1); + fatal(); } dest = reinterpret_cast (matching_addresses->ai_addr); @@ -1814,7 +1830,7 @@ int client::get_rinfo() "total_responses %ld, last_rinfo %d)\n", rinfos.size(), total_requests, total_responses.load(), last_rinfo); - _exit(1); + fatal(); } } } @@ -1893,6 +1909,7 @@ class homa_client : public client { void receiver(int id); void sender(void); virtual void stop_sender(void); + void timeout(homa::receiver *receiver); bool wait_response(homa::receiver *receiver, uint64_t rpc_id); /** @fd: file descriptor for Homa socket. */ @@ -1955,7 +1972,7 @@ homa_client::homa_client(int id, std::string& experiment) fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); if (fd < 0) { log(NORMAL, "Couldn't open Homa socket: %s\n", strerror(errno)); - _exit(1); + fatal(); } buf_region = (char *) mmap(NULL, buf_size, PROT_READ|PROT_WRITE, @@ -1963,7 +1980,7 @@ homa_client::homa_client(int id, std::string& experiment) if (buf_region == MAP_FAILED) { printf("Couldn't mmap buffer region for homa_client id %d: %s\n", id, strerror(errno)); - _exit(1); + fatal(); } arg.start = (uintptr_t)buf_region; arg.length = buf_size; @@ -1972,7 +1989,7 @@ homa_client::homa_client(int id, std::string& experiment) if (status < 0) { printf("FATAL: error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); - _exit(1); + fatal(); } if (unloaded) { @@ -2057,18 +2074,22 @@ bool homa_client::wait_response(homa::receiver *receiver, uint64_t rpc_id) if (length < 0) { if (exit_receivers) return false; + if (errno == ETIMEDOUT) { + timeout(receiver); + return true; + } log(NORMAL, "FATAL: error in Homa recvmsg: %s (id %lu, " "server %s)\n", - strerror(errno), rpc_id, + strerror(errno), receiver->id(), print_address((union sockaddr_in_union *) receiver->src_addr())); - _exit(1); + fatal(); } header = receiver->get(0); if (header == nullptr) { log(NORMAL, "FATAL: Homa response message contained %lu bytes; " "need at least %lu", length, sizeof(*header)); - _exit(1); + fatal(); } uint64_t end_time = rdtsc(); tt("Received response, cid 0x%08x, id %x, %d bytes", @@ -2077,6 +2098,29 @@ bool homa_client::wait_response(homa::receiver *receiver, uint64_t rpc_id) return true; } + +/** + * timeout() - Invoked to process Homa timeouts (free up the rinfo struct). + * @receiver: Holds information about the failed RPC. + */ +void homa_client::timeout(homa::receiver *receiver) +{ + uint64_t id = receiver->id(); + for (struct rinfo &r: rinfos) { + if (r.id == id) { + log(NORMAL, "ERROR: Homa RPC timed out, id %lu, " + "length %d, server %s\n", + id, r.request_length, + print_address((union sockaddr_in_union *) + receiver->src_addr())); + r.active = false; + return; + } + } + log(NORMAL, "FATAL: couldn't find rinfo for timed out RPC id %lu\n", id); + fatal(); +} + /** * homa_client::sender() - Invoked as the top-level method in a thread; * invokes a pseudo-random stream of RPCs continuously. @@ -2148,8 +2192,9 @@ void homa_client::sender() log(NORMAL, "FATAL: error in homa_send: %s (request " "length %d)\n", strerror(errno), header->length); - _exit(1); + fatal(); } + rinfos[slot].id = rpc_id; requests[server]++; total_requests++; lag = now - next_start; @@ -2212,7 +2257,7 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, log(NORMAL, "FATAL: error in homa_send: %s (request " "length %d)\n", strerror(errno), header->length); - _exit(1); + fatal(); } do { status = receiver->receive(0, rpc_id); @@ -2223,7 +2268,7 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, strerror(errno), rpc_id, print_address((union sockaddr_in_union *) receiver->src_addr())); - _exit(1); + fatal(); } return rdtsc() - start; } @@ -2371,7 +2416,7 @@ tcp_client::tcp_client(int id, std::string& experiment) if (epoll_fd < 0) { log(NORMAL, "FATAL: tcp_client couldn't create epoll " "instance: %s\n", strerror(errno)); - _exit(1); + fatal(); } for (uint32_t i = 0; i < server_addrs.size(); i++) { @@ -2380,7 +2425,7 @@ tcp_client::tcp_client(int id, std::string& experiment) log(NORMAL, "FATAL: couldn't open TCP client " "socket: %s\n", strerror(errno)); - _exit(1); + fatal(); } if (connect(fd, reinterpret_cast( &server_addrs[i]), @@ -2389,7 +2434,7 @@ tcp_client::tcp_client(int id, std::string& experiment) "to %s: %s\n", print_address(&server_addrs[i]), strerror(errno)); - _exit(1); + fatal(); } int flag = 1; setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &flag, sizeof(flag)); @@ -2398,7 +2443,7 @@ tcp_client::tcp_client(int id, std::string& experiment) "to server %s: %s", print_address(&server_addrs[i]), strerror(errno)); - _exit(1); + fatal(); } sockaddr_in_union addr; socklen_t length = sizeof(addr); @@ -2406,7 +2451,7 @@ tcp_client::tcp_client(int id, std::string& experiment) &length)) { log(NORMAL, "FATAL: getsockname failed for TCP client: " "%s\n", strerror(errno)); - _exit(1); + fatal(); } connections.emplace_back(new tcp_connection(fd, i, ntohs(addr.in4.sin_port), server_addrs[i])); @@ -2442,7 +2487,7 @@ tcp_client::~tcp_client() if (pipe2(fds, 0) < 0) { log(NORMAL, "FATAL: couldn't create pipe to shutdown TCP " "server: %s\n", strerror(errno)); - _exit(1); + fatal(); } struct epoll_event ev; ev.events = EPOLLIN; @@ -2451,7 +2496,7 @@ tcp_client::~tcp_client() if (write(fds[1], "xxxx", 4) < 0) { log(NORMAL, "FATAL: couldn't write to TCP shutdown " "pipe: %s\n", strerror(errno)); - _exit(1); + fatal(); } if (sending_thread) @@ -2595,7 +2640,7 @@ void tcp_client::receiver(int receiver_id) log(NORMAL, "FATAL: epoll_wait failed in tcp_client: " "%s\n", strerror(errno)); - _exit(1); + fatal(); } tt("epoll_wait returned %d events in client pid %d", num_events, pid); @@ -2629,7 +2674,7 @@ void tcp_client::read(tcp_connection *connection, int pid) if (error) { log(NORMAL, "FATAL: %s (client)\n", connection->error_message); - _exit(1); + fatal(); } } @@ -3508,13 +3553,13 @@ int main(int argc, char** argv) if (getrlimit(RLIMIT_NOFILE, &limits) != 0) { log(NORMAL, "FATAL: couldn't read file descriptor limits: " "%s\n", strerror(errno)); - _exit(1); + fatal(); } limits.rlim_cur = limits.rlim_max; if (setrlimit(RLIMIT_NOFILE, &limits) != 0) { log(NORMAL, "FATAL: couldn't increase file descriptor limit: " "%s\n", strerror(errno)); - _exit(1); + fatal(); } struct sigaction action; action.sa_sigaction = error_handler; @@ -3533,7 +3578,7 @@ int main(int argc, char** argv) for (int i = 1; i < argc; i++) words.emplace_back(argv[i]); if (!exec_words(words)) - _exit(1); + fatal(); /* Instead of going interactive, just print stats. * every second. From 7cf453a69a2202b4202083c0485db993cad49b65 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 24 Apr 2025 08:43:49 -0700 Subject: [PATCH 256/625] Minor improvements to cperf.py * Process "timed out" messages in log files * Fix syntax errors in regex patterns --- util/cperf.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index d92e970b..0bb90a3c 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -654,9 +654,8 @@ def start_servers(exp, ids, options): % (options.tcp_server_ports, options.tcp_port_threads, options.protocol, exp, options.ipv6), ids) server_nodes = ids - if (options.debug): - print("Pausing for debug setup; type to continue: ", end="") - input() + if options.debug: + input("Pausing for debug setup, type to continue: ") def run_experiment(name, clients, options): """ @@ -980,13 +979,19 @@ def scan_log(file, node, experiments): experiment = "" node_data = None active = False + timeouts = 0 for line in open(file): if "FATAL:" in line: log("%s: %s" % (file, line[:-1])) exited = True if "ERROR:" in line: + if "Homa RPC timed out" in line: + timeouts += 1 + if timeouts > 1: + continue log("%s: %s" % (file, line[:-1])) + continue if "cp_node exiting" in line: exited = True @@ -1053,6 +1058,8 @@ def scan_log(file, node, experiments): continue if not exited: log("%s appears to have crashed (didn't exit)" % (node)) + if timeouts > 1: + log("%s: %d additional Homa RPC timeouts" % (file, timeouts-1)) def scan_logs(): """ From c39b7606effbbf6315dcebb76450c545d62e496d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 24 Apr 2025 10:16:24 -0700 Subject: [PATCH 257/625] Refactor homa_pool APIs * Replace homa_pool_init with homa_pool_new and homa_pool_set_region * Move allocation and deallocation of homa_pools into homa_pool.c * Move locking responsibility out of homa_pool.c * Sockets now always have a valid buffer_pool pointer --- homa_plumbing.c | 9 +++-- homa_pool.c | 53 +++++++++++++++++---------- homa_pool.h | 7 ++-- homa_sock.c | 9 ++--- test/mock.c | 3 +- test/unit_homa_incoming.c | 4 ++- test/unit_homa_plumbing.c | 12 +++---- test/unit_homa_pool.c | 76 +++++++++++++++++++++++++++------------ test/unit_homa_rpc.c | 1 + 9 files changed, 115 insertions(+), 59 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index a7b76010..4366c26e 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -891,8 +891,9 @@ int homa_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; homa_sock_lock(hsk); - ret = homa_pool_init(hsk, u64_to_user_ptr(args.start), - args.length); + ret = homa_pool_set_region(hsk->buffer_pool, + u64_to_user_ptr(args.start), + args.length); homa_sock_unlock(hsk); INC_METRIC(so_set_buf_calls, 1); INC_METRIC(so_set_buf_ns, sched_clock() - start); @@ -945,7 +946,9 @@ int homa_getsockopt(struct sock *sk, int level, int optname, if (len < sizeof(rcvbuf_args)) return -EINVAL; - homa_pool_get_rcvbuf(hsk, &rcvbuf_args); + homa_sock_lock(hsk); + homa_pool_get_rcvbuf(hsk->buffer_pool, &rcvbuf_args); + homa_sock_unlock(hsk); len = sizeof(rcvbuf_args); result = &rcvbuf_args; } else if (optname == SO_HOMA_SERVER) { diff --git a/homa_pool.c b/homa_pool.c index 50b18b2f..73e3398e 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -42,25 +42,41 @@ static void set_bpages_needed(struct homa_pool *pool) } /** - * homa_pool_init() - Initialize a homa_pool; any previous contents are - * destroyed. - * @hsk: Socket containing the pool to initialize. + * homa_pool_new() - Allocate and initialize a new homa_pool (it will have + * no region associated with it until homa_pool_set_region is invoked). + * @hsk: Socket the pool will be associated with. + * Return: A pointer to the new pool or a negative errno. + */ +struct homa_pool *homa_pool_new(struct homa_sock *hsk) +{ + struct homa_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_ATOMIC); + if (!pool) + return ERR_PTR(-ENOMEM); + pool->hsk = hsk; + return pool; +} + +/** + * homa_pool_set_region() - Associate a region of memory with a pool. + * @pool: Pool the region will be associated with. Must not currently + * have a region associated with it. * @region: First byte of the memory region for the pool, allocated * by the application; must be page-aligned. * @region_size: Total number of bytes available at @buf_region. * Return: Either zero (for success) or a negative errno for failure. */ -int homa_pool_init(struct homa_sock *hsk, void __user *region, +int homa_pool_set_region(struct homa_pool *pool, void __user *region, u64 region_size) { - struct homa_pool *pool = hsk->buffer_pool; int i, result; - homa_pool_destroy(hsk->buffer_pool); + if (pool->region) + return -EINVAL; if (((uintptr_t)region) & ~PAGE_MASK) return -EINVAL; - pool->hsk = hsk; pool->region = (char __user *)region; pool->num_bpages = region_size >> HOMA_BPAGE_SHIFT; pool->descriptors = NULL; @@ -106,31 +122,30 @@ int homa_pool_init(struct homa_sock *hsk, void __user *region, /** * homa_pool_destroy() - Destructor for homa_pool. After this method - * returns, the object should not be used unless it has been reinitialized. + * returns, the object should not be used (it will be freed here). * @pool: Pool to destroy. */ void homa_pool_destroy(struct homa_pool *pool) { - if (!pool->region) - return; - kfree(pool->descriptors); - free_percpu(pool->cores); - pool->region = NULL; + if (pool->region) { + kfree(pool->descriptors); + free_percpu(pool->cores); + pool->region = NULL; + } + kfree(pool); } /** * homa_pool_get_rcvbuf() - Return information needed to handle getsockopt * for HOMA_SO_RCVBUF. - * @hsk: Socket on which getsockopt request was made. + * @pool: Pool for which information is needed. * @args: Store info here. */ -void homa_pool_get_rcvbuf(struct homa_sock *hsk, +void homa_pool_get_rcvbuf(struct homa_pool *pool, struct homa_rcvbuf_args *args) { - homa_sock_lock(hsk); - args->start = (uintptr_t)hsk->buffer_pool->region; - args->length = hsk->buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; - homa_sock_unlock(hsk); + args->start = (uintptr_t)pool->region; + args->length = pool->num_bpages << HOMA_BPAGE_SHIFT; } /** diff --git a/homa_pool.h b/homa_pool.h index b919eeee..4463fa58 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -145,11 +145,12 @@ void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available); int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, int leave_locked); -void homa_pool_get_rcvbuf(struct homa_sock *hsk, +void homa_pool_get_rcvbuf(struct homa_pool *pool, struct homa_rcvbuf_args *args); -int homa_pool_init(struct homa_sock *hsk, void *buf_region, - u64 region_size); +struct homa_pool *homa_pool_new(struct homa_sock *hsk); int homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, u32 *buffers); +int homa_pool_set_region(struct homa_pool *pool, void __user *region, + u64 region_size); #endif /* _HOMA_POOL_H */ diff --git a/homa_sock.c b/homa_sock.c index 0f6be755..5689122b 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -194,9 +194,11 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) bucket->id = i + 1000000; INIT_HLIST_HEAD(&bucket->rpcs); } - hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_ATOMIC); - if (!hsk->buffer_pool) - result = -ENOMEM; + hsk->buffer_pool = homa_pool_new(hsk); + if (IS_ERR(hsk->buffer_pool)) { + result = PTR_ERR(hsk->buffer_pool); + hsk->buffer_pool = NULL; + } #ifndef __STRIP__ /* See strip.py */ if (homa->hijack_tcp) hsk->sock.sk_protocol = IPPROTO_TCP; @@ -303,7 +305,6 @@ void homa_sock_shutdown(struct homa_sock *hsk) if (hsk->buffer_pool) { homa_pool_destroy(hsk->buffer_pool); - kfree(hsk->buffer_pool); hsk->buffer_pool = NULL; } tt_record1("Finished shutdown for socket %d", hsk->port); diff --git a/test/mock.c b/test/mock.c index 4d6e0eef..97dc4d2d 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1918,7 +1918,8 @@ int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) mock_mtu = UNIT_TEST_DATA_PER_PACKET + hsk->ip_header_length + sizeof(struct homa_data_hdr); mock_net_device.gso_max_size = mock_mtu; - err = homa_pool_init(hsk, (void *) 0x1000000, 100*HOMA_BPAGE_SIZE); + err = homa_pool_set_region(hsk->buffer_pool, (void *) 0x1000000, + 100*HOMA_BPAGE_SIZE); return err; } diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 662ac6d4..43419c65 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -152,13 +152,14 @@ TEST_F(homa_incoming, homa_message_in_init__message_too_long) ASSERT_TRUE(IS_ERR(srpc)); EXPECT_EQ(EINVAL, -PTR_ERR(srpc)); } -TEST_F(homa_incoming, homa_message_in_init__pool_doesnt_exist) +TEST_F(homa_incoming, homa_message_in_init__no_buffer_region) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); homa_pool_destroy(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_new(&self->hsk); EXPECT_EQ(ENOMEM, -homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 0)); EXPECT_EQ(0, crpc->msgin.num_bpages); } @@ -1284,6 +1285,7 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffer_pool) ASSERT_NE(NULL, crpc); homa_pool_destroy(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_new(&self->hsk); unit_log_clear(); homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, 1400, 0), crpc); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 91d23d75..39b695d0 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -324,6 +324,7 @@ TEST_F(homa_plumbing, homa_setsockopt__recvbuf_success) args.length = 64*HOMA_BPAGE_SIZE; self->optval.user = &args; homa_pool_destroy(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_new(&self->hsk); EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, self->optval, sizeof(struct homa_rcvbuf_args))); @@ -365,8 +366,11 @@ TEST_F(homa_plumbing, homa_getsockopt__recvbuf_success) struct homa_rcvbuf_args val; int size = sizeof32(val) + 10; - EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, - 10*HOMA_BPAGE_SIZE + 1000)); + homa_pool_destroy(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_new(&self->hsk); + EXPECT_EQ(0, -homa_pool_set_region(self->hsk.buffer_pool, + (void *)0x40000, + 10*HOMA_BPAGE_SIZE + 1000)); EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); EXPECT_EQ(0x40000, val.start); @@ -437,8 +441,6 @@ TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_size) struct homa_rcvbuf_args val = {.start = 0, .length = 0}; int size = sizeof32(val) + 10; - EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, - 10*HOMA_BPAGE_SIZE + 1000)); mock_copy_to_user_errors = 1; EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, @@ -451,8 +453,6 @@ TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_value) struct homa_rcvbuf_args val = {.start = 0, .length = 0}; int size = sizeof32(val) + 10; - EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, - 10*HOMA_BPAGE_SIZE + 1000)); mock_copy_to_user_errors = 2; EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index f4649db8..d98219c4 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -68,7 +68,7 @@ static void change_owner_hook(char *id) .page_hint].owner = -1; } -TEST_F(homa_pool, homa_pool_set_bpages_needed) +TEST_F(homa_pool, set_bpages_needed) { struct homa_pool *pool = self->hsk.buffer_pool; @@ -82,50 +82,80 @@ TEST_F(homa_pool, homa_pool_set_bpages_needed) EXPECT_EQ(2, pool->bpages_needed); } -TEST_F(homa_pool, homa_pool_init__basics) +TEST_F(homa_pool, homa_pool_init) { - struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_pool *pool; + + /* Success */ + pool = homa_pool_new(&self->hsk); + EXPECT_FALSE(IS_ERR(pool)); + EXPECT_EQ(pool->hsk, &self->hsk); + homa_pool_destroy(pool); - EXPECT_EQ(100, pool->num_bpages); - EXPECT_EQ(-1, pool->descriptors[98].owner); + /* Can't allocate memory. */ + mock_kmalloc_errors = 1; + pool = homa_pool_new(&self->hsk); + EXPECT_TRUE(IS_ERR(pool)); + EXPECT_EQ(ENOMEM, -PTR_ERR(pool)); } -TEST_F(homa_pool, homa_pool_init__region_not_page_aligned) + +TEST_F(homa_pool, homa_pool_set_region__basics) { - homa_pool_destroy(self->hsk.buffer_pool); - EXPECT_EQ(EINVAL, -homa_pool_init(&self->hsk, + struct homa_pool *pool = homa_pool_new(&self->hsk); + + EXPECT_EQ(0, -homa_pool_set_region(pool, (void *) 0x100000, + 78*HOMA_BPAGE_SIZE)); + EXPECT_EQ(78, pool->num_bpages); + EXPECT_EQ(-1, pool->descriptors[69].owner); + homa_pool_destroy(pool); +} +TEST_F(homa_pool, homa_pool_set_region__region_not_page_aligned) +{ + struct homa_pool *pool = homa_pool_new(&self->hsk); + + EXPECT_EQ(EINVAL, -homa_pool_set_region(pool, ((char *) 0x1000000) + 10, 100*HOMA_BPAGE_SIZE)); + homa_pool_destroy(pool); } -TEST_F(homa_pool, homa_pool_init__region_too_small) +TEST_F(homa_pool, homa_pool_set_region__region_too_small) { - homa_pool_destroy(self->hsk.buffer_pool); - EXPECT_EQ(EINVAL, -homa_pool_init(&self->hsk, (void *) 0x1000000, + struct homa_pool *pool = homa_pool_new(&self->hsk); + + EXPECT_EQ(EINVAL, -homa_pool_set_region(pool, (void *) 0x1000000, HOMA_BPAGE_SIZE)); + homa_pool_destroy(pool); } -TEST_F(homa_pool, homa_pool_init__cant_allocate_descriptors) +TEST_F(homa_pool, homa_pool_set_region__cant_allocate_descriptors) { + struct homa_pool *pool = homa_pool_new(&self->hsk); + mock_kmalloc_errors = 1; - homa_pool_destroy(self->hsk.buffer_pool); - EXPECT_EQ(ENOMEM, -homa_pool_init(&self->hsk, (void *) 0x100000, + EXPECT_EQ(ENOMEM, -homa_pool_set_region(pool, (void *) 0x100000, 100*HOMA_BPAGE_SIZE)); + homa_pool_destroy(pool); } -TEST_F(homa_pool, homa_pool_init__cant_allocate_core_info) +TEST_F(homa_pool, homa_pool_set_region__cant_allocate_core_info) { - homa_pool_destroy(self->hsk.buffer_pool); + struct homa_pool *pool = homa_pool_new(&self->hsk); + mock_kmalloc_errors = 2; - EXPECT_EQ(ENOMEM, -homa_pool_init(&self->hsk, (void *) 0x100000, + EXPECT_EQ(ENOMEM, -homa_pool_set_region(pool, (void *) 0x100000, 100*HOMA_BPAGE_SIZE)); + homa_pool_destroy(pool); } TEST_F(homa_pool, homa_pool_get_rcvbuf) { + struct homa_pool *pool = homa_pool_new(&self->hsk); struct homa_rcvbuf_args args; - EXPECT_EQ(0, -homa_pool_init(&self->hsk, (void *)0x40000, + EXPECT_EQ(0, -homa_pool_set_region(pool, (void *)0x40000, 10*HOMA_BPAGE_SIZE + 1000)); - homa_pool_get_rcvbuf(&self->hsk, &args); + homa_pool_get_rcvbuf(pool, &args); EXPECT_EQ(0x40000, args.start); EXPECT_EQ(10*HOMA_BPAGE_SIZE, args.length); + homa_pool_destroy(pool); } TEST_F(homa_pool, homa_pool_get_pages__basics) @@ -258,15 +288,17 @@ TEST_F(homa_pool, homa_pool_allocate__basics) EXPECT_EQ(150000 - 2*HOMA_BPAGE_SIZE, pool->cores[smp_processor_id()].allocated); } -TEST_F(homa_pool, homa_pool_no_buffer_pool) +TEST_F(homa_pool, homa_pool_allocate__no_buffer_pool) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 150000); - struct homa_pool *pool = self->hsk.buffer_pool; ASSERT_NE(NULL, crpc); - homa_pool_destroy(pool); + + homa_pool_destroy(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_new(&self->hsk); + EXPECT_EQ(ENOMEM, -homa_pool_allocate(crpc)); } TEST_F(homa_pool, homa_pool_allocate__cant_allocate_full_bpages) diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index b1e2390d..c3001525 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -249,6 +249,7 @@ TEST_F(homa_rpc, homa_rpc_new_server__no_buffer_pool) self->data.message_length = N(1400); homa_pool_destroy(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_new(&self->hsk); srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_TRUE(IS_ERR(srpc)); From dbf6858465d197b43000653ad517c2cdcf7373cc Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 24 Apr 2025 11:43:08 -0700 Subject: [PATCH 258/625] Limit the number of tt entries logged by tt_printk (Reduces the likelihood that log buffers will overflow) --- timetrace.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/timetrace.c b/timetrace.c index d39f0500..a47a6737 100644 --- a/timetrace.c +++ b/timetrace.c @@ -694,6 +694,7 @@ void tt_printk(void) */ static int pos[NR_CPUS]; static atomic_t active; + int i; if (atomic_xchg(&active, 1)) { pr_err("concurrent call to %s aborting\n", __func__); @@ -704,6 +705,16 @@ void tt_printk(void) atomic_inc(&tt_freeze_count); tt_find_oldest(pos); + /* Limit the number of entries logged per core (logging too many + * seems to cause entries to be lost). + */ + for (i = 0; i < nr_cpu_ids; i++) { + struct tt_buffer *buffer = tt_buffers[i]; + + if (((buffer->next_index - pos[i]) & (TT_BUF_SIZE - 1)) > 200) + pos[i] = (buffer->next_index - 200) & (TT_BUF_SIZE - 1); + } + pr_err("cpu_khz: %u\n", cpu_khz); /* Each iteration of this loop printk's one event. */ From a6d46dde653cf7774e00a205e0a188f527ce5234 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 28 Apr 2025 15:57:59 -0700 Subject: [PATCH 259/625] Analyze freeze/resend/busy packets in ttsync.py Allows timetraces to be sync-ed even if there are no data or grant packets. --- util/ttsync.py | 206 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 196 insertions(+), 10 deletions(-) diff --git a/util/ttsync.py b/util/ttsync.py index 5dedeb87..bb747049 100755 --- a/util/ttsync.py +++ b/util/ttsync.py @@ -10,7 +10,7 @@ across the traces """ -from __future__ import division, print_function +from collections import defaultdict from glob import glob from optparse import OptionParser import math @@ -50,13 +50,32 @@ # receiver, as passed to parse_tt. recv_pkts = {} - # (rpc_id:offset) -> 1 for each retransmitted packet. rpc_id is the # id on the sender; used to ignore retransmits when syncing clocks # (the retransmit can accidentally be paired with the receipt of the # original packet). retransmits = {} +# node_num -> rpc_id -> . For each node number, contains +# a dictionary mapping from RPC identifiers to a list of unadjusted times +# when busy or resend packets were transmitted for rpc_id. Rpc_id the id on +# the sender. +send_ctl = defaultdict(lambda: defaultdict(list)) + +# rpc_id -> times. Times is a list of unadjusted times when resend or +# busy packets were received for rpc_id (rpc_id is the id on the receiver). +recv_ctl = defaultdict(list) + +# List of with one entry for each FREEZE packet +# sent. Time is the unadjusted time on the sender when the packet was sent. +# sender is the sender node index, and receiver is the receiver *address*. +send_freeze = [] + +# node_num -> for each FREEZE packet received. Time +# is the unadjusted time on the receiver when the last freeze packet +# was received by node_num. Ssender is the sender *address*. +recv_freeze = {} + # This is an NxN array, where N is the number of nodes. min_delays[A][B] # gives the smallest delay seen from node A to node B, as measured with # their unadjusted clocks (one of these delays could be negative). @@ -65,7 +84,7 @@ # This is an NxN array, where N is the number of nodes. Each entry corresponds # to an entry in min_delays, and gives the time when the message producing # the minimum delay was received. -recv_times = [] +min_recv_times = [] # For each node, the offset to add to its clock value in order to synchronize # its clock with node 0. @@ -79,6 +98,15 @@ # skip receipts of retransmissions, which can mess up delay calculations. max_recv_offsets = {} +# rpc_id -> IP address of the node that sent or received packets with that ID. +id_addr = {} + +# rpc_id -> node index for the node that sent or received packets with that ID. +id_node_num = {} + +# IP address of node -> node's index in sync tables. +addr_node_num = {} + def parse_tt(tt, node_num): """ Reads a timetrace file and adds entries to send_pkts and recv_pkts. @@ -104,6 +132,65 @@ def parse_tt(tt, node_num): id = int(match.group(4)) pktid = '%d:%d' % (id, offset) retransmits[pktid] = 1 + continue + + match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' + 'sending BUSY from resend, id ([0-9]+),', line) + if match: + time = float(match.group(1)) + id = match.group(3) + send_ctl[node_num][id].append(time) + continue + + match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' + 'mlx sent homa packet to ([^ ]+) id ([0-9]+), ' + 'type (0x[0-9a-f]+)', line) + if match: + time = float(match.group(1)) + addr = match.group(3) + id = match.group(4) + type = match.group(5) + id_addr[peer_id(id)] = addr + id_node_num[id] = node_num + if type != '0x12' and type != '0x14': + continue + send_ctl[node_num][id].append(time) + continue + + match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' + 'homa_gro_receive got packet from ([^ ]+) id ([0-9]+), ' + 'type (0x[0-9a-f]+)', line) + if match: + time = float(match.group(1)) + addr = match.group(3) + id = match.group(4) + type = match.group(5) + id_addr[peer_id(id)] = addr + id_node_num[id] = node_num + if type == '0x16': + recv_freeze[node_num] = [time, addr] + continue + if type != '0x12' and type != '0x14': + continue + recv_ctl[id].append(time) + continue + + match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' + 'Sending freeze to (0x[0-9a-f]+)', line) + if match: + time = float(match.group(1)) + addr = match.group(3) + send_freeze.append([time, node_num, addr]) + continue + + # match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' + # 'Freezing because of request on port [0-9]+ ' + # 'from (0x[0-9a-f]+):', line) + # if match: + # time = float(match.group(1)) + # addr = match.group(3) + # recv_freeze[node_num] = [time, addr] + # continue continue time = float(match.group(1)) @@ -130,6 +217,7 @@ def parse_tt(tt, node_num): last_offset = offset + int(match2.group(1)) - 1 if (id in max_send_offsets) and (max_send_offsets[id] < last_offset): max_send_offsets[id] = last_offset + continue if "homa_gro_receive got packet" in line: if (id in max_recv_offsets) and (max_recv_offsets[id] >= offset): @@ -138,17 +226,30 @@ def parse_tt(tt, node_num): recv_pkts[pktid] = [time, node_num] max_recv_offsets[id] = offset recvd += 1 + continue if "sending grant for" in line: pktid = '%d:%dg' % (id, offset) if not pktid in send_pkts: send_pkts[pktid] = [time, node_num] sent += 1 + continue if "homa_gro_receive got grant from" in line: pktid = '%d:%dg' % (id^1, offset) recv_pkts[pktid] = [time, node_num] recvd += 1 + continue + + match = re.match(r' *([-0-9.]+) us .* us\) \[C([0-9]+)\] Sent RESEND ' + r'for client RPC id ([0-9]+), server ([^:]+):', line) + if False and match: + id = match.group(3) + addr = match.group(4) + id_addr[peer_id(id)] = addr + id_node_num[id] = node_num + send_ctl[node_num][id].append(time) + continue print("%s has %d packet sends, %d receives" % (tt, sent, recvd)) @@ -161,10 +262,10 @@ def find_min_delays(num_nodes): send_pkts and recv_pkts must be < num_nodes. """ - global min_delays, recv_times, send_pkts, recv_pkts + global min_delays, min_recv_times, send_pkts, recv_pkts min_delays = [[1e20 for i in range(num_nodes)] for j in range(num_nodes)] - recv_times = [[0 for i in range(num_nodes)] for j in range(num_nodes)] + min_recv_times = [[0 for i in range(num_nodes)] for j in range(num_nodes)] # Iterate over all the client-side events and match them to server-side # events if possible. @@ -176,7 +277,81 @@ def find_min_delays(num_nodes): delay = recv_time - send_time if delay < min_delays[send_node][recv_node]: min_delays[send_node][recv_node] = delay - recv_times[send_node][recv_node] = recv_time + min_recv_times[send_node][recv_node] = recv_time + +def find_min_delays_alt(num_nodes): + """ + This function provides an alternate way to compute minimum delays, + using resend and busy packets instead of data and grant packets. It's + useful in situations where the cluster has stalled so there aren't any + data/grant packets. + """ + global send_ctl, recv_ctl, send_freeze, recv_freeze + global min_delays, min_recv_times, addr_node_num + + # Resend and busy packet are problematic because they are not unique: + # there can be several identical packets between the same pair of nodes. + # Here's how this function matches up sends and receives: + # * Start from freeze packets, which are unique; use them to compute + # an upper bound on delays in one direction. + # * Then scan packets flowing in the other direction: match sends and + # receives to pick the pair that produces the smallest positive RTT + # (when combined with freeze info in the other direction). + # * Then use this minimum in the other direction to match sends and + # recieves in the same direction as the freeze, to get a tighter bound + # that the freeze could produce by itself. + + for send_time, fsend_node, recv_addr in send_freeze: + # Compute freeze delay. + if not recv_addr in addr_node_num: + continue + frecv_node = addr_node_num[recv_addr] + recv_time = recv_freeze[frecv_node][0] + freeze_delay = recv_time - send_time + if freeze_delay < min_delays[fsend_node][frecv_node]: + # print("New min delay %.1f us from %d to %d (freeze) send %.1f recv %.1f" % + # (freeze_delay, fsend_node, frecv_node, send_time, recv_time)) + min_delays[fsend_node][frecv_node] = freeze_delay + min_recv_times[fsend_node][frecv_node] = recv_time + + # Scan control packets in reverse direction from freeze. + min_delay = min_delays[frecv_node][fsend_node] + for id, send_times in send_ctl[frecv_node].items(): + id2 = peer_id(id) + if not id2 in id_node_num or id_node_num[id2] != fsend_node: + continue + for send in send_times: + for recv in recv_ctl[id2]: + delay = recv - send + if freeze_delay + delay > 0 and delay < min_delay: + # print("New min delay %.1f us (rtt %.1f) from %d " + # "to %d (reverse ctl) id %s send %.1f recv %.1f" % (delay, + # delay + freeze_delay, frecv_node, fsend_node, + # id, send, recv)) + min_delay = delay + min_delays[frecv_node][fsend_node] = delay + min_recv_times[frecv_node][fsend_node] = recv + + # Scan control packets in same direction as freeze. + reverse_delay = min_delay + if reverse_delay == 1e20: + continue + min_delay = min_delays[fsend_node][frecv_node] + for id, send_times in send_ctl[fsend_node].items(): + id2 = peer_id(id) + if not id2 in id_node_num or id_node_num[id2] != frecv_node: + continue + for send in send_times: + for recv in recv_ctl[id2]: + delay = recv - send + if reverse_delay + delay > 0 and delay < min_delay: + # print("New min delay %.1f us (rtt %.1f) from %d " + # "to %d (forward ctl) id %s send %.1f recv %.1f" % (delay, + # delay + reverse_delay, fsend_node, frecv_node, + # id, send, recv)) + min_delay = delay + min_delays[fsend_node][frecv_node] = delay + min_recv_times[fsend_node][frecv_node] = recv def get_node_num(tt_file): """ @@ -188,13 +363,24 @@ def get_node_num(tt_file): return int(match.group(1)) return tt_file +def peer_id(id): + """ + Given a (string) RPC identifier, return the identifier used for that RPC + on the peer node. + """ + + return str(int(id)^1) + tt_files.sort(key = lambda name : get_node_num(name)) node_names = [Path(tt_file).stem for tt_file in tt_files] num_nodes = len(tt_files) for i in range(num_nodes): parse_tt(tt_files[i],i) +for id, addr in id_addr.items(): + if id in id_node_num: + addr_node_num[addr] = id_node_num[id] find_min_delays(num_nodes) - +find_min_delays_alt(num_nodes) # List of offset info for all nodes; index = node id; elements are # dictionaries with the following entries: @@ -236,8 +422,8 @@ def get_node_num(tt_file): if rtt < 0: print('Negative RTT %.1f between %s (recv %.3f) and ' '%s (recv %.3f),' % (rtt, node_names[ref], - recv_times[node][ref], node_names[node], - recv_times[ref][node])) + min_recv_times[node][ref], node_names[node], + min_recv_times[ref][node])) if (rtt < best_rtt) and (rtt > 0): best_node = node best_ref = ref @@ -297,7 +483,7 @@ def get_node_num(tt_file): %(node_names[src], src_offset, node_names[dst], dst_offset)) print(' mimimum delay %.1f becomes %.1f, received at %9.3f' % (min_delays[src][dst], new_min, - recv_times[src][dst] + dst_offset)) + min_recv_times[src][dst] + dst_offset)) # Rewrite traces with synchronized times if not options.no_rewrite: From d5c0b981a025e34eb409a45556a604cf3a10a874 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 18 Apr 2025 14:03:55 -0700 Subject: [PATCH 260/625] Major refactoring of grant management * Introduce new struct homa_grant to pull grant-related fields out of struct homa. * Pull other grant-related stuff into homa_grant.c, such as sysctl values that are related to grants. * Redo locking structure to simplify, make things more obvious. * Eliminate homa_grant_recalc and grant_pick_rpcs, replace with a more incremental approach to managing priority order (e.g. homa_grant_manage_rpc and homa_grant_unmanage_rpc). * Several metrics changed. --- homa_grant.c | 1196 +++++++++++++++---------- homa_grant.h | 310 ++++++- homa_impl.h | 168 +--- homa_incoming.c | 28 +- homa_metrics.c | 18 +- homa_metrics.h | 28 +- homa_outgoing.c | 5 +- homa_peer.h | 20 +- homa_plumbing.c | 52 +- homa_pool.c | 6 +- homa_rpc.c | 21 +- homa_rpc.h | 63 +- homa_sock.c | 4 + homa_timer.c | 12 +- homa_utils.c | 20 +- man/homa.7 | 11 + notes.txt | 72 +- test/mock.c | 63 +- test/mock.h | 2 + test/unit_homa_grant.c | 1791 ++++++++++++++++++++----------------- test/unit_homa_incoming.c | 94 +- test/unit_homa_offload.c | 3 +- test/unit_homa_outgoing.c | 25 +- test/unit_homa_pacer.c | 5 +- test/unit_homa_pool.c | 6 +- test/unit_homa_rpc.c | 7 +- test/unit_homa_timer.c | 23 +- test/unit_homa_utils.c | 4 +- test/utils.c | 34 +- util/metrics.py | 4 - util/tthoma.py | 10 +- 31 files changed, 2238 insertions(+), 1867 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 0c450a86..86c00198 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -10,26 +10,207 @@ #include "homa_rpc.h" #include "homa_wire.h" -/* Design Notes: - * This file is pretty complicated because of locking issues. Recalculating - * the priorities for granting requires @homa->grant_lock, which is - * global. Priorities can potentially change every time a data packet - * arrives, but acquiring the global lock for each data packet would result - * in unacceptable contention (this was tried in earlier versions). The - * approach used here is to separate per-data-packet opereations - * (homa_grant_check_rpc) from the full priority recalculation - * (homa_grant_recalc). Hopefully most calls to homa_grant_check_rpc can be - * handled without calling homa_grant_recalc. It has been challenging to - * implement this safely and there are a few races; as one example, information - * may change while homa_grant_get_offset is using it. However, I believe - * that this race is "safe" (the worst that will happen is sending out a grant - * with an incorrect offset, which has only minor performance consequences). - * - * Another overall requirement for the file is not to hold locks (either - * RPC locks or @homa->grant_lock) when actually sending grants. This - * is because packet transmission takes a long time, so holding a lock - * could result in unacceptable contention. +#ifndef __STRIP__ /* See strip.py */ +/* Used to enable sysctl access to grant-specific configuration parameters. The + * @data fields are actually offsets within a struct homa_grant; these are + * converted to pointers into a net-specific struct grant later. + */ +#define OFFSET(field) ((void *) offsetof(struct homa_grant, field)) +static struct ctl_table grant_ctl_table[] = { + { + .procname = "fifo_grant_increment", + .data = OFFSET(fifo_grant_increment), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "grant_fifo_fraction", + .data = OFFSET(fifo_fraction), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "grant_recalc_usecs", + .data = OFFSET(recalc_usecs), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "max_grantable_rpcs", + .data = OFFSET(max_grantable_rpcs), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "max_incoming", + .data = OFFSET(max_incoming), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "max_overcommit", + .data = OFFSET(max_overcommit), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "max_rpcs_per_peer", + .data = OFFSET(max_rpcs_per_peer), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, + { + .procname = "window", + .data = OFFSET(window_param), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_grant_dointvec + }, +}; +#endif /* See strip.py */ + +/** + * homa_grant_new() - Allocate and initialize a new grant object, which + * will hold grant management information for @homa. + * @net: Network namespace that @homa is associated with. + * Return: A pointer to the new struct grant, or a negative errno. + */ +struct homa_grant *homa_grant_new(struct net *net) +{ + struct homa_grant *grant; + int err; + + grant = kmalloc(sizeof(*grant), GFP_KERNEL | __GFP_ZERO); + if (!grant) + return ERR_PTR(-ENOMEM); + grant->max_incoming = 400000; + spin_lock_init(&grant->lock); + INIT_LIST_HEAD(&grant->grantable_peers); + grant->window_param = 10000; + grant->max_rpcs_per_peer = 1; + grant->max_overcommit = 8; + grant->recalc_usecs = 20; + grant->fifo_grant_increment = 10000; + grant->fifo_fraction = 50; + +#ifndef __STRIP__ /* See strip.py */ + grant->sysctl_header = register_net_sysctl(net, "net/homa", + grant_ctl_table); + if (!grant->sysctl_header) { + err = -ENOMEM; + pr_err("couldn't register sysctl parameters for Homa grants\n"); + goto error; + } +#endif /* See strip.py */ + homa_grant_update_sysctl_deps(grant); + grant->next_recalc = sched_clock() + grant->recalc_ns; + return grant; + +error: + homa_grant_destroy(grant); + return ERR_PTR(err); +} + +/** + * homa_grant_destroy() - Cleanup and destroy the grant object for a Homa + * transport. + * @grant: Object to destroy; caller must not reference the object + * again once this function returns. + */ +void homa_grant_destroy(struct homa_grant *grant) +{ +#ifndef __STRIP__ /* See strip.py */ + if (grant->sysctl_header) { + unregister_net_sysctl_table(grant->sysctl_header); + grant->sysctl_header = NULL; + } +#endif /* See strip.py */ + kfree(grant); +} + +/** + * homa_grant_init_rpc() - Initialize grant-related information for an + * RPC's incoming message (may add the RPC to grant priority queues). + * @rpc: RPC being initialized. Grant-related fields in msgin + * are assumed to be zero. + * @unsched: Number of unscheduled bytes in the incoming message for @rpc. */ +void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched) +{ + rpc->msgin.rank = -1; + if (rpc->msgin.num_bpages == 0) + /* Can't issue grants until buffer space becomes available. */ + return; + if (unsched >= rpc->msgin.length) { + rpc->msgin.granted = rpc->msgin.length; + rpc->msgin.prev_grant = rpc->msgin.granted; + return; + } + rpc->msgin.granted = rpc->msgin.prev_grant = unsched; + homa_grant_manage_rpc(rpc); +} + +/** + * homa_grant_end_rpc() - This function is invoked when homa_rpc_end is + * invoked; it cleans up any state related to grants for that RPC's + * incoming message. + * @rpc: The RPC to clean up. Must be locked by the caller. This function + * may release and then reacquire the lock. + */ +void homa_grant_end_rpc(struct homa_rpc *rpc) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_grant_candidates cand; + + if (rpc->msgin.rank >= 0 || !list_empty(&rpc->grantable_links)) { + homa_grant_cand_init(&cand); + homa_grant_unmanage_rpc(rpc, &cand); + if (!homa_grant_cand_empty(&cand)) { + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); + homa_grant_cand_check(&cand, grant); + homa_rpc_lock(rpc); + homa_rpc_put(rpc); + } + } + + if (rpc->msgin.rec_incoming != 0) { + atomic_sub(rpc->msgin.rec_incoming, &grant->total_incoming); + rpc->msgin.rec_incoming = 0; + } +} + +/** + * homa_grant_window() - Return the window size (maximum number of granted + * but not received bytes for a message) given current conditions. + * @grant: Overall information for grant management. + * Return: See above. + */ +int homa_grant_window(struct homa_grant *grant) +{ + u64 window; + + window = grant->window_param; + if (window == 0) { + /* Dynamic window sizing uses the approach described in the + * paper "Dynamic Queue Length Thresholds for Shared-Memory + * Packet Switches" with an alpha value of 1. The idea is to + * maintain unused incoming capacity (for new RPC arrivals) + * equal to the amount of incoming allocated to each of the + * current RPCs. + */ + window = grant->max_incoming; + do_div(window, grant->num_active_rpcs + 1); + } + return window; +} /** * homa_grant_outranks() - Returns nonzero if rpc1 should be considered @@ -41,219 +222,374 @@ */ int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) { - /* Fewest bytes remaining is the primary criterion; if those are + /* Fewest ungranted bytes is the primary criterion; if those are * equal, then favor the older RPC. */ - return (rpc1->msgin.bytes_remaining < rpc2->msgin.bytes_remaining) || - ((rpc1->msgin.bytes_remaining == - rpc2->msgin.bytes_remaining) && - (rpc1->msgin.birth < rpc2->msgin.birth)); + int grant_diff; + + grant_diff = (rpc1->msgin.length - rpc1->msgin.granted) - + (rpc2->msgin.length - rpc2->msgin.granted); + return grant_diff < 0 || ((grant_diff == 0) && + (rpc1->msgin.birth < rpc2->msgin.birth)); } /** - * homa_grant_update_incoming() - Figure out how much incoming data there is - * for an RPC (i.e., data that has been granted but not yet received) and make - * sure this is properly reflected in rpc->msgin.incoming - * and homa->total_incoming. - * @rpc: RPC to check; must be locked. - * @homa: Overall information about the Homa transport. + * homa_grant_priority() - Return the appropriate priority to use in a + * grant for an incoming message. + * @homa: Overall information about the Homa transport. + * @rank: Position of the message's RPC in active_rpcs (lower means + * higher priority). + * Return: See above. */ -void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa *homa) - __must_hold(&rpc->bucket->lock) +int homa_grant_priority(struct homa *homa, int rank) { - int incoming, delta; + int max_sched_prio, extra_levels, priority; - incoming = rpc->msgin.granted - (rpc->msgin.length - - rpc->msgin.bytes_remaining); - if (incoming < 0) - incoming = 0; - delta = incoming - rpc->msgin.rec_incoming; - if (delta != 0) - atomic_add(delta, &homa->total_incoming); - rpc->msgin.rec_incoming = incoming; + /* If there aren't enough active RPCs to consume all of the priority + * levels, use only the lower levels; this allows faster preemption + * if a new high-priority message appears. + */ + max_sched_prio = homa->max_sched_prio; + priority = max_sched_prio - rank; + extra_levels = max_sched_prio + 1 - homa->grant->num_active_rpcs; + if (extra_levels >= 0) + priority -= extra_levels; + return (priority < 0) ? 0 : priority; } /** - * homa_grant_add_rpc() - Make sure that an RPC is present in the grantable - * list for its peer and in the appropriate position, and that the peer is - * present in the overall grantable list for Homa and in the correct - * position. - * @rpc: The RPC to add/reposition. Must be locked by caller. + * homa_grant_insert_active() - Try to insert an RPC in homa->active_rpcs. + * @rpc: RPC to insert (if possible). + * Return: NULL if there was room to insert @rpc without ejecting any other + * RPC. Otherwise, returns an RPC that must be added to + * homa->grantable_peers (could be either @rpc or some other RPC + * that @rpc displaced). */ -void homa_grant_add_rpc(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) +struct homa_rpc *homa_grant_insert_active(struct homa_rpc *rpc) + __must_hold(&rpc->hsk->homa->grant->lock) { - struct homa *homa = rpc->hsk->homa; - struct homa_peer *peer = rpc->peer; - struct homa_peer *peer_cand; - struct homa_rpc *candidate; - - homa_grant_lock(homa, 0); - - /* Make sure this message is in the right place in the grantable_rpcs - * list for its peer. + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_rpc *other, *result; + int insert_after; + int last_to_copy; + int peer_index; + int i; + + /* Scan active_rpcs backwards to find the lowest-priority message + * with higher priority than @rpc. Also find the lowest-priority + * message with the same peer as @rpc, if one appears. */ - if (list_empty(&rpc->grantable_links)) { - /* Message not yet tracked; add it in priority order to - * the peer's list. - */ - u64 time = sched_clock(); - - INC_METRIC(grantable_rpcs_integral, homa->num_grantable_rpcs - * (time - homa->last_grantable_change)); - homa->last_grantable_change = time; - homa->num_grantable_rpcs++; - tt_record2("Incremented num_grantable_rpcs to %d, id %d", - homa->num_grantable_rpcs, rpc->id); - if (homa->num_grantable_rpcs > homa->max_grantable_rpcs) - homa->max_grantable_rpcs = homa->num_grantable_rpcs; - rpc->msgin.birth = time; - list_for_each_entry(candidate, &peer->grantable_rpcs, - grantable_links) { - if (homa_grant_outranks(rpc, candidate)) { - list_add_tail(&rpc->grantable_links, - &candidate->grantable_links); - goto position_peer; - } + insert_after = -1; + peer_index = -1; + for (i = grant->num_active_rpcs - 1; i >= 0; i--) { + other = grant->active_rpcs[i]; + if (!homa_grant_outranks(rpc, other)) { + insert_after = i; + break; } - list_add_tail(&rpc->grantable_links, &peer->grantable_rpcs); + if (peer_index < 0 && other->peer == rpc->peer) + peer_index = i; + } + + if (rpc->peer->active_rpcs >= grant->max_rpcs_per_peer) { + if (peer_index <= i) + /* All the other RPCs with the same peer are higher + * priority than @rpc and we can't have any more RPCs + * with the same peer, so bump @rpc. + */ + return rpc; + + /* Bump the lowest priority RPC from the same peer to make room + * for the new RPC. @rpc will be in a slot with lower index + * (higher priority) than the bumped one. + */ + result = grant->active_rpcs[peer_index]; + result->msgin.rank = -1; + result->peer->active_rpcs--; + last_to_copy = peer_index - 1; } else { - while (rpc != list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, - grantable_links)) { - /* Message is on the list, but its priority may have - * increased because of the recent packet arrival. If - * so, adjust its position in the list. + if (insert_after >= grant->max_overcommit - 1) + /* active_rpcs is full and @rpc is too low priority; + * bump it. */ - candidate = list_prev_entry(rpc, grantable_links); - if (!homa_grant_outranks(rpc, candidate)) - goto position_peer; - __list_del_entry(&candidate->grantable_links); - list_add(&candidate->grantable_links, &rpc->grantable_links); + return rpc; + + if (grant->num_active_rpcs >= grant->max_overcommit) { + result = grant->active_rpcs[grant->num_active_rpcs - 1]; + result->msgin.rank = -1; + result->peer->active_rpcs--; + last_to_copy = grant->num_active_rpcs - 2; + } else { + result = NULL; + last_to_copy = grant->num_active_rpcs - 1; + grant->num_active_rpcs++; + } + } + + /* Move existing RPCs in active_rpcs down to make room for @rpc. */ + for (i = last_to_copy; i > insert_after; i--) { + other = grant->active_rpcs[i]; + other->msgin.rank = i + 1; + grant->active_rpcs[i + 1] = other; + } + grant->active_rpcs[insert_after + 1] = rpc; + rpc->msgin.rank = insert_after + 1; + rpc->peer->active_rpcs++; + + return result; +} + +/** + * homa_grant_insert_grantable() - Insert an RPC into the grantable list + * for its peer. + * @rpc: The RPC to add. Must not currently be in either active_rpcs + * or grantable_peers. + */ +void homa_grant_insert_grantable(struct homa_rpc *rpc) + __must_hold(&rpc->hsk->homa->grant->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_peer *peer = rpc->peer; + struct homa_peer *other_peer; + struct homa_rpc *other; + + /* Insert @rpc in the right place in the grantable_rpcs list for + * its peer. + */ + list_for_each_entry(other, &peer->grantable_rpcs, grantable_links) { + if (homa_grant_outranks(rpc, other)) { + list_add_tail(&rpc->grantable_links, + &other->grantable_links); + goto position_peer; } } + list_add_tail(&rpc->grantable_links, &peer->grantable_rpcs); position_peer: /* At this point rpc is positioned correctly on the list for its peer. - * However, the peer may need to be added to, or moved upward on, - * homa->grantable_peers. + * However, the peer may need to be added to, or moved upward in, + * grantable_peers. */ if (list_empty(&peer->grantable_links)) { - /* Must add peer to the overall Homa list. */ - list_for_each_entry(peer_cand, &homa->grantable_peers, + /* Must add peer to grantable_peers. */ + list_for_each_entry(other_peer, &grant->grantable_peers, grantable_links) { - candidate = list_first_entry(&peer_cand->grantable_rpcs, - struct homa_rpc, - grantable_links); - if (homa_grant_outranks(rpc, candidate)) { + other = list_first_entry(&other_peer->grantable_rpcs, + struct homa_rpc, + grantable_links); + if (homa_grant_outranks(rpc, other)) { list_add_tail(&peer->grantable_links, - &peer_cand->grantable_links); - goto done; + &other_peer->grantable_links); + return; } } - list_add_tail(&peer->grantable_links, &homa->grantable_peers); - goto done; + list_add_tail(&peer->grantable_links, &grant->grantable_peers); + return; } - /* The peer is on Homa's list, but it may need to move upward. */ - while (peer != list_first_entry(&homa->grantable_peers, + /* The peer is on grantable_peers, but it may need to move upward. */ + while (peer != list_first_entry(&grant->grantable_peers, struct homa_peer, grantable_links)) { struct homa_peer *prev_peer = list_prev_entry(peer, grantable_links); - candidate = list_first_entry(&prev_peer->grantable_rpcs, - struct homa_rpc, grantable_links); - if (!homa_grant_outranks(rpc, candidate)) - goto done; + other = list_first_entry(&prev_peer->grantable_rpcs, + struct homa_rpc, grantable_links); + if (!homa_grant_outranks(rpc, other)) + break; __list_del_entry(&prev_peer->grantable_links); list_add(&prev_peer->grantable_links, &peer->grantable_links); } -done: - homa_grant_unlock(homa); } /** - * homa_grant_remove_rpc() - Unlink an RPC from the grantable lists, so it will - * no longer be considered for grants. - * @rpc: RPC to remove from grantable lists. Must currently be in - * a grantable list. Must be locked by caller. + * homa_grant_manage_rpc() - Insert an RPC into the priority-based data + * structures for managing grantable RPCs (active_rpcs or grantable_peers). + * Ensures that the RPC will be sent grants as needed. + * @rpc: The RPC to add. Must be locked by caller. */ -void homa_grant_remove_rpc(struct homa_rpc *rpc) +void homa_grant_manage_rpc(struct homa_rpc *rpc) __must_hold(&rpc->bucket->lock) { - struct homa *homa = rpc->hsk->homa; - struct homa_peer *peer = rpc->peer; - struct homa_rpc *candidate; + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_rpc *bumped; u64 time = sched_clock(); - struct homa_rpc *head; - if (list_empty(&rpc->grantable_links)) - return; + BUG_ON(rpc->msgin.rank >= 0 || !list_empty(&rpc->grantable_links)); - homa_grant_lock(homa, 0); + homa_grant_lock(grant); - /* Must check list again: might have been removed by someone - * else before we got the lock. - */ - if (list_empty(&rpc->grantable_links)) { - homa_grant_unlock(homa); - return; - } + INC_METRIC(grantable_rpcs_integral, grant->num_grantable_rpcs + * (time - grant->last_grantable_change)); + grant->last_grantable_change = time; + grant->num_grantable_rpcs++; + tt_record2("Incremented num_grantable_rpcs to %d, id %d", + grant->num_grantable_rpcs, rpc->id); + if (grant->num_grantable_rpcs > grant->max_grantable_rpcs) + grant->max_grantable_rpcs = grant->num_grantable_rpcs; + rpc->msgin.birth = time; - if (homa->oldest_rpc == rpc) - homa->oldest_rpc = NULL; + bumped = homa_grant_insert_active(rpc); + if (bumped) + homa_grant_insert_grantable(bumped); + grant->window = homa_grant_window(grant); + + homa_grant_unlock(grant); +} + +/** + * homa_grant_remove_grantable() - Unlink an RPC from the grantable lists, + * so it will no longer be considered for grants. + * @rpc: RPC to remove from grantable lists. Must currently be in + * a grantable list. + */ +void homa_grant_remove_grantable(struct homa_rpc *rpc) + __must_hold(&rpc->hsk->homa->grant->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_peer *peer = rpc->peer; + struct homa_rpc *other; + struct homa_rpc *head; head = list_first_entry(&peer->grantable_rpcs, struct homa_rpc, grantable_links); list_del_init(&rpc->grantable_links); - INC_METRIC(grantable_rpcs_integral, homa->num_grantable_rpcs - * (time - homa->last_grantable_change)); - homa->last_grantable_change = time; - homa->num_grantable_rpcs--; - tt_record2("Decremented num_grantable_rpcs to %d, id %d", - homa->num_grantable_rpcs, rpc->id); if (rpc != head) - goto done; + return; /* The removed RPC was at the front of the peer's list. This means - * we may have to adjust the position of the peer in Homa's list, + * we may have to adjust the position of the peer in the peer list, * or perhaps remove it. */ if (list_empty(&peer->grantable_rpcs)) { list_del_init(&peer->grantable_links); - goto done; + return; } - /* The peer may have to move down in Homa's list (removal of - * an RPC can't cause the peer to move up). + /* The peer may have to move down in Homa's list (its highest priority + * may now be lower). */ - head = list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links); - while (peer != list_last_entry(&homa->grantable_peers, struct homa_peer, - grantable_links)) { + head = list_first_entry(&peer->grantable_rpcs, + struct homa_rpc, grantable_links); + while (peer != list_last_entry(&grant->grantable_peers, + struct homa_peer, grantable_links)) { struct homa_peer *next_peer = list_next_entry(peer, grantable_links); - candidate = list_first_entry(&next_peer->grantable_rpcs, - struct homa_rpc, grantable_links); - if (!homa_grant_outranks(rpc, candidate)) + other = list_first_entry(&next_peer->grantable_rpcs, + struct homa_rpc, grantable_links); + if (!homa_grant_outranks(other, head)) break; __list_del_entry(&peer->grantable_links); list_add(&peer->grantable_links, &next_peer->grantable_links); } +} + +/** + * homa_grant_remove_active() - Remove an RPC from active_rpcs and promote + * an RPC from grantable_peers if possible. + * @rpc: RPC that no longer needs grants. Must have rank > 0. + * @cand: If an RPC is promoted into active_rpcs it is added here. + */ +void homa_grant_remove_active(struct homa_rpc *rpc, + struct homa_grant_candidates *cand) + __must_hold(&rpc->hsk->homa->grant->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_peer *peer; + struct homa_rpc *other; + int i; + + for (i = rpc->msgin.rank + 1; i < grant->num_active_rpcs; i++) { + other = grant->active_rpcs[i]; + other->msgin.rank = i - 1; + grant->active_rpcs[i - 1] = other; + } + rpc->msgin.rank = -1; + rpc->peer->active_rpcs--; + grant->num_active_rpcs--; + + /* Pull the highest-priority entry (if there is one) from + * grantable_peers into active_rpcs. + */ + list_for_each_entry(peer, &grant->grantable_peers, grantable_links) { + if (peer->active_rpcs >= grant->max_rpcs_per_peer) + continue; + other = list_first_entry(&peer->grantable_rpcs, + struct homa_rpc, + grantable_links); + homa_grant_remove_grantable(other); + peer->active_rpcs++; + grant->active_rpcs[grant->num_active_rpcs] = other; + other->msgin.rank = grant->num_active_rpcs; + grant->num_active_rpcs++; + homa_grant_cand_add(cand, other); + break; + } +} + +/** + * homa_grant_unmanage_rpc() - Make sure that an RPC is no longer present + * in the priority structures used to manage grants (active_rpcs and + * grantable_rpcs). The RPC will no longer receive grants. + * @rpc: RPC to unlink. + * @cand: If an RPC is promoted into active_rpcs, it is added here. + */ +void homa_grant_unmanage_rpc(struct homa_rpc *rpc, + struct homa_grant_candidates *cand) + __must_hold(&rpc->bucket->lock) +{ + struct homa_grant *grant = rpc->hsk->homa->grant; + u64 time = sched_clock(); + + homa_grant_lock(grant); + + INC_METRIC(grantable_rpcs_integral, grant->num_grantable_rpcs + * (time - grant->last_grantable_change)); + grant->last_grantable_change = time; + grant->num_grantable_rpcs--; + tt_record2("Decremented num_grantable_rpcs to %d, id %d", + grant->num_grantable_rpcs, rpc->id); + + if (rpc->msgin.rank >= 0) + homa_grant_remove_active(rpc, cand); + if (!list_empty(&rpc->grantable_links)) + homa_grant_remove_grantable(rpc); + grant->window = homa_grant_window(grant); + + homa_grant_unlock(grant); +} + +/** + * homa_grant_update_incoming() - Figure out how much incoming data there is + * for an RPC (i.e., data that has been granted but not yet received) and make + * sure this is properly reflected in rpc->msgin.incoming + * and homa->total_incoming. + * @rpc: RPC to check; must be locked. + * @grant: Grant information for a Homa transport. + */ +void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa_grant *grant) + __must_hold(&rpc->bucket->lock) +{ + int incoming, delta; -done: - homa_grant_unlock(homa); + incoming = rpc->msgin.granted - (rpc->msgin.length - + rpc->msgin.bytes_remaining); + if (incoming < 0) + incoming = 0; + delta = incoming - rpc->msgin.rec_incoming; + if (delta != 0) + atomic_add(delta, &grant->total_incoming); + rpc->msgin.rec_incoming = incoming; } /** - * homa_grant_get_offset() - Compute a new grant offset for an RPC. This - * function may race with other functions modifying RPC state; see Design Notes - * at the start of this file. - * @rpc: RPC whose grant offset is desired. Need not be locked. - * @homa: Overall information about the Homa transport. This function - * may set @homa->incoming_hit_limit. - * Return: New grant offset for this RPC (rpc->msgin.granted is not updated) - * May be zero or negative if no additional grants should be sent. + * homa_grant_update_granted() - Compute a new grant offset for an RPC based + * on the state of that world as well as overall grant state. + * @rpc: RPC whose msgin.granted should be updated. Need not be locked. + * @grant: Information for managing grants. This function may set + * incoming_hit_limit. + * Return: True means the offset was increased and a grant should be sent + * for the RPC. False means no grant should be sent. */ -int homa_grant_get_offset(struct homa_rpc *rpc, struct homa *homa) +bool homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) { int received, new_grant_offset, incoming_delta, avl_incoming; @@ -262,22 +598,25 @@ int homa_grant_get_offset(struct homa_rpc *rpc, struct homa *homa) * node. */ if (rpc->silent_ticks > 1) - return 0; + return false; received = rpc->msgin.length - rpc->msgin.bytes_remaining; - new_grant_offset = received + homa->grant_window; + new_grant_offset = received + grant->window; if (new_grant_offset > rpc->msgin.length) new_grant_offset = rpc->msgin.length; incoming_delta = new_grant_offset - received - rpc->msgin.rec_incoming; - avl_incoming = homa->max_incoming - atomic_read(&homa->total_incoming); + avl_incoming = grant->max_incoming - atomic_read(&grant->total_incoming); if (avl_incoming < incoming_delta) { - atomic_set(&homa->incoming_hit_limit, 1); + atomic_set(&grant->incoming_hit_limit, 1); tt_record3("insufficient headroom for grant: needed %d, available %d, used %d", incoming_delta, avl_incoming, - atomic_read(&homa->total_incoming)); + atomic_read(&grant->total_incoming)); new_grant_offset -= incoming_delta - avl_incoming; } - return new_grant_offset; + if (new_grant_offset <= rpc->msgin.granted) + return false; + rpc->msgin.granted = new_grant_offset; + return true; } /** @@ -293,318 +632,145 @@ void homa_grant_send(struct homa_rpc *rpc) struct homa_grant_hdr grant; grant.offset = htonl(rpc->msgin.granted); - grant.priority = rpc->msgin.priority; + grant.priority = homa_grant_priority(rpc->hsk->homa, rpc->msgin.rank); grant.resend_all = rpc->msgin.resend_all; if (grant.resend_all) rpc->msgin.resend_all = 0; + tt_record4("sending grant for id %d, offset %d, priority %d, increment %d", + rpc->id, rpc->msgin.granted, grant.priority, + rpc->msgin.granted - rpc->msgin.prev_grant); + rpc->msgin.prev_grant = rpc->msgin.granted; homa_xmit_control(GRANT, &grant, sizeof(grant), rpc); } /** - * homa_grant_check_rpc() - This function is invoked when the state of an - * RPC has changed (such as packets arriving). It checks the state of the - * RPC relative to outgoing grants and takes any appropriate actions that - * are needed (such as adding the RPC to the grantable list or sending - * grants for this or other RPCs). + * homa_grant_check_rpc() - This function is responsible for generating + * grant packets. Is invoked whenever a data packet arrives for RPC; it + * checks the state of that RPC (as well as other RPCs) and generates + * grant packets as appropriate. * @rpc: RPC to check. Must be locked by the caller. */ void homa_grant_check_rpc(struct homa_rpc *rpc) __must_hold(&rpc->bucket->lock) { - /* See Design Notes at the start of the file. */ - struct homa *homa = rpc->hsk->homa; - int new_offset, rank; - - if (rpc->msgin.length < 0 || rpc->state == RPC_DEAD || - rpc->msgin.num_bpages <= 0) + struct homa_grant *grant = rpc->hsk->homa->grant; + struct homa_grant_candidates cand; + bool send_grant, limit, recalc; + u64 now; + int i; + + if (rpc->msgin.length < 0 || rpc->msgin.num_bpages <= 0 || + rpc->msgin.rank < 0) return; tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, rpc->msgin.length); INC_METRIC(grant_check_calls, 1); - homa_grant_update_incoming(rpc, homa); - if (rpc->msgin.granted >= rpc->msgin.length) { - if (atomic_read(&homa->incoming_hit_limit) != 0 && - atomic_read(&homa->total_incoming) < homa->max_incoming) { - goto recalc; - } - goto done; - } - - /* This message requires grants; if it is a new message, set up - * granting. - */ - if (list_empty(&rpc->grantable_links)) { - homa_grant_add_rpc(rpc); - if (homa->num_active_rpcs < homa->max_overcommit || - rpc->msgin.bytes_remaining < - atomic_read(&homa->active_remaining - [homa->max_overcommit - 1])) - goto recalc; - goto done; - } - /* Not a new message; see if we can upgrade the message's priority. - * This accesses data that might be changing concurrently (e.g. - * active_rpcs), but it should be safe: the worst that can happen - * is extra calls to homa_grant_recalc. - */ - rank = atomic_read(&rpc->msgin.rank); - if (homa->active_rpcs[rank] != rpc) { - /* RPC not currently active. */ - if (rpc->msgin.bytes_remaining < - atomic_read(&homa->active_remaining[homa->max_overcommit - - 1])) { - INC_METRIC(grant_priority_bumps, 1); - goto recalc; - } - goto done; - } - atomic_set(&homa->active_remaining[rank], rpc->msgin.bytes_remaining); - if (rank > 0 && rpc->msgin.bytes_remaining < - atomic_read(&homa->active_remaining[rank - 1])) { - INC_METRIC(grant_priority_bumps, 1); - goto recalc; - } - - if (atomic_read(&homa->incoming_hit_limit) != 0 && - atomic_read(&homa->total_incoming) < homa->max_incoming) { - goto recalc; - } - - /* Ideally this should be the common case: no need to consider - * any other RPCs or recompute priorities. - */ - new_offset = homa_grant_get_offset(rpc, homa); - if (new_offset > rpc->msgin.granted) { - int recalc = 0; - - tt_record4("sending grant for id %llu, offset %d, priority %d, increment %d", - rpc->id, new_offset, - rpc->msgin.priority, - new_offset - rpc->msgin.granted); - rpc->msgin.granted = new_offset; - homa_grant_update_incoming(rpc, homa); - if (rpc->msgin.granted >= rpc->msgin.length) { - homa_grant_remove_rpc(rpc); - recalc = 1; - } + /* This function handles 4 different things: + * 1. It generates new grant packets for @rpc if appropriate. This + * is the common case. + * 2. If total_incoming had been exhausted, but headroom is now + * available, it sends grants to the highest priority RPC that + * needs them, which may not be @rpc. + * 3. It occasionally sends grants to the oldest RPC as determined + * by the fifo_grant_fraction parameter. + * 4. It occasionally scans active_rpcs to restore proper priority + * order. More on this below. + * + * Cases 2-4 require the global grant lock, but that lock is in + * danger of overload, particularly as network speeds increase. So, + * this function handles case 1 without acquiring the grant lock. + * Issuing a grant to @rpc may change its priority relative to other + * RPCs in active_rpcs, but we don't check for that in the common + * case, since it would require the grant lock. Instead, this function + * occasionally scans all the RPCs in active_rpcs to fix any priority + * inversions that may have developed. The interval for these scans + * is chosen so as not to create too much contention for the grant lock. + */ + now = sched_clock(); + limit = atomic_xchg(&grant->incoming_hit_limit, false); + recalc = now >= READ_ONCE(grant->next_recalc); + if (!recalc && !limit) { + /* Fast path (Case 1). */ + send_grant = homa_grant_update_granted(rpc, grant); + homa_grant_update_incoming(rpc, grant); + if (!send_grant) + return; + + homa_grant_cand_init(&cand); + if (rpc->msgin.granted >= rpc->msgin.length) + homa_grant_unmanage_rpc(rpc, &cand); + + /* Sending a grant is slow, so release the RPC lock while + * sending the grant to reduce contention. + */ homa_rpc_hold(rpc); homa_rpc_unlock(rpc); homa_grant_send(rpc); - if (recalc) - homa_grant_recalc(homa); + if (!homa_grant_cand_empty(&cand)) + homa_grant_cand_check(&cand, grant); homa_rpc_lock(rpc); homa_rpc_put(rpc); + return; } - goto done; - -recalc: - homa_rpc_hold(rpc); - homa_rpc_unlock(rpc); - homa_grant_recalc(homa); - homa_rpc_lock(rpc); - homa_rpc_put(rpc); -done: - tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); -} + INC_METRIC(grant_check_slow_path, 1); + homa_grant_update_incoming(rpc, grant); + homa_grant_lock(grant); + if (recalc) { + /* Case 4. */ + grant->next_recalc = now + grant->recalc_ns; + homa_grant_fix_order(grant); + } -/** - * homa_grant_recalc() - Recompute which RPCs should currently receive grants, - * and what priorities to use for each. If needed, send out grant packets to - * ensure that all appropriate grants have been issued. This function is - * invoked whenever something happens that could change the contents or order - * of homa->active_rpcs. Caller must not hold any RPC locks (this function - * may need to lock RPCs). - * @homa: Overall information about the Homa transport. - */ -void homa_grant_recalc(struct homa *homa) -{ - /* A copy of homa->active_rpcs; needed so we can send grants - * without holding grant_lock. See Design Notes at the top - * of this file. - */ - struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; - int i, active, try_again; - u64 start; - - UNIT_LOG("; ", "homa_grant_recalc"); - tt_record("homa_grant_recalc starting"); - INC_METRIC(grant_recalc_calls, 1); - start = sched_clock(); - - /* We may have to recalculate multiple times if grants sent in one - * round cause messages to be completely granted, opening up - * opportunities to grant to additional messages. + /* Cases 3 and 4: search all active RPCs to find any that do + * not have a full window of grants. Then release the grant lock + * and send grants. */ - while (1) { - /* The first part of this computation holds the grant - * lock but not individual RPC locks (we know that any RPC - * in @homa->active_rpcs cannot be reaped until it is removed - * from the list, and that requires the grant lock). It - * takes references on all active RPCs before releasing the - * grant lock. - */ - if (!homa_grant_lock(homa, 1)) { - INC_METRIC(grant_recalc_skips, 1); - break; - } - - try_again = 0; - atomic_inc(&homa->grant_recalc_count); - atomic_set(&homa->incoming_hit_limit, 0); - - /* Recompute which RPCs we'll grant to and initialize info - * about them. - */ - active = homa_grant_pick_rpcs(homa, homa->active_rpcs, - homa->max_overcommit); - for (i = active; i < homa->max_overcommit; i++) - /* Effectively invalidates @msgin.rank in RPCs that - * are no longer active. - */ - homa->active_rpcs[i] = NULL; - homa->num_active_rpcs = active; - for (i = 0; i < active; i++) { - struct homa_rpc *rpc = homa->active_rpcs[i]; - int extra_levels; - - active_rpcs[i] = rpc; - homa_rpc_hold(rpc); - atomic_set(&homa->active_remaining[i], - rpc->msgin.bytes_remaining); - atomic_set(&rpc->msgin.rank, i); - - /* Compute the priority to use for this RPC's grants: - * if there aren't enough RPCs to consume all of the - * priority levels, use only the lower levels; this - * allows faster preemption if a new high-priority - * message appears. - */ - rpc->msgin.priority = homa->max_sched_prio - i; - extra_levels = homa->max_sched_prio + 1 - - homa->num_active_rpcs; - if (extra_levels >= 0) - rpc->msgin.priority -= extra_levels; - if (rpc->msgin.priority < 0) - rpc->msgin.priority = 0; - } - - /* Compute the maximum window size for any RPC. Dynamic window - * sizing uses the approach inspired by the paper "Dynamic Queue - * Length Thresholds for Shared-Memory Packet Switches" with an - * alpha value of 1. The idea is to maintain unused incoming - * capacity (for new RPC arrivals) equal to the amount of - * incoming allocated to each of the current RPCs. - */ - if (homa->window_param != 0) - homa->grant_window = homa->window_param; - else - homa->grant_window = homa->max_incoming / - (homa->num_active_rpcs + 1); - - /* The second part of the computation is done without - * holding the grant lock, but it will acquire RPC locks. - * The grant lock is released because (a) we want to - * reduce contention for it and (b) we can't acquire RPC locks - * while holding it. References on the active RPCs keep them - * from being reaped. - */ - homa_grant_unlock(homa); - for (i = 0; i < active; i++) { - struct homa_rpc *rpc = active_rpcs[i]; - int new_offset; - - new_offset = homa_grant_get_offset(rpc, homa); - if (new_offset > rpc->msgin.granted) { - tt_record4("sending grant for id %llu, offset %d, priority %d, increment %d", - rpc->id, new_offset, - rpc->msgin.priority, - new_offset - rpc->msgin.granted); - homa_rpc_lock(rpc); - rpc->msgin.granted = new_offset; - homa_grant_update_incoming(rpc, homa); - if (rpc->msgin.granted >= rpc->msgin.length) { - homa_grant_remove_rpc(rpc); - try_again = 1; - } - homa_rpc_unlock(rpc); - homa_grant_send(rpc); - } - homa_rpc_put(rpc); - } + homa_grant_cand_init(&cand); + for (i = 0; i < grant->num_active_rpcs; i++) { + struct homa_rpc *rpc2 = grant->active_rpcs[i]; - if (try_again == 0) - break; - INC_METRIC(grant_recalc_loops, 1); + if (rpc2->msgin.rec_incoming < grant->window && + rpc2->state != RPC_DEAD) + homa_grant_cand_add(&cand, rpc2); + } + homa_grant_unlock(grant); + if (!homa_grant_cand_empty(&cand)) { + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); + homa_grant_cand_check(&cand, grant); + homa_rpc_lock(rpc); + homa_rpc_put(rpc); } - INC_METRIC(grant_recalc_ns, sched_clock() - start); } /** - * homa_grant_pick_rpcs() - Scan the grantable lists to identify the highest - * priority RPCs for granting, subject to homa->max_rpcs_per_peer. - * @homa: Overall data about the Homa protocol implementation. - * @rpcs: The selected RPCs will be stored in this array, in - * decreasing priority order. - * @max_rpcs: Maximum number of RPCs to return in @rpcs. - * Return: The number of RPCs actually stored in @rpcs. + * homa_grant_fix_order() - This function scans all of the RPCS in + * @active_rpcs and repairs any priority inversions that may exist. + * @grant: Overall grant management information. */ -int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, - int max_rpcs) +void homa_grant_fix_order(struct homa_grant *grant) + __must_hold(&grant->lock) { - struct homa_peer *peer; - struct homa_rpc *rpc; - int num_rpcs = 0; - - /* Iterate over peers, in decreasing order of "highest priority - * RPC from this peer". - */ - list_for_each_entry(peer, &homa->grantable_peers, grantable_links) { - int rpcs_from_peer = 0; - - /* Consider up to homa->max_rpcs_per_peer from this peer, - * in decreasing order of priority. - */ - list_for_each_entry(rpc, &peer->grantable_rpcs, - grantable_links) { - int i, pos; - - /* Figure out where this RPC should be positioned - * in the result. - */ - for (i = num_rpcs - 1; i >= 0; i--) { - if (!homa_grant_outranks(rpc, rpcs[i])) - break; - } - - /* Rpc must go at position i+1. */ - pos = i + 1; - if (pos >= max_rpcs) + struct homa_rpc *rpc, *other; + int i, j; + + for (i = 1; i < grant->num_active_rpcs; i++) { + rpc = grant->active_rpcs[i]; + for (j = i - 1; j >= 0; j--) { + other = grant->active_rpcs[j]; + if (!homa_grant_outranks(rpc, other)) break; - if (num_rpcs < max_rpcs) { - for (i = num_rpcs - 1; i >= pos; i--) - rpcs[i + 1] = rpcs[i]; - num_rpcs++; - } else { - for (i = max_rpcs - 2; i >= pos; i--) - rpcs[i + 1] = rpcs[i]; - } - rpcs[pos] = rpc; - rpcs_from_peer++; - if (rpcs_from_peer >= homa->max_rpcs_per_peer) - break; - } - if (rpcs_from_peer == 0) { - /* If even the best RPC from this peer didn't fit, - * then no RPCS from any other peer will fit. - */ - break; + grant->active_rpcs[j + 1] = other; + other->msgin.rank = j + 1; + grant->active_rpcs[j] = rpc; + rpc->msgin.rank = j; + INC_METRIC(grant_priority_bumps, 1); } } - return num_rpcs; } /** @@ -614,7 +780,7 @@ int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, */ void homa_grant_find_oldest(struct homa *homa) { - int max_incoming = homa->grant_window + 2 * homa->fifo_grant_increment; + int max_incoming = homa->grant->window + 2 * homa->grant->fifo_grant_increment; struct homa_rpc *rpc, *oldest; struct homa_peer *peer; u64 oldest_birth; @@ -625,7 +791,7 @@ void homa_grant_find_oldest(struct homa *homa) /* Find the oldest message that doesn't currently have an * outstanding "pity grant". */ - list_for_each_entry(peer, &homa->grantable_peers, grantable_links) { + list_for_each_entry(peer, &homa->grant->grantable_peers, grantable_links) { list_for_each_entry(rpc, &peer->grantable_rpcs, grantable_links) { int received, incoming; @@ -649,33 +815,59 @@ void homa_grant_find_oldest(struct homa *homa) oldest_birth = rpc->msgin.birth; } } - homa->oldest_rpc = oldest; + homa->grant->oldest_rpc = oldest; } /** - * homa_grant_end_rpc() - This function is invoked when homa_rpc_end is - * invoked; it cleans up any state related to grants for that RPC's - * incoming message. - * @rpc: The RPC to clean up. Must be locked by the caller. This function - * may release and then reacquire the lock. + * homa_grant_cand_add() - Add an RPC into the struct, if there is + * space. After this function is called, homa_grant_cand_check must + * eventually be called to process the entries and release reference + * counts. + * @cand: Structure in which to add @rpc. + * @rpc: RPC to add. If added successfully its reference count will + * be incremented */ -void homa_grant_end_rpc(struct homa_rpc *rpc) +void homa_grant_cand_add(struct homa_grant_candidates *cand, + struct homa_rpc *rpc) { - struct homa *homa = rpc->hsk->homa; + if (cand->inserts < cand->removes + HOMA_MAX_CAND_RPCS) { + homa_rpc_hold(rpc); + cand->rpcs[cand->inserts & HOMA_CAND_MASK] = rpc; + cand->inserts++; + } +} - if (!list_empty(&rpc->grantable_links)) { - homa_grant_remove_rpc(rpc); - if (homa->active_rpcs[atomic_read(&rpc->msgin.rank)] == rpc) { - homa_rpc_hold(rpc); +/** + * homa_grant_cand_check() - Scan all of the entries in @cand, issuing + * grants if possible and releasing reference counts. This function + * will acquire each RPCs lock, so the caller must not hold RPC locks + * or locks that conflict with RPC locks, such as the + * grant lock. + * @cand: Check all of the RPCs in this struct. + * @grant: Grant management information. + */ +void homa_grant_cand_check(struct homa_grant_candidates *cand, + struct homa_grant *grant) +{ + struct homa_rpc *rpc; + + while (cand->removes < cand->inserts) { + rpc = cand->rpcs[cand->removes & HOMA_CAND_MASK]; + cand->removes++; + homa_rpc_lock(rpc); + + if (rpc->state != RPC_DEAD && + homa_grant_update_granted(rpc, grant)) { + homa_grant_update_incoming(rpc, grant); + if (rpc->msgin.granted >= rpc->msgin.length) + homa_grant_unmanage_rpc(rpc, cand); + homa_rpc_unlock(rpc); + homa_grant_send(rpc); + } else { homa_rpc_unlock(rpc); - homa_grant_recalc(homa); - homa_rpc_lock(rpc); - homa_rpc_put(rpc); } + homa_rpc_put(rpc); } - - if (rpc->msgin.rec_incoming != 0) - atomic_sub(rpc->msgin.rec_incoming, &homa->total_incoming); } /** @@ -684,47 +876,73 @@ void homa_grant_end_rpc(struct homa_rpc *rpc) * available. It waits for the lock, but also records statistics about * the waiting time. * @homa: Overall data about the Homa protocol implementation. - * @recalc: Nonzero means the caller is homa_grant_recalc; if another thread - * is already recalculating, can return without waiting for the lock. - * Return: Nonzero means this thread now owns the grant lock. Zero - * means the lock was not acquired and there is no need for this - * thread to do the work of homa_grant_recalc because some other - * thread started a fresh calculation after this method was invoked. */ -int homa_grant_lock_slow(struct homa *homa, int recalc) +void homa_grant_lock_slow(struct homa_grant *grant) __acquires(&homa->grant_lock) { - int starting_count = atomic_read(&homa->grant_recalc_count); u64 start = sched_clock(); - int result = 0; tt_record("beginning wait for grant lock"); - while (1) { - if (spin_trylock_bh(&homa->grant_lock)) { - tt_record("ending wait for grant lock"); - result = 1; - break; - } - if (recalc && atomic_read(&homa->grant_recalc_count) - != starting_count) { - tt_record("skipping wait for grant lock: recalc elsewhere"); - break; - } - } + spin_lock_bh(&grant->lock); + tt_record("ending wait for grant lock"); INC_METRIC(grant_lock_misses, 1); INC_METRIC(grant_lock_miss_ns, sched_clock() - start); - return result; +} + +/** + * homa_grant_update_sysctl_deps() - Invoked whenever a sysctl value is changed; + * updates variables that depend on sysctl-settable values. + * @grant: Structure in which to update information. + */ +void homa_grant_update_sysctl_deps(struct homa_grant *grant) +{ + u64 tmp; + + if (grant->max_overcommit > HOMA_MAX_GRANTS) + grant->max_overcommit = HOMA_MAX_GRANTS; + + if (grant->fifo_fraction > 500) + grant->fifo_fraction = 500; + tmp = grant->fifo_fraction; + if (tmp != 0) + tmp = (1000 * grant->fifo_grant_increment) / tmp - + grant->fifo_grant_increment; + grant->grant_nonfifo = tmp; + + grant->recalc_ns = grant->recalc_usecs * 1000; + + grant->window = homa_grant_window(grant); } #ifndef __STRIP__ /* See strip.py */ /** - * homa_grant_log_tt() - Generate timetrace records describing all of - * the active RPCs (those we are currently granting to). - * @homa: Overall information about the Homa transport. + * homa_grant_dointvec() - This function is a wrapper around proc_dointvec. It + * is invoked to read and write grant-related sysctl values. + * @table: sysctl table describing value to be read or written. + * @write: Nonzero means value is being written, 0 means read. + * @buffer: Address in user space of the input/output data. + * @lenp: Not exactly sure. + * @ppos: Not exactly sure. + * + * Return: 0 for success, nonzero for error. */ -void homa_grant_log_tt(struct homa *homa) +int homa_grant_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { - tt_record1("homa_grant_log_tt found %d active RPCs", - homa->num_active_rpcs); + struct homa_grant *grant = + homa_from_net(current->nsproxy->net_ns)->grant; + struct ctl_table table_copy; + int result; + + /* Generate a new ctl_table that refers to a field in the + * net-specific struct homa. + */ + table_copy = *table; + table_copy.data = ((char *) grant) + (uintptr_t) table_copy.data; + + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + if (write) + homa_grant_update_sysctl_deps(grant); + return result; } #endif /* See strip.py */ diff --git a/homa_grant.h b/homa_grant.h index 18611757..aa6beee7 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -7,70 +7,296 @@ #include "homa_rpc.h" -void homa_grant_add_rpc(struct homa_rpc *rpc); +/** + * define HOMA_MAX_GRANTS - Used to size various data structures for grant + * management; the max_overcommit sysctl parameter must never be greater than + * this. + */ +#define HOMA_MAX_GRANTS 10 + +/** + * struct homa_grant - Holds information used to manage the sending of + * grants for incoming messages. There is one instance of this object + * stored in each struct homa. + */ +struct homa_grant { + /** + * @total_incoming: the total number of bytes that we expect to receive + * (across all messages) even if we don't send out any more grants + * (includes granted but unreceived bytes, plus unreceived unscheduled + * bytes that we know about). This can potentially be negative, if + * a peer sends more bytes than granted (see synchronization note in + * homa_send_grants for why we have to allow this possibility). + */ + atomic_t total_incoming; + + /** + * @incoming_hit_limit: True means that one or more RPCs could + * not be fully granted because @total_incoming exceeded @max_incoming. + */ + atomic_t incoming_hit_limit; + + /** + * @max_incoming: Homa will try to ensure that the total number of + * bytes senders have permission to send to this host (either + * unscheduled bytes or granted bytes) does not exceeds this value. + * Set externally via sysctl. + */ + int max_incoming; + + /** + * @lock: The grant lock: used to synchronize access to grant-related + * fields below as well as some fields in homa_rpc structs. + */ + spinlock_t lock ____cacheline_aligned_in_smp; + + /** + * @lock_time: sched_clock() time when lock was last locked. Used + * for computing statistics. + */ + u64 lock_time; + + /** + * @num_active_rpcs: Number of entries in @active_rpcs that are + * currently used. + */ + int num_active_rpcs; + + /** + * @active_rpcs: The highest-priority RPCs that still need grants. + * Lower index in the list means higher priority. If an RPC is in + * this array then it is not in @grantable_peers. + */ + struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; + + /** + * @grantable_peers: Contains all peers with entries in their + * grantable_rpcs lists. The list is sorted in priority order of + * the highest priority RPC for each peer (fewer ungranted bytes -> + * higher priority). + */ + struct list_head grantable_peers; + + /** + * @num_grantable_rpcs: Total number of RPCs with incoming + * messages that still need grants. Includes entries in both + * @active_rpcs and @grantable_peers. */ + int num_grantable_rpcs; + + /** + * @last_grantable_change: The sched_clock() time of the most recent + * increment or decrement of num_grantable_rpcs; used for computing + * statistics. + */ + u64 last_grantable_change; + + /** + * @max_grantable_rpcs: The largest value that has been seen for + * num_grantable_rpcs since this value was reset to 0 (it can be + * reset externally using sysctl). + */ + int max_grantable_rpcs; + + /** + * @window: Maximum number of granted but not yet received bytes for + * an incoming message. Computed from @window_param. + */ + int window; + + /** + * @window_param: Set externally via sysctl to select a policy for + * computing grant windows (how much granted but not yet received + * data an incoming message may have). If nonzero, then it specifies + * a (static) size for windows. 0 means compute windows dynamically + * based on the number of RPCs we're currently granting to. + */ + int window_param; + + /** + * @max_rpcs_per_peer: If there are multiple incoming messages from + * the same peer, Homa will only issue grants to this many of them + * at a time. Set externally via sysctl. + */ + int max_rpcs_per_peer; + + /** + * @max_overcommit: The maximum number of messages to which Homa will + * send grants at any given point in time. Set externally via sysctl. + */ + int max_overcommit; + + /** + * @recalc_usecs: Length of the priority recalculation interval, in + * microseconds. Each interval, priorities of the active messages + * get resorted if they have drifted out of order. Set externally + * via sysctl. + */ + int recalc_usecs; + + /** @recalc_ns: Same as @recalc_usec except in units of nanoseconds. */ + int recalc_ns; + + /** + * @next_recalc: Time (in sched_clock() nanoseconds) at which + * priorities should be recalculated. + */ + u64 next_recalc; + + /** + * @fifo_grant_increment: how many additional bytes to grant in + * a "pity" grant sent to the oldest outstanding message. Set + * externally via sysctl. + */ + int fifo_grant_increment; + + /** + * @fifo_fraction: The fraction (in thousandths) of granted + * bytes that should go to the *oldest* incoming message, rather + * than the highest priority ones. Set externally via sysctl. + */ + int fifo_fraction; + + /** + * @grant_nonfifo: How many bytes should be granted using the + * normal priority system between grants to the oldest message. + */ + int grant_nonfifo; + + /** + * @grant_nonfifo_left: Counts down bytes granted using the normal + * priority mechanism. When this reaches zero, it's time to grant + * to the oldest message. + */ + int grant_nonfifo_left; + + /** + * @oldest_rpc: The RPC with incoming data whose start_ns is + * farthest in the past). NULL means either there are no incoming + * RPCs or the oldest needs to be recomputed. Must hold grant_lock + * to update. + */ + struct homa_rpc *oldest_rpc; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @sysctl_header: Used to remove sysctl values when this structure + * is destroyed. + */ + struct ctl_table_header *sysctl_header; +#endif /* See strip.py */ +} ____cacheline_aligned_in_smp; + +/** + * struct homa_grant_candidates() - Accumulates information about RPCs that + * can potentially be issued grants. Used in order to defer the actual + * granting until it is safe to acquire locks for the RPCs. + */ +struct homa_grant_candidates { + /** + * @inserts: Total number of RPCs that have been inserted in this + * structure over its lifetime. Low-order bits indicate where the + * next RPC should be inserted. + */ + u32 inserts; + + /** + * @removes: Total number of RPCs that have been removed from this + * structure over its lifetime. Low-order bits give index of next + * RPC to be checked for possible grant. + * */ + u32 removes; + + /* Maximum number of RPCs that can be stored in @rpcs. If space + * runs out some potentially grant-worthy RPCs may be ignored, + * but they will get another chance in a future call to + * homa_grant_check_all. Must be a power of 2. + */ +#define HOMA_MAX_CAND_RPCS 8 +#define HOMA_CAND_MASK (HOMA_MAX_CAND_RPCS - 1) + struct homa_rpc *rpcs[HOMA_MAX_CAND_RPCS]; + +}; + +void homa_grant_cand_add(struct homa_grant_candidates *cand, + struct homa_rpc *rpc); +void homa_grant_cand_check(struct homa_grant_candidates *cand, + struct homa_grant *grant); void homa_grant_check_rpc(struct homa_rpc *rpc); +void homa_grant_destroy(struct homa_grant *grant); +int homa_grant_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); void homa_grant_end_rpc(struct homa_rpc *rpc); void homa_grant_find_oldest(struct homa *homa); -int homa_grant_get_offset(struct homa_rpc *rpc, struct homa *homa); -int homa_grant_lock_slow(struct homa *homa, int recalc); +void homa_grant_fix_order(struct homa_grant *grant); +void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched); +struct homa_rpc + *homa_grant_insert_active(struct homa_rpc *rpc); +void homa_grant_insert_grantable(struct homa_rpc *rpc); +void homa_grant_manage_rpc(struct homa_rpc *rpc); +void homa_grant_lock_slow(struct homa_grant *grant); void homa_grant_log_tt(struct homa *homa); +struct homa_grant + *homa_grant_new(struct net *net); int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2); -int homa_grant_pick_rpcs(struct homa *homa, struct homa_rpc **rpcs, - int max_rpcs); void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); -void homa_grant_recalc(struct homa *homa); -void homa_grant_remove_rpc(struct homa_rpc *rpc); +int homa_grant_priority(struct homa *homa, int rank); +void homa_grant_remove_active(struct homa_rpc *rpc, + struct homa_grant_candidates *cand); +void homa_grant_remove_grantable(struct homa_rpc *rpc); void homa_grant_send(struct homa_rpc *rpc); +void homa_grant_unmanage_rpc(struct homa_rpc *rpc, + struct homa_grant_candidates *cand); +bool homa_grant_update_granted(struct homa_rpc *rpc, + struct homa_grant *grant); void homa_grant_update_incoming(struct homa_rpc *rpc, - struct homa *homa); + struct homa_grant *grant); +void homa_grant_update_sysctl_deps(struct homa_grant *grant); +int homa_grant_window(struct homa_grant *grant); /** - * homa_grant_lock() - Acquire the grant lock. If the lock - * isn't immediately available, record stats on the waiting time. - * @homa: Overall data about the Homa protocol implementation. - * @recalc: Nonzero means the caller is homa_grant_recalc; if another thread - * is already recalculating, can return without waiting for the lock. - * Return: Nonzero means this thread now owns the grant lock. Zero - * means the lock was not acquired and there is no need for this - * thread to do the work of homa_grant_recalc because some other - * thread started a fresh calculation after this method was invoked. + * homa_grant_cand_init() - Reset @cand to an empty state. + * @cand: Structure to initialize. */ -static inline int homa_grant_lock(struct homa *homa, int recalc) - __acquires(&homa->grant_lock) +static inline void homa_grant_cand_init(struct homa_grant_candidates *cand) { - int result; - - if (spin_trylock_bh(&homa->grant_lock)) - result = 1; - else - result = homa_grant_lock_slow(homa, recalc); - homa->grant_lock_time = sched_clock(); - return result; + cand->inserts = 0; + cand->removes = 0; } /** - * homa_grant_unlock() - Release the grant lock. - * @homa: Overall data about the Homa protocol implementation. + * homa_grant_cand_empty() - Returns true if there are no RPCs in @cand, + * false otherwise + * @cand: Structure to check. + * Return: See above. + */ +static inline bool homa_grant_cand_empty(struct homa_grant_candidates *cand) +{ + return cand->inserts == cand->removes; +} + +/** + * homa_grant_lock() - Acquire the grant lock. If the lock + * isn't immediately available, record stats on the waiting time. + * @grant: Grant management info. */ -static inline void homa_grant_unlock(struct homa *homa) - __releases(&homa->grant_lock) +static inline void homa_grant_lock(struct homa_grant *grant) + __acquires(&grant->lock) { - INC_METRIC(grant_lock_ns, sched_clock() - homa->grant_lock_time); - spin_unlock_bh(&homa->grant_lock); + if (!spin_trylock_bh(&grant->lock)) + homa_grant_lock_slow(grant); + grant->lock_time = sched_clock(); } /** - * homa_grant_needy_bit() - Return a bit mask with the bit set in the - * position in @homa->grant_needy_ranks for @rank. - * @rank: Rank of an RPC (corresponds to position in @homa->active_rpcs). - * Return: A value with a 1-bit in the position corresponding to @rank, - * or 0 if rank is -1 or >= HOMA_MAX_PRIORITIES. + * homa_grant_unlock() - Release the grant lock. + * @homa: Overall data about the Homa protocol implementation. */ -static inline int homa_grant_needy_bit(int rank) +static inline void homa_grant_unlock(struct homa_grant *grant) + __releases(&grant->grant_lock) { - /* Eliminate any bits that conflict with HOMA_MAX_PRIORITIES. */ - return (1 << rank) & ((1 << HOMA_MAX_PRIORITIES) - 1); + INC_METRIC(grant_lock_ns, sched_clock() - grant->lock_time); + spin_unlock_bh(&grant->lock); } -#endif /* _HOMA_GRANT_H */ +#endif /* _HOMA_GRANT_H */ \ No newline at end of file diff --git a/homa_impl.h b/homa_impl.h index eed204c5..e6bc6fde 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -83,15 +83,6 @@ void homa_throttle_lock_slow(struct homa *homa); #define sizeof32(type) ((int)(sizeof(type))) -#ifndef __STRIP__ /* See strip.py */ -/** - * define HOMA_MAX_GRANTS - Used to size various data structures for grant - * management; the max_overcommit sysctl parameter must never be greater than - * this. - */ -#define HOMA_MAX_GRANTS 10 -#endif /* See strip.py */ - /** * union sockaddr_in_union - Holds either an IPv4 or IPv6 address (smaller * and easier to use than sockaddr_storage). @@ -122,119 +113,10 @@ struct homa { #ifndef __STRIP__ /* See strip.py */ /** - * @grant_lock: Used to synchronize access to grant-related - * fields below. In order to reduce contention, this lock is held - * only when making structural changes (e.g. modifying grantable_peers - * or active_rpcs). It is not held when computing new grant offsets - * and/or sending grant packets. Under some race conditions, it is - * possible for RPCs to receive grants out of priority order, or to - * receive duplicate grants. - */ - spinlock_t grant_lock ____cacheline_aligned_in_smp; - - /** - * @grant_lock_time: sched_clock() time when grant_lock - * was last locked. - */ - u64 grant_lock_time; - - /** - * @grant_recalc_count: Incremented every time homa_grant_recalc - * starts a new recalculation; used to avoid unnecessary - * recalculations in other threads. If a thread sees this value - * change, it knows that someone else is recalculating grants. - */ - atomic_t grant_recalc_count; - - /** - * @grantable_peers: Contains all peers with entries in their - * grantable_rpcs lists. The list is sorted in priority order of - * the highest priority RPC for each peer (fewer ungranted bytes -> - * higher priority). - */ - struct list_head grantable_peers; - - /** @num_grantable_rpcs: The number of RPCs in grantable_peers. */ - int num_grantable_rpcs; - - /** @last_grantable_change: The sched_clock() time of the most recent - * increment or decrement of num_grantable_rpcs; used for computing - * statistics. - */ - u64 last_grantable_change; - - /** - * @max_grantable_rpcs: The largest value that has been seen for - * num_grantable_rpcs since this value was reset to 0 (it can be - * reset externally using sysctl). - */ - int max_grantable_rpcs; - - /** - * @num_active_rpcs: number of entries in @active_rpcs and - * @active_remaining that are currently used. - */ - int num_active_rpcs; - - /** - * @active_rpcs: Hints about RPCs that we are currently granting to - * (lower index in the array means higher priority). Entries may be - * NULL or may refer to RPCs that no longer exist, so can't dereference - * these pointers. + * @grant: Contains information used by homa_grant.c to manage + * grants for incoming messages. */ - struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS]; - - /** - * @active_remaining: entry i in this array contains a copy of - * active_rpcs[i]->msgin.bytes_remaining. These values can be - * updated by the corresponding RPCs without holding the grant - * lock. Perfect consistency isn't required; this are hints used to - * detect when the priority ordering of messages changes. - */ - atomic_t active_remaining[HOMA_MAX_GRANTS]; - - /** - * @oldest_rpc: The RPC with incoming data whose start_ns is - * farthest in the past). NULL means either there are no incoming - * RPCs or the oldest needs to be recomputed. Must hold grant_lock - * to update. - */ - struct homa_rpc *oldest_rpc; - - /** - * @grant_window: How many bytes of granted but not yet received data - * may exist for an RPC at any given time. - */ - int grant_window; - - /** - * @grant_nonfifo: How many bytes should be granted using the - * normal priority system between grants to the oldest message. - */ - int grant_nonfifo; - - /** - * @grant_nonfifo_left: Counts down bytes using the normal - * priority mechanism. When this reaches zero, it's time to grant - * to the old message. - */ - int grant_nonfifo_left; - - /** - * @total_incoming: the total number of bytes that we expect to receive - * (across all messages) even if we don't send out any more grants - * (includes granted but unreceived bytes, plus unreceived unscheduled - * bytes that we know about). This can potentially be negative, if - * a peer sends more bytes than granted (see synchronization note in - * homa_send_grants for why we have to allow this possibility). - */ - atomic_t total_incoming ____cacheline_aligned_in_smp; - - /** - * @incoming_hit_limit: Nonzero means that one or more RPCs could - * not be fully granted because @total_incoming exceeded @max_incoming. - */ - atomic_t incoming_hit_limit; + struct homa_grant *grant; #endif /* See strip.py */ /** @@ -324,15 +206,6 @@ struct homa { */ int unsched_bytes; - /** - * @window_param: Set externally via sysctl to select a policy for - * computing homa-grant_window. If 0 then homa->grant_window is - * computed dynamically based on the number of RPCs we're currently - * granting to. If nonzero then homa->grant_window will always be the - * same as @window_param. - */ - int window_param; - /** * @poll_usecs: Amount of time (in microseconds) that a thread * will spend busy-waiting for an incoming messages before @@ -382,41 +255,6 @@ struct homa { * next version change. Can be set externally via sysctl. */ int cutoff_version; - - /** - * @fifo_grant_increment: how many additional bytes to grant in - * a "pity" grant sent to the oldest outstanding message. Set - * externally via sysctl. - */ - int fifo_grant_increment; - - /** - * @grant_fifo_fraction: The fraction (in thousandths) of granted - * bytes that should go to the *oldest* incoming message, rather - * than the highest priority ones. Set externally via sysctl. - */ - int grant_fifo_fraction; - - /** - * @max_overcommit: The maximum number of messages to which Homa will - * send grants at any given point in time. Set externally via sysctl. - */ - int max_overcommit; - - /** - * @max_incoming: Homa will try to ensure that the total number of - * bytes senders have permission to send to this host (either - * unscheduled bytes or granted bytes) does not exceeds this value. - * Set externally via sysctl. - */ - int max_incoming; - - /** - * @max_rpcs_per_peer: If there are multiple incoming messages from - * the same peer, Homa will only issue grants to this many of them - * at a time. Set externally via sysctl. - */ - int max_rpcs_per_peer; #endif /* See strip.py */ /** diff --git a/homa_incoming.c b/homa_incoming.c index 5e80b965..8a2e7a8e 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -50,20 +50,13 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) skb_queue_head_init(&rpc->msgin.packets); INIT_LIST_HEAD(&rpc->msgin.gaps); rpc->msgin.bytes_remaining = length; -#ifndef __STRIP__ /* See strip.py */ - rpc->msgin.granted = (unsched > length) ? length : unsched; - atomic_set(&rpc->msgin.rank, 0); -#endif /* See strip.py */ err = homa_pool_allocate(rpc); - if (err != 0) + if (err != 0) { + rpc->msgin.length = -1; return err; -#ifndef __STRIP__ /* See strip.py */ - if (rpc->msgin.num_bpages == 0) { - /* The RPC is now queued waiting for buffer space, so we're - * going to discard all of its packets. - */ - rpc->msgin.granted = 0; } +#ifndef __STRIP__ /* See strip.py */ + homa_grant_init_rpc(rpc, unsched); if (length < HOMA_NUM_SMALL_COUNTS * 64) { INC_METRIC(small_msg_bytes[(length - 1) >> 6], length); } else if (length < HOMA_NUM_MEDIUM_COUNTS * 1024) { @@ -1442,19 +1435,6 @@ void homa_rpc_handoff(struct homa_rpc *rpc) */ void homa_incoming_sysctl_changed(struct homa *homa) { - u64 tmp; - - if (homa->grant_fifo_fraction > 500) - homa->grant_fifo_fraction = 500; - tmp = homa->grant_fifo_fraction; - if (tmp != 0) - tmp = (1000 * homa->fifo_grant_increment) / tmp - - homa->fifo_grant_increment; - homa->grant_nonfifo = tmp; - - if (homa->max_overcommit > HOMA_MAX_GRANTS) - homa->max_overcommit = HOMA_MAX_GRANTS; - homa->busy_ns = homa->busy_usecs * 1000; homa->gro_busy_ns = homa->gro_busy_usecs * 1000; } diff --git a/homa_metrics.c b/homa_metrics.c index 064145d6..ed1f1c47 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -313,24 +313,16 @@ char *homa_metrics_print(void) m->peer_ack_lock_misses); M("peer_ack_lock_miss_ns %15llu Time lost waiting for peer ack locks\n", m->peer_ack_lock_miss_ns); - M("grant_lock_misses %15llu Grant lock misses\n", + M("grant_lock_misses %15llu Grant lock misses\n", m->grant_lock_misses); - M("grant_lock_miss_ns %15llu Time lost waiting for grant lock\n", + M("grant_lock_miss_ns %15llu Time lost waiting for grant lock\n", m->grant_lock_miss_ns); M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", m->grantable_rpcs_integral); - M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", + M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", + m->grant_check_calls); + M("grant_check_slow_path %15llu Number of times homa_grant_check_rpc acquired grant lock\n", m->grant_check_calls); - M("grant_recalc_calls %15llu Number of calls to homa_grant_recalc\n", - m->grant_recalc_calls); - M("grant_recalc_ns %15llu Time spent in homa_grant_recalc\n", - m->grant_recalc_ns); - M("grant_recalc_loops %15llu Number of times homa_grant_recalc looped back\n", - m->grant_recalc_loops); - M("grant_recalc_skips %15llu Number of times homa_grant_recalc skipped redundant work\n", - m->grant_recalc_skips); - M("grant_check_needy_calls %15llu Number of calls to homa_grant_check_needy\n", - m->grant_recalc_skips); M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", m->grant_priority_bumps); M("fifo_grants %15llu Grants issued using FIFO priority\n", diff --git a/homa_metrics.h b/homa_metrics.h index afc32b31..f4560ad2 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -489,32 +489,10 @@ struct homa_metrics { u64 grant_check_calls; /** - * @grant_recalc_calls: cumulative number of times homa_grant_recalc - * has been invoked. - */ - u64 grant_recalc_calls; - - /** @grant_recalc_ns: total time spent in homa_grant_recalc. */ - u64 grant_recalc_ns; - - /** - * @grant_recalc_loops: cumulative number of times homa_grant_recalc - * has looped back to recalculate again. - */ - u64 grant_recalc_loops; - - /** - * @grant_recalc_skips: cumulative number of times that - * homa_grant_recalc skipped its work because in other thread - * already did it. - */ - u64 grant_recalc_skips; - - /** - * @grant_check_needy_calls: cumulative number of times that - * homa_grant_check_needy has been invoked. + * @grant_check_slow_path: cumulative number of times + * homa_grant_check_rpc acquired the grant lock. */ - u64 grant_check_needy_calls; + u64 grant_check_slow_path; /** * @grant_priority_bumps: cumulative number of times the grant priority diff --git a/homa_outgoing.c b/homa_outgoing.c index c73facb2..2c60b938 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -588,8 +588,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) * the NIC queue is sufficiently long. */ void homa_xmit_data(struct homa_rpc *rpc, bool force) - __releases(rpc->bucket_lock) - __acquires(rpc->bucket_lock) + __must_hold(&rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; #ifndef __STRIP__ /* See strip.py */ @@ -634,6 +633,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) rpc->msgout.next_xmit_offset += homa_get_skb_info(skb)->data_bytes; + homa_rpc_hold(rpc); homa_rpc_unlock(rpc); skb_get(skb); #ifndef __STRIP__ /* See strip.py */ @@ -648,6 +648,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) #endif /* See strip.py */ force = false; homa_rpc_lock(rpc); + homa_rpc_put(rpc); if (rpc->state == RPC_DEAD) break; } diff --git a/homa_peer.h b/homa_peer.h index 09bd4b61..947832fc 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -117,18 +117,26 @@ struct homa_peer { unsigned long last_update_jiffies; /** - * @grantable_rpcs: Contains all homa_rpcs (both requests and - * responses) involving this peer whose msgins require (or required - * them in the past) and have not been fully received. The list is - * sorted in priority order (head has fewest bytes_remaining). - * Locked with homa->grant_lock. + * @active_rpcs: Number of RPCs involving this peer whose incoming + * messages are currently in homa->grant->active_rpcs. Managed by + * homa_grant.c under the grant lock. + */ + int active_rpcs; + + /** + * @grantable_rpcs: Contains homa_rpcs (both requests and responses) + * involving this peer that are not in homa->active_rpcs but + * whose msgins eventually need more grants. The list is sorted in + * priority order (head has fewest ungranted bytes). Managed by + * homa_grant.c under the grant lock. */ struct list_head grantable_rpcs; /** * @grantable_links: Used to link this peer into homa->grantable_peers. * If this RPC is not linked into homa->grantable_peers, this is an - * empty list pointing to itself. + * empty list pointing to itself. Managed by homa_grant.c under the + * grant lock.abort */ struct list_head grantable_links; #endif /* See strip.py */ diff --git a/homa_plumbing.c b/homa_plumbing.c index 4366c26e..29a1a5ec 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" #include "homa_offload.h" #endif /* See strip.py */ #include "homa_pacer.h" @@ -195,13 +196,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, - { - .procname = "fifo_grant_increment", - .data = OFFSET(fifo_grant_increment), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, { .procname = "flags", .data = OFFSET(flags), @@ -223,13 +217,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_sysctl_softirq_cores }, - { - .procname = "grant_fifo_fraction", - .data = OFFSET(grant_fifo_fraction), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, { .procname = "gro_busy_usecs", .data = OFFSET(gro_busy_usecs), @@ -265,13 +252,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, - { - .procname = "max_grantable_rpcs", - .data = OFFSET(max_grantable_rpcs), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, { .procname = "max_gro_skbs", .data = OFFSET(max_gro_skbs), @@ -286,27 +266,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, - { - .procname = "max_incoming", - .data = OFFSET(max_incoming), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, - { - .procname = "max_overcommit", - .data = OFFSET(max_overcommit), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, - { - .procname = "max_rpcs_per_peer", - .data = OFFSET(max_rpcs_per_peer), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, { .procname = "max_sched_prio", .data = OFFSET(max_sched_prio), @@ -426,13 +385,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, - { - .procname = "window", - .data = OFFSET(window_param), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_dointvec - }, { .procname = "wmem_max", .data = OFFSET(wmem_max), @@ -1709,7 +1661,7 @@ int homa_dointvec(const struct ctl_table *table, int write, tt_freeze(); } else if (homa->sysctl_action == 8) { pr_notice("homa_total_incoming is %d\n", - atomic_read(&homa->total_incoming)); + atomic_read(&homa->grant->total_incoming)); } else if (homa->sysctl_action == 9) { tt_print_file("/users/ouster/node.tt"); } else { diff --git a/homa_pool.c b/homa_pool.c index 73e3398e..d6e5927b 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -275,6 +275,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, * returned. */ int homa_pool_allocate(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) { struct homa_pool *pool = rpc->hsk->buffer_pool; int full_pages, partial, i, core_id; @@ -510,11 +511,10 @@ void homa_pool_check_waiting(struct homa_pool *pool) if (rpc->msgin.num_bpages > 0) { /* Allocation succeeded; "wake up" the RPC. */ rpc->msgin.resend_all = 1; + homa_grant_init_rpc(rpc, 0); homa_grant_check_rpc(rpc); } - homa_rpc_unlock(rpc); -#else /* See strip.py */ - homa_rpc_unlock(rpc); #endif /* See strip.py */ + homa_rpc_unlock(rpc); } } diff --git a/homa_rpc.c b/homa_rpc.c index 909b9a2c..6324210d 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -274,7 +274,8 @@ void homa_rpc_end(struct homa_rpc *rpc) * necessary because homa_grant_end_rpc releases the RPC lock and * reacquires it. */ - homa_grant_end_rpc(rpc); + if (rpc->msgin.length >= 0) + homa_grant_end_rpc(rpc); #endif /* See strip.py */ /* Unlink from all lists, so no-one will ever find this RPC again. */ @@ -627,12 +628,9 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) rpc->id, tt_addr(rpc->peer->addr), received, rpc->msgin.length); #ifndef __STRIP__ - tt_record4("RPC id %d has incoming %d, granted %d, prio %d", rpc->id, - rpc->msgin.granted - received, - rpc->msgin.granted, rpc->msgin.priority); - rank = atomic_read(&rpc->msgin.rank); - if (rpc->hsk->homa->active_rpcs[rank] != rpc) - rank = -1; + tt_record3("RPC id %d has incoming %d, granted %d", rpc->id, + rpc->msgin.granted - received, rpc->msgin.granted); + rank = rpc->msgin.rank; #else /* __STRIP__ */ rank = -1; #endif /* __STRIP__ */ @@ -676,10 +674,7 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) struct homa_rpc *rpc; int count = 0; -#ifndef __STRIP__ - homa_grant_log_tt(homa); -#endif /* __STRIP__ */ - tt_record("Logging active Homa RPCs:"); + tt_record("Logging Homa RPCs:"); rcu_read_lock(); for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk; hsk = homa_socktab_next(&scan)) { @@ -735,7 +730,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) int actual; tt_record1("homa_validate_incoming starting, total_incoming %d", - atomic_read(&homa->total_incoming)); + atomic_read(&homa->grant->total_incoming)); *link_errors = 0; rcu_read_lock(); for (hsk = homa_socktab_start_scan(homa->port_map, &scan); @@ -779,7 +774,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) } homa_socktab_end_scan(&scan); rcu_read_unlock(); - actual = atomic_read(&homa->total_incoming); + actual = atomic_read(&homa->grant->total_incoming); tt_record3("homa_validate_incoming diff %d (expected %d, got %d)", actual - total_incoming, total_incoming, actual); return actual - total_incoming; diff --git a/homa_rpc.h b/homa_rpc.h index 52fd1444..dcf08138 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -150,55 +150,56 @@ struct homa_message_in { */ int bytes_remaining; -#ifndef __STRIP__ /* See strip.py */ /** - * @granted: Total # of bytes (starting from offset 0) that the sender - * may transmit without additional grants, includes unscheduled bytes. - * Never larger than @length. Note: once initialized, this - * may not be modified without holding @homa->grant_lock. + * @num_bpages: The number of entries in @bpage_offsets used for this + * message (0 means buffers not allocated yet). */ - int granted; + u32 num_bpages; /** - * @rec_incoming: Number of bytes in homa->total_incoming currently - * contributed ("recorded") from this RPC. + * @bpage_offsets: Describes buffer space allocated for this message. + * Each entry is an offset from the start of the buffer region. + * All but the last pointer refer to areas of size HOMA_BPAGE_SIZE. */ - int rec_incoming; + u32 bpage_offsets[HOMA_MAX_BPAGES]; +#ifndef __STRIP__ /* See strip.py */ /** - * @rank: A hint: if homa->active_rpcs[@rank] refers to this RPC then - * the RPC is active and this value indicates the RPC's priority (lower - * is better). Read without synchronization, so must be atomic. + * @rank: Position of this RPC in homa->grant->active_rpcs, or -1 + * if not in homa->grant->active_rpcs. Managed by homa_grant.c. */ - atomic_t rank; - - /** @priority: Priority level to include in future GRANTS. */ - int priority; -#endif /* See strip.py */ + int rank; - /** @resend_all: if nonzero, set resend_all in the next grant packet. */ - __u8 resend_all; + /** + * @granted: Total # of bytes (starting from offset 0) that the sender + * will transmit without additional grants, including unscheduled bytes. + * Never larger than @length. Managed by homa_grant.c. + */ + int granted; -#ifndef __STRIP__ /* See strip.py */ /** - * @birth: sched_clock() time when this RPC was added to the grantable - * list. Invalid if RPC isn't in the grantable list. + * @prev_grant: Offset in the last GRANT packet sent for this RPC + * (initially set to unscheduled bytes). */ - u64 birth; -#endif /* See strip.py */ + int prev_grant; /** - * @num_bpages: The number of entries in @bpage_offsets used for this - * message (0 means buffers not allocated yet). + * @rec_incoming: Number of bytes in homa->total_incoming currently + * contributed ("recorded") from this RPC. Managed by homa_grant.c. */ - u32 num_bpages; + int rec_incoming; + /** - * @bpage_offsets: Describes buffer space allocated for this message. - * Each entry is an offset from the start of the buffer region. - * All but the last pointer refer to areas of size HOMA_BPAGE_SIZE. + * @birth: sched_clock() time when homa_grant_manage_rpc was invoked + * for this RPC. Managed by homa_grant.c. Only set if the RPC needs + * grants. */ - u32 bpage_offsets[HOMA_MAX_BPAGES]; + u64 birth; + + /** @resend_all: if nonzero, set resend_all in the next grant packet. */ + __u8 resend_all; +#endif /* See strip.py */ }; /** diff --git a/homa_sock.c b/homa_sock.c index 5689122b..24a49001 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -8,6 +8,10 @@ #include "homa_peer.h" #include "homa_pool.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" +#endif /* See strip.py */ + #ifdef __UNIT_TEST__ #define KSELFTEST_NOT_MAIN 1 #include "test/kselftest_harness.h" diff --git a/homa_timer.c b/homa_timer.c index 6f128beb..16521201 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -8,6 +8,7 @@ #include "homa_peer.h" #include "homa_rpc.h" #ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" #include "homa_skb.h" #endif /* See strip.py */ @@ -24,6 +25,7 @@ * @rpc: RPC to check; must be locked by the caller. */ void homa_check_rpc(struct homa_rpc *rpc) + __must_hold(&rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; struct homa_resend_hdr resend; @@ -202,12 +204,12 @@ void homa_timer(struct homa *homa) } tt_record4("homa_timer found total_incoming %d, num_grantable_rpcs %d, num_active_rpcs %d, new grants %d", - atomic_read(&homa->total_incoming), - homa->num_grantable_rpcs, - homa->num_active_rpcs, + atomic_read(&homa->grant->total_incoming), + homa->grant->num_grantable_rpcs, + homa->grant->num_active_rpcs, total_grants - prev_grant_count); if (total_grants == prev_grant_count && - homa->num_grantable_rpcs > 20) { + homa->grant->num_grantable_rpcs > 20) { zero_count++; if (zero_count > 3 && !atomic_read(&tt_frozen) && 0) { pr_err("%s found no grants going out\n", __func__); @@ -283,7 +285,7 @@ void homa_timer(struct homa *homa) #ifndef __STRIP__ /* See strip.py */ tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", total_incoming_rpcs, sum_incoming, sum_incoming_rec, - atomic_read(&homa->total_incoming)); + atomic_read(&homa->grant->total_incoming)); #endif /* See strip.py */ homa_skb_release_pages(homa); #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_utils.c b/homa_utils.c index 5cde1262..e5ce2814 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_grant.h" #include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" @@ -38,9 +39,12 @@ int homa_init(struct homa *homa, struct net *net) memset(homa, 0, sizeof(*homa)); atomic64_set(&homa->next_outgoing_id, 2); #ifndef __STRIP__ /* See strip.py */ - spin_lock_init(&homa->grant_lock); - INIT_LIST_HEAD(&homa->grantable_peers); - homa->last_grantable_change = sched_clock(); + homa->grant = homa_grant_new(net); + if (IS_ERR(homa->grant)) { + err = PTR_ERR(homa->grant); + homa->grant = NULL; + return err; + } #endif /* See strip.py */ homa->pacer = homa_pacer_new(homa, net); if (IS_ERR(homa->pacer)) { @@ -79,7 +83,6 @@ int homa_init(struct homa *homa, struct net *net) /* Wild guesses to initialize configuration values... */ #ifndef __STRIP__ /* See strip.py */ homa->unsched_bytes = 40000; - homa->window_param = 100000; homa->poll_usecs = 50; homa->num_priorities = HOMA_MAX_PRIORITIES; for (i = 0; i < HOMA_MAX_PRIORITIES; i++) @@ -97,11 +100,6 @@ int homa_init(struct homa *homa, struct net *net) #else homa->cutoff_version = 1; #endif - homa->fifo_grant_increment = 10000; - homa->grant_fifo_fraction = 50; - homa->max_overcommit = 8; - homa->max_incoming = 400000; - homa->max_rpcs_per_peer = 1; #endif /* See strip.py */ homa->resend_ticks = 5; homa->resend_interval = 5; @@ -145,6 +143,10 @@ void homa_destroy(struct homa *homa) kfree(homa->port_map); homa->port_map = NULL; } + if (homa->grant) { + homa_grant_destroy(homa->grant); + homa->grant = NULL; + } if (homa->pacer) { homa_pacer_destroy(homa->pacer); homa->pacer = NULL; diff --git a/man/homa.7 b/man/homa.7 index acc4fe96..b090dd73 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -358,6 +358,17 @@ of the bandwidth is for FIFO and 90% for SRPT). As of October 2020, a small value can provide significant benefits for the largest messages under very high loads, but for most loads its effect is negligible. .TP +.IR grant_recalc_usecs +How frequently (in microseconds) to scan the RPCs currently receiving grants +to see if the priority order is still correct. The order can become incorrect +if enough data is arrives for a low-priority RPC so that it now has fewer bytes +left to grant than other RPCs that currently have higher priority. +Validating the order requires the global grant lock, so checking every time data +arrives would risk severe lock contention. Instead, the order is only checked +every +.I grant_recalc_usecs +microseconds. +.TP .I gro_busy_usecs An integer value used to determine whether or not to perform some optimizations specified by diff --git a/notes.txt b/notes.txt index 46760930..9080cfe0 100755 --- a/notes.txt +++ b/notes.txt @@ -1,14 +1,11 @@ Notes for Homa implementation in Linux: --------------------------------------- -* Grant changes: - * Make sure active_rpcs[0] is NULL if no active RPCs. - * Potential race: active_rpcs vs. active_remaining - * Duplicated hit_limit code in check_rpc - * Why call update_incoming again in try_send? - * Hold RPC lock when reading rec_incoming (no longer atomic) - * Eliminate homa_grant_try_send - * No increment in new "sending grant" tt_record. +* Notes for the next design of grants: + * Update tthoma.py: e.g., no grant_recalc records + * grant_check_slow_path too high + * Too much time in NAPI? + * Too many BUSY packets (more than DATA) * Failure modes: * homa_grant_add_rpc: list has a loop, or encounter a null list link @@ -427,62 +424,3 @@ Notes for Homa implementation in Linux: ip_input.c: ip_rcv_finish ip_input.c: dst_input homa_plumbing.c: homa_softirq - -#ifndef __STRIP__ /* See strip.py */ -#endif /* See strip.py */ -Call Trace:^M -[281667.695467] ^M -[281667.695601] ? show_regs+0x64/0x70^M -[281667.696414] ? __die+0x24/0x70^M -[281667.696992] ? page_fault_oops+0x21c/0x730 fault.c:716 -[281667.697224] ? __pfx_page_fault_oops+0x10/0x10^M -[281667.697857] ? bpf_ksym_find+0xcb/0xe0 kernel/bpf/core.c:740? -[281667.698072] ? __pfx_is_prefetch.isra.0+0x10/0x10^M -[281667.698801] ? search_bpf_extables+0xb5/0xd0 kernel/bpf/core.c:794 -[281667.699448] ? search_exception_tables+0x60/0x70^M kernel/extable.c:62 -[281667.700086] ? fixup_exception+0x3b/0x400 mm/extable.c:320 -[281667.700309] ? sched_clock+0x10/0x30^M -[281667.700526] ? kernelmode_fixup_or_oops.isra.0+0x6b/0x80 fault.c:733? -[281667.700829] ? __bad_area_nosemaphore+0x1e6/0x340 fault.c:790 -[281667.701445] ? spurious_kernel_fault_check+0x46/0xb0^M -[281667.701724] ? bad_area_nosemaphore+0x16/0x20^M fault.c:839 -[281667.702368] ? do_kern_addr_fault+0x95/0xb0 fault.c:1203 -[281667.703008] ? exc_page_fault+0xdd/0xe0^M -[281667.703231] ? asm_exc_page_fault+0x27/0x30^M -[281667.703874] ? __mod_memcg_lruvec_state+0x188/0x300^M -[281667.704197] ? __mod_memcg_lruvec_state+0x19e/0x300^M -[281667.704469] folio_batch_move_lru+0xc9/0x240 swap.c:168 -[281667.705820] lru_add_drain_cpu+0xf3/0x190 swap.c:616 -[281667.706021] lru_add_drain+0x24/0x60 swap.c:698 -[281667.706214] zap_page_range_single+0xa8/0x340 memory.c:1938 -[281667.706846] ? __pfx_zap_page_range_single+0x10/0x10^M -[281667.807198] ? userfaultfd_remove+0x8e/0x210 fs/userfaultd.c:777? -[281667.807853] ? __pfx_userfaultfd_remove+0x10/0x10^M -[281667.808526] ? __pfx_find_vma_prev+0x10/0x10 mmap.c:98x? -[281668.309000] do_madvise.part.0+0x1db8/0x1f90^M -[281668.309630] ? ____sys_recvmsg+0x15f/0x380 socket.c:2803 -[281668.309905] ? __pfx_do_madvise.part.0+0x10/0x10^M -[281668.310514] ? update_min_vruntime+0x11b/0x130^M -[281668.311116] ? __update_load_avg_cfs_rq+0x78/0x5f0^M -[281668.311722] ? __pfx___resched_curr+0x10/0x10^M -[281668.312349] ? ___sys_recvmsg+0xe0/0x150 socket.c:2845 -[281668.312557] ? __pfx____sys_recvmsg+0x10/0x10^M -[281668.313169] ? __kasan_check_read+0x11/0x20^M -[281668.313791] ? psi_group_change+0x2e9/0x4a0^M -[281668.314416] ? __set_next_task_fair.part.0+0x28/0x310^M -[281668.314711] ? __kasan_check_write+0x14/0x20^M -[281668.315326] ? recalc_sigpending+0xa7/0xf0^M -[281668.315529] ? preempt_count_sub+0x18/0xc0^M -[281668.315774] ? _raw_spin_unlock_irq+0x1f/0x40^M -[281668.316381] ? sigprocmask+0x129/0x1c0^M -[281668.316584] ? __pfx_sigprocmask+0x10/0x10^M -[281668.316812] ? __kasan_check_write+0x14/0x20^M -[281668.317411] ? __x64_sys_rt_sigprocmask+0x105/0x190^M -[281668.317680] ? __pfx___x64_sys_rt_sigprocmask+0x10/0x10^M -[281668.317978] __x64_sys_madvise+0x9a/0xb0^M -[281668.318189] ? __x64_sys_madvise+0x9a/0xb0^M -[281668.318393] x64_sys_call+0x1f34/0x20b0^M -[281668.318603] do_syscall_64+0x4b/0x110^M - -xmit DATA 1400@0; xmit DATA 1400@1400; xmit DATA 1400@2800; time 1600; time 2200; xmit DATA 800@4200; removing id 1234 from throttled list; time 3200; xmit DATA 1400@0; time 4400; xmit DATA 1400@1400; time 5600; time 6200; xmit DATA 1400@2800; time 7400; xmit DATA 1400@4200; time 8600; time 9200; xmit DATA 1400@5600; time 10400; time 11000; xmit DATA 1400@7000; time 12200; xmit DATA 1400@8400; time 13400; time 14000; xmit DATA 200@9800; removing id 1236 from throttled list -xmit DATA 1400@0; xmit DATA 1400@1400; xmit DATA 1400@2800; xmit DATA 800@4200; removing id 1234 from throttled list; time 1400; time 2000; xmit DATA 1400@0; time 3200; time 3800; xmit DATA 1400@1400; time 5000; xmit DATA 1400@2800; time 6200; time 6800; xmit DATA 1400@4200; time 8000; xmit DATA 1400@5600; time 9200; time 9800; xmit DATA 1400@7000; time 11000; time 11600; xmit DATA 1400@8400; xmit DATA 200@9800; removing id 1236 from throttled list \ No newline at end of file diff --git a/test/mock.c b/test/mock.c index 97dc4d2d..89847026 100644 --- a/test/mock.c +++ b/test/mock.c @@ -94,6 +94,11 @@ int mock_bpage_size = 0x10000; /* HOMA_BPAGE_SHIFT will evaluate to this. */ int mock_bpage_shift = 16; +/* Keeps track of all the spinlocks that have been locked but not unlocked. + * Reset for each test. + */ +static struct unit_hash *spinlocks_held; + /* Keeps track of all the blocks of memory that have been allocated by * kmalloc but not yet freed by kfree. Reset for each test. */ @@ -130,11 +135,6 @@ static struct unit_hash *vmallocs_in_use; */ static int mock_active_locks; -/* The number of spin locksthat have been acquired but not yet released. - * Should be 0 at the end of each test. - */ -static int mock_active_spin_locks; - /* Total number of successful spinlock acquisitions during current test. */ int mock_total_spin_locks; @@ -874,7 +874,8 @@ void *mock_kmalloc(size_t size, gfp_t flags) UNIT_HOOK("kmalloc"); if (mock_check_error(&mock_kmalloc_errors)) return NULL; - if (mock_active_spin_locks > 0 && (flags & ~__GFP_ZERO) != GFP_ATOMIC) + if (unit_hash_size(spinlocks_held) > 0 && + (flags & ~__GFP_ZERO) != GFP_ATOMIC) FAIL(" Incorrect flags 0x%x passed to mock_kmalloc; expected GFP_ATOMIC (0x%x)", flags, GFP_ATOMIC); block = malloc(size); @@ -1090,21 +1091,21 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) void _raw_spin_lock(raw_spinlock_t *lock) { - mock_active_spin_locks++; + mock_record_locked(lock); mock_total_spin_locks++; } void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { UNIT_HOOK("spin_lock"); - mock_active_spin_locks++; + mock_record_locked(lock); mock_total_spin_locks++; } void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { UNIT_HOOK("spin_lock"); - mock_active_spin_locks++; + mock_record_locked(lock); mock_total_spin_locks++; } @@ -1117,7 +1118,7 @@ int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) UNIT_HOOK("spin_lock"); if (mock_check_error(&mock_trylock_errors)) return 0; - mock_active_spin_locks++; + mock_record_locked(lock); mock_total_spin_locks++; return 1; } @@ -1125,12 +1126,12 @@ int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { UNIT_HOOK("unlock"); - mock_active_spin_locks--; + mock_record_unlocked(lock); } void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { - mock_active_spin_locks--; + mock_record_unlocked(lock); } int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) @@ -1138,7 +1139,7 @@ int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) UNIT_HOOK("spin_lock"); if (mock_check_error(&mock_spin_lock_held)) return 0; - mock_active_spin_locks++; + mock_record_locked(lock); return 1; } @@ -1399,7 +1400,9 @@ void tasklet_kill(struct tasklet_struct *t) {} void unregister_net_sysctl_table(struct ctl_table_header *header) -{} +{ + UNIT_LOG("; ", "unregister_net_sysctl_table"); +} void unregister_pernet_subsys(struct pernet_operations *) {} @@ -1662,6 +1665,25 @@ void mock_rcu_read_unlock(void) mock_active_rcu_locks--; } +void mock_record_locked(void *lock) +{ + if (!spinlocks_held) + spinlocks_held = unit_hash_new(); + if (unit_hash_get(spinlocks_held, lock) != NULL) + FAIL(" locking lock 0x%p when already locked", lock); + else + unit_hash_set(spinlocks_held, lock, "locked"); +} + +void mock_record_unlocked(void *lock) +{ + if (!spinlocks_held || unit_hash_get(spinlocks_held, lock) == NULL) { + FAIL(" unlocking lock 0x%p that isn't locked", lock); + return; + } + unit_hash_erase(spinlocks_held, lock); +} + /** * mock_register_net_sysctl() - Called instead of register_net_sysctl * when Homa is compiled for unit testing. @@ -1931,7 +1953,7 @@ int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) void mock_spin_unlock(spinlock_t *lock) { UNIT_HOOK("unlock"); - mock_active_spin_locks--; + mock_record_unlocked(lock); } /** @@ -2006,6 +2028,12 @@ void mock_teardown(void) unit_hash_free(skbs_in_use); skbs_in_use = NULL; + count = unit_hash_size(spinlocks_held); + if (count > 0) + FAIL(" %u spinlocks still held after test", count); + unit_hash_free(spinlocks_held); + spinlocks_held = NULL; + count = unit_hash_size(kmallocs_in_use); if (count > 0) FAIL(" %u kmalloced block(s) still allocated after test", count); @@ -2040,11 +2068,6 @@ void mock_teardown(void) FAIL(" %d (non-spin) locks still locked after test", mock_active_locks); mock_active_locks = 0; - - if (mock_active_spin_locks != 0) - FAIL(" %d spin locks still locked after test", - mock_active_spin_locks); - mock_active_spin_locks = 0; mock_total_spin_locks = 0; if (mock_active_rcu_locks != 0) diff --git a/test/mock.h b/test/mock.h index efa5ab18..776d36c0 100644 --- a/test/mock.h +++ b/test/mock.h @@ -179,6 +179,8 @@ int mock_processor_id(void); void mock_put_page(struct page *page); void mock_rcu_read_lock(void); void mock_rcu_read_unlock(void); +void mock_record_locked(void *lock); +void mock_record_unlocked(void *lock); struct ctl_table_header * mock_register_net_sysctl(struct net *net, const char *path, diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 9af4d0ea..68a4a2c8 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -3,6 +3,8 @@ #include "homa_impl.h" #include "homa_grant.h" #include "homa_pacer.h" +#include "homa_peer.h" +#include "homa_pool.h" #include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" @@ -26,25 +28,13 @@ char *rpc_ids(struct homa_rpc **rpcs, int count) return buffer; } -static struct homa *hook_homa; +static int hook_spinlock_count; static void grant_spinlock_hook(char *id) { if (strcmp(id, "spin_lock") != 0) return; - if (hook_homa != NULL) - atomic_inc(&hook_homa->grant_recalc_count); mock_ns = 1000; -} - -static struct homa_rpc *hook_rpc; -static void remove_rpc_hook(char *id) -{ - if (strcmp(id, "spin_lock") != 0) - return; - if (hook_rpc != NULL) { - homa_grant_remove_rpc(hook_rpc); - hook_rpc = NULL; - } + hook_spinlock_count++; } FIXTURE(homa_grant) { @@ -59,6 +49,7 @@ FIXTURE(homa_grant) { struct homa_sock hsk; struct homa_data_hdr data; int incoming_delta; + struct homa_grant_candidates cand; }; FIXTURE_SETUP(homa_grant) { @@ -82,11 +73,10 @@ FIXTURE_SETUP(homa_grant) self->homa.poll_usecs = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.pacer->fifo_fraction = 0; - self->homa.grant_fifo_fraction = 0; - self->homa.window_param = 10000; - self->homa.grant_window = 10000; - self->homa.max_incoming = 50000; - self->homa.max_rpcs_per_peer = 10; + self->homa.grant->fifo_fraction = 0; + self->homa.grant->window = 10000; + self->homa.grant->max_incoming = 50000; + self->homa.grant->max_rpcs_per_peer = 10; mock_sock_init(&self->hsk, &self->homa, 0); self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_addr = self->server_ip[0]; @@ -102,13 +92,18 @@ FIXTURE_SETUP(homa_grant) self->data.incoming = htonl(10000); unit_log_clear(); self->incoming_delta = 0; + homa_grant_cand_init(&self->cand); } FIXTURE_TEARDOWN(homa_grant) { + homa_grant_cand_check(&self->cand, self->homa.grant); homa_destroy(&self->homa); unit_teardown(); } +/* Create a client RPC whose msgin is mostly initialized, except + * homa_grant_init_rpc isn't invoked. + */ static struct homa_rpc *test_rpc(FIXTURE_DATA(homa_grant) *self, u64 id, struct in6_addr *server_ip, int size) { @@ -116,362 +111,788 @@ static struct homa_rpc *test_rpc(FIXTURE_DATA(homa_grant) *self, self->client_ip, server_ip, self->server_port, id, 1000, size); + rpc->msgin.length = size; + skb_queue_head_init(&rpc->msgin.packets); + INIT_LIST_HEAD(&rpc->msgin.gaps); + rpc->msgin.bytes_remaining = size; + rpc->msgin.rank = -1; + rpc->msgin.granted = 1000; + return rpc; +} + +/* Create a client RPC whose msgin is properly initialized with no + * unscheduled bytes and no packets received. + */ +static struct homa_rpc *test_rpc_init(FIXTURE_DATA(homa_grant) *self, + u64 id, struct in6_addr *server_ip, int size) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, server_ip, self->server_port, + id, 1000, size); homa_message_in_init(rpc, size, 0); - homa_grant_add_rpc(rpc); return rpc; } -TEST_F(homa_grant, homa_grant_outranks) +TEST_F(homa_grant, homa_grant_new__success) { - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 100, 1000, 20000); - struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 102, 1000, 30000); - struct homa_rpc *crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 104, 1000, 30000); - struct homa_rpc *crpc4 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 106, 1000, 30000); + struct homa_grant *grant; - homa_message_in_init(crpc1, 20000, 0); - crpc1->msgin.birth = 3000; - homa_message_in_init(crpc2, 30000, 0); - crpc2->msgin.birth = 2000; - homa_message_in_init(crpc3, 30000, 0); - crpc3->msgin.birth = 1999; - homa_message_in_init(crpc4, 30000, 0); - crpc4->msgin.birth = 2000; + grant = homa_grant_new(&mock_net); + EXPECT_EQ(50, grant->fifo_fraction); + homa_grant_destroy(grant); +} +TEST_F(homa_grant, homa_grant_new__cant_allocate_memory) +{ + struct homa_grant *grant; - EXPECT_EQ(1, homa_grant_outranks(crpc1, crpc2)); - EXPECT_EQ(0, homa_grant_outranks(crpc2, crpc1)); - EXPECT_EQ(0, homa_grant_outranks(crpc2, crpc3)); - EXPECT_EQ(1, homa_grant_outranks(crpc3, crpc2)); - EXPECT_EQ(0, homa_grant_outranks(crpc2, crpc4)); - EXPECT_EQ(0, homa_grant_outranks(crpc4, crpc2)); + mock_kmalloc_errors = 1; + grant = homa_grant_new(&mock_net); + EXPECT_TRUE(IS_ERR(grant)); + EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); } +TEST_F(homa_grant, homa_grant_new__cant_register_sysctls) +{ + struct homa_grant *grant; -TEST_F(homa_grant, homa_grant_update_incoming) + mock_register_sysctl_errors = 1; + grant = homa_grant_new(&mock_net); + EXPECT_TRUE(IS_ERR(grant)); + EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); +} + +TEST_F(homa_grant, homa_grant_destroy__basics) +{ + struct homa_grant *grant; + + grant = homa_grant_new(&mock_net); + homa_grant_destroy(grant); + EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_destroy__sysctls_not_registered) +{ + struct homa_grant *grant; + + grant = homa_grant_new(&mock_net); + grant->sysctl_header = NULL; + homa_grant_destroy(grant); + EXPECT_STREQ("", unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_init_rpc__no_bpages_available) { struct homa_rpc *rpc; - rpc = test_rpc(self, 200, self->server_ip, 20000); + rpc= unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 100, 1000, + 20000); - /* Case 1: total_incoming increases. */ - atomic_set(&self->homa.total_incoming, 1000); - rpc->msgin.bytes_remaining = 19000; - rpc->msgin.granted = 3000; - rpc->msgin.rec_incoming = 500; - homa_grant_update_incoming(rpc, &self->homa); - EXPECT_EQ(2500, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(2000, rpc->msgin.rec_incoming); + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); + homa_message_in_init(rpc, 20000, 10000); + EXPECT_EQ(0, rpc->msgin.num_bpages); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(0, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_init_rpc__grants_not_needed) +{ + struct homa_rpc *rpc; - /* Case 2: incoming negative. */ - atomic_set(&self->homa.total_incoming, 1000); - rpc->msgin.bytes_remaining = 16000; - rpc->msgin.granted = 3000; - rpc->msgin.rec_incoming = 500; - homa_grant_update_incoming(rpc, &self->homa); - EXPECT_EQ(500, atomic_read(&self->homa.total_incoming)); + rpc= unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 100, 1000, + 20000); + homa_message_in_init(rpc, 2000, 2000); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(2000, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_init_rpc__grants_needed) +{ + struct homa_rpc *rpc; + + rpc= unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, 100, 1000, + 20000); + + homa_message_in_init(rpc, 5000, 2000); + EXPECT_EQ(0, rpc->msgin.rank); + EXPECT_EQ(2000, rpc->msgin.granted); +} + +TEST_F(homa_grant, homa_grant_end_rpc__basics) +{ + struct homa_rpc *rpc; + + rpc = test_rpc_init(self, 100, self->server_ip, 20000); + rpc->msgin.rec_incoming = 100; + EXPECT_EQ(0, rpc->msgin.rank); + + unit_hook_register(grant_spinlock_hook); + hook_spinlock_count = 0; + + /* First call: RPC is managed. */ + homa_grant_end_rpc(rpc); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(1, hook_spinlock_count); + EXPECT_EQ(-100, atomic_read(&self->homa.grant->total_incoming)); EXPECT_EQ(0, rpc->msgin.rec_incoming); - /* Case 3: no change to rec_incoming. */ - atomic_set(&self->homa.total_incoming, 1000); - self->homa.max_incoming = 1000; - rpc->msgin.bytes_remaining = 16000; - rpc->msgin.granted = 4500; - rpc->msgin.rec_incoming = 500; - homa_grant_update_incoming(rpc, &self->homa); - EXPECT_EQ(1000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(500, rpc->msgin.rec_incoming); + /* Second call: RPC not managed, nothing to do. */ + hook_spinlock_count = 0; + homa_grant_end_rpc(rpc); + EXPECT_EQ(0, hook_spinlock_count); } +TEST_F(homa_grant, homa_grant_end_rpc__call_cand_check) +{ + struct homa_rpc *rpc1, *rpc2; + + self->homa.grant->max_rpcs_per_peer = 1; + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(-1, rpc2->msgin.rank); -TEST_F(homa_grant, homa_grant_add_rpc__update_metrics) + unit_hook_register(grant_spinlock_hook); + hook_spinlock_count = 0; + + unit_log_clear(); + homa_rpc_lock(rpc1); + homa_grant_end_rpc(rpc1); + homa_rpc_unlock(rpc1); + EXPECT_EQ(-1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); + EXPECT_EQ(4, hook_spinlock_count); + EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_window) { - self->homa.last_grantable_change = 100; - self->homa.num_grantable_rpcs = 3; - mock_ns = 200; - test_rpc(self, 100, self->server_ip, 100000); - EXPECT_EQ(4, self->homa.num_grantable_rpcs); - EXPECT_EQ(300, homa_metrics_per_cpu()->grantable_rpcs_integral); - EXPECT_EQ(200, self->homa.last_grantable_change); + /* Static grant window. */ + self->homa.grant->window_param = 5000; + EXPECT_EQ(5000, homa_grant_window(self->homa.grant)); + + /* Dynamic grant window. */ + self->homa.grant->window_param = 0; + self->homa.grant->max_incoming = 100000; + self->homa.grant->num_active_rpcs = 4; + EXPECT_EQ(20000, homa_grant_window(self->homa.grant)); +} + +TEST_F(homa_grant, homa_grant_outranks) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 20000); + rpc1->msgin.birth = 3000; + rpc2 = test_rpc(self, 102, self->server_ip, 30000); + rpc2->msgin.birth = 2000; + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc3->msgin.birth = 1999; + rpc4 = test_rpc(self, 106, self->server_ip, 30000); + rpc4->msgin.birth = 2000; + + EXPECT_EQ(1, homa_grant_outranks(rpc1, rpc2)); + EXPECT_EQ(0, homa_grant_outranks(rpc2, rpc1)); + EXPECT_EQ(0, homa_grant_outranks(rpc2, rpc3)); + EXPECT_EQ(1, homa_grant_outranks(rpc3, rpc2)); + EXPECT_EQ(0, homa_grant_outranks(rpc2, rpc4)); + EXPECT_EQ(0, homa_grant_outranks(rpc4, rpc2)); +} + +TEST_F(homa_grant, homa_grant_priority__no_extra_levels) +{ + self->homa.max_sched_prio = 6; + self->homa.grant->num_active_rpcs = 7; + EXPECT_EQ(6, homa_grant_priority(&self->homa, 0)); + EXPECT_EQ(0, homa_grant_priority(&self->homa, 7)); +} +TEST_F(homa_grant, homa_grant_priority__extra_levels) +{ + self->homa.max_sched_prio = 6; + self->homa.grant->num_active_rpcs = 4; + EXPECT_EQ(3, homa_grant_priority(&self->homa, 0)); + EXPECT_EQ(0, homa_grant_priority(&self->homa, 7)); +} + +TEST_F(homa_grant, homa_grant_insert_active__basics) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc(self, 100, self->server_ip, 30000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(0, rpc1->msgin.rank); + + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); + + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); + EXPECT_EQ(2, rpc3->msgin.rank); + EXPECT_EQ(3, rpc1->peer->active_rpcs); +} +TEST_F(homa_grant, homa_grant_insert_active__too_many_from_same_peer) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, &self->server_ip[1], 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 25000); + + self->homa.grant->max_rpcs_per_peer = 2; + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(rpc4, homa_grant_insert_active(rpc4)); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(1, rpc2->msgin.rank); + EXPECT_EQ(2, rpc3->msgin.rank); + EXPECT_EQ(-1, rpc4->msgin.rank); + EXPECT_EQ(2, rpc1->peer->active_rpcs); +} +TEST_F(homa_grant, homa_grant_insert_active__bump_rpc_from_same_peer) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, &self->server_ip[1], 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 5000); + + self->homa.grant->max_rpcs_per_peer = 2; + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(rpc3, homa_grant_insert_active(rpc4)); + EXPECT_EQ(1, rpc1->msgin.rank); + EXPECT_EQ(2, rpc2->msgin.rank); + EXPECT_EQ(-1, rpc3->msgin.rank); + EXPECT_EQ(0, rpc4->msgin.rank); + EXPECT_EQ(2, rpc1->peer->active_rpcs); + EXPECT_EQ(3, self->homa.grant->num_active_rpcs); +} +TEST_F(homa_grant, homa_grant_insert_active__no_room_for_new_rpc) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 30000); + + self->homa.grant->max_overcommit = 3; + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(rpc4, homa_grant_insert_active(rpc4)); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(1, rpc2->msgin.rank); + EXPECT_EQ(2, rpc3->msgin.rank); + EXPECT_EQ(-1, rpc4->msgin.rank); + EXPECT_EQ(3, self->homa.grant->num_active_rpcs); +} +TEST_F(homa_grant, homa_grant_insert_active__insert_in_middle_and_bump) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 15000); + + self->homa.grant->max_overcommit = 3; + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(rpc3, homa_grant_insert_active(rpc4)); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(2, rpc2->msgin.rank); + EXPECT_EQ(-1, rpc3->msgin.rank); + EXPECT_EQ(1, rpc4->msgin.rank); + EXPECT_EQ(3, self->homa.grant->num_active_rpcs); + EXPECT_EQ(rpc4, self->homa.grant->active_rpcs[1]); + EXPECT_EQ(rpc2, self->homa.grant->active_rpcs[2]); + EXPECT_EQ(3, rpc1->peer->active_rpcs); } -TEST_F(homa_grant, homa_grant_add_rpc__insert_in_peer_list) +TEST_F(homa_grant, homa_grant_insert_active__insert_in_middle_no_bump) { - test_rpc(self, 100, self->server_ip, 100000); - test_rpc(self, 200, self->server_ip, 50000); - test_rpc(self, 300, self->server_ip, 120000); - test_rpc(self, 400, self->server_ip, 70000); + struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; + rpc1 = test_rpc(self, 100, self->server_ip, 10000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc3 = test_rpc(self, 104, self->server_ip, 30000); + rpc4 = test_rpc(self, 106, self->server_ip, 15000); + + EXPECT_EQ(NULL, homa_grant_insert_active(rpc1)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc2)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc3)); + EXPECT_EQ(NULL, homa_grant_insert_active(rpc4)); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(2, rpc2->msgin.rank); + EXPECT_EQ(3, rpc3->msgin.rank); + EXPECT_EQ(1, rpc4->msgin.rank); + EXPECT_EQ(4, self->homa.grant->num_active_rpcs); + EXPECT_EQ(rpc4, self->homa.grant->active_rpcs[1]); + EXPECT_EQ(rpc2, self->homa.grant->active_rpcs[2]); + EXPECT_EQ(rpc3, self->homa.grant->active_rpcs[3]); + EXPECT_EQ(4, rpc1->peer->active_rpcs); +} + +TEST_F(homa_grant, homa_grant_insert_grantable__insert_in_peer_list) +{ + homa_grant_insert_grantable(test_rpc(self, 100, self->server_ip, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip, + 50000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip, + 1200000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip, + 70000)); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 200, remaining 50000; " - "response from 1.2.3.4, id 400, remaining 70000; " - "response from 1.2.3.4, id 100, remaining 100000; " - "response from 1.2.3.4, id 300, remaining 120000", - unit_log_get()); - EXPECT_EQ(4, self->homa.num_grantable_rpcs); + EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 49000 " + "id 400 ungranted 69000 " + "id 100 ungranted 99000 " + "id 300 ungranted 1199000", + unit_log_get()); } -TEST_F(homa_grant, homa_grant_add_rpc__adjust_order_in_peer_list) +TEST_F(homa_grant, homa_grant_insert_grantable__insert_peer_in_grantable_peers) { - struct homa_rpc *rpc3; - - test_rpc(self, 200, self->server_ip, 20000); - test_rpc(self, 300, self->server_ip, 30000); - rpc3 = test_rpc(self, 400, self->server_ip, 40000); - test_rpc(self, 500, self->server_ip, 50000); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip+1, + 50000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+2, + 120000)); + homa_grant_insert_grantable(test_rpc(self, 500, self->server_ip+3, + 70000)); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 200, remaining 20000; " - "response from 1.2.3.4, id 300, remaining 30000; " - "response from 1.2.3.4, id 400, remaining 40000; " - "response from 1.2.3.4, id 500, remaining 50000", - unit_log_get()); - - rpc3->msgin.bytes_remaining = 30000; - homa_grant_add_rpc(rpc3); + EXPECT_STREQ("peer 2.2.3.4: id 300 ungranted 49000; " + "peer 4.2.3.4: id 500 ungranted 69000; " + "peer 1.2.3.4: id 200 ungranted 99000; " + "peer 3.2.3.4: id 400 ungranted 119000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_insert_grantable__move_peer_in_grantable_peers) +{ + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip, + 20000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip+1, + 30000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+2, + 40000)); + homa_grant_insert_grantable(test_rpc(self, 500, self->server_ip+3, + 50000)); + + /* This insertion moves the peer upwards in the list. */ + homa_grant_insert_grantable(test_rpc(self, 600, self->server_ip+3, + 25000)); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 200, remaining 20000; " - "response from 1.2.3.4, id 300, remaining 30000; " - "response from 1.2.3.4, id 400, remaining 30000; " - "response from 1.2.3.4, id 500, remaining 50000", - unit_log_get()); - - rpc3->msgin.bytes_remaining = 19999; - homa_grant_add_rpc(rpc3); + EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 19000; " + "peer 4.2.3.4: id 600 ungranted 24000 " + "id 500 ungranted 49000; " + "peer 2.2.3.4: id 300 ungranted 29000; " + "peer 3.2.3.4: id 400 ungranted 39000", + unit_log_get()); + + /* This insertion moves the peer to the front of the list. */ + homa_grant_insert_grantable(test_rpc(self, 700, self->server_ip+3, + 10000)); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 400, remaining 19999; " - "response from 1.2.3.4, id 200, remaining 20000; " - "response from 1.2.3.4, id 300, remaining 30000; " - "response from 1.2.3.4, id 500, remaining 50000", - unit_log_get()); - EXPECT_EQ(4, self->homa.num_grantable_rpcs); + EXPECT_STREQ("peer 4.2.3.4: id 700 ungranted 9000 " + "id 600 ungranted 24000 " + "id 500 ungranted 49000; " + "peer 1.2.3.4: id 200 ungranted 19000; " + "peer 2.2.3.4: id 300 ungranted 29000; " + "peer 3.2.3.4: id 400 ungranted 39000", + unit_log_get()); } -TEST_F(homa_grant, homa_grant_add_rpc__insert_peer_in_homa_list) + +TEST_F(homa_grant, homa_grant_manage_rpc__update_metrics) { - test_rpc(self, 200, self->server_ip, 100000); - test_rpc(self, 300, self->server_ip+1, 50000); - test_rpc(self, 400, self->server_ip+2, 120000); - test_rpc(self, 500, self->server_ip+3, 70000); + self->homa.grant->last_grantable_change = 50; + self->homa.grant->num_grantable_rpcs = 3; + mock_ns = 200; + homa_grant_manage_rpc(test_rpc(self, 100, self->server_ip, 100000)); + EXPECT_EQ(4, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(450, homa_metrics_per_cpu()->grantable_rpcs_integral); + EXPECT_EQ(200, self->homa.grant->last_grantable_change); +} +TEST_F(homa_grant, homa_grant_manage_rpc__dont_change_max_grantable_rpcs) +{ + self->homa.grant->num_grantable_rpcs = 3; + self->homa.grant->max_grantable_rpcs = 5; + homa_grant_manage_rpc(test_rpc(self, 100, self->server_ip, 100000)); + EXPECT_EQ(4, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(5, self->homa.grant->max_grantable_rpcs); +} +TEST_F(homa_grant, homa_grant_manage_rpc__insert_and_bump_to_grantables) +{ + struct homa_rpc *rpc1, *rpc2; + rpc1 = test_rpc(self, 100, self->server_ip, 50000); + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + + self->homa.grant->max_overcommit = 1; + self->homa.grant->last_grantable_change = 50; + self->homa.grant->num_grantable_rpcs = 3; + mock_ns = 200; + homa_grant_manage_rpc(rpc1); + mock_ns = 300; + homa_grant_manage_rpc(rpc2); + EXPECT_EQ(5, self->homa.grant->max_grantable_rpcs); + EXPECT_EQ(850, homa_metrics_per_cpu()->grantable_rpcs_integral); + EXPECT_EQ(300, self->homa.grant->last_grantable_change); + EXPECT_EQ(-1, rpc1->msgin.rank); + EXPECT_EQ(200, rpc1->msgin.birth); + EXPECT_EQ(0, rpc2->msgin.rank); + EXPECT_EQ(300, rpc2->msgin.birth); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 2.2.3.4, id 300, remaining 50000; " - "response from 4.2.3.4, id 500, remaining 70000; " - "response from 1.2.3.4, id 200, remaining 100000; " - "response from 3.2.3.4, id 400, remaining 120000", - unit_log_get()); - EXPECT_EQ(4, self->homa.num_grantable_rpcs); + EXPECT_STREQ("active[0]: id 102 ungranted 19000; " + "peer 1.2.3.4: id 100 ungranted 49000", unit_log_get()); } -TEST_F(homa_grant, homa_grant_add_rpc__move_peer_in_homa_list) +TEST_F(homa_grant, homa_grant_manage_rpc__set_window) { - struct homa_rpc *rpc3; - struct homa_rpc *rpc4; + struct homa_rpc *rpc1; - test_rpc(self, 200, self->server_ip, 20000); - test_rpc(self, 300, self->server_ip+1, 30000); - rpc3 = test_rpc(self, 400, self->server_ip+2, 40000); - rpc4 = test_rpc(self, 500, self->server_ip+3, 50000); + rpc1 = test_rpc(self, 100, self->server_ip, 50000); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 200, remaining 20000; " - "response from 2.2.3.4, id 300, remaining 30000; " - "response from 3.2.3.4, id 400, remaining 40000; " - "response from 4.2.3.4, id 500, remaining 50000", - unit_log_get()); - - rpc3->msgin.bytes_remaining = 30000; - homa_grant_add_rpc(rpc3); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 200, remaining 20000; " - "response from 2.2.3.4, id 300, remaining 30000; " - "response from 3.2.3.4, id 400, remaining 30000; " - "response from 4.2.3.4, id 500, remaining 50000", - unit_log_get()); - - rpc4->msgin.bytes_remaining = 19999; - homa_grant_add_rpc(rpc4); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 4.2.3.4, id 500, remaining 19999; " - "response from 1.2.3.4, id 200, remaining 20000; " - "response from 2.2.3.4, id 300, remaining 30000; " - "response from 3.2.3.4, id 400, remaining 30000", - unit_log_get()); - EXPECT_EQ(4, self->homa.num_grantable_rpcs); + self->homa.grant->max_incoming = 100000; + self->homa.grant->window_param = 0; + homa_grant_manage_rpc(rpc1); + EXPECT_EQ(50000, self->homa.grant->window); } -TEST_F(homa_grant, homa_grant_remove_rpc__skip_if_not_linked) +TEST_F(homa_grant, homa_grant_remove_grantable__not_first_in_peer_list) { - struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 100, 1000, 2000); + struct homa_rpc *rpc = test_rpc(self, 300, self->server_ip, 30000); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip, + 20000)); + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+1, + 25000)); + + unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_EQ(0, self->homa.num_grantable_rpcs); + EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 19000 " + "id 300 ungranted 29000; " + "peer 2.2.3.4: id 400 ungranted 24000", + unit_log_get()); - homa_grant_remove_rpc(rpc); - EXPECT_EQ(0, self->homa.num_grantable_rpcs); + homa_grant_remove_grantable(rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 19000; " + "peer 2.2.3.4: id 400 ungranted 24000", + unit_log_get()); } -TEST_F(homa_grant, homa_grant_remove_rpc__race_in_checking_not_linked) +TEST_F(homa_grant, homa_grant_remove_grantable__only_entry_in_peer_list) { - struct homa_rpc *rpc = test_rpc(self, 200, self->server_ip, 20000); + struct homa_rpc *rpc = test_rpc(self, 200, self->server_ip, 30000); - EXPECT_EQ(1, self->homa.num_grantable_rpcs); + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip+1, + 40000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+2, + 20000)); - unit_hook_register(remove_rpc_hook); - hook_rpc = rpc; - homa_grant_remove_rpc(rpc); - EXPECT_EQ(0, self->homa.num_grantable_rpcs); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 400 ungranted 19000; " + "peer 1.2.3.4: id 200 ungranted 29000; " + "peer 2.2.3.4: id 300 ungranted 39000", + unit_log_get()); + + homa_grant_remove_grantable(rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 400 ungranted 19000; " + "peer 2.2.3.4: id 300 ungranted 39000", + unit_log_get()); } -TEST_F(homa_grant, homa_grant_remove_rpc__clear_oldest_rpc) +TEST_F(homa_grant, homa_grant_remove_grantable__reposition_peer_in_grantable_peers) { struct homa_rpc *rpc1 = test_rpc(self, 200, self->server_ip, 20000); - struct homa_rpc *rpc2 = test_rpc(self, 300, self->server_ip, 10000); + struct homa_rpc *rpc2 = test_rpc(self, 202, self->server_ip, 35000); + + homa_grant_insert_grantable(rpc1); + homa_grant_insert_grantable(rpc2); + homa_grant_insert_grantable(test_rpc(self, 204, self->server_ip, + 60000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip+1, + 30000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+2, + 40000)); + homa_grant_insert_grantable(test_rpc(self, 500, self->server_ip+3, + 50000)); - EXPECT_EQ(2, self->homa.num_grantable_rpcs); - self->homa.oldest_rpc = rpc2; + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 19000 " + "id 202 ungranted 34000 " + "id 204 ungranted 59000; " + "peer 2.2.3.4: id 300 ungranted 29000; " + "peer 3.2.3.4: id 400 ungranted 39000; " + "peer 4.2.3.4: id 500 ungranted 49000", + unit_log_get()); - homa_grant_remove_rpc(rpc1); - EXPECT_NE(NULL, self->homa.oldest_rpc); - EXPECT_EQ(300, self->homa.oldest_rpc->id); + /* First removal moves peer down, but not to end of list. */ + homa_grant_remove_grantable(rpc1); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 2.2.3.4: id 300 ungranted 29000; " + "peer 1.2.3.4: id 202 ungranted 34000 " + "id 204 ungranted 59000; " + "peer 3.2.3.4: id 400 ungranted 39000; " + "peer 4.2.3.4: id 500 ungranted 49000", + unit_log_get()); - homa_grant_remove_rpc(rpc2); - EXPECT_EQ(NULL, self->homa.oldest_rpc); + /* Second removal moves peer to end of list. */ + homa_grant_remove_grantable(rpc2); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 2.2.3.4: id 300 ungranted 29000; " + "peer 3.2.3.4: id 400 ungranted 39000; " + "peer 4.2.3.4: id 500 ungranted 49000; " + "peer 1.2.3.4: id 204 ungranted 59000", + unit_log_get()); } -TEST_F(homa_grant, homa_grant_remove_rpc__update_metrics) + +TEST_F(homa_grant, homa_grant_remove_active__copy_existing_rpcs) { - struct homa_rpc *rpc = test_rpc(self, 200, self->server_ip, 20000); + struct homa_rpc *rpc; - EXPECT_EQ(1, self->homa.num_grantable_rpcs); - self->homa.last_grantable_change = 100; - self->homa.num_grantable_rpcs = 3; - mock_ns = 200; + homa_grant_manage_rpc(test_rpc(self, 200, self->server_ip, + 50000)); + homa_grant_manage_rpc(test_rpc(self, 300, self->server_ip, + 40000)); + homa_grant_manage_rpc(test_rpc(self, 400, self->server_ip, + 30000)); + homa_grant_manage_rpc(test_rpc(self, 500, self->server_ip, + 20000)); - homa_grant_remove_rpc(rpc); - EXPECT_EQ(2, self->homa.num_grantable_rpcs); - EXPECT_EQ(300, homa_metrics_per_cpu()->grantable_rpcs_integral); - EXPECT_EQ(200, self->homa.last_grantable_change); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 400 ungranted 29000; " + "active[2]: id 300 ungranted 39000; " + "active[3]: id 200 ungranted 49000", + unit_log_get()); + + rpc = self->homa.grant->active_rpcs[0]; + EXPECT_EQ(4, rpc->peer->active_rpcs); + + homa_grant_remove_active(rpc, &self->cand); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 400 ungranted 29000; " + "active[1]: id 300 ungranted 39000; " + "active[2]: id 200 ungranted 49000", + unit_log_get()); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(3, rpc->peer->active_rpcs); + EXPECT_TRUE(homa_grant_cand_empty(&self->cand)); } -TEST_F(homa_grant, homa_grant_remove_rpc__not_first_in_peer_list) +TEST_F(homa_grant, homa_grant_remove_active__promote_from_grantable) { - struct homa_rpc *rpc2; + struct homa_rpc *rpc; - test_rpc(self, 200, self->server_ip, 20000); - rpc2 = test_rpc(self, 300, self->server_ip, 30000); - test_rpc(self, 400, self->server_ip+1, 25000); + self->homa.grant->max_overcommit = 2; + homa_grant_manage_rpc(test_rpc(self, 200, self->server_ip, + 50000)); + homa_grant_manage_rpc(test_rpc(self, 300, self->server_ip, + 40000)); + homa_grant_manage_rpc(test_rpc(self, 400, self->server_ip, + 30000)); + homa_grant_manage_rpc(test_rpc(self, 500, self->server_ip, + 20000)); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 200, remaining 20000; " - "response from 1.2.3.4, id 300, remaining 30000; " - "response from 2.2.3.4, id 400, remaining 25000", - unit_log_get()); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 400 ungranted 29000; " + "peer 1.2.3.4: id 300 ungranted 39000 " + "id 200 ungranted 49000", + unit_log_get()); - homa_grant_remove_rpc(rpc2); + rpc = self->homa.grant->active_rpcs[1]; + EXPECT_EQ(2, rpc->peer->active_rpcs); + + homa_grant_remove_active(rpc, &self->cand); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 200, remaining 20000; " - "response from 2.2.3.4, id 400, remaining 25000", - unit_log_get()); - EXPECT_EQ(2, self->homa.num_grantable_rpcs); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 300 ungranted 39000; " + "peer 1.2.3.4: id 200 ungranted 49000", + unit_log_get()); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(2, rpc->peer->active_rpcs); + EXPECT_FALSE(homa_grant_cand_empty(&self->cand)); } -TEST_F(homa_grant, homa_grant_remove_rpc__only_entry_in_peer_list) +TEST_F(homa_grant, homa_grant_remove_active__skip_overactive_peer) { - struct homa_rpc *rpc1 = test_rpc(self, 200, self->server_ip, 30000); + struct homa_rpc *rpc; - test_rpc(self, 300, self->server_ip+1, 40000); - test_rpc(self, 400, self->server_ip+2, 20000); + self->homa.grant->max_overcommit = 2; + self->homa.grant->max_rpcs_per_peer = 1; + homa_grant_manage_rpc(test_rpc(self, 200, self->server_ip+1, + 50000)); + homa_grant_manage_rpc(test_rpc(self, 300, self->server_ip+1, + 40000)); + homa_grant_manage_rpc(test_rpc(self, 400, self->server_ip, + 30000)); + homa_grant_manage_rpc(test_rpc(self, 500, self->server_ip, + 20000)); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 3.2.3.4, id 400, remaining 20000; " - "response from 1.2.3.4, id 200, remaining 30000; " - "response from 2.2.3.4, id 300, remaining 40000", - unit_log_get()); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 300 ungranted 39000; " + "peer 1.2.3.4: id 400 ungranted 29000; " + "peer 2.2.3.4: id 200 ungranted 49000", + unit_log_get()); + + rpc = self->homa.grant->active_rpcs[1]; - homa_grant_remove_rpc(rpc1); + homa_grant_remove_active(rpc, &self->cand); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 3.2.3.4, id 400, remaining 20000; " - "response from 2.2.3.4, id 300, remaining 40000", - unit_log_get()); - EXPECT_EQ(2, self->homa.num_grantable_rpcs); + EXPECT_STREQ("active[0]: id 500 ungranted 19000; " + "active[1]: id 200 ungranted 49000; " + "peer 1.2.3.4: id 400 ungranted 29000", + unit_log_get()); + EXPECT_FALSE(homa_grant_cand_empty(&self->cand)); } -TEST_F(homa_grant, homa_grant_remove_rpc__reposition_peer_in_homa_list) + +TEST_F(homa_grant, homa_grant_unmanage_rpc) { - struct homa_rpc *rpc1 = test_rpc(self, 200, self->server_ip, 20000); + struct homa_rpc *rpc; + + self->homa.grant->max_rpcs_per_peer = 1; + self->homa.grant->window_param = 0; + self->homa.grant->max_incoming = 60000; + homa_grant_manage_rpc(test_rpc(self, 100, self->server_ip, + 20000)); + rpc = test_rpc(self, 200, self->server_ip, 30000); + homa_grant_manage_rpc(rpc); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 19000; " + "peer 1.2.3.4: id 200 ungranted 29000", + unit_log_get()); + EXPECT_EQ(2, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(30000, self->homa.grant->window); - test_rpc(self, 300, self->server_ip, 50000); - test_rpc(self, 400, self->server_ip+1, 30000); - test_rpc(self, 500, self->server_ip+2, 40000); + self->homa.grant->last_grantable_change = 100; + mock_ns = 250; + homa_grant_unmanage_rpc(rpc, &self->cand); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 1.2.3.4, id 200, remaining 20000; " - "response from 1.2.3.4, id 300, remaining 50000; " - "response from 2.2.3.4, id 400, remaining 30000; " - "response from 3.2.3.4, id 500, remaining 40000", - unit_log_get()); + EXPECT_STREQ("active[0]: id 100 ungranted 19000", unit_log_get()); + EXPECT_EQ(1, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(300, homa_metrics_per_cpu()->grantable_rpcs_integral); + EXPECT_EQ(250, self->homa.grant->last_grantable_change); + EXPECT_EQ(30000, self->homa.grant->window); - homa_grant_remove_rpc(rpc1); + homa_grant_unmanage_rpc(self->homa.grant->active_rpcs[0], &self->cand); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("response from 2.2.3.4, id 400, remaining 30000; " - "response from 3.2.3.4, id 500, remaining 40000; " - "response from 1.2.3.4, id 300, remaining 50000", - unit_log_get()); - EXPECT_EQ(3, self->homa.num_grantable_rpcs); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(60000, self->homa.grant->window); +} + +TEST_F(homa_grant, homa_grant_update_incoming) +{ + struct homa_rpc *rpc; + + rpc = test_rpc(self, 200, self->server_ip, 20000); + + /* Case 1: total_incoming increases. */ + atomic_set(&self->homa.grant->total_incoming, 1000); + rpc->msgin.bytes_remaining = 19000; + rpc->msgin.granted = 3000; + rpc->msgin.rec_incoming = 500; + homa_grant_update_incoming(rpc, self->homa.grant); + EXPECT_EQ(2500, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(2000, rpc->msgin.rec_incoming); + + /* Case 2: incoming negative. */ + atomic_set(&self->homa.grant->total_incoming, 1000); + rpc->msgin.bytes_remaining = 16000; + rpc->msgin.granted = 3000; + rpc->msgin.rec_incoming = 500; + homa_grant_update_incoming(rpc, self->homa.grant); + EXPECT_EQ(500, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(0, rpc->msgin.rec_incoming); + + /* Case 3: no change to rec_incoming. */ + atomic_set(&self->homa.grant->total_incoming, 1000); + self->homa.grant->max_incoming = 1000; + rpc->msgin.bytes_remaining = 16000; + rpc->msgin.granted = 4500; + rpc->msgin.rec_incoming = 500; + homa_grant_update_incoming(rpc, self->homa.grant); + EXPECT_EQ(1000, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(500, rpc->msgin.rec_incoming); } -TEST_F(homa_grant, homa_grant_get_offset__basics) +TEST_F(homa_grant, homa_grant_update_granted__basics) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - EXPECT_EQ(10000, homa_grant_get_offset(rpc, &self->homa)); - EXPECT_EQ(0, atomic_read(&self->homa.incoming_hit_limit)); + EXPECT_TRUE(homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(10000, rpc->msgin.granted); + EXPECT_EQ(0, atomic_read(&self->homa.grant->incoming_hit_limit)); } -TEST_F(homa_grant, homa_grant_get_offset__rpc_idle) +TEST_F(homa_grant, homa_grant_update_granted__rpc_idle) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); rpc->silent_ticks = 2; - EXPECT_EQ(0, homa_grant_get_offset(rpc, &self->homa)); + EXPECT_FALSE(homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(1000, rpc->msgin.granted); } -TEST_F(homa_grant, homa_grant_get_offset__end_of_message) +TEST_F(homa_grant, homa_grant_update_granted__end_of_message) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); /* First call grants remaining bytes in message. */ rpc->msgin.bytes_remaining = 5000; - EXPECT_EQ(20000, homa_grant_get_offset(rpc, &self->homa)); + EXPECT_TRUE(homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(20000, rpc->msgin.granted); /* Second call cannot grant anything additional. */ - EXPECT_EQ(20000, homa_grant_get_offset(rpc, &self->homa)); + EXPECT_FALSE(homa_grant_update_granted(rpc, self->homa.grant)); } -TEST_F(homa_grant, homa_grant_get_offset__insufficient_room_in_incoming) +TEST_F(homa_grant, homa_grant_update_granted__insufficient_room_in_incoming) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); rpc->msgin.bytes_remaining = 5000; - atomic_set(&rpc->msgin.rank, 5); - atomic_set(&self->homa.total_incoming, 48000); - EXPECT_EQ(17000, homa_grant_get_offset(rpc, &self->homa)); + rpc->msgin.rank = 5; + atomic_set(&self->homa.grant->total_incoming, 48000); + EXPECT_TRUE(homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(17000, rpc->msgin.granted); } -TEST_F(homa_grant, homa_grant_get_offset__incoming_overcommitted) +TEST_F(homa_grant, homa_grant_update_granted__incoming_overcommitted) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - atomic_set(&self->homa.total_incoming, 51000); - EXPECT_EQ(-1000, homa_grant_get_offset(rpc, &self->homa)); - EXPECT_EQ(1, atomic_read(&self->homa.incoming_hit_limit)); + atomic_set(&self->homa.grant->total_incoming, 51000); + EXPECT_FALSE(homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(1000, rpc->msgin.granted); + EXPECT_EQ(1, atomic_read(&self->homa.grant->incoming_hit_limit)); } TEST_F(homa_grant, homa_grant_send__basics) @@ -480,10 +901,10 @@ TEST_F(homa_grant, homa_grant_send__basics) mock_xmit_log_verbose = 1; rpc->msgin.granted = 2600; - rpc->msgin.priority = 6; + rpc->msgin.rank = 2; unit_log_clear(); homa_grant_send(rpc); - EXPECT_SUBSTR("id 100, offset 2600, grant_prio 6", unit_log_get()); + EXPECT_SUBSTR("id 100, offset 2600, grant_prio 0", unit_log_get()); } TEST_F(homa_grant, homa_grant_send__resend_all) { @@ -491,11 +912,11 @@ TEST_F(homa_grant, homa_grant_send__resend_all) mock_xmit_log_verbose = 1; rpc->msgin.granted = 9999; - rpc->msgin.priority = 4; + rpc->msgin.rank = 0; rpc->msgin.resend_all = 1; unit_log_clear(); homa_grant_send(rpc); - EXPECT_SUBSTR("id 100, offset 9999, grant_prio 4, resend_all", + EXPECT_SUBSTR("id 100, offset 9999, grant_prio 0, resend_all", unit_log_get()); EXPECT_EQ(0, rpc->msgin.resend_all); } @@ -507,567 +928,205 @@ TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) 100, 1000, 2000); rpc->msgin.bytes_remaining = 500; - rpc->msgin.granted = 2000; + rpc->msgin.granted = 1000; rpc->msgin.rec_incoming = 0; + unit_log_clear(); homa_grant_check_rpc(rpc); + EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, rpc->msgin.rec_incoming); - EXPECT_EQ(0, atomic_read(&self->homa.total_incoming)); + EXPECT_EQ(0, atomic_read(&self->homa.grant->total_incoming)); EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); } -TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) +TEST_F(homa_grant, homa_grant_check_rpc__rpc_not_active) { struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 2000); - int old_state; homa_message_in_init(rpc, 2000, 0); - homa_grant_check_rpc(rpc); - EXPECT_EQ(2000, rpc->msgin.rec_incoming); - EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); - - old_state = rpc->state; - rpc->state = RPC_DEAD; - rpc->msgin.bytes_remaining = 0; - homa_grant_check_rpc(rpc); - rpc->state = old_state; - EXPECT_EQ(2000, rpc->msgin.rec_incoming); - EXPECT_EQ(2000, atomic_read(&self->homa.total_incoming)); -} -TEST_F(homa_grant, homa_grant_check_rpc__message_fully_granted_no_recalc) -{ - struct homa_rpc *rpc; - - rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, 102, 1000, - 2000); - homa_message_in_init(rpc, 2000, 0); - rpc->msgin.granted = 2000; - rpc->msgin.bytes_remaining = 500; - + EXPECT_EQ(0, rpc->msgin.rank); + rpc->msgin.rank = -1; unit_log_clear(); homa_grant_check_rpc(rpc); EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); } -TEST_F(homa_grant, homa_grant_check_rpc__message_fully_granted_must_recalc) -{ - struct homa_rpc *rpc1, *rpc2; - - /* First RPC is complete. */ - rpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, 100, 1000, - 2000); - homa_message_in_init(rpc1, 2000, 0); - rpc1->msgin.granted = 2000; - rpc1->msgin.bytes_remaining = 0; - rpc1->msgin.rec_incoming = 1500; - - /* Second RPC will be waiting for incoming. */ - rpc2 = test_rpc(self, 100, self->server_ip, 5000); - - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); - self->homa.max_overcommit = 1; - homa_grant_recalc(&self->homa); - EXPECT_EQ(1, atomic_read(&self->homa.incoming_hit_limit)); - - EXPECT_EQ(0, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(0, rpc2->msgin.granted); - - // atomic_set(&self->homa.total_incoming, 0); - unit_log_clear(); - homa_grant_check_rpc(rpc1); - EXPECT_EQ(1500, rpc2->msgin.granted); - EXPECT_EQ(self->homa.max_incoming, - atomic_read(&self->homa.total_incoming)); - EXPECT_STREQ("homa_grant_recalc; xmit GRANT 1500@0", - unit_log_get()); -} -TEST_F(homa_grant, homa_grant_check_rpc__add_new_message_to_grantables) +TEST_F(homa_grant, homa_grant_check_rpc__fast_path) { struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 20000); homa_message_in_init(rpc, 20000, 0); - rpc->msgin.bytes_remaining = 12000; + EXPECT_EQ(0, rpc->msgin.granted); + /* First call issues a grant. */ + unit_log_clear(); + homa_rpc_lock(rpc); homa_grant_check_rpc(rpc); - EXPECT_EQ(18000, rpc->msgin.granted); - EXPECT_EQ(10000, rpc->msgin.rec_incoming); - EXPECT_EQ(0, atomic_read(&rpc->msgin.rank)); - EXPECT_EQ(10000, atomic_read(&self->homa.total_incoming)); + EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_calls); -} -TEST_F(homa_grant, homa_grant_check_rpc__new_message_bumps_existing) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3; + EXPECT_EQ(10000, rpc->msgin.granted); - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - self->homa.max_overcommit = 3; - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); - - rpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, 104, 1000, 25000); - homa_message_in_init(rpc3, 20000, 0); - homa_grant_check_rpc(rpc3); - EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.rec_incoming); - EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); - EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); -} -TEST_F(homa_grant, homa_grant_check_rpc__new_message_cant_be_granted) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); - rpc2->msgin.bytes_remaining = 1000; - - rpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, 104, 1000, 30000); - homa_message_in_init(rpc3, 30000, 0); - homa_grant_check_rpc(rpc3); - EXPECT_EQ(0, rpc3->msgin.granted); - EXPECT_EQ(0, rpc3->msgin.rec_incoming); - EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); + /* Second call doesn't issue a grant (nothing has changed). */ + unit_log_clear(); + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(2, homa_metrics_per_cpu()->grant_check_calls); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_slow_path); + EXPECT_EQ(10000, rpc->msgin.granted); } -TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_negative_rank) +TEST_F(homa_grant, homa_grant_check_rpc__fast_path_grants_to_end_of_message) { - struct homa_rpc *rpc1, *rpc2, *rpc3; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 104, self->server_ip, 40000); - self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(rpc1, self->homa.active_rpcs[0]); - EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(rpc2, self->homa.active_rpcs[1]); - EXPECT_EQ(0, rpc3->msgin.granted); + struct homa_rpc *rpc = test_rpc_init(self, 100, self->server_ip, 6000); - rpc3->msgin.bytes_remaining = 15000; - homa_grant_check_rpc(rpc3); - EXPECT_EQ(35000, rpc3->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.rec_incoming); - EXPECT_EQ(rpc1, self->homa.active_rpcs[1]); - EXPECT_EQ(rpc3, self->homa.active_rpcs[0]); -} -TEST_F(homa_grant, homa_grant_check_rpc__upgrade_priority_from_positive_rank) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3; + EXPECT_EQ(1, self->homa.grant->num_grantable_rpcs); - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 104, self->server_ip, 40000); - self->homa.max_overcommit = 4; - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); - EXPECT_EQ(10000, rpc3->msgin.granted); - - rpc3->msgin.bytes_remaining = 25000; unit_log_clear(); - homa_grant_check_rpc(rpc3); - EXPECT_EQ(25000, rpc3->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.rec_incoming); - EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); - EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_STREQ("homa_grant_recalc; xmit GRANT 25000@1", unit_log_get()); + homa_rpc_lock(rpc); + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("xmit GRANT 6000@0", unit_log_get()); + EXPECT_EQ(6000, rpc->msgin.granted); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_slow_path); } -TEST_F(homa_grant, homa_grant_check_rpc__check_incoming_no_recalc) +TEST_F(homa_grant, homa_grant_check_rpc__fix_order) { struct homa_rpc *rpc1, *rpc2, *rpc3; - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 100, self->server_ip, 30000); - rpc3 = test_rpc(self, 100, self->server_ip, 40000); - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); - - atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 15000); - atomic_set(&self->homa.incoming_hit_limit, 0); + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + EXPECT_EQ(2, rpc3->msgin.rank); + rpc3->msgin.granted = 25000; + rpc3->msgin.bytes_remaining = 15000; + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + mock_ns = self->homa.grant->next_recalc; + unit_log_clear(); - homa_grant_check_rpc(rpc3); - EXPECT_EQ(0, rpc1->msgin.granted); + homa_rpc_lock(rpc2); + homa_grant_check_rpc(rpc2); + homa_rpc_unlock(rpc2); + EXPECT_STREQ("xmit GRANT 35000@2; xmit GRANT 5000@1", unit_log_get()); + EXPECT_EQ(5000, rpc1->msgin.granted); EXPECT_EQ(0, rpc2->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(35000, rpc3->msgin.granted); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 104 ungranted 5000; " + "active[1]: id 100 ungranted 15000; " + "active[2]: id 102 ungranted 30000", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_slow_path); + EXPECT_EQ(40000, self->homa.grant->next_recalc); } -TEST_F(homa_grant, homa_grant_check_rpc__check_incoming_recalc) +TEST_F(homa_grant, homa_grant_check_rpc__incoming_hit_limit) { struct homa_rpc *rpc1, *rpc2, *rpc3; - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 100, self->server_ip, 30000); - rpc3 = test_rpc(self, 100, self->server_ip, 40000); - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); - - atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 15000); - atomic_set(&self->homa.incoming_hit_limit, 1); + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + atomic_set(&self->homa.grant->incoming_hit_limit, 1); + unit_log_clear(); - homa_grant_check_rpc(rpc3); + homa_rpc_lock(rpc1); + homa_grant_check_rpc(rpc1); + homa_rpc_unlock(rpc1); + EXPECT_STREQ("xmit GRANT 10000@2; xmit GRANT 5000@1", unit_log_get()); EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(5000, rpc2->msgin.granted); EXPECT_EQ(0, rpc3->msgin.granted); - EXPECT_STREQ("homa_grant_recalc; xmit GRANT 10000@2; xmit GRANT 5000@1", - unit_log_get()); -} -TEST_F(homa_grant, homa_grant_check_rpc__grant_to_self) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 100, self->server_ip, 30000); - rpc3 = test_rpc(self, 100, self->server_ip, 40000); - rpc4 = test_rpc(self, 100, self->server_ip, 50000); - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(3, atomic_read(&rpc4->msgin.rank)); - - atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 20000); - atomic_set(&self->homa.incoming_hit_limit, 0); - unit_log_clear(); - homa_grant_check_rpc(rpc3); - EXPECT_EQ(0, rpc1->msgin.granted); - EXPECT_EQ(0, rpc2->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_STREQ("xmit GRANT 10000@1", unit_log_get()); -} -TEST_F(homa_grant, homa_grant_check_rpc__grant_to_self_and_recalc) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 5000); - rpc2 = test_rpc(self, 100, self->server_ip, 6000); - rpc3 = test_rpc(self, 100, self->server_ip, 10000); - rpc4 = test_rpc(self, 100, self->server_ip, 20000); - atomic_set(&self->homa.total_incoming, self->homa.max_incoming); - self->homa.max_overcommit = 3; - homa_grant_recalc(&self->homa); - EXPECT_EQ(2, atomic_read(&rpc3->msgin.rank)); - - atomic_set(&self->homa.total_incoming, self->homa.max_incoming - 10000); - atomic_set(&self->homa.incoming_hit_limit, 0); unit_log_clear(); - homa_grant_check_rpc(rpc3); - EXPECT_EQ(0, rpc1->msgin.granted); - EXPECT_EQ(0, rpc2->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_EQ(2, atomic_read(&rpc4->msgin.rank)); - EXPECT_TRUE(list_empty(&rpc3->grantable_links)); - EXPECT_STREQ("xmit GRANT 10000@0; homa_grant_recalc", unit_log_get()); -} - -TEST_F(homa_grant, homa_grant_recalc__basics) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 104, self->server_ip+1, 25000); - rpc4 = test_rpc(self, 106, self->server_ip+1, 35000); - self->homa.max_incoming = 100000; - self->homa.max_overcommit = 3; - atomic_set(&self->homa.incoming_hit_limit, 1); - mock_ns_tick = 10; - - unit_log_clear(); - homa_grant_recalc(&self->homa); - EXPECT_STREQ("homa_grant_recalc; xmit GRANT 10000@2; " - "xmit GRANT 10000@1; " - "xmit GRANT 10000@0", unit_log_get()); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(2, rpc1->msgin.priority); - EXPECT_EQ(10000, rpc1->msgin.granted); - EXPECT_EQ(20000, atomic_read(&self->homa.active_remaining[0])); - EXPECT_EQ(1, atomic_read(&self->homa.grant_recalc_count)); - EXPECT_EQ(0, atomic_read(&self->homa.incoming_hit_limit)); - - EXPECT_EQ(1, atomic_read(&rpc3->msgin.rank)); - EXPECT_EQ(1, rpc3->msgin.priority); - EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(30000, atomic_read(&self->homa.active_remaining[2])); - - EXPECT_EQ(2, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_NE(0, homa_metrics_per_cpu()->grant_recalc_ns); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 10000; " + "active[1]: id 102 ungranted 25000; " + "active[2]: id 104 ungranted 40000", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_slow_path); + EXPECT_EQ(20000, self->homa.grant->next_recalc); + EXPECT_EQ(1, atomic_read(&self->homa.grant->incoming_hit_limit)); } -TEST_F(homa_grant, homa_grant_recalc__cant_acquire_grantable_lock) +TEST_F(homa_grant, homa_grant_check_rpc__skip_rpc_with_too_much_incoming) { - struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + struct homa_rpc *rpc2, *rpc3; - unit_hook_register(grant_spinlock_hook); - hook_homa = &self->homa; - mock_trylock_errors = 0xff; + test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + rpc2->msgin.rec_incoming = 10000; + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + atomic_set(&self->homa.grant->incoming_hit_limit, 1); + homa_rpc_lock(rpc3); + homa_grant_check_rpc(rpc3); + homa_rpc_unlock(rpc3); unit_log_clear(); - homa_grant_recalc(&self->homa); - EXPECT_STREQ("homa_grant_recalc", unit_log_get()); - EXPECT_EQ(0, rpc->msgin.granted); - EXPECT_EQ(2, atomic_read(&self->homa.grant_recalc_count)); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_recalc_skips); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 10000; " + "active[1]: id 102 ungranted 30000; " + "active[2]: id 104 ungranted 35000", unit_log_get()); } -TEST_F(homa_grant, homa_grant_recalc__clear_unused_entries_in_active_rpcs) +TEST_F(homa_grant, homa_grant_check_rpc__skip_dead_rpc) { struct homa_rpc *rpc1, *rpc2, *rpc3; + int saved_state; - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 104, self->server_ip, 25000); - self->homa.max_incoming = 100000; - self->homa.max_overcommit = 3; - - homa_grant_recalc(&self->homa); - EXPECT_EQ(rpc1, self->homa.active_rpcs[0]); - EXPECT_EQ(rpc3, self->homa.active_rpcs[1]); - EXPECT_EQ(rpc2, self->homa.active_rpcs[2]); - - homa_rpc_end(rpc1); - homa_rpc_end(rpc2); - homa_grant_recalc(&self->homa); - EXPECT_EQ(rpc3, self->homa.active_rpcs[0]); - EXPECT_EQ(NULL, self->homa.active_rpcs[1]); - EXPECT_EQ(NULL, self->homa.active_rpcs[2]); -} -TEST_F(homa_grant, homa_grant_recalc__use_only_lowest_priorities) -{ - struct homa_rpc *rpc1, *rpc2; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - self->homa.max_incoming = 100000; - self->homa.max_sched_prio = 5; + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + saved_state = rpc2->state; + rpc2->state = RPC_DEAD; + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + atomic_set(&self->homa.grant->incoming_hit_limit, 1); unit_log_clear(); - homa_grant_recalc(&self->homa); - EXPECT_STREQ("homa_grant_recalc; xmit GRANT 10000@1; xmit GRANT 10000@0", - unit_log_get()); - EXPECT_EQ(1, rpc1->msgin.priority); - EXPECT_EQ(0, rpc2->msgin.priority); -} -TEST_F(homa_grant, homa_grant_recalc__share_lowest_priority_level) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 100, self->server_ip, 40000); - rpc4 = test_rpc(self, 102, self->server_ip, 50000); - self->homa.max_incoming = 100000; - self->homa.max_sched_prio = 2; - - unit_log_clear(); - homa_grant_recalc(&self->homa); - EXPECT_STREQ("homa_grant_recalc; " - "xmit GRANT 10000@2; " - "xmit GRANT 10000@1; " - "xmit GRANT 10000@0; " - "xmit GRANT 10000@0", unit_log_get()); - EXPECT_EQ(2, rpc1->msgin.priority); - EXPECT_EQ(1, rpc2->msgin.priority); - EXPECT_EQ(0, rpc3->msgin.priority); - EXPECT_EQ(0, rpc4->msgin.priority); -} -TEST_F(homa_grant, homa_grant_recalc__compute_window_size) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3; - - rpc1 = test_rpc(self, 100, self->server_ip, 30000); - rpc2 = test_rpc(self, 102, self->server_ip, 40000); - rpc3 = test_rpc(self, 100, self->server_ip, 50000); - self->homa.max_incoming = 100000; - - /* First try: fixed window size. */ - self->homa.window_param = 5000; - homa_grant_recalc(&self->homa); - EXPECT_EQ(5000, self->homa.grant_window); - EXPECT_EQ(5000, rpc1->msgin.granted); - EXPECT_EQ(5000, rpc2->msgin.granted); - EXPECT_EQ(5000, rpc3->msgin.granted); - - /* Second try: dynamic window size. */ - self->homa.window_param = 0; - homa_grant_recalc(&self->homa); - EXPECT_EQ(25000, self->homa.grant_window); - EXPECT_EQ(25000, rpc1->msgin.granted); - EXPECT_EQ(25000, rpc2->msgin.granted); - EXPECT_EQ(25000, rpc3->msgin.granted); -} -TEST_F(homa_grant, homa_grant_recalc__rpc_cant_be_granted) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 10000); - rpc2 = test_rpc(self, 102, self->server_ip, 10000); - rpc3 = test_rpc(self, 104, self->server_ip, 10000); - rpc4 = test_rpc(self, 106, self->server_ip, 10000); - self->homa.window_param = 5000; - self->homa.max_overcommit = 3; - rpc2->silent_ticks = 3; - - homa_grant_recalc(&self->homa); - EXPECT_EQ(5000, rpc1->msgin.granted); + homa_rpc_lock(rpc3); + homa_grant_check_rpc(rpc3); + homa_rpc_unlock(rpc3); + EXPECT_STREQ("xmit GRANT 10000@2; xmit GRANT 5000@0", unit_log_get()); + EXPECT_EQ(10000, rpc1->msgin.granted); EXPECT_EQ(0, rpc2->msgin.granted); EXPECT_EQ(5000, rpc3->msgin.granted); - EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_loops); -} -TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_so_recalc) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 10000); - rpc2 = test_rpc(self, 102, self->server_ip, 10000); - rpc3 = test_rpc(self, 104, self->server_ip, 10000); - rpc4 = test_rpc(self, 106, self->server_ip, 10000); - self->homa.max_incoming = 32000; - self->homa.max_overcommit = 2; - - homa_grant_recalc(&self->homa); - EXPECT_EQ(10000, rpc1->msgin.granted); - EXPECT_EQ(10000, rpc2->msgin.granted); - EXPECT_EQ(10000, rpc3->msgin.granted); - EXPECT_EQ(2000, rpc4->msgin.granted); - EXPECT_TRUE(list_empty(&rpc2->grantable_links)); - EXPECT_TRUE(list_empty(&rpc3->grantable_links)); - EXPECT_EQ(2, homa_metrics_per_cpu()->grant_recalc_loops); -} -TEST_F(homa_grant, homa_grant_recalc__rpc_fully_granted_but_cant_get_lock) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3, *rpc4; - - rpc1 = test_rpc(self, 100, self->server_ip, 10000); - rpc2 = test_rpc(self, 102, self->server_ip, 10000); - rpc3 = test_rpc(self, 104, self->server_ip, 10000); - rpc4 = test_rpc(self, 106, self->server_ip, 10000); - self->homa.max_incoming = 32000; - self->homa.max_overcommit = 2; - unit_hook_register(grant_spinlock_hook); - hook_homa = &self->homa; - mock_trylock_errors = 0xe0; - EXPECT_EQ(0, homa_metrics_per_cpu()->grant_recalc_skips); - - homa_grant_recalc(&self->homa); - EXPECT_EQ(10000, rpc1->msgin.granted); - EXPECT_EQ(10000, rpc2->msgin.granted); - EXPECT_EQ(0, rpc3->msgin.granted); - EXPECT_EQ(0, rpc4->msgin.granted); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_recalc_loops); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_recalc_skips); -} - -TEST_F(homa_grant, homa_grant_pick_rpcs__basics) -{ - struct homa_rpc *rpcs[4]; - int count; - - test_rpc(self, 200, self->server_ip, 20000); - test_rpc(self, 300, self->server_ip, 50000); - test_rpc(self, 400, self->server_ip+1, 30000); - test_rpc(self, 500, self->server_ip+2, 40000); - - self->homa.max_rpcs_per_peer = 2; - count = homa_grant_pick_rpcs(&self->homa, rpcs, 4); - EXPECT_EQ(4, count); - EXPECT_STREQ("200 400 500 300", rpc_ids(rpcs, count)); -} -TEST_F(homa_grant, homa_grant_pick_rpcs__new_rpc_goes_in_middle_of_list) -{ - struct homa_rpc *rpcs[4]; - int count; - - test_rpc(self, 200, self->server_ip, 20000); - test_rpc(self, 300, self->server_ip, 30000); - test_rpc(self, 400, self->server_ip, 40000); - test_rpc(self, 500, self->server_ip+1, 25000); - - count = homa_grant_pick_rpcs(&self->homa, rpcs, 5); - EXPECT_EQ(4, count); - EXPECT_STREQ("200 500 300 400", rpc_ids(rpcs, count)); -} -TEST_F(homa_grant, homa_grant_pick_rpcs__new_rpc_goes_in_middle_of_list_with_overflow) -{ - struct homa_rpc *rpcs[4]; - int count; - - test_rpc(self, 200, self->server_ip, 20000); - test_rpc(self, 300, self->server_ip, 30000); - test_rpc(self, 400, self->server_ip, 40000); - test_rpc(self, 500, self->server_ip+1, 25000); - - count = homa_grant_pick_rpcs(&self->homa, rpcs, 3); - EXPECT_EQ(3, count); - EXPECT_STREQ("200 500 300", rpc_ids(rpcs, count)); -} -TEST_F(homa_grant, homa_grant_pick_rpcs__non_first_rpc_of_peer_doesnt_fit) -{ - struct homa_rpc *rpcs[4]; - int count; - - test_rpc(self, 200, self->server_ip, 20000); - test_rpc(self, 300, self->server_ip, 30000); - test_rpc(self, 400, self->server_ip, 40000); - test_rpc(self, 500, self->server_ip, 50000); - test_rpc(self, 600, self->server_ip+1, 25000); - - self->homa.max_rpcs_per_peer = 3; - count = homa_grant_pick_rpcs(&self->homa, rpcs, 3); - EXPECT_EQ(3, count); - EXPECT_STREQ("200 600 300", rpc_ids(rpcs, count)); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 10000; " + "active[1]: id 102 ungranted 30000; " + "active[2]: id 104 ungranted 35000", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_slow_path); + rpc2->state = saved_state; } -TEST_F(homa_grant, homa_grant_pick_rpcs__max_rpcs_per_peer) -{ - struct homa_rpc *rpcs[4]; - int count; - - test_rpc(self, 200, self->server_ip, 20000); - test_rpc(self, 300, self->server_ip, 30000); - test_rpc(self, 400, self->server_ip, 40000); - test_rpc(self, 500, self->server_ip, 50000); - test_rpc(self, 600, self->server_ip+1, 60000); - self->homa.max_rpcs_per_peer = 2; - count = homa_grant_pick_rpcs(&self->homa, rpcs, 4); - EXPECT_EQ(3, count); - EXPECT_STREQ("200 300 600", rpc_ids(rpcs, count)); -} -TEST_F(homa_grant, homa_grant_pick_rpcs__first_rpc_of_peer_doesnt_fit) +TEST_F(homa_grant, homa_grant_fix_order) { - struct homa_rpc *rpcs[4]; - int count; + struct homa_rpc *rpc3, *rpc4; - test_rpc(self, 200, self->server_ip, 20000); - test_rpc(self, 300, self->server_ip, 30000); - test_rpc(self, 400, self->server_ip, 40000); - test_rpc(self, 400, self->server_ip+1, 50000); - test_rpc(self, 500, self->server_ip+2, 60000); + test_rpc_init(self, 100, self->server_ip, 20000); + test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + rpc4 = test_rpc_init(self, 106, self->server_ip, 50000); + rpc3->msgin.granted = 25000; + rpc3->msgin.bytes_remaining = 15000; + rpc4->msgin.granted = 26000; + rpc4->msgin.bytes_remaining = 24000; - self->homa.max_rpcs_per_peer = 3; - count = homa_grant_pick_rpcs(&self->homa, rpcs, 3); - EXPECT_EQ(3, count); - EXPECT_STREQ("200 300 400", rpc_ids(rpcs, count)); + homa_grant_fix_order(self->homa.grant); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 104 ungranted 15000; " + "active[1]: id 100 ungranted 20000; " + "active[2]: id 106 ungranted 24000; " + "active[3]: id 102 ungranted 30000", unit_log_get()); + EXPECT_EQ(3, homa_metrics_per_cpu()->grant_priority_bumps); } +#if 0 TEST_F(homa_grant, homa_grant_find_oldest__basics) { mock_ns_tick = 10; @@ -1108,98 +1167,176 @@ TEST_F(homa_grant, homa_grant_find_oldest__no_good_candidates) homa_grant_find_oldest(&self->homa); EXPECT_EQ(NULL, self->homa.oldest_rpc); } +#endif -TEST_F(homa_grant, homa_grant_end_rpc__rpc_not_grantable) +TEST_F(homa_grant, homa_grant_cand_add__basics) { - struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 100, 1000, 2000); - atomic_set(&self->homa.total_incoming, 10000); - rpc->msgin.rec_incoming = 3000; - homa_grant_end_rpc(rpc); - EXPECT_EQ(7000, atomic_read(&self->homa.total_incoming)); -} -TEST_F(homa_grant, homa_grant_end_rpc__in_active_list) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3; + struct homa_grant_candidates cand; + struct homa_rpc *rpc1, *rpc2; rpc1 = test_rpc(self, 100, self->server_ip, 20000); rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 104, self->server_ip, 40000); - self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa); - EXPECT_EQ(rpc1, self->homa.active_rpcs[0]); - EXPECT_EQ(rpc2, self->homa.active_rpcs[1]); - EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(10000, rpc1->msgin.rec_incoming); - unit_log_clear(); - homa_grant_end_rpc(rpc1); - EXPECT_EQ(rpc2, self->homa.active_rpcs[0]); - EXPECT_EQ(rpc3, self->homa.active_rpcs[1]); - EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); + homa_grant_cand_init(&cand); + homa_grant_cand_add(&cand, rpc2); + homa_grant_cand_add(&cand, rpc1); + EXPECT_EQ(2, cand.inserts); + EXPECT_EQ(0, cand.removes); + EXPECT_EQ(rpc2, cand.rpcs[0]); + EXPECT_EQ(rpc1, cand.rpcs[1]); + EXPECT_EQ(1, atomic_read(&rpc1->refs)); + homa_grant_cand_check(&cand, self->homa.grant); +} +TEST_F(homa_grant, homa_grant_cand_add__wrap_around) +{ + struct homa_grant_candidates cand; + int i; + + homa_grant_cand_init(&cand); + + /* Add so many RPCs that some have to be dropped. */ + for (i = 0; i < HOMA_MAX_CAND_RPCS + 2; i++) + homa_grant_cand_add(&cand, test_rpc(self, 100 + 2*i, + self->server_ip, 20000)); + EXPECT_EQ(HOMA_MAX_CAND_RPCS, cand.inserts); + EXPECT_EQ(0, cand.removes); + EXPECT_EQ(100, cand.rpcs[0]->id); + EXPECT_EQ(114, cand.rpcs[HOMA_MAX_CAND_RPCS-1]->id); + + /* Discard a couple of RPCs then add more. */ + homa_rpc_put(cand.rpcs[0]); + homa_rpc_put(cand.rpcs[1]); + cand.removes = 2; + for (i = 0; i < 3; i++) + homa_grant_cand_add(&cand, test_rpc(self, 200 + 2*i, + self->server_ip, 20000)); + EXPECT_EQ(HOMA_MAX_CAND_RPCS + 2, cand.inserts); + EXPECT_EQ(2, cand.removes); + EXPECT_EQ(200, cand.rpcs[0]->id); + EXPECT_EQ(202, cand.rpcs[1]->id); + EXPECT_EQ(104, cand.rpcs[2]->id); + homa_grant_cand_check(&cand, self->homa.grant); } -TEST_F(homa_grant, homa_grant_end_rpc__not_in_active_list) + +TEST_F(homa_grant, homa_grant_cand_check__basics) { + struct homa_grant_candidates cand; struct homa_rpc *rpc1, *rpc2, *rpc3; - rpc1 = test_rpc(self, 100, self->server_ip, 20000); - rpc2 = test_rpc(self, 102, self->server_ip, 30000); - rpc3 = test_rpc(self, 104, self->server_ip, 40000); - self->homa.max_overcommit = 2; - homa_grant_recalc(&self->homa); - EXPECT_EQ(0, atomic_read(&rpc1->msgin.rank)); - EXPECT_EQ(1, atomic_read(&rpc2->msgin.rank)); - EXPECT_EQ(20000, atomic_read(&self->homa.total_incoming)); - EXPECT_EQ(0, rpc3->msgin.rec_incoming); - EXPECT_FALSE(list_empty(&rpc3->grantable_links)); + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 20000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 20000); - rpc3->msgin.rec_incoming = 5000; - homa_grant_end_rpc(rpc3); - EXPECT_TRUE(list_empty(&rpc3->grantable_links)); - EXPECT_EQ(15000, atomic_read(&self->homa.total_incoming)); + homa_grant_cand_init(&cand); + homa_grant_cand_add(&cand, rpc1); + homa_grant_cand_add(&cand, rpc2); + homa_grant_cand_add(&cand, rpc3); + rpc2->msgin.granted = 20000; + unit_log_clear(); + homa_grant_cand_check(&cand, self->homa.grant); + EXPECT_STREQ("xmit GRANT 10000@2; xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(0, atomic_read(&rpc1->refs)); + EXPECT_EQ(0, atomic_read(&rpc2->refs)); + EXPECT_EQ(0, atomic_read(&rpc3->refs)); } +TEST_F(homa_grant, homa_grant_cand_check__rpc_dead) +{ + struct homa_grant_candidates cand; + struct homa_rpc *rpc; + int saved_state; + + rpc = test_rpc_init(self, 100, self->server_ip, 20000); + + homa_grant_cand_init(&cand); + homa_grant_cand_add(&cand, rpc); + saved_state = rpc->state; + rpc->state = RPC_DEAD; -TEST_F(homa_grant, homa_grant_lock_slow__basics) + unit_log_clear(); + homa_grant_cand_check(&cand, self->homa.grant); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, atomic_read(&rpc->refs)); + rpc->state = saved_state; +} +TEST_F(homa_grant, homa_grant_cand_check__rpc_becomes_fully_granted) { - mock_ns = 500; - unit_hook_register(grant_spinlock_hook); + struct homa_grant_candidates cand; + struct homa_rpc *rpc1, *rpc2; - EXPECT_EQ(1, homa_grant_lock_slow(&self->homa, 0)); - homa_grant_unlock(&self->homa); + self->homa.grant->max_rpcs_per_peer = 1; + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(-1, rpc2->msgin.rank); + rpc1->msgin.bytes_remaining = 10000; - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_lock_misses); - EXPECT_EQ(500, homa_metrics_per_cpu()->grant_lock_miss_ns); + homa_grant_cand_init(&cand); + homa_grant_cand_add(&cand, rpc1); + + unit_log_clear(); + homa_grant_cand_check(&cand, self->homa.grant); + EXPECT_STREQ("xmit GRANT 20000@1; xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(-1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); + EXPECT_EQ(2, cand.removes); } -TEST_F(homa_grant, homa_grant_lock_slow__recalc_count) + +TEST_F(homa_grant, homa_grant_lock_slow) { mock_ns = 500; unit_hook_register(grant_spinlock_hook); - hook_homa = &self->homa; - mock_trylock_errors = 0xff; - EXPECT_EQ(0, homa_grant_lock_slow(&self->homa, 1)); - hook_homa = NULL; + homa_grant_lock_slow(self->homa.grant); + homa_grant_unlock(self->homa.grant); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_lock_misses); EXPECT_EQ(500, homa_metrics_per_cpu()->grant_lock_miss_ns); - - /* Make sure the check only occurs if the recalc argument is set. */ - mock_trylock_errors = 0xff; - EXPECT_EQ(1, homa_grant_lock_slow(&self->homa, 0)); - EXPECT_EQ(2, homa_metrics_per_cpu()->grant_lock_misses); - homa_grant_unlock(&self->homa); } -/* Functions in homa_grant.h: - * -------------------------- - */ +TEST_F(homa_grant, homa_grant_update_sysctl_deps__max_overcommit) +{ + self->homa.grant->max_overcommit = 2; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(2, self->homa.grant->max_overcommit); + + self->homa.grant->max_overcommit = HOMA_MAX_GRANTS; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(HOMA_MAX_GRANTS, self->homa.grant->max_overcommit); + + self->homa.grant->max_overcommit = HOMA_MAX_GRANTS+1; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(HOMA_MAX_GRANTS, self->homa.grant->max_overcommit); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__grant_fifo_fraction) +{ + self->homa.grant->fifo_fraction = 499; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(499, self->homa.grant->fifo_fraction); - TEST_F(homa_grant, homa_grant_needy_bit) - { - EXPECT_EQ(0x1, homa_grant_needy_bit(0)); - EXPECT_EQ(0x4, homa_grant_needy_bit(2)); - EXPECT_EQ(0x80, homa_grant_needy_bit(7)); - EXPECT_EQ(0, homa_grant_needy_bit(20)); - EXPECT_EQ(0, homa_grant_needy_bit(-1)); - } \ No newline at end of file + self->homa.grant->fifo_fraction = 501; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(500, self->homa.grant->fifo_fraction); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__grant_nonfifo) +{ + self->homa.grant->fifo_grant_increment = 10000; + self->homa.grant->fifo_fraction = 0; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(0, self->homa.grant->grant_nonfifo); + + self->homa.grant->fifo_fraction = 100; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(90000, self->homa.grant->grant_nonfifo); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__recalc_ns) +{ + self->homa.grant->recalc_usecs = 7; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(7000, self->homa.grant->recalc_ns); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__grant_window) +{ + self->homa.grant->window_param = 30000; + homa_grant_update_sysctl_deps(self->homa.grant); + EXPECT_EQ(30000, self->homa.grant->window); +} diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 43419c65..f4092c12 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_grant.h" #include "homa_interest.h" #include "homa_pacer.h" #include "homa_peer.h" @@ -96,9 +97,8 @@ FIXTURE_SETUP(homa_incoming) self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.pacer->fifo_fraction = 0; #ifndef __STRIP__ /* See strip.py */ - self->homa.grant_fifo_fraction = 0; self->homa.unsched_bytes = 10000; - self->homa.window_param = 10000; + self->homa.grant->window = 10000; #endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, 0); mock_sock_init(&self->hsk2, &self->homa, self->server_port); @@ -162,6 +162,7 @@ TEST_F(homa_incoming, homa_message_in_init__no_buffer_region) self->hsk.buffer_pool = homa_pool_new(&self->hsk); EXPECT_EQ(ENOMEM, -homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 0)); EXPECT_EQ(0, crpc->msgin.num_bpages); + EXPECT_EQ(-1, crpc->msgin.length); } TEST_F(homa_incoming, homa_message_in_init__no_buffers_available) { @@ -183,13 +184,13 @@ TEST_F(homa_incoming, homa_message_in_init__update_metrics) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - EXPECT_EQ(0, homa_message_in_init(crpc, 140, 0)); - EXPECT_EQ(0, homa_message_in_init(crpc, 130, 0)); - EXPECT_EQ(0, homa_message_in_init(crpc, 0xfff, 0)); - EXPECT_EQ(0, homa_message_in_init(crpc, 0xfff0, 0)); - EXPECT_EQ(0, homa_message_in_init(crpc, 0x3000, 0)); - EXPECT_EQ(0, homa_message_in_init(crpc, 1000000, 0)); - EXPECT_EQ(0, homa_message_in_init(crpc, 900000, 0)); + EXPECT_EQ(0, homa_message_in_init(crpc, 140, 140)); + EXPECT_EQ(0, homa_message_in_init(crpc, 130, 130)); + EXPECT_EQ(0, homa_message_in_init(crpc, 0xfff, 0xfff)); + EXPECT_EQ(0, homa_message_in_init(crpc, 0xfff0, 0xfff0)); + EXPECT_EQ(0, homa_message_in_init(crpc, 0x3000, 0x3000)); + EXPECT_EQ(0, homa_message_in_init(crpc, 1000000, 1000000)); + EXPECT_EQ(0, homa_message_in_init(crpc, 900000, 900000)); EXPECT_EQ(270, homa_metrics_per_cpu()->small_msg_bytes[2]); EXPECT_EQ(0xfff, homa_metrics_per_cpu()->small_msg_bytes[63]); EXPECT_EQ(0x3000, homa_metrics_per_cpu()->medium_msg_bytes[11]); @@ -645,7 +646,9 @@ TEST_F(homa_incoming, homa_copy_to_user__basics) unit_log_clear(); mock_copy_to_user_dont_copy = -1; + homa_rpc_lock(crpc); EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399; " "skb_copy_datagram_iter: 648 bytes to 0x1000578: 101000-101647; " "skb_copy_datagram_iter: 752 bytes to 0x1000800: 101648-102399; " @@ -691,7 +694,9 @@ TEST_F(homa_incoming, homa_copy_to_user__multiple_batches) unit_log_clear(); mock_copy_to_user_dont_copy = -1; + homa_rpc_lock(crpc); EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399; " "skb_copy_datagram_iter: 1400 bytes to 0x1000578: 1400-2799; " "skb_copy_datagram_iter: 1400 bytes to 0x1000af0: 2800-4199; " @@ -716,7 +721,9 @@ TEST_F(homa_incoming, homa_copy_to_user__nothing_to_copy) /* First call finds packets to copy. */ unit_log_clear(); mock_copy_to_user_dont_copy = -1; + homa_rpc_lock(crpc); EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); EXPECT_STREQ("skb_copy_datagram_iter: 1400 bytes to 0x1000000: 0-1399", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); @@ -742,7 +749,9 @@ TEST_F(homa_incoming, homa_copy_to_user__many_chunks_for_one_skb) unit_log_clear(); mock_copy_to_user_dont_copy = -1; + homa_rpc_lock(crpc); EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); EXPECT_STREQ("skb_copy_datagram_iter: 512 bytes to 0x1000000: 101000-101511; " "skb_copy_datagram_iter: 512 bytes to 0x1000200: 101512-102023; " "skb_copy_datagram_iter: 512 bytes to 0x1000400: 102024-102535; " @@ -768,7 +777,9 @@ TEST_F(homa_incoming, homa_copy_to_user__skb_data_extends_past_message_end) mock_copy_to_user_dont_copy = -1; h = (struct homa_data_hdr *)skb_peek(&crpc->msgin.packets)->data; h->seg.offset = htonl(4000); + homa_rpc_lock(crpc); EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); EXPECT_STREQ("", unit_log_get()); } TEST_F(homa_incoming, homa_copy_to_user__error_in_import_ubuf) @@ -782,7 +793,9 @@ TEST_F(homa_incoming, homa_copy_to_user__error_in_import_ubuf) unit_log_clear(); mock_import_ubuf_errors = 1; + homa_rpc_lock(crpc); EXPECT_EQ(13, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } @@ -797,7 +810,9 @@ TEST_F(homa_incoming, homa_copy_to_user__error_in_skb_copy_datagram_iter) unit_log_clear(); mock_copy_data_errors = 1; + homa_rpc_lock(crpc); EXPECT_EQ(14, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } @@ -823,7 +838,9 @@ TEST_F(homa_incoming, homa_copy_to_user__timetrace_info) unit_log_clear(); mock_copy_to_user_dont_copy = -1; tt_init(NULL); + homa_rpc_lock(crpc); EXPECT_EQ(0, -homa_copy_to_user(crpc)); + homa_rpc_unlock(crpc); tt_get_messages(traces, sizeof(traces)); EXPECT_STREQ("starting copy to user space for id 1234; " "copied out bytes 0-1400 for id 1234; " @@ -1165,6 +1182,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__too_many_acks) EXPECT_STREQ("sk->sk_data_ready invoked; ack 1237; ack 1235", unit_log_get()); } +#if 0 #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__invoke_homa_grant_check_rpc) { @@ -1177,6 +1195,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__invoke_homa_grant_check_rpc) EXPECT_SUBSTR("id 1235", unit_log_get()); } #endif /* See strip.py */ +#endif TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) { struct homa_rpc *dead = unit_client_rpc(&self->hsk, @@ -1431,7 +1450,9 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) .resend_all = 0}; ASSERT_NE(NULL, srpc); + homa_rpc_lock(srpc); homa_xmit_data(srpc, false); + homa_rpc_unlock(srpc); unit_log_clear(); homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), @@ -1473,7 +1494,9 @@ TEST_F(homa_incoming, homa_grant_pkt__reset) .resend_all = 1}; ASSERT_NE(NULL, srpc); + homa_rpc_lock(srpc); homa_xmit_data(srpc, false); + homa_rpc_unlock(srpc); unit_log_clear(); EXPECT_EQ(10000, srpc->msgout.granted); EXPECT_EQ(10000, srpc->msgout.next_xmit_offset); @@ -1630,7 +1653,9 @@ TEST_F(homa_incoming, homa_resend_pkt__client_send_data) self->server_port, self->client_id, 2000, 100); ASSERT_NE(NULL, crpc); + homa_rpc_lock(crpc); homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); unit_log_clear(); mock_clear_xmit_prios(); @@ -1659,7 +1684,9 @@ TEST_F(homa_incoming, homa_resend_pkt__server_send_data) self->server_id, 100, 20000); ASSERT_NE(NULL, srpc); + homa_rpc_lock(srpc); homa_xmit_data(srpc, false); + homa_rpc_unlock(srpc); unit_log_clear(); mock_clear_xmit_prios(); @@ -1683,7 +1710,9 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) self->server_port, self->client_id, 2000, 2000); ASSERT_NE(NULL, crpc); + homa_rpc_lock(crpc); homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); unit_log_clear(); mock_xmit_log_verbose = 1; @@ -1714,7 +1743,9 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) #ifndef __STRIP__ /* See strip.py */ crpc->msgout.granted = 1400; #endif /* See strip.py */ + homa_rpc_lock(crpc); homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); unit_log_clear(); mock_xmit_log_verbose = 1; @@ -2226,7 +2257,9 @@ TEST_F(homa_incoming, homa_wait_private__available_immediately) ASSERT_NE(NULL, crpc); ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); atomic_or(RPC_PRIVATE, &crpc->flags); + homa_rpc_lock(crpc); EXPECT_EQ(0, homa_wait_private(crpc, 0)); + homa_rpc_unlock(crpc); ASSERT_EQ(RPC_PRIVATE, atomic_read(&crpc->flags)); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_none)); } @@ -2240,7 +2273,9 @@ TEST_F(homa_incoming, homa_wait_private__rpc_has_error) ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); atomic_or(RPC_PRIVATE, &crpc->flags); crpc->error = -ENOENT; + homa_rpc_lock(crpc); EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); + homa_rpc_unlock(crpc); EXPECT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags) & RPC_PKTS_READY); } TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) @@ -2253,7 +2288,9 @@ TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); atomic_or(RPC_PRIVATE, &crpc->flags); mock_copy_data_errors = 1; + homa_rpc_lock(crpc); EXPECT_EQ(EFAULT, -homa_wait_private(crpc, 0)); + homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_wait_private__nonblocking) { @@ -2264,7 +2301,9 @@ TEST_F(homa_incoming, homa_wait_private__nonblocking) ASSERT_NE(NULL, crpc); atomic_or(RPC_PRIVATE, &crpc->flags); + homa_rpc_lock(crpc); EXPECT_EQ(EAGAIN, -homa_wait_private(crpc, 1)); + homa_rpc_unlock(crpc); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_fast)); } TEST_F(homa_incoming, homa_wait_private__signal_notify_race) @@ -2278,10 +2317,12 @@ TEST_F(homa_incoming, homa_wait_private__signal_notify_race) IF_NO_STRIP(self->homa.poll_usecs = 0); unit_hook_register(handoff_hook); hook_rpc = crpc; - hook_count = 1; + hook_count = 2; mock_prepare_to_wait_errors = 1; + homa_rpc_lock(crpc); EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); + homa_rpc_unlock(crpc); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); EXPECT_EQ(0, mock_prepare_to_wait_errors); } @@ -2493,39 +2534,6 @@ TEST_F(homa_incoming, homa_rpc_handoff__queue_rpc_on_socket) } #ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_incoming, homa_incoming_sysctl_changed__grant_nonfifo) -{ - self->homa.fifo_grant_increment = 10000; - self->homa.grant_fifo_fraction = 0; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(0, self->homa.grant_nonfifo); - - self->homa.grant_fifo_fraction = 100; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(90000, self->homa.grant_nonfifo); - - self->homa.grant_fifo_fraction = 500; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(10000, self->homa.grant_nonfifo); - - self->homa.grant_fifo_fraction = 2000; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(10000, self->homa.grant_nonfifo); -} -TEST_F(homa_incoming, homa_incoming_sysctl_changed__limit_on_max_overcommit) -{ - self->homa.max_overcommit = 2; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(2, self->homa.max_overcommit); - - self->homa.max_overcommit = HOMA_MAX_GRANTS; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(HOMA_MAX_GRANTS, self->homa.max_overcommit); - - self->homa.max_overcommit = HOMA_MAX_GRANTS+1; - homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(HOMA_MAX_GRANTS, self->homa.max_overcommit); -} TEST_F(homa_incoming, homa_incoming_sysctl_changed__convert_usec_to_ns) { self->homa.busy_usecs = 53; diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 6ae9d633..0bc5f3ee 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -44,7 +44,6 @@ FIXTURE_SETUP(homa_offload) mock_set_homa(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.unsched_bytes = 10000; - self->homa.window_param = 10000; mock_sock_init(&self->hsk, &self->homa, 99); self->ip = unit_get_in_addr("196.168.0.1"); memset(&self->header, 0, sizeof(self->header)); @@ -326,7 +325,9 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) &client_ip, &server_ip, client_port, server_id, 100, 20000); ASSERT_NE(NULL, srpc); + homa_rpc_lock(srpc); homa_xmit_data(srpc, false); + homa_rpc_unlock(srpc); unit_log_clear(); h.common.sport = htons(srpc->dport); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 0945199c..32f8b5ba 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_grant.h" #include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" @@ -85,7 +86,7 @@ FIXTURE_SETUP(homa_outgoing) self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; - self->homa.window_param = 10000; + self->homa.grant->window = 10000; self->homa.pacer->fifo_fraction = 0; #endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, self->client_port); @@ -402,6 +403,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) crpc2 = homa_rpc_new_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc2 == NULL); + homa_rpc_unlock(crpc2); mock_set_ipv6(&self->hsk); self->hsk.sock.sk_protocol = IPPROTO_TCP; @@ -409,13 +411,16 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) /* First try: not quite enough space for 3 packets in GSO. */ mock_net_device.gso_max_size = mock_mtu - 1 + 2 * UNIT_TEST_DATA_PER_PACKET; + homa_rpc_lock(crpc1); ASSERT_EQ(0, -homa_message_out_fill(crpc1, unit_iov_iter((void *) 1000, 10000), 0)); + homa_rpc_unlock(crpc1); EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 2800", unit_log_get()); /* Second try: just barely enough space for 3 packets in GSO. */ mock_net_device.gso_max_size += 1; unit_log_clear(); + homa_rpc_lock(crpc2); ASSERT_EQ(0, -homa_message_out_fill(crpc2, unit_iov_iter((void *) 1000, 10000), 0)); homa_rpc_unlock(crpc2); @@ -456,7 +461,6 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_force_software) struct homa_rpc *crpc2; ASSERT_FALSE(crpc1 == NULL); - homa_rpc_unlock(crpc1); mock_net_device.gso_max_size = 10000; mock_xmit_log_verbose = 1; self->homa.gso_force_software = 0; @@ -464,17 +468,18 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_force_software) unit_iov_iter((void *) 1000, 5000), 0)); unit_log_clear(); homa_xmit_data(crpc1, false); + homa_rpc_unlock(crpc1); EXPECT_SUBSTR("xmit DATA", unit_log_get()); EXPECT_NOSUBSTR("TSO disabled", unit_log_get()); crpc2 = homa_rpc_new_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc2 == NULL); - homa_rpc_unlock(crpc2); self->homa.gso_force_software = 1; ASSERT_EQ(0, -homa_message_out_fill(crpc2, unit_iov_iter((void *) 1000, 5000), 0)); unit_log_clear(); homa_xmit_data(crpc2, false); + homa_rpc_unlock(crpc2); EXPECT_SUBSTR("TSO disabled", unit_log_get()); } TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) @@ -778,7 +783,9 @@ TEST_F(homa_outgoing, homa_xmit_data__basics) #endif /* See strip.py */ unit_log_clear(); mock_clear_xmit_prios(); + homa_rpc_lock(crpc); homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit DATA 1400@0; " "xmit DATA 1400@1400; " @@ -807,7 +814,9 @@ TEST_F(homa_outgoing, homa_xmit_data__stop_because_no_more_granted) unit_log_clear(); crpc->msgout.granted = 1000; + homa_rpc_lock(crpc); homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); @@ -825,7 +834,9 @@ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) self->homa.pacer->max_nic_queue_ns = 500; self->homa.pacer->throttle_min_bytes = 250; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + homa_rpc_lock(crpc); homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 200@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); @@ -844,14 +855,18 @@ TEST_F(homa_outgoing, homa_xmit_data__force) atomic64_set(&self->homa.pacer->link_idle_time, 11000); self->homa.pacer->max_nic_queue_ns = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + homa_rpc_lock(crpc1); homa_xmit_data(crpc1, false); + homa_rpc_unlock(crpc1); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("request id 1234, next_offset 2800", unit_log_get()); /* Now force transmission. */ unit_log_clear(); + homa_rpc_lock(crpc2); homa_xmit_data(crpc2, true); + homa_rpc_unlock(crpc2); EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); @@ -869,7 +884,9 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) self->homa.pacer->max_nic_queue_ns = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + homa_rpc_lock(crpc); homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 1400@0; " "xmit DATA 1400@1400", unit_log_get()); unit_log_clear(); @@ -888,9 +905,11 @@ TEST_F(homa_outgoing, homa_xmit_data__rpc_freed) #endif /* See strip.py */ unit_log_clear(); + homa_rpc_lock(crpc); unit_hook_register(lock_free_hook); hook_rpc = crpc; homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 1400@0; homa_rpc_end invoked", unit_log_get()); EXPECT_EQ(1400, crpc->msgout.next_xmit_offset); diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index 74e824cd..fe0d8fba 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -124,7 +124,8 @@ TEST_F(homa_pacer, homa_pacer_destroy__basics) EXPECT_FALSE(IS_ERR(pacer)); unit_log_clear(); homa_pacer_destroy(pacer); - EXPECT_STREQ("kthread_stop", unit_log_get()); + EXPECT_STREQ("unregister_net_sysctl_table; kthread_stop", + unit_log_get()); } TEST_F(homa_pacer, homa_pacer_destroy__no_thread) { @@ -135,7 +136,7 @@ TEST_F(homa_pacer, homa_pacer_destroy__no_thread) pacer->kthread = NULL; unit_log_clear(); homa_pacer_destroy(pacer); - EXPECT_STREQ("", unit_log_get()); + EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); } TEST_F(homa_pacer, homa_pacer_check_nic_q__success) diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index d98219c4..72415fa0 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_grant.h" #include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" @@ -22,7 +23,7 @@ FIXTURE_SETUP(homa_pool) mock_set_homa(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; - self->homa.window_param = 10000; + self->homa.grant->window = 10000; #endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); @@ -669,7 +670,8 @@ TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) atomic_set(&pool->free_bpages, 2); homa_pool_check_waiting(pool); EXPECT_EQ(2, crpc->msgin.num_bpages); - EXPECT_STREQ("homa_grant_recalc; xmit GRANT 10000@0 resend_all", + EXPECT_EQ(0, crpc->msgin.rank); + EXPECT_STREQ("xmit GRANT 10000@0 resend_all", unit_log_get()); } #endif /* See strip.py */ diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index c3001525..ffd4115b 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_grant.h" #include "homa_pacer.h" #include "homa_peer.h" #include "homa_pool.h" @@ -71,7 +72,7 @@ FIXTURE_SETUP(homa_rpc) mock_set_homa(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; - self->homa.window_param = 10000; + self->homa.grant->window = 10000; #endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, 0); memset(&self->data, 0, sizeof(self->data)); @@ -412,14 +413,14 @@ TEST_F(homa_rpc, homa_rpc_end__basics) self->server_port, self->client_id, 1000, 20000); #ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(1, self->homa.num_grantable_rpcs); + EXPECT_EQ(1, self->homa.grant->num_grantable_rpcs); #endif /* See strip.py */ ASSERT_NE(NULL, crpc); unit_log_clear(); mock_log_rcu_sched = 1; homa_rpc_end(crpc); #ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(0, self->homa.num_grantable_rpcs); + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); #endif /* See strip.py */ EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, crpc->id)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 16d6181e..b612e2ea 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_grant.h" #include "homa_peer.h" #include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 @@ -38,7 +39,7 @@ FIXTURE_SETUP(homa_timer) self->homa.timer_ticks = 100; #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; - self->homa.window_param = 10000; + self->homa.grant->window = 10000; #endif /* See strip.py */ mock_sock_init(&self->hsk, &self->homa, 0); unit_log_clear(); @@ -59,6 +60,7 @@ TEST_F(homa_timer, homa_check_rpc__request_ack) self->homa.request_ack_ticks = 2; /* First call: do nothing (response not fully transmitted). */ + homa_rpc_lock(srpc); homa_check_rpc(srpc); EXPECT_EQ(0, srpc->done_timer_ticks); @@ -80,6 +82,7 @@ TEST_F(homa_timer, homa_check_rpc__request_ack) unit_log_clear(); self->homa.timer_ticks++; homa_check_rpc(srpc); + homa_rpc_unlock(srpc); EXPECT_EQ(100, srpc->done_timer_ticks); EXPECT_STREQ("xmit NEED_ACK", unit_log_get()); } @@ -94,7 +97,9 @@ TEST_F(homa_timer, homa_check_rpc__all_granted_bytes_received) unit_log_clear(); crpc->msgin.granted = 1400; crpc->silent_ticks = 10; - homa_check_rpc(crpc); + homa_rpc_lock(crpc); + homa_check_rpc(crpc); + homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } @@ -109,7 +114,9 @@ TEST_F(homa_timer, homa_check_rpc__no_buffer_space) unit_log_clear(); crpc->msgin.num_bpages = 0; crpc->silent_ticks = 10; + homa_rpc_lock(crpc); homa_check_rpc(crpc); + homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } @@ -122,7 +129,9 @@ TEST_F(homa_timer, homa_check_rpc__server_has_received_request) ASSERT_NE(NULL, srpc); unit_log_clear(); srpc->silent_ticks = 10; + homa_rpc_lock(srpc); homa_check_rpc(srpc); + homa_rpc_unlock(srpc); EXPECT_EQ(0, srpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } @@ -135,7 +144,9 @@ TEST_F(homa_timer, homa_check_rpc__granted_bytes_not_sent) ASSERT_NE(NULL, crpc); unit_log_clear(); crpc->silent_ticks = 10; + homa_rpc_lock(crpc); homa_check_rpc(crpc); + homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } @@ -148,6 +159,7 @@ TEST_F(homa_timer, homa_check_rpc__timeout) ASSERT_NE(NULL, crpc); unit_log_clear(); crpc->silent_ticks = self->homa.timeout_ticks-1; + homa_rpc_lock(crpc); homa_check_rpc(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(0, homa_metrics_per_cpu()->rpc_timeouts); @@ -155,6 +167,7 @@ TEST_F(homa_timer, homa_check_rpc__timeout) EXPECT_EQ(0, crpc->error); crpc->silent_ticks = self->homa.timeout_ticks; homa_check_rpc(crpc); + homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->rpc_timeouts); #endif /* See strip.py */ @@ -183,6 +196,7 @@ TEST_F(homa_timer, homa_check_rpc__issue_resend) /* Second call: resend_ticks. */ crpc->silent_ticks = 3; unit_log_clear(); + homa_rpc_lock(crpc); homa_check_rpc(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); @@ -200,6 +214,7 @@ TEST_F(homa_timer, homa_check_rpc__issue_resend) crpc->silent_ticks = 5; unit_log_clear(); homa_check_rpc(crpc); + homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); #else /* See strip.py */ @@ -222,6 +237,7 @@ TEST_F(homa_timer, homa_check_rpc__request_first_bytes_of_message) /* First call: resend_ticks-1. */ crpc->silent_ticks = 2; unit_log_clear(); + homa_rpc_lock(crpc); homa_check_rpc(crpc); EXPECT_STREQ("", unit_log_get()); @@ -229,6 +245,7 @@ TEST_F(homa_timer, homa_check_rpc__request_first_bytes_of_message) crpc->silent_ticks = 3; unit_log_clear(); homa_check_rpc(crpc); + homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 0-99@7", unit_log_get()); #else /* See strip.py */ @@ -253,7 +270,9 @@ TEST_F(homa_timer, homa_check_rpc__call_homa_gap_retry) self->homa.resend_interval = 2; unit_log_clear(); + homa_rpc_lock(crpc); homa_check_rpc(crpc); + homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 7000-7999@7", unit_log_get()); #else /* See strip.py */ diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 6503a06a..dc823d86 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -68,7 +68,7 @@ TEST_F(homa_utils, homa_init__kmalloc_failure_for_peers) struct homa homa2; memset(&homa2, 0, sizeof(homa2)); - mock_kmalloc_errors = 4; + mock_kmalloc_errors = 8; EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); EXPECT_NE(NULL, homa2.port_map); EXPECT_EQ(NULL, homa2.peers); @@ -80,7 +80,7 @@ TEST_F(homa_utils, homa_init__homa_skb_init_failure) struct homa homa2; memset(&homa2, 0, sizeof(homa2)); - mock_kmalloc_errors = 8; + mock_kmalloc_errors = 0x10; EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); EXPECT_SUBSTR("Couldn't initialize skb management (errno 12)", mock_printk_output); diff --git a/test/utils.c b/test/utils.c index 4fdfeec9..2472eda1 100644 --- a/test/utils.c +++ b/test/utils.c @@ -5,6 +5,7 @@ */ #include "homa_impl.h" +#include "homa_grant.h" #include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" @@ -199,23 +200,33 @@ void unit_log_frag_list(struct sk_buff *skb, int verbose) #ifndef __STRIP__ /* See strip.py */ /** * unit_log_grantables() - Append to the test log information about all of - * the messages that are currently grantable. + * the messages under grant->grantable_peers. * @homa: Homa's overall state. */ void unit_log_grantables(struct homa *homa) { struct homa_peer *peer; struct homa_rpc *rpc; + int i; - list_for_each_entry(peer, &homa->grantable_peers, grantable_links) { + for (i = 0; i < homa->grant->num_active_rpcs; i++) { + rpc = homa->grant->active_rpcs[i]; + unit_log_printf("; ", "active[%d]: id %llu ungranted %d", + i, rpc->id, + rpc->msgin.length - rpc->msgin.granted); + if (rpc->msgin.rank != i) { + unit_log_printf(" ", "bad rank %d", rpc->msgin.rank); + } + } + list_for_each_entry(peer, &homa->grant->grantable_peers, + grantable_links) { + unit_log_printf("; ", "peer %s:", + homa_print_ipv6_addr(&peer->addr)); list_for_each_entry(rpc, &peer->grantable_rpcs, grantable_links) { - unit_log_printf("; ", "%s from %s, id %llu, remaining %d", - homa_is_client(rpc->id) ? "response" - : "request", - homa_print_ipv6_addr(&peer->addr), + unit_log_printf(" ", "id %llu ungranted %d", rpc->id, - rpc->msgin.bytes_remaining); + rpc->msgin.length - rpc->msgin.granted); } } } @@ -354,6 +365,8 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, { int bytes_received, created; struct homa_data_hdr h; + int status; + memset(&h, 0, sizeof(h)); h.common = (struct homa_common_hdr){ .sport = htons(client_port), @@ -394,8 +407,11 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, srpc->state = RPC_IN_SERVICE; if (state == UNIT_IN_SERVICE) return srpc; - if (homa_message_out_fill(srpc, - unit_iov_iter((void *) 2000, resp_length), 0) != 0) + homa_rpc_lock(srpc); + status = homa_message_out_fill(srpc, unit_iov_iter((void *) 2000, + resp_length), 0); + homa_rpc_unlock(srpc); + if (status != 0) goto error; srpc->state = RPC_OUTGOING; if (state == UNIT_OUTGOING) diff --git a/util/metrics.py b/util/metrics.py index 1220aaec..dc2f8170 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -433,10 +433,6 @@ def scale_number(number): print("Skb page alloc time: %5.2f usec/skb" % ( float(deltas["skb_page_alloc_ns"]) / 1000 / deltas["skb_page_allocs"])) - if deltas["grant_recalc_calls"] != 0: - print("homa_grant_recalc: %5.2f usec/call" % ( - float(deltas["grant_recalc_ns"]) / 1000 / - deltas["grant_recalc_calls"])) print("\nCanaries (possible problem indicators):") print("---------------------------------------") diff --git a/util/tthoma.py b/util/tthoma.py index df43bb95..f60b8bd7 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1221,14 +1221,14 @@ def __sendmsg_done(self, trace, time, core, match, interests): }) def __recvmsg_done(self, trace, time, core, match, interests): - id = int(match.group(1)) - length = int(match.group(2)) + status = int(match.group(1)) + id = int(match.group(2)) for interest in interests: - interest.tt_recvmsg_done(trace, time, core, id, length) + interest.tt_recvmsg_done(trace, time, core, id, status) patterns.append({ 'name': 'recvmsg_done', - 'regexp': 'homa_recvmsg returning id ([0-9]+), length ([0-9]+)' + 'regexp': 'homa_recvmsg returning status ([0-9]+), id ([0-9]+)' }) def __copy_in_start(self, trace, time, core, match, interests): @@ -5833,7 +5833,7 @@ def tt_sendmsg_response(self, trace, t, core, id, length): rpcs[id]['sendmsg'] = t rpcs[id]['out_length'] = length - def tt_recvmsg_done(self, trace, t, core, id, length): + def tt_recvmsg_done(self, trace, t, core, id, status): global rpcs rpcs[id]['recvmsg_done'] = t From 59e662410d97dba911302351efe258d322555a01 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 2 May 2025 09:19:33 -0700 Subject: [PATCH 261/625] Fix issues related to sparse Reworked all of the __acquires/__releases/__must_hold declarations; many were missing or wrong. Also fixed issues related to RCU. --- homa_grant.c | 6 ++++-- homa_impl.h | 6 ++++++ homa_incoming.c | 34 +++++++++++++++++++++++----------- homa_interest.h | 3 ++- homa_outgoing.c | 23 +++++++++++++++-------- homa_pacer.c | 4 ++-- homa_pacer.h | 4 ++-- homa_peer.c | 2 +- homa_plumbing.c | 12 ++++++------ homa_rpc.c | 13 +++++++------ homa_rpc.h | 5 ++++- homa_sock.c | 5 +++-- homa_sock.h | 12 +++++++----- 13 files changed, 82 insertions(+), 47 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 86c00198..17f08de6 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -139,10 +139,11 @@ void homa_grant_destroy(struct homa_grant *grant) * homa_grant_init_rpc() - Initialize grant-related information for an * RPC's incoming message (may add the RPC to grant priority queues). * @rpc: RPC being initialized. Grant-related fields in msgin - * are assumed to be zero. + * are assumed to be zero. Must be locked by caller. * @unsched: Number of unscheduled bytes in the incoming message for @rpc. */ void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched) + __must_hold(rpc_bucket_lock) { rpc->msgin.rank = -1; if (rpc->msgin.num_bpages == 0) @@ -165,6 +166,7 @@ void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched) * may release and then reacquire the lock. */ void homa_grant_end_rpc(struct homa_rpc *rpc) + __must_hold(rpc_bucket_lock) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_grant_candidates cand; @@ -878,7 +880,7 @@ void homa_grant_cand_check(struct homa_grant_candidates *cand, * @homa: Overall data about the Homa protocol implementation. */ void homa_grant_lock_slow(struct homa_grant *grant) - __acquires(&homa->grant_lock) + __acquires(&grant->lock) { u64 start = sched_clock(); diff --git a/homa_impl.h b/homa_impl.h index e6bc6fde..d2236866 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -83,6 +83,12 @@ void homa_throttle_lock_slow(struct homa *homa); #define sizeof32(type) ((int)(sizeof(type))) +#ifdef __CHECKER__ +#define __context__(x, y, z) __attribute__((context(x, y, z))) +#else +#define __context__(x, y, z) +#endif /* __CHECKER__ */ + /** * union sockaddr_in_union - Holds either an IPv4 or IPv6 address (smaller * and easier to use than sockaddr_storage). diff --git a/homa_incoming.c b/homa_incoming.c index 8a2e7a8e..e976ab24 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -40,6 +40,7 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) */ int homa_message_in_init(struct homa_rpc *rpc, int length) #endif /* See strip.py */ + __must_hold(rpc_bucket_lock) { int err; @@ -97,6 +98,7 @@ struct homa_gap *homa_gap_new(struct list_head *next, int start, int end) * @rpc: RPC to check; must be locked by caller. */ void homa_gap_retry(struct homa_rpc *rpc) + __must_hold(rpc_bucket_lock) { struct homa_resend_hdr resend; struct homa_gap *gap; @@ -121,6 +123,7 @@ void homa_gap_retry(struct homa_rpc *rpc) * (the packet will either be freed or added to rpc->msgin.packets). */ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) + __must_hold(rpc_bucket_lock) { struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; struct homa_gap *gap, *dummy, *gap2; @@ -240,8 +243,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) * if all available packets have been copied out. */ int homa_copy_to_user(struct homa_rpc *rpc) - __releases(rpc->bucket_lock) - __acquires(rpc->bucket_lock) + __must_hold(rpc_bucket_lock) { #ifdef __UNIT_TEST__ #define MAX_SKBS 3 @@ -306,7 +308,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) int buf_bytes, chunk_size; struct iov_iter iter; int copied = 0; - char *dst; + char __user *dst; /* Each iteration of this loop copies to one * user buffer. @@ -324,8 +326,8 @@ int homa_copy_to_user(struct homa_rpc *rpc) } chunk_size = buf_bytes; } - error = import_ubuf(READ, (void __user *)dst, - chunk_size, &iter); + error = import_ubuf(READ, dst, chunk_size, + &iter); if (error) goto free_skbs; error = skb_copy_datagram_iter(skbs[i], @@ -614,6 +616,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) * Must be locked by the caller. */ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) + __must_hold(rpc_bucket_lock) { struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; #ifndef __STRIP__ /* See strip.py */ @@ -712,8 +715,10 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) * @skb: Incoming packet; size already verified large enough for header. * This function now owns the packet. * @rpc: Information about the RPC corresponding to this packet. + * Must be locked by caller. */ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) + __must_hold(rpc_bucket_lock) { struct homa_grant_hdr *h = (struct homa_grant_hdr *)skb->data; int new_offset = ntohl(h->offset); @@ -749,6 +754,7 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) */ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk) + __must_hold(rpc_bucket_lock) { struct homa_resend_hdr *h = (struct homa_resend_hdr *)skb->data; struct homa_busy_hdr busy; @@ -817,9 +823,11 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, * homa_rpc_unknown_pkt() - Handler for incoming RPC_UNKNOWN packets. * @skb: Incoming packet; size known to be large enough for the header. * This function now owns the packet. - * @rpc: Information about the RPC corresponding to this packet. + * @rpc: Information about the RPC corresponding to this packet. Must + * be locked by caller. */ void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) + __must_hold(rpc_bucket_lock) { tt_record3("Received unknown for id %llu, peer %x:%d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport); @@ -904,6 +912,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) */ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc) + __must_hold(rpc_bucket_lock) { struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); @@ -967,7 +976,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, */ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc) - __releases(rpc->bucket_lock) + __must_hold(rpc_bucket_lock) { const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); struct homa_ack_hdr *h = (struct homa_ack_hdr *)skb->data; @@ -986,12 +995,13 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, */ homa_rpc_hold(rpc); homa_rpc_unlock(rpc); - } - for (i = 0; i < count; i++) - homa_rpc_acked(hsk, &saddr, &h->acks[i]); - if (rpc) { + for (i = 0; i < count; i++) + homa_rpc_acked(hsk, &saddr, &h->acks[i]); homa_rpc_lock(rpc); homa_rpc_put(rpc); + } else { + for (i = 0; i < count; i++) + homa_rpc_acked(hsk, &saddr, &h->acks[i]); } } tt_record3("ACK received for id %d, peer 0x%x, with %d other acks", @@ -1093,6 +1103,7 @@ struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) * we just free the RPC. */ void homa_rpc_abort(struct homa_rpc *rpc, int error) + __must_hold(rpc_bucket_lock) { if (!homa_is_client(rpc->id)) { INC_METRIC(server_rpc_discards, 1); @@ -1383,6 +1394,7 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) * @rpc: RPC to handoff; must be locked. */ void homa_rpc_handoff(struct homa_rpc *rpc) + __must_hold(rpc_bucket_lock) { struct homa_sock *hsk = rpc->hsk; struct homa_interest *interest; diff --git a/homa_interest.h b/homa_interest.h index ce755e5a..e8607e18 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -20,7 +20,8 @@ struct homa_interest { * attention, or NULL if this is a shared interest and hsk has * been shutdown. If ready is not set, this will be NULL if the * interest is shared; if it's private, it holds the RPC the - * interest is associated with. + * interest is associated with. If non-NULL, a reference has been + * taken on the RPC. */ struct homa_rpc *rpc; diff --git a/homa_outgoing.c b/homa_outgoing.c index 2c60b938..6fe746f0 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -19,10 +19,12 @@ /** * homa_message_out_init() - Initialize rpc->msgout. - * @rpc: RPC whose output message should be initialized. + * @rpc: RPC whose output message should be initialized. Must be + * locked by caller. * @length: Number of bytes that will eventually be in rpc->msgout. */ void homa_message_out_init(struct homa_rpc *rpc, int length) + __must_hold(rpc_bucket_lock) { memset(&rpc->msgout, 0, sizeof(rpc->msgout)); rpc->msgout.length = length; @@ -41,7 +43,8 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) * part of a data packet after the initial header, when GSO is being used * but TCP hijacking is not. As result, homa_seg_hdrs must be interleaved * with the data to provide the correct offset for each segment. - * @rpc: RPC whose output message is being created. + * @rpc: RPC whose output message is being created. Must be + * locked by caller. * @skb: The packet being filled. The initial homa_data_hdr was * created and initialized by the caller and the * homa_skb_info has been filled in with the packet geometry. @@ -55,7 +58,8 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) * part of a data packet after the initial header, when GSO is being used. * homa_seg_hdrs must be interleaved with the data to provide the correct * offset for each segment. - * @rpc: RPC whose output message is being created. + * @rpc: RPC whose output message is being created. Must be + * locked by caller. * @skb: The packet being filled. The initial homa_data_hdr was * created and initialized by the caller and the * homa_skb_info has been filled in with the packet geometry. @@ -66,6 +70,7 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) #endif /* See strip.py */ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter) + __must_hold(rpc_bucket_lock) { struct homa_skb_info *homa_info = homa_get_skb_info(skb); int seg_length = homa_info->seg_length; @@ -106,7 +111,7 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, * data packet. The resulting packet will be a GSO packet that will eventually * be segmented by the NIC. * @rpc: RPC that packet will belong to (msgout must have been - * initialized). + * initialized). Must be locked by caller. * @iter: Describes location(s) of (remaining) message data in user * space. * @offset: Offset in the message of the first byte of data in this @@ -122,6 +127,7 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data) + __must_hold(rpc_bucket_lock) { struct homa_skb_info *homa_info; struct homa_data_hdr *h; @@ -240,8 +246,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, * rpc->state will be RPC_DEAD. */ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) - __releases(rpc->bucket_lock) - __acquires(rpc->bucket_lock) + __must_hold(rpc_bucket_lock) { /* Geometry information for packets: * mtu: largest size for an on-the-wire packet (including @@ -406,6 +411,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) * @rpc: The packet will go to the socket that handles the other end * of this RPC. Addressing info for the packet, including all of * the fields of homa_common_hdr except type, will be set from this. + * Caller must hold either the lock or a reference. * * Return: Either zero (for success), or a negative errno value if there * was a problem. @@ -588,7 +594,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) * the NIC queue is sufficiently long. */ void homa_xmit_data(struct homa_rpc *rpc, bool force) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc_bucket_lock) { struct homa *homa = rpc->hsk->homa; #ifndef __STRIP__ /* See strip.py */ @@ -739,7 +745,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) * homa_resend_data() - This function is invoked as part of handling RESEND * requests. It retransmits the packet(s) containing a given range of bytes * from a message. - * @rpc: RPC for which data should be resent. + * @rpc: RPC for which data should be resent. Must be locked by caller. * @start: Offset within @rpc->msgout of the first byte to retransmit. * @end: Offset within @rpc->msgout of the byte just after the last one * to retransmit. @@ -759,6 +765,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, */ void homa_resend_data(struct homa_rpc *rpc, int start, int end) #endif /* See strip.py */ + __must_hold(rpc_bucket_lock) { struct homa_skb_info *homa_info; struct sk_buff *skb; diff --git a/homa_pacer.c b/homa_pacer.c index 1946804f..6218bd72 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -325,7 +325,7 @@ void homa_pacer_xmit(struct homa_pacer *pacer) * sent because of NIC queue restrictions. Must be locked by caller. */ void homa_pacer_manage_rpc(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc_bucket_lock) { struct homa_pacer *pacer = rpc->hsk->homa->pacer; struct homa_rpc *candidate; @@ -375,7 +375,7 @@ void homa_pacer_manage_rpc(struct homa_rpc *rpc) * @rpc: RPC of interest. */ void homa_pacer_unmanage_rpc(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc_bucket_lock) { struct homa_pacer *pacer = rpc->hsk->homa->pacer; diff --git a/homa_pacer.h b/homa_pacer.h index d8eb8e7f..68f803bd 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -190,7 +190,7 @@ static inline void homa_pacer_check(struct homa_pacer *pacer) * @pacer: Pacer information for a Homa transport. */ static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) - __acquires(&homa->pacer.throttle_lock) + __acquires(&pacer->throttle_lock) { if (!spin_trylock_bh(&pacer->throttle_lock)) homa_pacer_throttle_lock_slow(pacer); @@ -201,7 +201,7 @@ static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) * @pacer: Pacer information for a Homa transport. */ static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) - __acquires(&homa->pacer.throttle_lock) + __acquires(&pacer->throttle_lock) { spin_lock_bh(&pacer->throttle_lock); } diff --git a/homa_peer.c b/homa_peer.c index e825f010..2dbfd07f 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -386,7 +386,7 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, /** * homa_peer_lock_slow() - This function implements the slow path for - * acquiring a peer's @unacked_lock. It is invoked when the lock isn't + * acquiring a peer's @ack_lock. It is invoked when the lock isn't * immediately available. It waits for the lock, but also records statistics * about the waiting time. * @peer: Peer to lock. diff --git a/homa_plumbing.c b/homa_plumbing.c index 29a1a5ec..ed0b5c58 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1561,7 +1561,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct homa_sock *hsk = homa_sk(sock->sk); - u32 mask; + __poll_t mask; mask = 0; sock_poll_wait(file, sock, wait); @@ -1569,17 +1569,17 @@ __poll_t homa_poll(struct file *file, struct socket *sock, refcount_read(&hsk->sock.sk_wmem_alloc), hsk->sock.sk_sndbuf); if (homa_sock_wmem_avl(hsk)) - mask |= POLLOUT | POLLWRNORM; + mask |= EPOLLOUT | EPOLLWRNORM; else set_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags); if (hsk->shutdown) - mask |= POLLIN; + mask |= EPOLLIN; if (!list_empty(&hsk->ready_rpcs)) - mask |= POLLIN | POLLRDNORM; - tt_record1("homa_poll returning mask 0x%x", mask); - return (__poll_t)mask; + mask |= EPOLLIN | EPOLLRDNORM; + tt_record1("homa_poll returning mask 0x%x", (__force int)mask); + return mask; } #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_rpc.c b/homa_rpc.c index 6324210d..296c43fe 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -29,7 +29,7 @@ */ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, const union sockaddr_in_union *dest) - __acquires(&crpc->bucket->lock) + __acquires(rpc_bucket_lock) { struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); struct homa_rpc_bucket *bucket; @@ -81,7 +81,9 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, goto error; } hlist_add_head(&crpc->hash_links, &bucket->rpcs); + rcu_read_lock(); list_add_tail_rcu(&crpc->active_links, &hsk->active_rpcs); + rcu_read_unlock(); homa_sock_unlock(hsk); return crpc; @@ -110,7 +112,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, const struct in6_addr *source, struct homa_data_hdr *h, int *created) - __acquires(&srpc->bucket->lock) + __acquires(rpc_bucket_lock) { u64 id = homa_local_id(h->common.sender_id); struct homa_rpc_bucket *bucket; @@ -245,8 +247,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, * use the RPC except to unlock it. */ void homa_rpc_end(struct homa_rpc *rpc) - __acquires(&rpc->hsk->lock) - __releases(&rpc->hsk->lock) + __must_hold(rpc_bucket_lock) { /* The goal for this function is to make the RPC inaccessible, * so that no other code will ever access it again. However, don't @@ -499,7 +500,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * by invoking homa_rpc_unlock. */ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, u64 id) - __acquires(&crpc->bucket->lock) + __cond_acquires(rpc_bucket_lock) { struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); struct homa_rpc *crpc; @@ -526,7 +527,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, u64 id) */ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, const struct in6_addr *saddr, u64 id) - __acquires(&srpc->bucket->lock) + __acquires(rpc_bucket_lock) { struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); struct homa_rpc *srpc; diff --git a/homa_rpc.h b/homa_rpc.h index dcf08138..0efee70d 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -348,7 +348,7 @@ struct homa_rpc { * The next field will be LIST_POISON1 if this RPC hasn't yet been * linked into @hsk->active_rpcs. Access with RCU. */ - struct list_head __rcu active_links; + struct list_head active_links; /** @dead_links: For linking this object into @hsk->dead_rpcs. */ struct list_head dead_links; @@ -452,6 +452,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, * doing! See sync.txt for more info on locking. */ static inline void homa_rpc_lock(struct homa_rpc *rpc) + __acquires(rpc_bucket_lock) { homa_bucket_lock(rpc->bucket, rpc->id); } @@ -463,6 +464,7 @@ static inline void homa_rpc_lock(struct homa_rpc *rpc) * currently owned by someone else. */ static inline int homa_rpc_try_lock(struct homa_rpc *rpc) + __cond_acquires(rpc_bucket_lock) { if (!spin_trylock_bh(&rpc->bucket->lock)) return 0; @@ -474,6 +476,7 @@ static inline int homa_rpc_try_lock(struct homa_rpc *rpc) * @rpc: RPC to unlock. */ static inline void homa_rpc_unlock(struct homa_rpc *rpc) + __releases(rpc_bucket_lock) { homa_bucket_unlock(rpc->bucket, rpc->id); } diff --git a/homa_sock.c b/homa_sock.c index 24a49001..d0d0009b 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -88,7 +88,7 @@ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, */ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) { - struct hlist_head __rcu *bucket; + struct hlist_head *bucket; struct hlist_node *next; rcu_read_lock(); @@ -215,6 +215,7 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) * homa_sock_unlink() - Unlinks a socket from its socktab and does * related cleanups. Once this method returns, the socket will not be * discoverable through the socktab. + * @hsk: Socket to unlink. */ void homa_sock_unlink(struct homa_sock *hsk) { @@ -429,7 +430,7 @@ void homa_sock_lock_slow(struct homa_sock *hsk) * Used only for metrics. */ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) - __acquires(&bucket->lock) + __acquires(rpc_bucket_lock) { u64 start = sched_clock(); diff --git a/homa_sock.h b/homa_sock.h index ff7d1f75..2f89839e 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -38,7 +38,7 @@ struct homa_socktab { * @buckets: Heads of chains for hash table buckets. Chains * consist of homa_sock objects. */ - struct hlist_head __rcu buckets[HOMA_SOCKTAB_BUCKETS]; + struct hlist_head buckets[HOMA_SOCKTAB_BUCKETS]; }; /** @@ -76,7 +76,7 @@ struct homa_rpc_bucket { * this bucket. This dual purpose permits clean and safe * deletion and garbage collection of RPCs. */ - spinlock_t lock; + spinlock_t lock __context__(rpc_bucket_lock, 1, 1); /** * @id: identifier for this bucket, used in error messages etc. @@ -169,7 +169,7 @@ struct homa_sock { int ip_header_length; /** @socktab_links: Links this socket into a homa_socktab bucket. */ - struct hlist_node __rcu socktab_links; + struct hlist_node socktab_links; /** * @active_rpcs: List of all existing RPCs related to this socket, @@ -180,7 +180,7 @@ struct homa_sock { * The list is sorted, with the oldest RPC first. Manipulate with * RCU so timer can access without locking. */ - struct list_head __rcu active_rpcs; + struct list_head active_rpcs; /** * @dead_rpcs: Contains RPCs for which homa_rpc_end has been @@ -358,6 +358,7 @@ static inline struct homa_rpc_bucket * Used only for metrics. */ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) + __acquires(rpc_bucket_lock) { if (!spin_trylock_bh(&bucket->lock)) homa_bucket_lock_slow(bucket, id); @@ -370,6 +371,7 @@ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) * Used only for metrics. */ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) + __acquires(rpc_bucket_lock) { spin_lock_bh(&bucket->lock); } @@ -381,7 +383,7 @@ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) * @id: ID of the RPC that was using the lock. */ static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, u64 id) - __releases(&bucket->lock) + __releases(rpc_bucket_lock) { spin_unlock_bh(&bucket->lock); } From 888cfc246fd25b7453236511ec8efba03bc9cbb1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 2 May 2025 10:37:16 -0700 Subject: [PATCH 262/625] Update current Linux version in README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ddd6b247..af661785 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,14 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - Please contact me if you have any problems using this repo; I'm happy to provide advice and support. -- The head is known to work under Linux 6.10.6. In the past, Homa has +- The head is known to work under Linux 6.13.9. In the past, Homa has run under several earlier versions of Linux. There is a separate branch for each of these older versions, with names such as linux_4.15.18. Older branches are out of date feature-wise: recent commits have not been back-ported to them. Other versions of Linux have not been tested and - may require code changes (these upgrades rarely take more than a couple - of hours). If you get Homa working on some other version, please submit a + may require code changes (these upgrades rarely long). If you get Homa + working on some other version, please submit a pull request with the required code changes. - Related work that you may find useful: From 1b96a63a8780ec5c2333123fbf2c1e64d005fd75 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 2 May 2025 10:46:54 -0700 Subject: [PATCH 263/625] Fix issues from kernel-doc and checkpatch --- homa.h | 3 +++ homa_grant.c | 12 ++++++------ homa_grant.h | 11 +++++++---- homa_impl.h | 19 ++++--------------- homa_metrics.c | 4 +--- homa_metrics.h | 9 ++++++--- homa_offload.h | 2 +- homa_pacer.c | 4 ++-- homa_plumbing.c | 13 ------------- homa_timer.c | 8 ++++---- homa_utils.c | 2 +- 11 files changed, 35 insertions(+), 52 deletions(-) diff --git a/homa.h b/homa.h index 2db52801..26e52e15 100644 --- a/homa.h +++ b/homa.h @@ -159,7 +159,10 @@ struct homa_abort_args { */ int error; + /** @_pad1: Reserved. */ int _pad1; + + /** @_pad2: Reserved. */ __u64 _pad2[2]; }; diff --git a/homa_grant.c b/homa_grant.c index 17f08de6..d947981a 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -232,7 +232,7 @@ int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2) grant_diff = (rpc1->msgin.length - rpc1->msgin.granted) - (rpc2->msgin.length - rpc2->msgin.granted); return grant_diff < 0 || ((grant_diff == 0) && - (rpc1->msgin.birth < rpc2->msgin.birth)); + (rpc1->msgin.birth < rpc2->msgin.birth)); } /** @@ -298,7 +298,7 @@ struct homa_rpc *homa_grant_insert_active(struct homa_rpc *rpc) /* All the other RPCs with the same peer are higher * priority than @rpc and we can't have any more RPCs * with the same peer, so bump @rpc. - */ + */ return rpc; /* Bump the lowest priority RPC from the same peer to make room @@ -672,7 +672,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) /* This function handles 4 different things: * 1. It generates new grant packets for @rpc if appropriate. This - * is the common case. + * is the common case. * 2. If total_incoming had been exhausted, but headroom is now * available, it sends grants to the highest priority RPC that * needs them, which may not be @rpc. @@ -690,7 +690,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * occasionally scans all the RPCs in active_rpcs to fix any priority * inversions that may have developed. The interval for these scans * is chosen so as not to create too much contention for the grant lock. - */ + */ now = sched_clock(); limit = atomic_xchg(&grant->incoming_hit_limit, false); recalc = now >= READ_ONCE(grant->next_recalc); @@ -830,7 +830,7 @@ void homa_grant_find_oldest(struct homa *homa) * be incremented */ void homa_grant_cand_add(struct homa_grant_candidates *cand, - struct homa_rpc *rpc) + struct homa_rpc *rpc) { if (cand->inserts < cand->removes + HOMA_MAX_CAND_RPCS) { homa_rpc_hold(rpc); @@ -877,7 +877,7 @@ void homa_grant_cand_check(struct homa_grant_candidates *cand, * acquiring the grant lock. It is invoked when the lock isn't immediately * available. It waits for the lock, but also records statistics about * the waiting time. - * @homa: Overall data about the Homa protocol implementation. + * @grant: Grant management information. */ void homa_grant_lock_slow(struct homa_grant *grant) __acquires(&grant->lock) diff --git a/homa_grant.h b/homa_grant.h index aa6beee7..af1456b4 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -80,7 +80,8 @@ struct homa_grant { /** * @num_grantable_rpcs: Total number of RPCs with incoming * messages that still need grants. Includes entries in both - * @active_rpcs and @grantable_peers. */ + * @active_rpcs and @grantable_peers. + */ int num_grantable_rpcs; /** @@ -203,7 +204,7 @@ struct homa_grant_candidates { * @removes: Total number of RPCs that have been removed from this * structure over its lifetime. Low-order bits give index of next * RPC to be checked for possible grant. - * */ + */ u32 removes; /* Maximum number of RPCs that can be stored in @rpcs. If space @@ -213,6 +214,8 @@ struct homa_grant_candidates { */ #define HOMA_MAX_CAND_RPCS 8 #define HOMA_CAND_MASK (HOMA_MAX_CAND_RPCS - 1) + + /** @rpcs: RPCs that should be considered for sending grants. */ struct homa_rpc *rpcs[HOMA_MAX_CAND_RPCS]; }; @@ -290,7 +293,7 @@ static inline void homa_grant_lock(struct homa_grant *grant) /** * homa_grant_unlock() - Release the grant lock. - * @homa: Overall data about the Homa protocol implementation. + * @grant: Grant management info. */ static inline void homa_grant_unlock(struct homa_grant *grant) __releases(&grant->grant_lock) @@ -299,4 +302,4 @@ static inline void homa_grant_unlock(struct homa_grant *grant) spin_unlock_bh(&grant->lock); } -#endif /* _HOMA_GRANT_H */ \ No newline at end of file +#endif /* _HOMA_GRANT_H */ diff --git a/homa_impl.h b/homa_impl.h index d2236866..1421171a 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -86,7 +86,7 @@ void homa_throttle_lock_slow(struct homa *homa); #ifdef __CHECKER__ #define __context__(x, y, z) __attribute__((context(x, y, z))) #else -#define __context__(x, y, z) +#define __context__(...) #endif /* __CHECKER__ */ /** @@ -180,7 +180,7 @@ struct homa { struct page **skb_pages_to_free; /** - * @pages_to_free_slot: Maximum number of pages that can be + * @pages_to_free_slots: Maximum number of pages that can be * stored in skb_pages_to_free; */ int pages_to_free_slots; @@ -398,7 +398,7 @@ struct homa { #define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE | HOMA_GRO_GEN2 | \ HOMA_GRO_SHORT_BYPASS | HOMA_GRO_FAST_GRANTS) - /* + /** * @busy_usecs: if there has been activity on a core within the * last @busy_usecs, it is considered to be busy and Homa will * try to avoid scheduling other activities on the core. See @@ -409,7 +409,7 @@ struct homa { /** @busy_ns: Same as busy_usecs except in sched_clock() units. */ int busy_ns; - /* + /** * @gro_busy_usecs: if the gap between the completion of * homa_gro_receive and the next call to homa_gro_receive on the same * core is less than this, then GRO on that core is considered to be @@ -736,13 +736,8 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); struct homa_rpc *homa_choose_fifo_grant(struct homa *homa); void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -int homa_dointvec(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#else int homa_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -#endif void homa_incoming_sysctl_changed(struct homa *homa); int homa_ioc_abort(struct sock *sk, int *karg); int homa_message_in_init(struct homa_rpc *rpc, int length, @@ -750,11 +745,6 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, void homa_prios_changed(struct homa *homa); void homa_resend_data(struct homa_rpc *rpc, int start, int end, int priority); -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -#else int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); @@ -764,7 +754,6 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority); -#endif #else /* See strip.py */ int homa_message_in_init(struct homa_rpc *rpc, int unsched); void homa_resend_data(struct homa_rpc *rpc, int start, int end); diff --git a/homa_metrics.c b/homa_metrics.c index ed1f1c47..67dd0416 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -72,10 +72,8 @@ void homa_metric_append(const char *format, ...) homa_mout.capacity = 4096; #endif homa_mout.output = kmalloc(homa_mout.capacity, GFP_KERNEL); - if (!homa_mout.output) { - pr_warn("%s couldn't allocate memory\n", __func__); + if (!homa_mout.output) return; - } homa_mout.length = 0; } diff --git a/homa_metrics.h b/homa_metrics.h index f4560ad2..f4d026a8 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -633,12 +633,12 @@ DECLARE_PER_CPU(struct homa_metrics, homa_metrics); */ struct homa_metrics_output { /** - * @metrics_mutex: Used to synchronize accesses to @active_opens + * @mutex: Used to synchronize accesses to @active_opens * and updates to @output. */ struct mutex mutex; - /* + /** * @output: a human-readable string containing recent values * for all the Homa performance metrics, as generated by * homa_append_metric. This string is kmalloc-ed; NULL means @@ -661,7 +661,10 @@ struct homa_metrics_output { */ int active_opens; - /* Used to remove /proc/net/homa_metrics when the module is unloaded. */ + /** + * @dir_entry: Used to remove /proc/net/homa_metrics when the + * module is unloaded. + */ struct proc_dir_entry *dir_entry; }; diff --git a/homa_offload.h b/homa_offload.h index f5d1d106..3e5562db 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -58,7 +58,7 @@ struct homa_offload_core { u64 last_app_active; /** - * held_skb: last packet buffer known to be available for + * @held_skb: last packet buffer known to be available for * merging other packets into on this core (note: may not still * be available), or NULL if none. */ diff --git a/homa_pacer.c b/homa_pacer.c index 6218bd72..3f44d4af 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -247,7 +247,7 @@ void homa_pacer_xmit(struct homa_pacer *pacer) while (1) { queue_ns = atomic64_read(&pacer->link_idle_time) - sched_clock(); if (queue_ns >= pacer->max_nic_queue_ns) - break; + break; if (list_empty(&pacer->throttled_rpcs)) break; @@ -301,7 +301,7 @@ void homa_pacer_xmit(struct homa_pacer *pacer) */ #ifndef __STRIP__ /* See strip.py */ if (!*rpc->msgout.next_xmit || rpc->msgout.next_xmit_offset >= - rpc->msgout.granted) { + rpc->msgout.granted) { #else /* See strip.py */ if (!*rpc->msgout.next_xmit) { #endif /* See strip.py */ diff --git a/homa_plumbing.c b/homa_plumbing.c index ed0b5c58..bdc22f58 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -392,9 +392,6 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) - {} -#endif }; #endif /* See strip.py */ @@ -1595,13 +1592,8 @@ __poll_t homa_poll(struct file *file, struct socket *sock, * * Return: 0 for success, nonzero for error. */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -int homa_dointvec(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -#else int homa_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) -#endif { struct homa *homa = homa_from_net(current->nsproxy->net_ns); struct ctl_table table_copy; @@ -1685,13 +1677,8 @@ int homa_dointvec(const struct ctl_table *table, int write, * * Return: 0 for success, nonzero for error. */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 12, 0) -int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -#else int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) -#endif { struct homa_offload_core *offload_core; struct ctl_table table_copy; diff --git a/homa_timer.c b/homa_timer.c index 16521201..067ae82c 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -148,16 +148,16 @@ void homa_check_rpc(struct homa_rpc *rpc) rpc->dport, rpc->msgin.recv_end); /* Should be if (homa->verbose) */ // pr_notice("Homa client RESEND to %s:%d for id %llu, offset %d\n", - // homa_print_ipv6_addr(&rpc->peer->addr), - // rpc->dport, rpc->id, rpc->msgin.recv_end); + // homa_print_ipv6_addr(&rpc->peer->addr), + // rpc->dport, rpc->id, rpc->msgin.recv_end); } else { tt_record4("Sent RESEND for server RPC id %llu, client 0x%x:%d offset %d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport, rpc->msgin.recv_end); /* Should be if (homa->verbose) */ // pr_notice("Homa server RESEND to %s:%d for id %llu, offset %d\n", - // homa_print_ipv6_addr(&rpc->peer->addr), - // rpc->dport, rpc->id, rpc->msgin.recv_end); + // homa_print_ipv6_addr(&rpc->peer->addr), + // rpc->dport, rpc->id, rpc->msgin.recv_end); } #endif /* See strip.py */ } diff --git a/homa_utils.c b/homa_utils.c index e5ce2814..9df14b39 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -33,7 +33,7 @@ int homa_init(struct homa *homa, struct net *net) int i; _Static_assert(HOMA_MAX_PRIORITIES >= 8, - "homa_init assumes at least 8 priority levels"); + "Homa requires at least 8 priority levels"); #endif /* See strip.py */ memset(homa, 0, sizeof(*homa)); From 320030ec947877d9ad25286d7dac5bc334195ce0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 2 May 2025 10:58:57 -0700 Subject: [PATCH 264/625] Fix problems compiling with __STRIP__=1 --- homa_utils.c | 2 ++ test/unit_homa_pacer.c | 7 +++++++ test/unit_homa_utils.c | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/homa_utils.c b/homa_utils.c index 9df14b39..e672db1a 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -143,10 +143,12 @@ void homa_destroy(struct homa *homa) kfree(homa->port_map); homa->port_map = NULL; } +#ifndef __STRIP__ /* See strip.py */ if (homa->grant) { homa_grant_destroy(homa->grant); homa->grant = NULL; } +#endif /* See strip.py */ if (homa->pacer) { homa_pacer_destroy(homa->pacer); homa->pacer = NULL; diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index fe0d8fba..5a766788 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -124,8 +124,13 @@ TEST_F(homa_pacer, homa_pacer_destroy__basics) EXPECT_FALSE(IS_ERR(pacer)); unit_log_clear(); homa_pacer_destroy(pacer); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("unregister_net_sysctl_table; kthread_stop", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("kthread_stop", + unit_log_get()); +#endif /* See strip.py */ } TEST_F(homa_pacer, homa_pacer_destroy__no_thread) { @@ -136,7 +141,9 @@ TEST_F(homa_pacer, homa_pacer_destroy__no_thread) pacer->kthread = NULL; unit_log_clear(); homa_pacer_destroy(pacer); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); +#endif /* See strip.py */ } TEST_F(homa_pacer, homa_pacer_check_nic_q__success) diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index dc823d86..bf63f5bc 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -63,6 +63,7 @@ TEST_F(homa_utils, homa_init__kmalloc_failure_for_port_map) EXPECT_EQ(NULL, homa2.port_map); homa_destroy(&homa2); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_init__kmalloc_failure_for_peers) { struct homa homa2; @@ -74,7 +75,6 @@ TEST_F(homa_utils, homa_init__kmalloc_failure_for_peers) EXPECT_EQ(NULL, homa2.peers); homa_destroy(&homa2); } -#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_init__homa_skb_init_failure) { struct homa homa2; From 06a878c95eda744dce8a6f503c877f1fb587a331 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 2 May 2025 14:13:10 -0700 Subject: [PATCH 265/625] Add Kconfig file for Linux kernel sources --- Kconfig | 21 +++++++++++++++++++++ Makefile | 1 + 2 files changed, 22 insertions(+) create mode 100644 Kconfig diff --git a/Kconfig b/Kconfig new file mode 100644 index 00000000..8ce5fbf0 --- /dev/null +++ b/Kconfig @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: BSD-2-Clause +# +# Homa transport protocol +# + +menuconfig HOMA + tristate "The Homa transport protocol" + depends on INET + depends on IPV6 + + help + Homa is a network transport protocol for communication within + a datacenter. It provides significantly lower latency than TCP, + particularly for workloads containing a mixture of large and small + messages operating at high network utilization. At present, Homa + has been only partially upstreamed; this version provides bare-bones + functionality but is not performant. For more information see the + homa(7) man page or checkout the Homa Wiki at + https://homa-transport.atlassian.net/wiki/spaces/HOMA/overview. + + If unsure, say N. diff --git a/Makefile b/Makefile index 9e8099e7..616a9373 100644 --- a/Makefile +++ b/Makefile @@ -66,6 +66,7 @@ CP_HDRS := homa_impl.h \ CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o timetrace.o, $(HOMA_OBJS))) CP_EXTRAS := reap.txt \ sync.txt \ + Kconfig \ Makefile CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS) $(CP_EXTRAS)) net-next: $(CP_TARGETS) $(LINUX_SRC_DIR)/include/uapi/linux/homa.h From 05bb800fe5ba2b05c5b07fe5f7b74758598b5590 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 2 May 2025 15:08:04 -0700 Subject: [PATCH 266/625] Fix compilation issues under 6.15.0 --- homa_offload.c | 20 ++++++++++++++++++-- homa_plumbing.c | 5 +++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/homa_offload.c b/homa_offload.c index 1ee41a9b..0eab9739 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -367,13 +367,18 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, */ hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); if (offload_core->held_skb) { - /* Reverse-engineer the location of the napi_struct, so we + /* Reverse-engineer the location of the gro_node, so we * can verify that held_skb is still valid. */ struct gro_list *gro_list = container_of(held_list, struct gro_list, list); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) struct napi_struct *napi = container_of(gro_list, struct napi_struct, gro_hash[hash]); +#else + struct gro_node *gro_node = container_of(gro_list, + struct gro_node, hash[hash]); +#endif /* Must verify that offload_core->held_skb points to a packet on * the list, and that the packet is a Homa packet. @@ -383,7 +388,11 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * some other protocol). */ list_for_each_entry(held_skb, +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) &napi->gro_hash[offload_core->held_bucket].list, +#else + &gro_node->hash[offload_core->held_bucket].list, +#endif list) { int protocol; @@ -418,7 +427,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * returning skb as result is no longer * sufficient (as of 5.4.80) to push it up * the stack; the packet just gets queued on - * napi->rx_list. This code basically steals + * gro_node->rx_list. This code basically steals * the packet from dev_gro_receive and * pushes it upward. */ @@ -426,10 +435,17 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, homa_gro_complete(held_skb, 0); netif_receive_skb(held_skb); homa_send_ipis(); +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) napi->gro_hash[offload_core->held_bucket].count--; if (napi->gro_hash[offload_core->held_bucket].count == 0) __clear_bit(offload_core->held_bucket, &napi->gro_bitmask); +#else + gro_node->hash[offload_core->held_bucket].count--; + if (gro_node->hash[offload_core->held_bucket].count == 0) + __clear_bit(offload_core->held_bucket, + &gro_node->bitmask); +#endif result = ERR_PTR(-EINPROGRESS); } goto done; diff --git a/homa_plumbing.c b/homa_plumbing.c index bdc22f58..8b9bf28d 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1768,8 +1768,13 @@ int homa_timer_main(void *transport) ktime_t tick_interval; u64 nsec; +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) hrtimer_init(&homa->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); homa->hrtimer.function = &homa_hrtimer; +#else + hrtimer_setup(&homa->hrtimer, homa_hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); +#endif nsec = 1000000; /* 1 ms */ tick_interval = ns_to_ktime(nsec); while (1) { From e8d047e97c5cd47a1fb18dc231bee9be7f71f933 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 2 May 2025 15:43:34 -0700 Subject: [PATCH 267/625] Add homa_pacer.o to Makefile.upstream --- Makefile.upstream | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.upstream b/Makefile.upstream index 2b3c3aff..ed894eba 100644 --- a/Makefile.upstream +++ b/Makefile.upstream @@ -6,6 +6,7 @@ obj-$(CONFIG_HOMA) := homa.o homa-y:= homa_incoming.o \ homa_interest.o \ homa_outgoing.o \ + homa_pacer.o \ homa_peer.o \ homa_plumbing.o \ homa_pool.o \ From 35584f4227b1efca53f896cb28ec731c44b01ef8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 2 May 2025 16:49:07 -0700 Subject: [PATCH 268/625] Minor fix to sparse annotation --- homa_rpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_rpc.c b/homa_rpc.c index 296c43fe..9882dcc5 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -527,7 +527,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, u64 id) */ struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, const struct in6_addr *saddr, u64 id) - __acquires(rpc_bucket_lock) + __cond_acquires(rpc_bucket_lock) { struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); struct homa_rpc *srpc; From fb806cabf1ae5a05fa6607b3c45ca9f6580a5008 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 2 May 2025 16:49:22 -0700 Subject: [PATCH 269/625] Fix #include errors when stripped --- homa_sock.c | 1 - homa_utils.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/homa_sock.c b/homa_sock.c index d0d0009b..e3e6e175 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -3,7 +3,6 @@ /* This file manages homa_sock and homa_socktab objects. */ #include "homa_impl.h" -#include "homa_grant.h" #include "homa_interest.h" #include "homa_peer.h" #include "homa_pool.h" diff --git a/homa_utils.c b/homa_utils.c index e672db1a..8b665538 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -5,11 +5,11 @@ */ #include "homa_impl.h" -#include "homa_grant.h" #include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" #ifndef __STRIP__ /* See strip.py */ +#include "homa_grant.h" #include "homa_skb.h" #endif /* See strip.py */ From 427fb6a36de823a6880f2423362947aab8d8eb29 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 4 May 2025 17:39:20 -0700 Subject: [PATCH 270/625] Use wait_event_interruptible in homa_pacer_main (not wait_event) --- homa_pacer.c | 9 ++++++--- test/mock.c | 8 +++++++- test/mock.h | 1 + test/unit_homa_pacer.c | 7 +++++++ 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/homa_pacer.c b/homa_pacer.c index 3f44d4af..2cc0839b 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -193,6 +193,7 @@ int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, int homa_pacer_main(void *arg) { struct homa_pacer *pacer = arg; + int status; while (1) { if (pacer->exit) @@ -212,9 +213,11 @@ int homa_pacer_main(void *arg) } tt_record("pacer sleeping"); - wait_event(pacer->wait_queue, pacer->exit || - !list_empty(&pacer->throttled_rpcs)); - tt_record("pacer woke up"); + status = wait_event_interruptible(pacer->wait_queue, + pacer->exit || !list_empty(&pacer->throttled_rpcs)); + tt_record1("pacer woke up with status %d", status); + if (status != 0 && status != -ERESTARTSYS) + break; } kthread_complete_and_exit(&pacer->kthread_done, 0); return 0; diff --git a/test/mock.c b/test/mock.c index 89847026..c1984b0e 100644 --- a/test/mock.c +++ b/test/mock.c @@ -55,6 +55,11 @@ int mock_trylock_errors; int mock_vmalloc_errors; int mock_wait_intr_irq_errors; +/* The value that prepare_to_wait_event should return when + * mock_prepare_to_wait_errors is nonzero. + */ +int mock_prepare_to_wait_status = -ERESTARTSYS; + /* The return value from calls to signal_pending(). */ int mock_signal_pending; @@ -1007,7 +1012,7 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, { UNIT_HOOK("prepare_to_wait"); if (mock_check_error(&mock_prepare_to_wait_errors)) - return -ERESTARTSYS; + return mock_prepare_to_wait_status; return 0; } @@ -2001,6 +2006,7 @@ void mock_teardown(void) mock_trylock_errors = 0; mock_vmalloc_errors = 0; memset(&mock_task, 0, sizeof(mock_task)); + mock_prepare_to_wait_status = -ERESTARTSYS; mock_signal_pending = 0; mock_xmit_log_verbose = 0; mock_xmit_log_homa_info = 0; diff --git a/test/mock.h b/test/mock.h index 776d36c0..5cb853a3 100644 --- a/test/mock.h +++ b/test/mock.h @@ -142,6 +142,7 @@ extern u64 mock_ns; extern u64 mock_ns_tick; extern int mock_numa_mask; extern int mock_page_nid_mask; +extern int mock_prepare_to_wait_status; extern char mock_printk_output[]; extern int mock_route_errors; extern int mock_signal_pending; diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index 5a766788..397b0ee1 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -304,6 +304,13 @@ TEST_F(homa_pacer, homa_pacer_main__rpc_arrives_while_sleeping) EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400; xmit DATA 1400@2800", unit_log_get()); } +TEST_F(homa_pacer, homa_pacer_main__exit_on_signal) +{ + mock_prepare_to_wait_errors = 1; + mock_prepare_to_wait_status = -EINVAL; + unit_log_clear(); + homa_pacer_main(self->homa.pacer); +} TEST_F(homa_pacer, homa_pacer_xmit__basics) { From 04cb7a14492e5bd799854b8799e954000333ae01 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 4 May 2025 17:56:05 -0700 Subject: [PATCH 271/625] Fix bugs in homa_grant_check_rpc * Don't do anything if the RPC is dead * Must invoke homa_grant_update_incoming before returning, even if rank < 0 (rec_incoming could still change) --- homa_grant.c | 6 +++++- test/unit_homa_grant.c | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index d947981a..792696bc 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -662,8 +662,12 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) int i; if (rpc->msgin.length < 0 || rpc->msgin.num_bpages <= 0 || - rpc->msgin.rank < 0) + rpc->state == RPC_DEAD) return; + if (rpc->msgin.rank < 0) { + homa_grant_update_incoming(rpc, grant); + return; + } tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 68a4a2c8..f5ca2751 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -937,7 +937,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) EXPECT_EQ(0, atomic_read(&self->homa.grant->total_incoming)); EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); } -TEST_F(homa_grant, homa_grant_check_rpc__rpc_not_active) +TEST_F(homa_grant, homa_grant_check_rpc__update_incoming_if_rpc_not_active) { struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, @@ -946,10 +946,13 @@ TEST_F(homa_grant, homa_grant_check_rpc__rpc_not_active) homa_message_in_init(rpc, 2000, 0); EXPECT_EQ(0, rpc->msgin.rank); rpc->msgin.rank = -1; + rpc->msgin.rec_incoming = 100; + atomic_set(&self->homa.grant->total_incoming, 1000); unit_log_clear(); homa_grant_check_rpc(rpc); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); + EXPECT_EQ(900, atomic_read(&self->homa.grant->total_incoming)); } TEST_F(homa_grant, homa_grant_check_rpc__fast_path) { From d7ece03d0a466c404634a1b766e0f404ecdc5506 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 09:15:36 -0700 Subject: [PATCH 272/625] Fix trivial coding style issue --- homa.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/homa.h b/homa.h index 26e52e15..b5660f25 100644 --- a/homa.h +++ b/homa.h @@ -33,8 +33,8 @@ * define HOMA_MAX_BPAGES - The largest number of bpages that will be required * to store an incoming message. */ -#define HOMA_MAX_BPAGES ((HOMA_MAX_MESSAGE_LENGTH + HOMA_BPAGE_SIZE - 1) \ - >> HOMA_BPAGE_SHIFT) +#define HOMA_MAX_BPAGES ((HOMA_MAX_MESSAGE_LENGTH + HOMA_BPAGE_SIZE - 1) >> \ + HOMA_BPAGE_SHIFT) /** * define HOMA_MIN_DEFAULT_PORT - The 16 bit port space is divided into From 0b280f272b767d63105c464492bdbd9f8710592d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 13:20:48 -0700 Subject: [PATCH 273/625] Copy homa_test and server executables in install_homa --- cloudlab/bin/install_homa | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudlab/bin/install_homa b/cloudlab/bin/install_homa index 181310e1..a4528386 100755 --- a/cloudlab/bin/install_homa +++ b/cloudlab/bin/install_homa @@ -43,7 +43,7 @@ for ((i = $first ; i <= $last; i++)); do echo '*** Installing Homa on' $node '***' rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv ~/.bashrc ~/.bash_profile ~/.gdbinit $node: rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv --exclude __pycache__ ~/bin/ $node:bin/ - rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv $homa_ko $root/util/cp_node $root/util/homa_prio $root/util/*.py $node:bin/ + rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv $homa_ko $root/util/cp_node $root/util/homa_prio $root/util/server $root/util/homa_test $root/util/*.py $node:bin/ ssh -4 $node 'sudo sysctl .kernel.printk="5 4 1 7"' ssh -4 $node 'echo $PATH' ssh -4 $node 'config default' From a311b2e3a3b0ea54ea8b48b1590906285a5dd259 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 15:35:26 -0700 Subject: [PATCH 274/625] Remove homa_api.c Functions homa_abort, homa_reply, homa_replyv, homa_send, and homa_sendv no longer exist. --- README.md | 14 ++- homa.h | 22 +---- homa_api.c | 222 ---------------------------------------------- man/Makefile | 3 - man/homa.7 | 65 ++++++++++++-- man/homa_abort.3 | 93 ------------------- man/homa_reply.3 | 79 ----------------- man/homa_send.3 | 107 ---------------------- man/recvmsg.2 | 21 ++--- man/sendmsg.2 | 22 ++++- util/Makefile | 5 +- util/cp_node.cc | 68 ++++++++------ util/homa_test.cc | 187 ++++++++++++++++++++++++-------------- util/server.cc | 51 ++++++----- util/test_utils.h | 31 +++++++ 15 files changed, 317 insertions(+), 673 deletions(-) delete mode 100644 homa_api.c delete mode 100644 man/homa_abort.3 delete mode 100644 man/homa_reply.3 delete mode 100644 man/homa_send.3 diff --git a/README.md b/README.md index af661785..24c11303 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,11 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - Homa exports a collection of configuration parameters through the sysctl mechanism. For details, see the man page `homa.7`. -## Significant recent improvements +## Significant changesKs +- May 2025: `homa_api.c` has been removed, so the functions `homa_abort`, + `homa_reply`, `homa_replyv`, `homa_send`, and `homa_sendv` no longer + exist. +- April 2025: upgraded to Linux 6.13.9. - March 2025: implemented private RPCs, resulting in API changes. HOMA_RECVMSG_REQUEST and HOMA_RECVMSG_RESPONSE flags no longer exist and struct homa_sendmsg_args now has a flags field with one defined @@ -132,6 +136,14 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - February 2025: by default, incoming requests for a socket are rejected unless the socket has been bound. setsockopt can be used with SO_HOMA_SERVER to enable or disable incoming requests for any socket. +- October 2024: the process of upstreaming Homa into the Linux kernel has + begun. The reviewing process is likely to result in API changes. + Upstreaming will occur in stages, so the first version to appear in Linux + will not be either functionally complete or performant. The sources in + this repository contain '#ifndef __STRIP__' directives, which + separate functionality being upstreamed from functionality that is not + currently upstreamed (some things, such as development aids, + may never be upstreamed). - October 2024: Homa now has an official IANA IP protocol number (146). - August 2024: upgraded to Linux 6.10.6. - July 2024: introduced "TCP hijacking", where Homa packets are sent as diff --git a/homa.h b/homa.h index b5660f25..26fb3224 100644 --- a/homa.h +++ b/homa.h @@ -157,10 +157,10 @@ struct homa_abort_args { * @error: Zero means destroy and free RPCs; nonzero means complete * them with this error (recvmsg will return the RPCs). */ - int error; + __u32 error; /** @_pad1: Reserved. */ - int _pad1; + __u32 _pad1; /** @_pad2: Reserved. */ __u64 _pad2[2]; @@ -209,22 +209,4 @@ struct homa_rcvbuf_args { #endif /* See strip.py */ #define HOMAIOCFREEZE _IO(0x89, 0xef) -#ifndef __STRIP__ /* See strip.py */ -int homa_abort(int sockfd, __u64 id, int error); -#endif /* See strip.py */ -int homa_send(int sockfd, const void *message_buf, - size_t length, const struct sockaddr *dest_addr, - __u32 addrlen, __u64 *id, __u64 completion_cookie, - int flags); -int homa_sendv(int sockfd, const struct iovec *iov, - int iovcnt, const struct sockaddr *dest_addr, - __u32 addrlen, __u64 *id, __u64 completion_cookie, - int flags); -ssize_t homa_reply(int sockfd, const void *message_buf, - size_t length, const struct sockaddr *dest_addr, - __u32 addrlen, __u64 id); -ssize_t homa_replyv(int sockfd, const struct iovec *iov, - int iovcnt, const struct sockaddr *dest_addr, - __u32 addrlen, __u64 id); - #endif /* _UAPI_LINUX_HOMA_H */ diff --git a/homa_api.c b/homa_api.c deleted file mode 100644 index 296cba91..00000000 --- a/homa_api.c +++ /dev/null @@ -1,222 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause - -/* This file contains functions that implement the Homa API visible to - * applications. It is intended to be part of the user-level run-time library. - */ - -#include -#include -#include -#include -#ifndef NDEBUG -#include -#endif -#include -#include - -#include "homa.h" - -/** - * homa_reply() - Send a response message for an RPC previously received - * with a call to recvmsg. - * @sockfd: File descriptor for the socket on which to send the message. - * @message_buf: First byte of buffer containing the response message. - * @length: Number of bytes in the message at @message_buf. - * @dest_addr: Address of the RPC's client (returned by recvmsg when - * the message was received). - * @addrlen: # bytes at *dest_addr. - * @id: Unique identifier for the request, as returned by recvmsg - * when the request was received. - * - * @dest_addr and @id must correspond to a previously-received request - * for which no reply has yet been sent; if there is no such active request, - * then this function does nothing. - * - * Return: 0 means the response has been accepted for delivery. If an - * error occurred, -1 is returned and errno is set appropriately. - */ -ssize_t homa_reply(int sockfd, const void *message_buf, size_t length, - const struct sockaddr *dest_addr, __u32 addrlen, - __u64 id) -{ - struct homa_sendmsg_args args; - struct msghdr hdr; - struct iovec vec; - int result; - - args.id = id; - args.completion_cookie = 0; - args.flags = 0; - args.reserved = 0; - - vec.iov_base = (void *)message_buf; - vec.iov_len = length; - - hdr.msg_name = (void *)dest_addr; - hdr.msg_namelen = addrlen; - hdr.msg_iov = &vec; - hdr.msg_iovlen = 1; - hdr.msg_control = &args; - hdr.msg_controllen = 0; - result = sendmsg(sockfd, &hdr, 0); - return result; -} - -/** - * homa_replyv() - Similar to homa_reply, except the response - * message can be divided among several chunks of memory. - * @sockfd: File descriptor for the socket on which to send the message. - * @iov: Pointer to array that describes the chunks of the response - * message. - * @iovcnt: Number of elements in @iov. - * @dest_addr: Address of the RPC's client (returned by recvmsg when - * the message was received). - * @addrlen: # bytes at *dest_addr. - * @id: Unique identifier for the request, as returned by recvmsg - * when the request was received. - * - * @dest_addr and @id must correspond to a previously-received request - * for which no reply has yet been sent; if there is no such active request, - * then this function does nothing. - * - * Return: 0 means the response has been accepted for delivery. If an - * error occurred, -1 is returned and errno is set appropriately. - */ -ssize_t homa_replyv(int sockfd, const struct iovec *iov, int iovcnt, - const struct sockaddr *dest_addr, __u32 addrlen, - __u64 id) -{ - struct homa_sendmsg_args args; - struct msghdr hdr; - int result; - - args.id = id; - args.completion_cookie = 0; - args.flags = 0; - args.reserved = 0; - - hdr.msg_name = (void *)dest_addr; - hdr.msg_namelen = addrlen; - hdr.msg_iov = (struct iovec *)iov; - hdr.msg_iovlen = iovcnt; - hdr.msg_control = &args; - hdr.msg_controllen = 0; - result = sendmsg(sockfd, &hdr, 0); - return result; -} - -/** - * homa_send() - Send a request message to initiate an RPC. - * @sockfd: File descriptor for the socket on which to send the - * message. - * @message_buf: First byte of buffer containing the request message. - * @length: Number of bytes at @message_buf. - * @dest_addr: Address of server to which the request should be sent. - * @addrlen: # bytes at *dest_addr. - * @id: A unique identifier for the request will be returned - * here; this can be used later to find the response for - * this request. - * @completion_cookie: Value to be returned by recvmsg when RPC completes. - * @flags: Flag bits to pass to the sendmsg kernel call, such - * as HOMA_SENDMSG_PRIVATE; see man page for complete info. - * - * Return: 0 means the request has been accepted for delivery. If an - * error occurred, -1 is returned and errno is set appropriately. - */ -int homa_send(int sockfd, const void *message_buf, size_t length, - const struct sockaddr *dest_addr, __u32 addrlen, - __u64 *id, __u64 completion_cookie, int flags) -{ - struct homa_sendmsg_args args; - struct msghdr hdr; - struct iovec vec; - int result; - - args.id = 0; - args.completion_cookie = completion_cookie; - args.flags = flags; - args.reserved = 0; - - vec.iov_base = (void *)message_buf; - vec.iov_len = length; - - hdr.msg_name = (void *)dest_addr; - hdr.msg_namelen = addrlen; - hdr.msg_iov = &vec; - hdr.msg_iovlen = 1; - hdr.msg_control = &args; - hdr.msg_controllen = 0; - result = sendmsg(sockfd, &hdr, 0); - if (result >= 0 && id) - *id = args.id; - return result; -} - -/** - * homa_sendv() - Same as homa_send, except that the request message can - * be divided among multiple disjoint chunks of memory. - * @sockfd: File descriptor for the socket on which to send the - * message. - * @iov: Pointer to array that describes the chunks of the request - * message. - * @iovcnt: Number of elements in @iov. - * @dest_addr: Address of server to which the request should be sent. - * @addrlen: # bytes at *dest_addr. - * @id: A unique identifier for the request will be returned - * here; this can be used later to find the response for - * this request. - * @completion_cookie: Value to be returned by recvmsg when RPC completes. - * @flags: Flag bits to pass to the sendmsg kernel call, such - * as HOMA_SENDMSG_PRIVATE; see man page for complete info. - * - * Return: 0 means the request has been accepted for delivery. If an - * error occurred, -1 is returned and errno is set appropriately. - */ -int homa_sendv(int sockfd, const struct iovec *iov, int iovcnt, - const struct sockaddr *dest_addr, __u32 addrlen, - __u64 *id, __u64 completion_cookie, int flags) -{ - struct homa_sendmsg_args args; - struct msghdr hdr; - int result; - - args.id = 0; - args.completion_cookie = completion_cookie; - args.flags = flags; - args.reserved = 0; - - hdr.msg_name = (void *)dest_addr; - hdr.msg_namelen = addrlen; - hdr.msg_iov = (struct iovec *)iov; - hdr.msg_iovlen = iovcnt; - hdr.msg_control = &args; - hdr.msg_controllen = 0; - result = sendmsg(sockfd, &hdr, 0); - if (result >= 0 && id) - *id = args.id; - return result; -} - -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_abort() - Terminate the execution of an RPC. - * @sockfd: File descriptor for the socket associated with the RPC. - * @id: Unique identifier for a client RPC to abort (return value - * from previous call to homa_send). 0 means abort all client - * RPCs on this socket. - * @error: 0 means that the aborted RPCs should be destroyed - * immediately (they will never be returned by recvmsg). - * Nonzero means that the RPCs should be moved to the - * completed state; recvmsg will return an error for these - * RPCs, with @error as the errno value. - * - * Return: If an error occurred, -1 is returned and errno is set - * appropriately. Otherwise zero is returned. - */ -int homa_abort(int sockfd, __u64 id, int error) -{ - struct homa_abort_args args = {id, error}; - - return ioctl(sockfd, HOMAIOCABORT, &args); -} -#endif /* See strip.py */ diff --git a/man/Makefile b/man/Makefile index 8986d89b..c54a7281 100644 --- a/man/Makefile +++ b/man/Makefile @@ -1,9 +1,6 @@ # Makefile to build man pages for Homa. SRCS := homa.7 \ - homa_abort.3 \ - homa_reply.3 \ - homa_send.3 \ recvmsg.2 \ sendmsg.2 diff --git a/man/homa.7 b/man/homa.7 index b090dd73..31a2340d 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -242,12 +242,62 @@ takes an integer argument, where any nonzero value enables incoming requests and zero disables them. The current setting can be retrieved with .BR getsockopt . -.SH ABORTING REQUESTS +.SH ABORTING RPCS .PP -It is possible to abort RPCs that are in progress. This is done with -the -.B homa_abort -function call, which is described in a separate manual page. +It is possible for a client to abort RPCs that are in progress by invoking +.B ioctl +with the +.B HOMAIOCABORT +operation. One additional argument must be specified for +.BR ioctl , +consisting of a pointer to the following structure: +.in +4n +.ps -1 +.vs -2 +.EX +struct homa_abort_args { + __u64 id; /* Id of RPC to abort or 0. + __u32 error; /* Errno to use for completion or 0. + __u32 _pad1; /* Must be zero. + __u64 _pad2[2]; /* Must be zero. +}; +.EE +.vs +2 +.ps +1 +.in +.PP +The +.B id +field contains the identifier for an RPC; if this RPC is active on the socket +then it is aborted. If no such RPC exists then the +.B ioctl +returns without doing anything. If +.B id +is zero then all outgoing RPCs for the socket are aborted. +.PP +If +.B error +is 0 then the matching RPCs will be deleted and all state associated +with them will be freed (the RPCs will not be returned by +.BR recvmsg ). +If +.B error +is nonzero then the RPC(s) will immediately be placed in the completed +state so that they can be returned by +.BR recvmsg ; +.B recvmsg +will return an error for each aborted RPC, with an +.B errno +value of +.B error. +Regardless of whether the RPC(s) are completed or freed, the +servers for the RPCs +are not notified of the abort. If a +request has already been transmitted to the server at the time +an abort is requested, it may still be executed on the server. Any response +from the server will be discarded. +.PP +Only outgoing (client-side) RPCs may be aborted. .SH SHUTDOWN .PP The @@ -655,7 +705,4 @@ the core number for the following lines. A few counters appear before the first "core" line: these are core-independent counters such as elapsed time. .SH SEE ALSO .BR recvmsg (2), -.BR sendmsg (2), -.BR homa_abort (3), -.BR homa_reply (3), -.BR homa_send (3) +.BR sendmsg (2) diff --git a/man/homa_abort.3 b/man/homa_abort.3 deleted file mode 100644 index 04972654..00000000 --- a/man/homa_abort.3 +++ /dev/null @@ -1,93 +0,0 @@ -.TH HOMA_ABORT 3 2022-9-15 "Homa" "Linux Programmer's Manual" -.SH NAME -homa_abort \- terminate an outgoing RPC -.SH SYNOPSIS -.nf -.B #include -.PP -.BI "int homa_abort(int " sockfd ", uint64_t " id ", int " error ); -.PP -.BI "int homa_abortp(int " sockfd ", struct homa_abort_args *" args ); -.fi -.SH DESCRIPTION -These two functions will cancel the execution of one (or all) outgoing RPCs. -They behave identically except that -.BR homa_abort -receives its arguments as separate parameters, whereas -.BR homa_abortp -packs all of the arguments into a structure: -.PP -.in +4n -.ps -1 -.vs -2 -.EX -struct homa_abort_args { - uint64_t id; - int error; -}; -.EE -.vs +2 -.ps +1 -.in -.PP -The -.I id -argument contains the identifier for an RPC; if this RPC is active on -.IR sockfd -then it is aborted. -If -.I id -is 0 then all outgoing RPCs on -.IR sockfd -will be aborted. -If -.I error -is 0, then the matching RPCs will be deleted and all state associated -with them will be freed (the RPCs will not -be returned by -.BR homa_recv ). -If -.I error -is nonzero, then the RPC(s) will immediately be placed in the completed -state so that they can be returned by -.BR homa_recv ; -the -.BR homa_recv -call will return an error, with an -.I errno -value of -.I error. -Regardless of whether the RPC(s) are completed or freed, the -servers for the RPCs -are not notified of the abort. If a -request has already been transmitted to the server at the time -.B homa_abort -is invoked, it may still be executed on the server. Any response -from the server will be discarded. - -.SH RETURN VALUE -On success, the return value is 0. -On error, \-1 is returned and -.I errno -is set appropriately. - -.SH ERRORS -.TP -.B EALREADY -.I error -and -.I id -were both nonzero, but the RPC was already in the completed state. In this -case the system call has no effect. -.TP -.B EFAULT -An invalid user space address was specified for an argument. -.TP -.B EINVAL -There is no RPC corresponding to -.IR id . -.SH SEE ALSO -.BR homa_recv (3), -.BR homa_reply (3), -.BR homa_send (3), -.BR homa (7) diff --git a/man/homa_reply.3 b/man/homa_reply.3 deleted file mode 100644 index f5956ac7..00000000 --- a/man/homa_reply.3 +++ /dev/null @@ -1,79 +0,0 @@ -.TH HOMA_REPLY 3 2024-11-11 "Homa" "Linux Programmer's Manual" -.SH NAME -homa_reply, homa_replyv \- send a Homa response message -.SH SYNOPSIS -.nf -.B #include -.PP -.BI "int homa_reply(int " sockfd ", const void *" message_buf ", size_t " \ -length , -.BI " const struct sockaddr *" dest_addr ", size_t " \ -addrlen , -.BI " uint64_t " id );> -.PP -.BI "int homa_replyv(int " sockfd ", const struct iovec *" iov ", size_t " \ -iovcnt , -.BI " const struct sockaddr *" dest_addr ", size_t " \ -addrlen , -.BI " uint64_t " id ); -.fi -.SH DESCRIPTION -.BR homa_reply -and -.BR homa_replyv -are convenience functions layered on top of the -.B sendmsg -system call. -Either may be used to transmit a response message using the Homa -transport protocol. -The argument -.I sockfd -is the file descriptor of a Homa socket to use for sending the response. -With -.BR homa_reply -the response message is stored in a single contiguous buffer pointed to by -.IR message_buf , -and the argument -.I length -gives the length of the message in bytes. -With -.BR homa_replyv -the response message consists of multiple disjoint chunks, specified -by -.I iovcnt -descriptors at -.IR iov . -In either case the total message length must not exceed -.BR HOMA_MAX_MESSAGE_LENGTH . -The destination for the response is given by -.IR dest_addr , -which can hold either an IPv4 or an IPv6 address. The length -of the address is given by -.IR addrlen . -The argument -.I id -is an identifier previously returned by -.BR recvmsg (2); -along with -.IR dest_addr , -it identifies the request for which this message is the response. -.PP -This function returns as soon as the response has been queued for -transmission. -.SH RETURN VALUE -On success, the return value is 0. -On error, \-1 is returned and -.I errno -is set appropriately. -.SH ERRORS -See -.BR sendmsg (2) -for details on the -.I errno -values returned after errors. -.SH SEE ALSO -.BR recvmsg (2), -.BR sendmsg (2), -.BR homa_abort (3), -.BR homa_send (3), -.BR homa (7) diff --git a/man/homa_send.3 b/man/homa_send.3 deleted file mode 100644 index f85bc3f2..00000000 --- a/man/homa_send.3 +++ /dev/null @@ -1,107 +0,0 @@ -.TH HOMA_SEND 3 2024-11-11 "Homa" "Linux Programmer's Manual" -.SH NAME -homa_send, homa_sendv \- send a request message -.SH SYNOPSIS -.nf -.B #include -.PP -.BI "int homa_send(int " sockfd ", const void *" message_buf ", size_t " length \ -", const struct sockaddr *" dest_addr ", -.BI " size_t " addrlen ", __u64 *" id ", __u64" \ -completion_cookie ", __u32 " flags ); -.PP -.BI "int homa_sendv(int " sockfd ", const struct iovec *" iov ", size_t " \ -iovcnt ", const sockaddr *" dest_addr , -.BI " size_t " addrlen ", __u64 *" id ", __u64 " \ -completion_cookie ", __u32 " flags ); -.fi -.SH DESCRIPTION -.BR homa_send -and -.BR homa_sendv -are convenience functions layered on top of the -.B sendmsg -system call. -Either may be used to transmit a request message using the Homa -transport protocol. -The argument -.I sockfd -is the file descriptor of the sending socket; this must be a Homa socket. -With -.BR homa_send -the request message is stored in a single contiguous buffer pointed to by -.IR message_buf , -and the argument -.I length -gives the length of the message in bytes. -With -.BR homa_sendv -the request message consists of multiple disjoint chunks, specified -by -.I iovcnt -descriptors at -.IR iov . -In either case, the total message length must not exceed -.BR HOMA_MAX_MESSAGE_LENGTH . -The destination socket for the request is given by -.IR dest_addr , -which can hold either an IPv4 or IPv6 address. The length of -the address is given by -.IR addrlen . -If -.I id -is not NULL, an identifier for the request is returned at -.IR *id. -The identifier will be unique among all requests issued on -.IR sockfd , -and can be used to match the request with a response returned later by -.BR homa_reply (3). -The -.I completion_cookie -argument provides application-specific identifying information about the RPC, -such as the address of a data structure used to manage the -RPC; it will be returned by -.BR recvmsg -when the RPC completes. -The -.I flags -argument is passed to -.BR sendmsg -as the -.IR flags -field of the -.B -homa_sendmsg_args -struct (see the man page for -.BR sendmsg -for details). -.PP -This function returns as soon as the message has been queued for -transmission. - -.SH RETURN VALUE -On success, the return value is 0 and an identifier for the request -is stored in -.I *id -(if -.I id -is not NULL). -The identifier can be used later to match the request -with the corresponding response, using -.BR recvmsg (2). -On error, \-1 is returned and -.I errno -is set appropriately. -.SH ERRORS -After an error return, -.I errno -will contain additional information about the cause of the error. -See -.BR sendmsg (2) -for details. -.SH SEE ALSO -.BR recvmsg (2), -.BR sendmsg (2), -.BR homa_abort (3), -.BR homa_reply (3), -.BR homa (7) diff --git a/man/recvmsg.2 b/man/recvmsg.2 index 6efe9a1a..7eb70a40 100644 --- a/man/recvmsg.2 +++ b/man/recvmsg.2 @@ -176,21 +176,15 @@ will be set to zero by the call. .PP .B recvmsg normally waits until a suitable message has arrived, but nonblocking -behavior may be requested in any of three ways. First, the -.BR HOMA_RECVMSG_NONBLOCKING -bit may be set in the -.B flags -field of the -.BR homa_recvmsg_args -struct. Second, the +behavior may be requested either by setting the .BR MSG_DONTWAIT -bit can be set in the +bit in the .BR flags argument to -.BR recvmsg . -Third, the +.BR recvmsg +or by setting the .B O_NONBLOCK -flag may be set for the socket using +flag for the socket using .BR fcntl . If .B recvmsg @@ -290,8 +284,5 @@ was sent. The socked has been disabled using .BR shutdown (2). .SH SEE ALSO -.BR recvmsg (2), -.BR homa_abort (3), -.BR homa_reply (3), -.BR homa_send (3), +.BR sendmsg (2), .BR homa (7) diff --git a/man/sendmsg.2 b/man/sendmsg.2 index 397433eb..e0784d69 100644 --- a/man/sendmsg.2 +++ b/man/sendmsg.2 @@ -115,6 +115,25 @@ cannot be used to determine when a private response has arrived. .PP .B sendmsg returns as soon as the message has been queued for transmission. +.B sendmsg +will block if there are so many outstanding messages on the socket that +its limit on write buffer memory has been exceeded. However, if the +.BR MSG_DONTWAIT +bit is set in the +.BR flags +argument to +.BR sendmsg +or the +.B O_NONBLOCK +flag is set for the socket using +.BR fcntl +then +.B sendmsg +will will fail with an +.I errno +value of +.BR EAGAIN +instead of blocking. .SH RETURN VALUE The return value is 0 for success and -1 if an error occurred. .SH ERRORS @@ -159,7 +178,4 @@ The socked has been disabled using .BR shutdown (2). .SH SEE ALSO .BR recvmsg (2), -.BR homa_abort (3), -.BR homa_reply (3), -.BR homa_send (3), .BR homa (7) diff --git a/util/Makefile b/util/Makefile index 618a2d1a..9e60b497 100644 --- a/util/Makefile +++ b/util/Makefile @@ -8,7 +8,7 @@ BINS := buffer_client buffer_server cp_node dist_test dist_to_proto \ OBJS := $(patsubst %,%.o,$(BINS)) -LIB_SRCS := dist.cc homa_api.c test_utils.cc time_trace.cc +LIB_SRCS := dist.cc test_utils.cc time_trace.cc LIB_OBJS := $(patsubst %.c,%.o,$(patsubst %.cc,%.o,$(LIB_SRCS))) LIB_OBJS += homa_receiver.o @@ -39,9 +39,6 @@ homa_receiver.o: ../homa_receiver.cc ../homa_receiver.h %.o: %.c test_utils.h ../homa.h cc -c $(CFLAGS) $< -o $@ -homa_api.o: ../homa_api.c ../homa.h - cc -c $(CFLAGS) $< -o $@ - clean: rm -f $(BINS) $(OBJS) $(LIB_OBJS) diff --git a/util/cp_node.cc b/util/cp_node.cc index 7d164cf9..1209d842 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -1065,11 +1065,13 @@ homa_server::~homa_server() */ void homa_server::server(int thread_id, server_metrics *metrics) { - message_header *header; - int length, num_vecs, result; - char thread_name[50]; homa::receiver receiver(fd, buf_region); struct iovec vecs[HOMA_MAX_BPAGES]; + struct homa_sendmsg_args homa_args; + int length, num_vecs, result; + message_header *header; + struct msghdr msghdr; + char thread_name[50]; int offset; snprintf(thread_name, sizeof(thread_name), "S%d.%d", id, thread_id); @@ -1121,9 +1123,11 @@ void homa_server::server(int thread_id, server_metrics *metrics) offset += chunk_size; num_vecs++; } - result = homa_replyv(fd, vecs, num_vecs, receiver.src_addr(), - sockaddr_size(receiver.src_addr()), - receiver.id()); + init_sendmsg_hdrs(&msghdr, &homa_args, vecs, num_vecs, + receiver.src_addr(), + sockaddr_size(receiver.src_addr())); + homa_args.id = receiver.id(); + result = sendmsg(fd, &msghdr, 0); if (result < 0) { log(NORMAL, "FATAL: homa_reply failed for server " "port %d: %s\n", @@ -2131,13 +2135,16 @@ void homa_client::sender() uint64_t next_start = rdtsc(); char thread_name[50]; homa::receiver receiver(fd, buf_region); + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec vec[2]; + int num_vecs; snprintf(thread_name, sizeof(thread_name), "C%d", id); time_trace::thread_buffer thread_buffer(thread_name); while (1) { uint64_t now; - __u64 rpc_id; int server; int status; int slot = get_rinfo(); @@ -2173,28 +2180,29 @@ void homa_client::sender() header->msg_id = slot; tt("sending request, cid 0x%08x, id %u, length %d", header->cid, header->msg_id, header->length); + if (client_iovec && (header->length > 20)) { - struct iovec vec[2]; vec[0].iov_base = sender_buffer; vec[0].iov_len = 20; vec[1].iov_base = sender_buffer + 20; vec[1].iov_len = header->length - 20; - status = homa_sendv(fd, vec, 2, - &server_addrs[server].sa, - sockaddr_size(&server_addrs[server].sa), - &rpc_id, 0, 0); - } else - status = homa_send(fd, sender_buffer, header->length, - &server_addrs[server].sa, - sockaddr_size(&server_addrs[server].sa), - &rpc_id, 0, 0); + num_vecs = 2; + } else { + vec[0].iov_base = sender_buffer; + vec[0].iov_len = header->length; + num_vecs = 1; + } + init_sendmsg_hdrs(&msghdr, &homa_args, vec, num_vecs, + &server_addrs[server].sa, + sockaddr_size(&server_addrs[server].sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - log(NORMAL, "FATAL: error in homa_send: %s (request " + log(NORMAL, "FATAL: error in Homa sendmsg: %s (request " "length %d)\n", strerror(errno), header->length); fatal(); } - rinfos[slot].id = rpc_id; + rinfos[slot].id = homa_args.id; requests[server]++; total_requests++; lag = now - next_start; @@ -2202,7 +2210,7 @@ void homa_client::sender() if (receivers_running == 0) { /* There isn't a separate receiver thread; wait for * the response here. */ - wait_response(&receiver, rpc_id); + wait_response(&receiver, homa_args.id); } } } @@ -2238,8 +2246,10 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, homa::receiver *receiver) { message_header *header = reinterpret_cast(buffer); + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec vec; uint64_t start; - __u64 rpc_id; int status; header->length = length; @@ -2250,22 +2260,26 @@ uint64_t homa_client::measure_rtt(int server, int length, char *buffer, header->cid = server_conns[server]; header->cid.client_port = id; start = rdtsc(); - status = homa_send(fd, buffer, header->length, &server_addrs[server].sa, - sockaddr_size(&server_addrs[server].sa), &rpc_id, 0, - 0); + + vec.iov_base = buffer; + vec.iov_len = header->length; + init_sendmsg_hdrs(&msghdr, &homa_args, &vec, 1, + &server_addrs[server].sa, + sockaddr_size(&server_addrs[server].sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - log(NORMAL, "FATAL: error in homa_send: %s (request " + log(NORMAL, "FATAL: error in Homa sendmsg: %s (request " "length %d)\n", strerror(errno), header->length); fatal(); } do { - status = receiver->receive(0, rpc_id); + status = receiver->receive(0, homa_args.id); } while ((status < 0) && ((errno == EAGAIN) || (errno == EINTR))); if (status < 0) { log(NORMAL, "FATAL: measure_rtt got error in recvmsg: %s " "(id %llu, server %s)\n", - strerror(errno), rpc_id, + strerror(errno), homa_args.id, print_address((union sockaddr_in_union *) receiver->src_addr())); fatal(); diff --git a/util/homa_test.cc b/util/homa_test.cc index 429e6332..32572d26 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -79,17 +79,21 @@ void close_fd(int fd) */ void send_fd(int fd, const sockaddr_in_union *addr, char *request) { - __u64 id; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec iov; int status; sleep(1); - status = homa_send(fd, request, length, &addr->sa, - sockaddr_size(&addr->sa), &id, 0, 0); + iov.iov_base = request; + iov.iov_len = length; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &addr->sa, + sockaddr_size(&addr->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", - strerror(errno)); + printf("Error in sendmsg %s\n", strerror(errno)); } else { - printf("homa_send succeeded, id %llu\n", id); + printf("sendmsg succeeded, id %llu\n", homa_args.id); } } @@ -162,19 +166,25 @@ void test_close() */ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) { - __u64 id; - int status; + struct homa_sendmsg_args homa_args; + uint64_t start = rdtsc(); + struct msghdr msghdr; int completed = 0; size_t total = 0; -#define PRINT_INTERVAL 1000 ssize_t received; - uint64_t start = rdtsc(); + struct iovec iov; + int status; + +#define PRINT_INTERVAL 1000 + iov.iov_base = request; + iov.iov_len = 1; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); for (int i = 1; i <= count; i++) { - status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0, 0); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", + printf("Error in sendmsg: %s\n", strerror(errno)); sleep(1); } @@ -191,7 +201,7 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) received = recvmsg(fd, &recv_hdr, 0); if (received < 0) { printf("Error in recvmsg for id %llu: %s\n", - id, strerror(errno)); + recv_args.id, strerror(errno)); } else { total += received; completed++; @@ -216,17 +226,22 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) */ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) { - __u64 id; - int status; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; ssize_t resp_length; + struct iovec iov; + int status; - status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0, 0); + iov.iov_base = request; + iov.iov_len = length; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); return; } else { - printf("homa_send succeeded, id %llu\n", id); + printf("sendmsg succeeded, id %llu\n", homa_args.id); } recv_args.id = 0; recv_args.flags = 0; @@ -341,19 +356,27 @@ void test_poll(int fd, char *request) */ void test_private(int fd, const sockaddr_in_union *dest, char *request) { - __u64 ids[3]; - int status, i; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; ssize_t resp_length; + struct iovec iov; + int status, i; + __u64 ids[3]; + iov.iov_base = request; + iov.iov_len = length; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + homa_args.flags = HOMA_SENDMSG_PRIVATE; for (i = 0; i < 3; i++) { - status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), &ids[i], 0, - HOMA_SENDMSG_PRIVATE); + homa_args.id = 0; + status = sendmsg(fd, &msghdr, 0); + ids[i] = homa_args.id; if (status < 0) { - printf("Error in homa_send: %s\n", strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); return; } else { - printf("homa_send succeeded, id %llu\n", ids[i]); + printf("sendmsg succeeded, id %llu\n", ids[i]); } } @@ -411,18 +434,23 @@ void test_read(int fd, int count) */ void test_rtt(int fd, const sockaddr_in_union *dest, char *request) { - int status; + uint64_t *times = new uint64_t[count]; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; ssize_t resp_length; + struct iovec iov; uint64_t start; - uint64_t *times = new uint64_t[count]; + int status; + iov.iov_base = request; + iov.iov_len = length; for (int i = -10; i < count; i++) { start = rdtsc(); - status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), NULL, 0, 0); + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", - strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); return; } recv_args.id = 0; @@ -453,26 +481,29 @@ void test_rtt(int fd, const sockaddr_in_union *dest, char *request) */ void test_send(int fd, const sockaddr_in_union *dest, char *request) { - __u64 id; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec iov; int status; - status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0, 0); + iov.iov_base = request; + iov.iov_len = length; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", - strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); } else { - printf("Homa_send succeeded, id %llu\n", id); + printf("sendmsg succeeded, id %llu\n", homa_args.id); } } /** * test_set_buf() - Invoke homa_set_buf on a Homa socket. - * @fd: Homa socket. */ -void test_set_buf(int fd) +void test_set_buf(void) { - int status; + int status, fd; char *region = (char *) mmap(NULL, 64*HOMA_BPAGE_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); struct homa_rcvbuf_args arg; @@ -482,6 +513,11 @@ void test_set_buf(int fd) return; } + fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); + if (fd < 0) { + printf("Couldn't open Homa socket: %s\n", strerror(errno)); + } + arg.start = (uintptr_t)region; arg.length = 64*HOMA_BPAGE_SIZE; status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, @@ -489,6 +525,7 @@ void test_set_buf(int fd) if (status < 0) printf("Error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); + close(fd); } /** @@ -535,14 +572,16 @@ void test_shutdown(int fd) void test_stream(int fd, const sockaddr_in_union *dest) { #define MAX_RPCS 100 + struct homa_sendmsg_args homa_args; + uint64_t end_cycles, end_time; + uint64_t start_cycles = 0; + int64_t start_bytes = 0; int *buffers[MAX_RPCS]; + int64_t bytes_sent = 0; + struct msghdr msghdr; ssize_t resp_length; - __u64 id; - uint64_t start_cycles = 0; - uint64_t end_cycles, end_time; + struct iovec iov; int status, i; - int64_t bytes_sent = 0; - int64_t start_bytes = 0; double rate; end_time = rdtsc() + (uint64_t) (5*get_cycles_per_sec()); @@ -558,11 +597,14 @@ void test_stream(int fd, const sockaddr_in_union *dest) buffers[i][1] = 12; seed_buffer(buffers[i]+2, length - 2*sizeof32(int), 1000*i); } + iov.iov_len = length; for (i = 0; i < count; i++) { - status = homa_send(fd, buffers[i], length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0, 0); + iov.iov_base = buffers[i]; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); return; } } @@ -579,19 +621,19 @@ void test_stream(int fd, const sockaddr_in_union *dest) recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (resp_length < 0) { - printf("Error in recvmsg: %s\n", - strerror(errno)); + printf("Error in recvmsg: %s\n", strerror(errno)); return; } if (resp_length != 12) printf("Expected 12 bytes in response, received %ld\n", resp_length); response = (int *) (buf_region + recv_args.bpage_offsets[0]); - status = homa_send(fd, buffers[(response[2]/1000) %count], - length, &dest->sa, sockaddr_size(&dest->sa), - &id, 0, 0); + iov.iov_base = buffers[(response[2]/1000) % count]; + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); return; } bytes_sent += length; @@ -885,23 +927,28 @@ void recv_slow(int fd) /** * test_wmem() - Use two threads, a sender and a receiver, and make the * receiver go so slowly that the sender uses up all available tx packet - * memory and blocks. + * memory and blocks. Note: specify a large --length parameter. * @fd: Homa socket. * @dest: Where to send the request * @request: Request message. */ void test_wmem(int fd, const sockaddr_in_union *dest, char *request) { - __u64 id; + struct homa_sendmsg_args homa_args; + struct msghdr msghdr; + struct iovec iov; int status; std::thread thread(recv_slow, fd); + iov.iov_base = request; + iov.iov_len = length; for ( ; count > 0; count--) { - status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0, 0); + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, 0); if (status < 0) { - printf("Error in homa_send: %s\n", strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); break; } printf("Sent request with %d bytes\n", length); @@ -920,16 +967,20 @@ void test_wmem(int fd, const sockaddr_in_union *dest, char *request) */ void test_wmem_poll(int fd, const sockaddr_in_union *dest, char *request) { - __u64 id; - int status; + struct homa_sendmsg_args homa_args; struct pollfd poll_info = { .fd = fd, .events = POLLOUT, .revents = 0 }; + struct msghdr msghdr; + struct iovec iov; + int status; std::thread thread(recv_slow, fd); + iov.iov_base = request; + iov.iov_len = length; for ( ; count > 0; count--) { status = poll(&poll_info, 1, -1); if (status > 0) { @@ -938,11 +989,11 @@ void test_wmem_poll(int fd, const sockaddr_in_union *dest, char *request) printf("Poll failed: %s\n", strerror(errno)); break; } - status = homa_send(fd, request, length, &dest->sa, - sockaddr_size(&dest->sa), &id, 0, - HOMA_SENDMSG_NONBLOCKING); + init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, + sockaddr_size(&dest->sa)); + status = sendmsg(fd, &msghdr, MSG_DONTWAIT); if (status < 0) { - printf("Error in homa_send: %s\n", strerror(errno)); + printf("Error in sendmsg: %s\n", strerror(errno)); break; } printf("Sent request with %d bytes\n", length); @@ -1100,7 +1151,7 @@ int main(int argc, char** argv) } else if (strcmp(argv[next_arg], "shutdown") == 0) { test_shutdown(fd); } else if (strcmp(argv[next_arg], "set_buf") == 0) { - test_set_buf(fd); + test_set_buf(); } else if (strcmp(argv[next_arg], "stream") == 0) { test_stream(fd, &dest); } else if (strcmp(argv[next_arg], "tcp") == 0) { diff --git a/util/server.cc b/util/server.cc index ce418c24..fa7b076e 100644 --- a/util/server.cc +++ b/util/server.cc @@ -57,16 +57,18 @@ int inet_family = AF_INET; */ void homa_server(int port) { - int fd; - sockaddr_in_union addr; - sockaddr_in_union source; - int length; + struct homa_sendmsg_args reply_args; + struct iovec vecs[HOMA_MAX_BPAGES]; struct homa_recvmsg_args recv_args; - struct msghdr hdr; struct homa_rcvbuf_args arg; + struct msghdr reply_msghdr; + sockaddr_in_union source; + sockaddr_in_union addr; + struct msghdr hdr; char *buf_region; - struct iovec vecs[HOMA_MAX_BPAGES]; int num_vecs; + int length; + int fd; fd = socket(inet_family, SOCK_DGRAM, IPPROTO_HOMA); if (fd < 0) { @@ -78,7 +80,7 @@ void homa_server(int port) addr.in4.sin_port = htons(port); if (bind(fd, &addr.sa, sizeof(addr)) != 0) { printf("Couldn't bind socket to Homa port %d: %s\n", port, - strerror(errno)); + strerror(errno)); return; } if (verbose) @@ -86,7 +88,8 @@ void homa_server(int port) // Set up buffer region. buf_region = (char *) mmap(NULL, 1000*HOMA_BPAGE_SIZE, - PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); + PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (buf_region == MAP_FAILED) { printf("Couldn't mmap buffer region: %s\n", strerror(errno)); return; @@ -94,7 +97,7 @@ void homa_server(int port) arg.start = (uintptr_t)buf_region; arg.length = 1000*HOMA_BPAGE_SIZE; int status = setsockopt(fd, IPPROTO_HOMA, SO_HOMA_RCVBUF, &arg, - sizeof(arg)); + sizeof(arg)); if (status < 0) { printf("Error in setsockopt(SO_HOMA_RCVBUF): %s\n", strerror(errno)); @@ -120,21 +123,22 @@ void homa_server(int port) printf("recvmsg failed: %s\n", strerror(errno)); continue; } - int resp_length = ((int *) (buf_region + recv_args.bpage_offsets[0]))[1]; + int resp_length = ((int *) (buf_region + + recv_args.bpage_offsets[0]))[1]; if (validate) { seed = check_message(&recv_args, buf_region, length, - 2*sizeof32(int)); + 2*sizeof32(int)); if (verbose) printf("Received message from %s with %d bytes, " - "id %llu, seed %d, response length %d\n", - print_address(&source), length, - recv_args.id, seed, resp_length); + "id %llu, seed %d, response length %d\n", + print_address(&source), length, + recv_args.id, seed, resp_length); } else if (verbose) printf("Received message from %s with " - "%d bytes, id %llu, response length %d\n", - print_address(&source), length, - recv_args.id, resp_length); + "%d bytes, id %llu, response length %d\n", + print_address(&source), length, + recv_args.id, resp_length); /* Second word of the message indicates how large a * response to send. @@ -143,15 +147,18 @@ void homa_server(int port) while (resp_length > 0) { vecs[num_vecs].iov_len = (resp_length > HOMA_BPAGE_SIZE) ? HOMA_BPAGE_SIZE : resp_length; - vecs[num_vecs].iov_base = buf_region - + recv_args.bpage_offsets[num_vecs]; + vecs[num_vecs].iov_base = buf_region + + recv_args.bpage_offsets[num_vecs]; resp_length -= vecs[num_vecs].iov_len; num_vecs++; } - result = homa_replyv(fd, vecs, num_vecs, &source.sa, - sockaddr_size(&source.sa), recv_args.id); + init_sendmsg_hdrs(&reply_msghdr, &reply_args, vecs, num_vecs, + &source.sa, sockaddr_size(&source.sa)); + reply_args.id = recv_args.id; + result = sendmsg(fd, &reply_msghdr, 0); if (result < 0) { - printf("homa_reply failed: %s\n", strerror(errno)); + printf("sendmsg for reply failed: %s\n", + strerror(errno)); } } } diff --git a/util/test_utils.h b/util/test_utils.h index 2dc1b6df..53faa404 100644 --- a/util/test_utils.h +++ b/util/test_utils.h @@ -67,6 +67,37 @@ inline static uint64_t rdtsc(void) return (((uint64_t)hi << 32) | lo); } +/** + * init_sendmsg_hdrs(): Convenience function to initialize the two headers + * needed to invoke sendmsg for Homa. This initializes for the common case; + * callers may need to set some fields explicitly for less common cases. + * @hdr: msghdr argument to sendmsg: will be initialized here. + * @args: Homa's sendmsg arguments; will be initialized here. + * @iov: Describes outgoing message. + * @iovcnt: Number of entries in @iov. + * @dest_addr: Target for the message. + * @addrlen: Size of @dest_addr (bytes). + */ +inline static void init_sendmsg_hdrs(struct msghdr *hdr, + struct homa_sendmsg_args *args, + struct iovec *iov, int iovcnt, + const struct sockaddr *dest_addr, + __u32 addrlen) +{ + args->id = 0; + args->completion_cookie = 0; + args->flags = 0; + args->reserved = 0; + + hdr->msg_name = (struct sockaddr *)dest_addr; + hdr->msg_namelen = addrlen; + hdr->msg_iov = iov; + hdr->msg_iovlen = iovcnt; + hdr->msg_control = args; + hdr->msg_controllen = 0; + hdr->msg_flags = 0; +} + #ifdef __cplusplus } #endif From 070bc1710f2db8b6157ab41d44890d0532f4b6ef Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 15:47:03 -0700 Subject: [PATCH 275/625] Remove HOMA_SENDMSG_NONBLOCKING and HOMA_RECVMSG_NONBLOCKING These aren't needed: MSG_DONTWAIT seems to provide all of the needed functionality. Also removed flags field from homa_recvmsg_args (no flags are currently defined). --- homa.h | 14 +------------- homa_plumbing.c | 14 +++++--------- homa_receiver.cc | 9 +++------ test/unit_homa_plumbing.c | 22 ++++++---------------- util/homa_test.cc | 12 ------------ 5 files changed, 15 insertions(+), 56 deletions(-) diff --git a/homa.h b/homa.h index 26fb3224..2863d2c3 100644 --- a/homa.h +++ b/homa.h @@ -84,8 +84,7 @@ _Static_assert(sizeof(struct homa_sendmsg_args) <= 24, /* Flag bits for homa_sendmsg_args.flags (see man page for documentation): */ #define HOMA_SENDMSG_PRIVATE 0x01 -#define HOMA_SENDMSG_NONBLOCKING 0x02 -#define HOMA_SENDMSG_VALID_FLAGS 0x03 +#define HOMA_SENDMSG_VALID_FLAGS 0x01 /** * struct homa_recvmsg_args - Provides information needed by Homa's @@ -106,12 +105,6 @@ struct homa_recvmsg_args { */ __u64 completion_cookie; - /** - * @flags: (in) OR-ed combination of bits that control the operation. - * See below for values. - */ - __u32 flags; - /** * @num_bpages: (in/out) Number of valid entries in @bpage_offsets. * Passes in bpages from previous messages that can now be @@ -139,11 +132,6 @@ _Static_assert(sizeof(struct homa_recvmsg_args) <= 88, "homa_recvmsg_args grew"); #endif -/* Flag bits for homa_recvmsg_args.flags (see man page for documentation): - */ -#define HOMA_RECVMSG_NONBLOCKING 0x01 -#define HOMA_RECVMSG_VALID_FLAGS 0x01 - #ifndef __STRIP__ /* See strip.py */ /** * struct homa_abort_args - Structure that passes arguments and results diff --git a/homa_plumbing.c b/homa_plumbing.c index 8b9bf28d..c490a726 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -969,9 +969,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (!homa_sock_wmem_avl(hsk)) { result = homa_sock_wait_wmem(hsk, - msg->msg_flags & MSG_DONTWAIT || - args.flags & - HOMA_SENDMSG_NONBLOCKING); + msg->msg_flags & MSG_DONTWAIT); if (result != 0) goto error; } @@ -1122,11 +1120,10 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, sizeof(control)))) return -EFAULT; control.completion_cookie = 0; - tt_record3("homa_recvmsg starting, port %d, pid %d, flags %d", - hsk->port, current->pid, control.flags); + tt_record2("homa_recvmsg starting, port %d, pid %d", + hsk->port, current->pid); - if (control.num_bpages > HOMA_MAX_BPAGES || - (control.flags & ~HOMA_RECVMSG_VALID_FLAGS)) { + if (control.num_bpages > HOMA_MAX_BPAGES) { result = -EINVAL; goto done; } @@ -1140,8 +1137,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (result != 0) goto done; - nonblocking = ((flags & MSG_DONTWAIT) || - (control.flags & HOMA_RECVMSG_NONBLOCKING)); + nonblocking = flags & MSG_DONTWAIT; if (control.id != 0) { rpc = homa_find_client_rpc(hsk, control.id); /* Locks RPC. */ if (!rpc) { diff --git a/homa_receiver.cc b/homa_receiver.cc index 9f6fa932..5839cb75 100644 --- a/homa_receiver.cc +++ b/homa_receiver.cc @@ -69,8 +69,7 @@ void homa::receiver::copy_out(void *dest, size_t offset, size_t count) const /** * homa::receiver::receive() - Release resources for the current message, if * any, and receive a new incoming message. - * @flags: Various OR'ed bits such as HOMA_RECVMSG_NONBLOCKING. See the - * Homa documentation for the flags field of recvmsg for details. + * @flags: Flag bits for the recvmsg invocation (e.g., MSG_DONTWAIT). * @id: Identifier of a private RPC whose result is desired, or 0 * to wait for a shared RPC. See the Homa documentation for the id * field of recvmsg for details. @@ -82,11 +81,10 @@ void homa::receiver::copy_out(void *dest, size_t offset, size_t count) const */ size_t homa::receiver::receive(int flags, uint64_t id) { - control.flags = flags; control.id = id; hdr.msg_namelen = sizeof(source); hdr.msg_controllen = sizeof(control); - msg_length = recvmsg(fd, &hdr, 0); + msg_length = recvmsg(fd, &hdr, flags); if (msg_length < 0) { control.num_bpages = 0; id = 0; @@ -105,9 +103,8 @@ void homa::receiver::release() return; /* This recvmsg request will do nothing except return buffer space. */ - control.flags = HOMA_RECVMSG_NONBLOCKING; control.id = 0; - recvmsg(fd, &hdr, 0); + recvmsg(fd, &hdr, MSG_DONTWAIT); control.num_bpages = 0; msg_length = -1; } \ No newline at end of file diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 39b695d0..b9785615 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -85,9 +85,8 @@ FIXTURE_SETUP(homa_plumbing) self->recvmsg_hdr.msg_namelen = 0; self->recvmsg_hdr.msg_control = &self->recvmsg_args; self->recvmsg_hdr.msg_controllen = sizeof(self->recvmsg_args); - self->recvmsg_hdr.msg_flags = 0; + self->recvmsg_hdr.msg_flags = MSG_DONTWAIT; memset(&self->recvmsg_args, 0, sizeof(self->recvmsg_args)); - self->recvmsg_args.flags = HOMA_RECVMSG_NONBLOCKING; self->send_vec[0].iov_base = self->buffer; self->send_vec[0].iov_len = 100; self->send_vec[1].iov_base = self->buffer + 1000; @@ -677,12 +676,6 @@ TEST_F(homa_plumbing, homa_recvmsg__num_bpages_too_large) EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); } -TEST_F(homa_plumbing, homa_recvmsg__bogus_flags) -{ - self->recvmsg_args.flags = 1 << 10; - EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); -} TEST_F(homa_plumbing, homa_recvmsg__no_buffer_pool) { struct homa_pool *saved_pool = self->hsk.buffer_pool; @@ -703,7 +696,7 @@ TEST_F(homa_plumbing, homa_recvmsg__release_buffers) self->recvmsg_args.bpage_offsets[1] = HOMA_BPAGE_SIZE; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[0].refs)); EXPECT_EQ(0, atomic_read(&self->hsk.buffer_pool->descriptors[1].refs)); } @@ -714,7 +707,7 @@ TEST_F(homa_plumbing, homa_recvmsg__error_in_release_buffers) self->hsk.buffer_pool->num_bpages << HOMA_BPAGE_SHIFT; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); } TEST_F(homa_plumbing, homa_recvmsg__private_rpc_doesnt_exist) { @@ -733,18 +726,15 @@ TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_private) atomic_or(RPC_PRIVATE, &crpc->flags); self->recvmsg_args.id = crpc->id; - self->recvmsg_args.flags = HOMA_RECVMSG_NONBLOCKING; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, self->recvmsg_args.id); } TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_shared) { - self->recvmsg_args.flags = HOMA_RECVMSG_NONBLOCKING; - EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); } TEST_F(homa_plumbing, homa_recvmsg__MSG_DONT_WAIT) { @@ -910,7 +900,7 @@ TEST_F(homa_plumbing, homa_recvmsg__copy_back_args_even_after_error) self->recvmsg_args.bpage_offsets[1] = HOMA_BPAGE_SIZE; EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, - 0, 0, &self->recvmsg_hdr.msg_namelen)); + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, self->recvmsg_args.num_bpages); } diff --git a/util/homa_test.cc b/util/homa_test.cc index 32572d26..c316967d 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -146,7 +146,6 @@ void test_close() } std::thread thread(close_fd, fd); recv_args.id = 0; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result > 0) { @@ -196,7 +195,6 @@ void test_fill_memory(int fd, const sockaddr_in_union *dest, char *request) total = 0; for (int i = 1; i <= count; i++) { recv_args.id = 0; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); received = recvmsg(fd, &recv_hdr, 0); if (received < 0) { @@ -244,7 +242,6 @@ void test_invoke(int fd, const sockaddr_in_union *dest, char *request) printf("sendmsg succeeded, id %llu\n", homa_args.id); } recv_args.id = 0; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (resp_length < 0) { @@ -337,7 +334,6 @@ void test_poll(int fd, char *request) } recv_args.id = 0; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result < 0) @@ -382,7 +378,6 @@ void test_private(int fd, const sockaddr_in_union *dest, char *request) for (i = 2; i >= 0; i--) { recv_args.id = ids[i]; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (resp_length < 0) { @@ -454,7 +449,6 @@ void test_rtt(int fd, const sockaddr_in_union *dest, char *request) return; } recv_args.id = 0; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (i >= 0) @@ -539,7 +533,6 @@ void test_shutdown(int fd) std::thread thread(shutdown_fd, fd); thread.detach(); recv_args.id = 0; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result > 0) { @@ -551,7 +544,6 @@ void test_shutdown(int fd) /* Make sure that future reads also fail. */ recv_args.id = 0; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); result = recvmsg(fd, &recv_hdr, 0); if (result < 0) { @@ -617,7 +609,6 @@ void test_stream(int fd, const sockaddr_in_union *dest) int *response; recv_args.id = 0; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); resp_length = recvmsg(fd, &recv_hdr, 0); if (resp_length < 0) { @@ -861,7 +852,6 @@ void test_tmp(int fd, int count) h.msg_controllen = sizeof(control); memset(&control, 0, sizeof(control)); - control.flags = HOMA_RECVMSG_NONBLOCKING; int result = recvmsg(fd, &h, 0); printf("recvmsg returned %d, addr %p, namelen %d, control %p, " @@ -913,7 +903,6 @@ void recv_slow(int fd) while (1) { sleep(1); recv_args.id = 0; - recv_args.flags = 0; recv_hdr.msg_controllen = sizeof(recv_args); status = recvmsg(fd, &recv_hdr, 0); if (status < 0) { @@ -1119,7 +1108,6 @@ int main(int argc, char** argv) exit(1); } recv_args.id = 0; - recv_args.flags = 0; recv_args.num_bpages = 0; recv_hdr.msg_name = &source_addr; recv_hdr.msg_namelen = sizeof32(source_addr); From 9397617f3853f050d53cdd7e401419eac3c3fba2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 16:32:33 -0700 Subject: [PATCH 276/625] Replace BOGUS in enum homa_packet_type with MAX_OP --- homa_metrics.c | 4 ++-- homa_metrics.h | 4 ++-- homa_plumbing.c | 2 +- homa_wire.h | 4 ++-- test/unit_homa_plumbing.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index 67dd0416..92d8acf0 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -143,13 +143,13 @@ char *homa_metrics_print(void) m->large_msg_bytes, lower); M("sent_msg_bytes %15llu otal bytes in all outgoing messages\n", m->sent_msg_bytes); - for (i = DATA; i < BOGUS; i++) { + for (i = DATA; i <= MAX_OP; i++) { char *symbol = homa_symbol_for_type(i); M("packets_sent_%-7s %15llu %s packets sent\n", symbol, m->packets_sent[i - DATA], symbol); } - for (i = DATA; i < BOGUS; i++) { + for (i = DATA; i <= MAX_OP; i++) { char *symbol = homa_symbol_for_type(i); M("packets_rcvd_%-7s %15llu %s packets received\n", diff --git a/homa_metrics.h b/homa_metrics.h index f4d026a8..279572ad 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -59,13 +59,13 @@ struct homa_metrics { * @packets_sent: total number of packets sent for each packet type * (entry 0 corresponds to DATA, and so on). */ - u64 packets_sent[BOGUS - DATA]; + u64 packets_sent[MAX_OP + 1 - DATA]; /** * @packets_received: total number of packets received for each * packet type (entry 0 corresponds to DATA, and so on). */ - u64 packets_received[BOGUS - DATA]; + u64 packets_received[MAX_OP + 1 - DATA]; /** @priority_bytes: total bytes sent at each priority level. */ u64 priority_bytes[HOMA_MAX_PRIORITIES]; diff --git a/homa_plumbing.c b/homa_plumbing.c index c490a726..e57c6155 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1342,7 +1342,7 @@ int homa_softirq(struct sk_buff *skb) /* Reject packets that are too short or have bogus types. */ h = (struct homa_common_hdr *)skb->data; if (unlikely(skb->len < sizeof(struct homa_common_hdr) || - h->type < DATA || h->type >= BOGUS || + h->type < DATA || h->type > MAX_OP || skb->len < header_lengths[h->type - DATA])) { #ifndef __STRIP__ /* See strip.py */ const struct in6_addr saddr = diff --git a/homa_wire.h b/homa_wire.h index dc324aa5..507db977 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -27,9 +27,9 @@ enum homa_packet_type { #endif /* See strip.py */ NEED_ACK = 0x17, ACK = 0x18, - BOGUS = 0x19, /* Used only in unit tests. */ + MAX_OP = 0x18, /* If you add a new type here, you must also do the following: - * 1. Change BOGUS so it is the highest opcode + * 1. Change MAX_OP so it is the highest valid opcode * 2. Add support for the new opcode in homa_print_packet, * homa_print_packet_short, homa_symbol_for_type, and mock_skb_new. * 3. Add the header length to header_lengths in homa_plumbing.c. diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index b9785615..18eee10a 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -948,7 +948,7 @@ TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) { struct sk_buff *skb; - self->data.common.type = BOGUS; + self->data.common.type = MAX_OP + 1; skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); From 0bf1d953822a949e47f0243ba6a1ee66757f0d32 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 16:51:39 -0700 Subject: [PATCH 277/625] Remove HOMA_IPV6_HEADER_LENGTH and HOMA_IPV6_HEADER_LENGTH4 Use sizeof(struct iphdr) instead. Also, change HOMS_SKB_EXTRA to use MAX_TCP_HEADER. --- homa_skb.c | 5 ++--- homa_sock.c | 2 +- homa_stub.h | 4 ++-- homa_wire.h | 22 ++++++++++++---------- test/mock.c | 4 ++-- test/unit_homa_outgoing.c | 3 +-- test/unit_homa_sock.c | 4 ++-- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/homa_skb.c b/homa_skb.c index d0c7a5a9..8a4b575c 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -124,11 +124,10 @@ struct sk_buff *homa_skb_new_tx(int length) /* Note: allocate space for an IPv6 header, which is larger than * an IPv4 header. */ - skb = alloc_skb(HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH + - sizeof(struct homa_skb_info) + length, + skb = alloc_skb(HOMA_SKB_EXTRA + sizeof(struct homa_skb_info) + length, GFP_ATOMIC); if (likely(skb)) { - skb_reserve(skb, HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH); + skb_reserve(skb, HOMA_SKB_EXTRA); skb_reset_transport_header(skb); } INC_METRIC(skb_allocs, 1); diff --git a/homa_sock.c b/homa_sock.c index e3e6e175..7fe8e146 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -154,7 +154,7 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) atomic_set(&hsk->protect_count, 0); hsk->homa = homa; hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) - ? HOMA_IPV4_HEADER_LENGTH : HOMA_IPV6_HEADER_LENGTH; + ? sizeof(struct iphdr) :sizeof(struct ipv6hdr); hsk->is_server = false; hsk->shutdown = false; starting_port = homa->prev_default_port; diff --git a/homa_stub.h b/homa_stub.h index 3bfe7b8b..ab87f3cc 100644 --- a/homa_stub.h +++ b/homa_stub.h @@ -76,8 +76,8 @@ static inline struct sk_buff *homa_skb_new_tx(int length) { struct sk_buff *skb; - skb = alloc_skb(HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH + - sizeof(struct homa_skb_info) + length, GFP_ATOMIC); + skb = alloc_skb(HOMA_SKB_EXTRA + sizeof(struct homa_skb_info) + length, + GFP_ATOMIC); if (likely(skb)) { skb_reserve(skb, HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH); skb_reset_transport_header(skb); diff --git a/homa_wire.h b/homa_wire.h index 507db977..2cd64d5c 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -6,6 +6,14 @@ #define _HOMA_WIRE_H #include +#ifdef __UNIT_TEST__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#endif /* __UNIT_TEST__ */ +#include +#ifdef __UNIT_TEST__ +#pragma GCC diagnostic pop +#endif /* __UNIT_TEST__ */ /* Defines the possible types of Homa packets. * @@ -36,19 +44,13 @@ enum homa_packet_type { */ }; -/** define HOMA_IPV6_HEADER_LENGTH - Size of IP header (V6). */ -#define HOMA_IPV6_HEADER_LENGTH 40 - -/** define HOMA_IPV4_HEADER_LENGTH - Size of IP header (V4). */ -#define HOMA_IPV4_HEADER_LENGTH 20 - /** * define HOMA_SKB_EXTRA - How many bytes of additional space to allow at the - * beginning of each sk_buff, before the IP header. This includes room for a - * VLAN header and also includes some extra space, "just to be safe" (not - * really sure if this is needed). + * beginning of each sk_buff, before the Homa header. This includes room for + * either an IPV4 or IPV6 header, Ethernet header, VLAN header, etc. This is + * a bit of an overestimate, since it also includes space for a TCP header. */ -#define HOMA_SKB_EXTRA 40 +#define HOMA_SKB_EXTRA MAX_TCP_HEADER /** * define HOMA_ETH_OVERHEAD - Number of bytes per Ethernet packet for Ethernet diff --git a/test/mock.c b/test/mock.c index c1984b0e..52d0e5ce 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1768,8 +1768,8 @@ void mock_set_core(int num) void mock_set_ipv6(struct homa_sock *hsk) { mock_ipv6 = true; - mock_mtu -= hsk->ip_header_length - HOMA_IPV6_HEADER_LENGTH; - hsk->ip_header_length = HOMA_IPV6_HEADER_LENGTH; + mock_mtu -= hsk->ip_header_length - sizeof(struct ipv6hdr); + hsk->ip_header_length = sizeof(struct ipv6hdr); hsk->sock.sk_family = AF_INET6; } diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 32f8b5ba..5e590069 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -55,8 +55,7 @@ static void mock_resend_data(struct homa_rpc *rpc, int start, int end, static int true_size(int msg_bytes) { return SKB_TRUESIZE(msg_bytes + HOMA_SKB_EXTRA + - HOMA_IPV6_HEADER_LENGTH + sizeof(struct homa_skb_info) + - sizeof(struct homa_data_hdr)); + sizeof(struct homa_skb_info) + sizeof(struct homa_data_hdr)); } FIXTURE(homa_outgoing) { diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index a13d7c58..a1d5a781 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -168,8 +168,8 @@ TEST_F(homa_sock, homa_sock_init__ip_header_length) mock_sock_init(&hsk_v4, &self->homa, 0); mock_ipv6 = true; mock_sock_init(&hsk_v6, &self->homa, 0); - EXPECT_EQ(HOMA_IPV4_HEADER_LENGTH, hsk_v4.ip_header_length); - EXPECT_EQ(HOMA_IPV6_HEADER_LENGTH, hsk_v6.ip_header_length); + EXPECT_EQ(sizeof(struct iphdr), hsk_v4.ip_header_length); + EXPECT_EQ(sizeof(struct ipv6hdr), hsk_v6.ip_header_length); homa_sock_destroy(&hsk_v4); homa_sock_destroy(&hsk_v6); } From 90ee81ac86689b9bf1fc108e64f1b896ab8d06f0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 17:15:36 -0700 Subject: [PATCH 278/625] Rename unit test functions to avoid conflicts with Linux --- test/unit_homa_offload.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 0bc5f3ee..f482456c 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -11,16 +11,16 @@ #define cur_offload_core (&per_cpu(homa_offload_core, smp_processor_id())) -static struct sk_buff *tcp_gro_receive(struct list_head *held_list, +static struct sk_buff *test_tcp_gro_receive(struct list_head *held_list, struct sk_buff *skb) { - UNIT_LOG("; ", "tcp_gro_receive"); + UNIT_LOG("; ", "test_tcp_gro_receive"); return NULL; } -static struct sk_buff *tcp6_gro_receive(struct list_head *held_list, +static struct sk_buff *unit_tcp6_gro_receive(struct list_head *held_list, struct sk_buff *skb) { - UNIT_LOG("; ", "tcp6_gro_receive"); + UNIT_LOG("; ", "unit_tcp6_gro_receive"); return NULL; } @@ -79,9 +79,9 @@ FIXTURE_SETUP(homa_offload) list_add_tail(&self->skb->list, &self->napi.gro_hash[2].list); list_add_tail(&self->skb2->list, &self->napi.gro_hash[2].list); INIT_LIST_HEAD(&self->empty_list); - self->tcp_offloads.callbacks.gro_receive = tcp_gro_receive; + self->tcp_offloads.callbacks.gro_receive = test_tcp_gro_receive; inet_offloads[IPPROTO_TCP] = &self->tcp_offloads; - self->tcp6_offloads.callbacks.gro_receive = tcp6_gro_receive; + self->tcp6_offloads.callbacks.gro_receive = unit_tcp6_gro_receive; inet6_offloads[IPPROTO_TCP] = &self->tcp6_offloads; homa_offload_init(); @@ -115,16 +115,16 @@ TEST_F(homa_offload, homa_gro_hook_tcp) homa_gro_hook_tcp(); homa_gro_unhook_tcp(); - EXPECT_EQ(&tcp_gro_receive, + EXPECT_EQ(&test_tcp_gro_receive, inet_offloads[IPPROTO_TCP]->callbacks.gro_receive); - EXPECT_EQ(&tcp6_gro_receive, + EXPECT_EQ(&unit_tcp6_gro_receive, inet6_offloads[IPPROTO_TCP]->callbacks.gro_receive); /* Second unhook call should do nothing. */ homa_gro_unhook_tcp(); - EXPECT_EQ(&tcp_gro_receive, + EXPECT_EQ(&test_tcp_gro_receive, inet_offloads[IPPROTO_TCP]->callbacks.gro_receive); - EXPECT_EQ(&tcp6_gro_receive, + EXPECT_EQ(&unit_tcp6_gro_receive, inet6_offloads[IPPROTO_TCP]->callbacks.gro_receive); } @@ -139,7 +139,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) h = (struct homa_common_hdr *) skb_transport_header(skb); h->flags = 0; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_STREQ("tcp_gro_receive", unit_log_get()); + EXPECT_STREQ("test_tcp_gro_receive", unit_log_get()); kfree_skb(skb); unit_log_clear(); @@ -147,7 +147,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) h = (struct homa_common_hdr *)skb_transport_header(skb); h->urgent -= 1; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); - EXPECT_STREQ("tcp_gro_receive", unit_log_get()); + EXPECT_STREQ("test_tcp_gro_receive", unit_log_get()); kfree_skb(skb); homa_gro_unhook_tcp(); } From e3fe78c644d9c30455405900637d250ea07e93ab Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 21:12:04 -0700 Subject: [PATCH 279/625] Eliminate sizeof32 define: use sizeof instead --- homa_devel.c | 4 +-- homa_impl.h | 2 -- homa_outgoing.c | 12 ++++---- homa_plumbing.c | 58 +++++++++++++++++++-------------------- test/unit_homa_outgoing.c | 4 +-- test/unit_homa_plumbing.c | 28 +++++++++---------- test/unit_homa_utils.c | 6 ++-- 7 files changed, 56 insertions(+), 58 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 04980482..ba0d4268 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -166,7 +166,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) ", TSO disabled"); if (skb_shinfo(skb)->gso_segs <= 1) break; - pos = skb_transport_offset(skb) + sizeof32(*h) + seg_length; + pos = skb_transport_offset(skb) + sizeof(*h) + seg_length; used = homa_snprintf(buffer, buf_len, used, ", extra segs"); for (i = skb_shinfo(skb)->gso_segs - 1; i > 0; i--) { if (homa_info->seg_length < skb_shinfo(skb)->gso_size) { @@ -296,7 +296,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) if (offset == -1) offset = ntohl(h->common.sequence); - pos = skb_transport_offset(skb) + sizeof32(*h) + seg_length; + pos = skb_transport_offset(skb) + sizeof(*h) + seg_length; used = homa_snprintf(buffer, buf_len, 0, "DATA%s %d@%d", h->retransmit ? " retrans" : "", seg_length, offset); diff --git a/homa_impl.h b/homa_impl.h index 1421171a..daf72df7 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -81,8 +81,6 @@ struct homa_sock; void homa_throttle_lock_slow(struct homa *homa); #endif /* See strip.py */ -#define sizeof32(type) ((int)(sizeof(type))) - #ifdef __CHECKER__ #define __context__(x, y, z) __attribute__((context(x, y, z))) #else diff --git a/homa_outgoing.c b/homa_outgoing.c index 6fe746f0..ca8f3267 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -140,10 +140,10 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, /* Initialize the overall skb. */ #ifndef __STRIP__ /* See strip.py */ - skb = homa_skb_new_tx(sizeof32(struct homa_data_hdr)); + skb = homa_skb_new_tx(sizeof(struct homa_data_hdr)); #else /* See strip.py */ - skb = homa_skb_new_tx(sizeof32(struct homa_data_hdr) + length + - (segs - 1) * sizeof32(struct homa_seg_hdr)); + skb = homa_skb_new_tx(sizeof(struct homa_data_hdr) + length + + (segs - 1) * sizeof(struct homa_seg_hdr)); #endif /* See strip.py */ if (!skb) return ERR_PTR(-ENOMEM); @@ -195,7 +195,7 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, if (segs > 1) { #endif /* See strip.py */ homa_set_doff(h, sizeof(struct homa_data_hdr) - - sizeof32(struct homa_seg_hdr)); + sizeof(struct homa_seg_hdr)); #ifndef __STRIP__ /* See strip.py */ h->seg.offset = htonl(offset); #endif /* See strip.py */ @@ -788,7 +788,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) continue; offset = homa_info->offset; - seg_offset = sizeof32(struct homa_data_hdr); + seg_offset = sizeof(struct homa_data_hdr); data_left = homa_info->data_bytes; if (skb_shinfo(skb)->gso_segs <= 1) { seg_length = data_left; @@ -829,7 +829,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) goto resend_done; } h = __skb_put_data(new_skb, skb_transport_header(skb), - sizeof32(struct homa_data_hdr)); + sizeof(struct homa_data_hdr)); h->common.sequence = htonl(offset); h->seg.offset = htonl(offset); h->retransmit = 1; diff --git a/homa_plumbing.c b/homa_plumbing.c index e57c6155..9b25d885 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -398,27 +398,27 @@ static struct ctl_table homa_ctl_table[] = { /* Sizes of the headers for each Homa packet type, in bytes. */ #ifndef __STRIP__ /* See strip.py */ static __u16 header_lengths[] = { - sizeof32(struct homa_data_hdr), - sizeof32(struct homa_grant_hdr), - sizeof32(struct homa_resend_hdr), - sizeof32(struct homa_rpc_unknown_hdr), - sizeof32(struct homa_busy_hdr), - sizeof32(struct homa_cutoffs_hdr), - sizeof32(struct homa_freeze_hdr), - sizeof32(struct homa_need_ack_hdr), - sizeof32(struct homa_ack_hdr) + sizeof(struct homa_data_hdr), + sizeof(struct homa_grant_hdr), + sizeof(struct homa_resend_hdr), + sizeof(struct homa_rpc_unknown_hdr), + sizeof(struct homa_busy_hdr), + sizeof(struct homa_cutoffs_hdr), + sizeof(struct homa_freeze_hdr), + sizeof(struct homa_need_ack_hdr), + sizeof(struct homa_ack_hdr) }; #else /* See strip.py */ static __u16 header_lengths[] = { - sizeof32(struct homa_data_hdr), + sizeof(struct homa_data_hdr), 0, - sizeof32(struct homa_resend_hdr), - sizeof32(struct homa_rpc_unknown_hdr), - sizeof32(struct homa_busy_hdr), + sizeof(struct homa_resend_hdr), + sizeof(struct homa_rpc_unknown_hdr), + sizeof(struct homa_busy_hdr), 0, 0, - sizeof32(struct homa_need_ack_hdr), - sizeof32(struct homa_ack_hdr) + sizeof(struct homa_need_ack_hdr), + sizeof(struct homa_ack_hdr) }; #endif /* See strip.py */ @@ -434,20 +434,20 @@ int __init homa_load(void) pr_err("Homa module loading\n"); #ifndef __STRIP__ /* See strip.py */ - pr_notice("Homa structure sizes: homa_data_hdr %u, homa_seg_hdr %u, ack %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", - sizeof32(struct homa_data_hdr), - sizeof32(struct homa_seg_hdr), - sizeof32(struct homa_ack), - sizeof32(struct homa_peer), - sizeof32(struct iphdr), - sizeof32(struct flowi), - sizeof32(struct ipv6hdr), - sizeof32(struct flowi6), - sizeof32(struct tcp_sock), - sizeof32(struct homa_rpc), - sizeof32(struct sk_buff), - sizeof32(struct homa_recvmsg_args), - sizeof32(union sockaddr_in_union), + pr_notice("Homa structure sizes: homa_data_hdr %lu, homa_seg_hdr %lu, ack %lu, peer %lu, ip_hdr %lu flowi %lu ipv6_hdr %lu, flowi6 %lu tcp_sock %lu homa_rpc %lu sk_buff %lu rcvmsg_control %lu union sockaddr_in_union %lu HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", + sizeof(struct homa_data_hdr), + sizeof(struct homa_seg_hdr), + sizeof(struct homa_ack), + sizeof(struct homa_peer), + sizeof(struct iphdr), + sizeof(struct flowi), + sizeof(struct ipv6hdr), + sizeof(struct flowi6), + sizeof(struct tcp_sock), + sizeof(struct homa_rpc), + sizeof(struct sk_buff), + sizeof(struct homa_recvmsg_args), + sizeof(union sockaddr_in_union), HOMA_MAX_BPAGES, NR_CPUS, nr_cpu_ids, diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 5e590069..d0b4c330 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -147,8 +147,8 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved) EXPECT_STREQ("DATA from 0.0.0.0:40000, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, extra segs 1500@11500 1500@13000 500@14500", homa_print_packet(skb, buffer, sizeof(buffer))); #endif /* See strip.py */ - EXPECT_EQ(5000 + sizeof32(struct homa_data_hdr) - + 3*sizeof32(struct homa_seg_hdr), skb->len); + EXPECT_EQ(5000 + sizeof(struct homa_data_hdr) + + 3*sizeof(struct homa_seg_hdr), skb->len); kfree_skb(skb); } TEST_F(homa_outgoing, homa_fill_data_interleaved__error_copying_data) diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 18eee10a..c49a7157 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -363,7 +363,7 @@ TEST_F(homa_plumbing, homa_setsockopt__server_success) TEST_F(homa_plumbing, homa_getsockopt__recvbuf_success) { struct homa_rcvbuf_args val; - int size = sizeof32(val) + 10; + int size = sizeof(val) + 10; homa_pool_destroy(self->hsk.buffer_pool); self->hsk.buffer_pool = homa_pool_new(&self->hsk); @@ -374,12 +374,12 @@ TEST_F(homa_plumbing, homa_getsockopt__recvbuf_success) SO_HOMA_RCVBUF, (char *)&val, &size)); EXPECT_EQ(0x40000, val.start); EXPECT_EQ(10*HOMA_BPAGE_SIZE, val.length); - EXPECT_EQ(sizeof32(val), size); + EXPECT_EQ(sizeof(val), size); } TEST_F(homa_plumbing, homa_getsockopt__cant_read_size) { struct homa_rcvbuf_args val; - int size = sizeof32(val); + int size = sizeof(val); mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, 0, SO_HOMA_RCVBUF, @@ -388,7 +388,7 @@ TEST_F(homa_plumbing, homa_getsockopt__cant_read_size) TEST_F(homa_plumbing, homa_getsockopt__bad_level) { struct homa_rcvbuf_args val; - int size = sizeof32(val); + int size = sizeof(val); EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, 0, SO_HOMA_RCVBUF, (char *)&val, &size)); @@ -396,7 +396,7 @@ TEST_F(homa_plumbing, homa_getsockopt__bad_level) TEST_F(homa_plumbing, homa_getsockopt__recvbuf_bad_length) { struct homa_rcvbuf_args val; - int size = sizeof32(val) - 1; + int size = sizeof(val) - 1; EXPECT_EQ(EINVAL, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); @@ -404,7 +404,7 @@ TEST_F(homa_plumbing, homa_getsockopt__recvbuf_bad_length) TEST_F(homa_plumbing, homa_getsockopt__server_bad_length) { int is_server; - int size = sizeof32(is_server) - 1; + int size = sizeof(is_server) - 1; EXPECT_EQ(EINVAL, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_SERVER, (char *)&is_server, &size)); @@ -412,7 +412,7 @@ TEST_F(homa_plumbing, homa_getsockopt__server_bad_length) TEST_F(homa_plumbing, homa_getsockopt__server_success) { int is_server; - int size = sizeof32(is_server); + int size = sizeof(is_server); self->hsk.is_server = 1; EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, @@ -430,7 +430,7 @@ TEST_F(homa_plumbing, homa_getsockopt__server_success) TEST_F(homa_plumbing, homa_getsockopt__bad_optname) { struct homa_rcvbuf_args val; - int size = sizeof32(val); + int size = sizeof(val); EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF-1, (char *)&val, &size)); @@ -438,26 +438,26 @@ TEST_F(homa_plumbing, homa_getsockopt__bad_optname) TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_size) { struct homa_rcvbuf_args val = {.start = 0, .length = 0}; - int size = sizeof32(val) + 10; + int size = sizeof(val) + 10; mock_copy_to_user_errors = 1; EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); EXPECT_EQ(0, val.start); - EXPECT_EQ(sizeof32(val) + 10, size); + EXPECT_EQ(sizeof(val) + 10, size); } TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_value) { struct homa_rcvbuf_args val = {.start = 0, .length = 0}; - int size = sizeof32(val) + 10; + int size = sizeof(val) + 10; mock_copy_to_user_errors = 2; EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); EXPECT_EQ(0, val.start); - EXPECT_EQ(sizeof32(val), size); + EXPECT_EQ(sizeof(val), size); } TEST_F(homa_plumbing, homa_sendmsg__msg_name_null) @@ -773,7 +773,7 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) EXPECT_EQ(AF_INET, self->addr.in4.sin_family); EXPECT_STREQ("1.2.3.4", homa_print_ipv4_addr( self->addr.in4.sin_addr.s_addr)); - EXPECT_EQ(sizeof32(struct sockaddr_in), + EXPECT_EQ(sizeof(struct sockaddr_in), self->recvmsg_hdr.msg_namelen); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, self->recvmsg_args.num_bpages); @@ -804,7 +804,7 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv6) EXPECT_EQ(AF_INET6, self->addr.in6.sin6_family); EXPECT_STREQ("[1::3:5:7]", homa_print_ipv6_addr( &self->addr.in6.sin6_addr)); - EXPECT_EQ(sizeof32(struct sockaddr_in6), + EXPECT_EQ(sizeof(struct sockaddr_in6), self->recvmsg_hdr.msg_namelen); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, crpc->msgin.num_bpages); diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index bf63f5bc..e66c3ee4 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -113,18 +113,18 @@ TEST_F(homa_utils, homa_snprintf) char buffer[50]; int used = 0; - used = homa_snprintf(buffer, sizeof32(buffer), used, + used = homa_snprintf(buffer, sizeof(buffer), used, "Test message with values: %d and %d", 100, 1000); EXPECT_EQ(38, used); EXPECT_STREQ("Test message with values: 100 and 1000", buffer); - used = homa_snprintf(buffer, sizeof32(buffer), used, + used = homa_snprintf(buffer, sizeof(buffer), used, "; plus: %d", 123456); EXPECT_EQ(49, used); EXPECT_STREQ("Test message with values: 100 and 1000; plus: 123", buffer); - used = homa_snprintf(buffer, sizeof32(buffer), used, + used = homa_snprintf(buffer, sizeof(buffer), used, "more text, none of which fits"); EXPECT_EQ(49, used); EXPECT_STREQ("Test message with values: 100 and 1000; plus: 123", From 54e83b9cd151f1578fe7ebf395ac51b74cb490b2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 21:25:08 -0700 Subject: [PATCH 280/625] Delete unused is_homa_pkt function Doesn't work correctly anyway (no IPv6 support). --- homa_impl.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index daf72df7..0f2bf9d6 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -614,19 +614,6 @@ static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb) return mapped; } -static inline bool is_homa_pkt(struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - -#ifndef __STRIP__ /* See strip.py */ - return ((iph->protocol == IPPROTO_HOMA) || - ((iph->protocol == IPPROTO_TCP) && - (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); -#else /* See strip.py */ - return (iph->protocol == IPPROTO_HOMA); -#endif /* See strip.py */ -} - /** * homa_make_header_avl() - Invokes pskb_may_pull to make sure that all the * Homa header information for a packet is in the linear part of the skb From ecb8988a003fbf4c6c4e42cf9556b4848ae73191 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 May 2025 21:31:22 -0700 Subject: [PATCH 281/625] Fix stripping bug in homa_impl.h Code that was supposed to get stripped was slipping through to the upstream version. --- homa_impl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/homa_impl.h b/homa_impl.h index 0f2bf9d6..919ed8e6 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -631,6 +631,7 @@ static inline bool homa_make_header_avl(struct sk_buff *skb) return pskb_may_pull(skb, pull_length); } +#ifndef __STRIP__ /* See strip.py */ #ifdef __UNIT_TEST__ void unit_log_printf(const char *separator, const char *format, ...) __printf(2, 3); @@ -641,6 +642,7 @@ void unit_hook(char *id); #define UNIT_LOG(...) #define UNIT_HOOK(...) #endif /* __UNIT_TEST__ */ +#endif /* See strip.py */ extern unsigned int homa_net_id; From 17f791fcbf813e20827fb512c71b9c31d670bfb2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 09:13:09 -0700 Subject: [PATCH 282/625] Fix issues from 'checkpatch.pl --strict' --- homa_devel.h | 4 ++-- homa_grant.c | 11 ++++++----- homa_incoming.c | 4 ++-- homa_interest.h | 2 +- homa_metrics.h | 2 +- homa_offload.c | 2 -- homa_outgoing.c | 4 ++-- homa_pacer.c | 14 +++++++------- homa_plumbing.c | 12 ++++++------ homa_pool.c | 9 +++++---- homa_rpc.h | 1 - homa_sock.c | 4 ++-- homa_stub.h | 2 +- 13 files changed, 35 insertions(+), 36 deletions(-) diff --git a/homa_devel.h b/homa_devel.h index ee4bf716..8e0638ae 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -52,7 +52,7 @@ static inline void check_addr_valid(void *addr, char *info) { #ifndef __UNIT_TEST__ #define HIGH_BITS 0xffff800000000000 - u64 int_addr = (u64) addr; + u64 int_addr = (u64)addr; if ((int_addr & HIGH_BITS) != HIGH_BITS) { pr_err("Bogus address 0x%px (%s))\n", addr, info); @@ -81,7 +81,7 @@ char *homa_print_ipv4_addr(__be32 addr); char *homa_print_ipv6_addr(const struct in6_addr *addr); char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); char *homa_print_packet_short(struct sk_buff *skb, char *buffer, - int buf_len); + int buf_len); int homa_snprintf(char *buffer, int size, int used, const char *format, ...) __printf(4, 5); char *homa_symbol_for_type(uint8_t type); diff --git a/homa_grant.c b/homa_grant.c index 792696bc..4525f403 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -15,7 +15,7 @@ * @data fields are actually offsets within a struct homa_grant; these are * converted to pointers into a net-specific struct grant later. */ -#define OFFSET(field) ((void *) offsetof(struct homa_grant, field)) +#define OFFSET(field) ((void *)offsetof(struct homa_grant, field)) static struct ctl_table grant_ctl_table[] = { { .procname = "fifo_grant_increment", @@ -154,7 +154,8 @@ void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched) rpc->msgin.prev_grant = rpc->msgin.granted; return; } - rpc->msgin.granted = rpc->msgin.prev_grant = unsched; + rpc->msgin.granted = unsched; + rpc->msgin.prev_grant = unsched; homa_grant_manage_rpc(rpc); } @@ -611,8 +612,8 @@ bool homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) if (avl_incoming < incoming_delta) { atomic_set(&grant->incoming_hit_limit, 1); tt_record3("insufficient headroom for grant: needed %d, available %d, used %d", - incoming_delta, avl_incoming, - atomic_read(&grant->total_incoming)); + incoming_delta, avl_incoming, + atomic_read(&grant->total_incoming)); new_grant_offset -= incoming_delta - avl_incoming; } if (new_grant_offset <= rpc->msgin.granted) @@ -944,7 +945,7 @@ int homa_grant_dointvec(const struct ctl_table *table, int write, * net-specific struct homa. */ table_copy = *table; - table_copy.data = ((char *) grant) + (uintptr_t) table_copy.data; + table_copy.data = ((char *)grant) + (uintptr_t)table_copy.data; result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); if (write) diff --git a/homa_incoming.c b/homa_incoming.c index e976ab24..c22a7437 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1239,7 +1239,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) rpc->msgin.bytes_remaining == 0 && skb_queue_len(&rpc->msgin.packets) == 0) { tt_record2("homa_wait_private found rpc id %d, pid %d via null, blocked 0", - rpc->id, current->pid); + rpc->id, current->pid); break; } @@ -1259,7 +1259,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); homa_interest_unlink_private(&interest); tt_record3("homa_wait_private found rpc id %d, pid %d via handoff, blocked %d", - rpc->id, current->pid, interest.blocked); + rpc->id, current->pid, interest.blocked); /* If homa_interest_wait returned an error but the interest * actually got ready, then ignore the error. diff --git a/homa_interest.h b/homa_interest.h index e8607e18..d08adbbe 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -92,7 +92,7 @@ static inline void homa_interest_unlink_private(struct homa_interest *interest) } void homa_interest_init_shared(struct homa_interest *interest, - struct homa_sock *hsk); + struct homa_sock *hsk); int homa_interest_init_private(struct homa_interest *interest, struct homa_rpc *rpc); void homa_interest_notify_private(struct homa_rpc *rpc); diff --git a/homa_metrics.h b/homa_metrics.h index 279572ad..5fe7a97c 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -117,7 +117,7 @@ struct homa_metrics { /** * @wait_fast: total number of times that a message arrived for * a receiving thread while it was polling (i.e. the message - * wasn't immediatly available, but the thread never blocked). + * wasn't immediately available, but the thread never blocked). */ u64 wait_fast; diff --git a/homa_offload.c b/homa_offload.c index 0eab9739..afc0915d 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -161,8 +161,6 @@ struct sk_buff *homa_tcp_gro_receive(struct list_head *held_list, } return homa_gro_receive(held_list, skb); } -#ifndef __STRIP__ /* See strip.py */ -#endif /* See strip.py */ /** * homa_set_softirq_cpu() - Arrange for SoftIRQ processing of a packet to diff --git a/homa_outgoing.c b/homa_outgoing.c index ca8f3267..0b0c3fdf 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -862,8 +862,8 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) new_homa_info->offset = offset; tt_record3("retransmitting offset %d, length %d, id %d", offset, seg_length, rpc->id); - homa_pacer_check_nic_q(rpc->hsk->homa->pacer, - new_skb, true); + homa_pacer_check_nic_q(rpc->hsk->homa->pacer, new_skb, + true); #ifndef __STRIP__ /* See strip.py */ __homa_xmit_data(new_skb, rpc, priority); #else /* See strip.py */ diff --git a/homa_pacer.c b/homa_pacer.c index 2cc0839b..9462fc50 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -13,7 +13,7 @@ * @data fields are actually offsets within a struct homa_pacer; these are * converted to pointers into a net-specific struct homa later. */ -#define OFFSET(field) ((void *) offsetof(struct homa_pacer, field)) +#define OFFSET(field) ((void *)offsetof(struct homa_pacer, field)) static struct ctl_table pacer_ctl_table[] = { { .procname = "link_mbps", @@ -271,7 +271,7 @@ void homa_pacer_xmit(struct homa_pacer *pacer) pacer->fifo_count += 1000; rpc = NULL; list_for_each_entry(cur, &pacer->throttled_rpcs, - throttled_links) { + throttled_links) { if (cur->msgout.init_ns < oldest) { rpc = cur; oldest = cur->msgout.init_ns; @@ -312,7 +312,7 @@ void homa_pacer_xmit(struct homa_pacer *pacer) * (right now), so remove it from the throttled list. */ tt_record2("pacer removing id %d from throttled list, offset %d", - rpc->id, rpc->msgout.next_xmit_offset); + rpc->id, rpc->msgout.next_xmit_offset); homa_pacer_unmanage_rpc(rpc); } homa_rpc_unlock(rpc); @@ -347,7 +347,7 @@ void homa_pacer_manage_rpc(struct homa_rpc *rpc) bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; homa_pacer_throttle_lock(pacer); list_for_each_entry(candidate, &pacer->throttled_rpcs, - throttled_links) { + throttled_links) { int bytes_left_cand; checks++; @@ -359,7 +359,7 @@ void homa_pacer_manage_rpc(struct homa_rpc *rpc) candidate->msgout.next_xmit_offset; if (bytes_left_cand > bytes_left) { list_add_tail(&rpc->throttled_links, - &candidate->throttled_links); + &candidate->throttled_links); goto done; } } @@ -426,7 +426,7 @@ void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) * Return: 0 for success, nonzero for error. */ int homa_pacer_dointvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct homa_pacer *pacer = homa_from_net(current->nsproxy->net_ns)->pacer; struct ctl_table table_copy; @@ -436,7 +436,7 @@ int homa_pacer_dointvec(const struct ctl_table *table, int write, * net-specific struct homa. */ table_copy = *table; - table_copy.data = ((char *) pacer) + (uintptr_t) table_copy.data; + table_copy.data = ((char *)pacer) + (uintptr_t)table_copy.data; result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); if (write) diff --git a/homa_plumbing.c b/homa_plumbing.c index 9b25d885..42844b25 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -159,7 +159,7 @@ static struct inet6_protocol homav6_protocol = { * @data fields are actually offsets within a struct homa; these are converted * to pointers into a net-specific struct homa later. */ -#define OFFSET(field) ((void *) offsetof(struct homa, field)) +#define OFFSET(field) ((void *)offsetof(struct homa, field)) static struct ctl_table homa_ctl_table[] = { { .procname = "action", @@ -346,7 +346,7 @@ static struct ctl_table homa_ctl_table[] = { { .procname = "temp", .data = OFFSET(temp[0]), - .maxlen = sizeof(((struct homa *) 0)->temp), + .maxlen = sizeof(((struct homa *)0)->temp), .mode = 0644, .proc_handler = homa_dointvec }, @@ -579,7 +579,7 @@ int homa_net_init(struct net *net) #ifndef __STRIP__ /* See strip.py */ homa->sysctl_header = register_net_sysctl(net, "net/homa", - homa_ctl_table); + homa_ctl_table); if (!homa->sysctl_header) { pr_err("couldn't register Homa sysctl parameters\n"); status = -ENOMEM; @@ -836,7 +836,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, * first page of the region. */ if (copy_to_user(u64_to_user_ptr(args.start), &args, - sizeof(args))) + sizeof(args))) return -EFAULT; homa_sock_lock(hsk); @@ -962,7 +962,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) goto error; } if (args.flags & ~HOMA_SENDMSG_VALID_FLAGS || - (args.reserved != 0)) { + args.reserved != 0) { result = -EINVAL; goto error; } @@ -1599,7 +1599,7 @@ int homa_dointvec(const struct ctl_table *table, int write, * net-specific struct homa. */ table_copy = *table; - table_copy.data = ((char *) homa) + (uintptr_t) table_copy.data; + table_copy.data = ((char *)homa) + (uintptr_t)table_copy.data; result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); if (write) { diff --git a/homa_pool.c b/homa_pool.c index d6e5927b..ee6755af 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -36,9 +36,10 @@ static void set_bpages_needed(struct homa_pool *pool) { struct homa_rpc *rpc = list_first_entry(&pool->hsk->waiting_for_bufs, - struct homa_rpc, buf_links); - pool->bpages_needed = (rpc->msgin.length + HOMA_BPAGE_SIZE - 1) - >> HOMA_BPAGE_SHIFT; + struct homa_rpc, buf_links); + + pool->bpages_needed = (rpc->msgin.length + HOMA_BPAGE_SIZE - 1) >> + HOMA_BPAGE_SHIFT; } /** @@ -68,7 +69,7 @@ struct homa_pool *homa_pool_new(struct homa_sock *hsk) * Return: Either zero (for success) or a negative errno for failure. */ int homa_pool_set_region(struct homa_pool *pool, void __user *region, - u64 region_size) + u64 region_size) { int i, result; diff --git a/homa_rpc.h b/homa_rpc.h index 0efee70d..d4827216 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -189,7 +189,6 @@ struct homa_message_in { */ int rec_incoming; - /** * @birth: sched_clock() time when homa_grant_manage_rpc was invoked * for this RPC. Managed by homa_grant.c. Only set if the RPC needs diff --git a/homa_sock.c b/homa_sock.c index 7fe8e146..b4eb9d6b 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -154,7 +154,7 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) atomic_set(&hsk->protect_count, 0); hsk->homa = homa; hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) - ? sizeof(struct iphdr) :sizeof(struct ipv6hdr); + ? sizeof(struct iphdr) : sizeof(struct ipv6hdr); hsk->is_server = false; hsk->shutdown = false; starting_port = homa->prev_default_port; @@ -300,7 +300,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) tx_memory = refcount_read(&hsk->sock.sk_wmem_alloc); if (tx_memory != 1) { pr_err("%s found sk_wmem_alloc %llu bytes, port %d\n", - __func__, tx_memory, hsk->port); + __func__, tx_memory, hsk->port); #ifdef __UNIT_TEST__ FAIL(" sk_wmem_alloc %llu after shutdown for port %d", tx_memory, hsk->port); diff --git a/homa_stub.h b/homa_stub.h index ab87f3cc..206d2656 100644 --- a/homa_stub.h +++ b/homa_stub.h @@ -77,7 +77,7 @@ static inline struct sk_buff *homa_skb_new_tx(int length) struct sk_buff *skb; skb = alloc_skb(HOMA_SKB_EXTRA + sizeof(struct homa_skb_info) + length, - GFP_ATOMIC); + GFP_ATOMIC); if (likely(skb)) { skb_reserve(skb, HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH); skb_reset_transport_header(skb); From 1e06561db825fe1f36e261341cc03936099ef437 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 10:05:04 -0700 Subject: [PATCH 283/625] Use "alloc" in function names instead of "new" Also use "free" instead of "destroy" Cleaned up a few other bad name choices as well --- homa_grant.c | 12 +- homa_grant.h | 6 +- homa_impl.h | 9 +- homa_incoming.c | 18 +-- homa_metrics.c | 2 +- homa_metrics.h | 6 +- homa_outgoing.c | 20 +-- homa_pacer.c | 10 +- homa_pacer.h | 4 +- homa_plumbing.c | 20 +-- homa_pool.c | 14 +-- homa_pool.h | 6 +- homa_rpc.c | 24 ++-- homa_rpc.h | 21 ++-- homa_skb.c | 8 +- homa_skb.h | 2 +- homa_sock.c | 4 +- homa_stub.h | 2 +- homa_timer.c | 6 +- homa_utils.c | 8 +- homa_wire.h | 2 +- test/mock.c | 8 +- test/mock.h | 4 +- test/unit_homa_grant.c | 26 ++-- test/unit_homa_incoming.c | 254 +++++++++++++++++++------------------- test/unit_homa_offload.c | 50 ++++---- test/unit_homa_outgoing.c | 92 +++++++------- test/unit_homa_pacer.c | 22 ++-- test/unit_homa_plumbing.c | 72 +++++------ test/unit_homa_pool.c | 62 +++++----- test/unit_homa_rpc.c | 104 ++++++++-------- test/unit_homa_skb.c | 22 ++-- test/unit_homa_timer.c | 54 ++++---- test/utils.c | 12 +- 34 files changed, 493 insertions(+), 493 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 4525f403..1175652b 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -77,12 +77,12 @@ static struct ctl_table grant_ctl_table[] = { #endif /* See strip.py */ /** - * homa_grant_new() - Allocate and initialize a new grant object, which + * homa_grant_alloc() - Allocate and initialize a new grant object, which * will hold grant management information for @homa. * @net: Network namespace that @homa is associated with. * Return: A pointer to the new struct grant, or a negative errno. */ -struct homa_grant *homa_grant_new(struct net *net) +struct homa_grant *homa_grant_alloc(struct net *net) { struct homa_grant *grant; int err; @@ -114,17 +114,17 @@ struct homa_grant *homa_grant_new(struct net *net) return grant; error: - homa_grant_destroy(grant); + homa_grant_free(grant); return ERR_PTR(err); } /** - * homa_grant_destroy() - Cleanup and destroy the grant object for a Homa + * homa_grant_free() - Cleanup and free the grant object for a Homa * transport. - * @grant: Object to destroy; caller must not reference the object + * @grant: Object to free; caller must not reference the object * again once this function returns. */ -void homa_grant_destroy(struct homa_grant *grant) +void homa_grant_free(struct homa_grant *grant) { #ifndef __STRIP__ /* See strip.py */ if (grant->sysctl_header) { diff --git a/homa_grant.h b/homa_grant.h index af1456b4..37053252 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -220,17 +220,19 @@ struct homa_grant_candidates { }; +struct homa_grant + *homa_grant_alloc(struct net *net); void homa_grant_cand_add(struct homa_grant_candidates *cand, struct homa_rpc *rpc); void homa_grant_cand_check(struct homa_grant_candidates *cand, struct homa_grant *grant); void homa_grant_check_rpc(struct homa_rpc *rpc); -void homa_grant_destroy(struct homa_grant *grant); int homa_grant_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void homa_grant_end_rpc(struct homa_rpc *rpc); void homa_grant_find_oldest(struct homa *homa); void homa_grant_fix_order(struct homa_grant *grant); +void homa_grant_free(struct homa_grant *grant); void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched); struct homa_rpc *homa_grant_insert_active(struct homa_rpc *rpc); @@ -238,8 +240,6 @@ void homa_grant_insert_grantable(struct homa_rpc *rpc); void homa_grant_manage_rpc(struct homa_rpc *rpc); void homa_grant_lock_slow(struct homa_grant *grant); void homa_grant_log_tt(struct homa *homa); -struct homa_grant - *homa_grant_new(struct net *net); int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2); void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); diff --git a/homa_impl.h b/homa_impl.h index 919ed8e6..30805b45 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -667,7 +667,7 @@ int homa_err_handler_v6(struct sk_buff *skb, int offset, __be32 info); int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter); -struct homa_gap *homa_gap_new(struct list_head *next, int start, int end); +struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end); void homa_gap_retry(struct homa_rpc *rpc); int homa_get_port(struct sock *sk, unsigned short snum); int homa_getsockopt(struct sock *sk, int level, int optname, @@ -682,9 +682,6 @@ int homa_message_out_fill(struct homa_rpc *rpc, void homa_message_out_init(struct homa_rpc *rpc, int length); void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); -struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, - struct iov_iter *iter, int offset, - int length, int max_seg_data); int homa_net_init(struct net *net); void homa_net_exit(struct net *net); __poll_t homa_poll(struct file *file, struct socket *sock, @@ -705,7 +702,11 @@ int homa_shutdown(struct socket *sock, int how); int homa_softirq(struct sk_buff *skb); void homa_spin(int ns); void homa_timer(struct homa *homa); +void homa_timer_check_rpc(struct homa_rpc *rpc); int homa_timer_main(void *transport); +struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, + struct iov_iter *iter, int offset, + int length, int max_seg_data); void homa_unhash(struct sock *sk); void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_unload(void); diff --git a/homa_incoming.c b/homa_incoming.c index c22a7437..a18974e0 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -51,7 +51,7 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) skb_queue_head_init(&rpc->msgin.packets); INIT_LIST_HEAD(&rpc->msgin.gaps); rpc->msgin.bytes_remaining = length; - err = homa_pool_allocate(rpc); + err = homa_pool_alloc_msg(rpc); if (err != 0) { rpc->msgin.length = -1; return err; @@ -71,14 +71,14 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) } /** - * homa_gap_new() - Create a new gap and add it to a list. + * homa_gap_alloc() - Allocate a new gap and add it to a gap list. * @next: Add the new gap just before this list element. * @start: Offset of first byte covered by the gap. * @end: Offset of byte just after the last one covered by the gap. * Return: Pointer to the new gap, or NULL if memory couldn't be allocated * for the gap object. */ -struct homa_gap *homa_gap_new(struct list_head *next, int start, int end) +struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end) { struct homa_gap *gap; @@ -145,7 +145,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (start > rpc->msgin.recv_end) { /* Packet creates a new gap. */ - if (!homa_gap_new(&rpc->msgin.gaps, + if (!homa_gap_alloc(&rpc->msgin.gaps, rpc->msgin.recv_end, start)) { pr_err("Homa couldn't allocate gap: insufficient memory\n"); tt_record2("Couldn't allocate gap for id %d (start %d): no memory", @@ -198,7 +198,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) } /* Packet is in the middle of the gap; must split the gap. */ - gap2 = homa_gap_new(&gap->links, gap->start, start); + gap2 = homa_gap_alloc(&gap->links, gap->start, start); if (!gap2) { pr_err("Homa couldn't allocate gap for split: insufficient memory\n"); tt_record2("Couldn't allocate gap for split for id %d (start %d): no memory", @@ -474,7 +474,7 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) /* Create a new RPC if one doesn't * already exist. */ - rpc = homa_rpc_new_server(hsk, &saddr, + rpc = homa_rpc_alloc_server(hsk, &saddr, h, &created); if (IS_ERR(rpc)) { pr_warn("homa_pkt_dispatch couldn't create server rpc: error %lu", @@ -484,11 +484,11 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) goto discard; } } else { - rpc = homa_find_server_rpc(hsk, &saddr, + rpc = homa_rpc_find_server(hsk, &saddr, id); } } else { - rpc = homa_find_client_rpc(hsk, id); + rpc = homa_rpc_find_client(hsk, id); } } if (unlikely(!rpc)) { @@ -645,7 +645,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) #endif /* See strip.py */ goto discard; } else if (rpc->state != RPC_INCOMING) { - /* Must be server; note that homa_rpc_new_server already + /* Must be server; note that homa_rpc_alloc_server already * initialized msgin and allocated buffers. */ if (unlikely(rpc->msgin.length >= 0)) diff --git a/homa_metrics.c b/homa_metrics.c index 92d8acf0..17db997b 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -347,7 +347,7 @@ char *homa_metrics_print(void) m->ignored_need_acks); M("bpage_reuses %15llu Buffer page could be reused because ref count was zero\n", m->bpage_reuses); - M("buffer_alloc_failures %15llu homa_pool_allocate didn't find enough buffer space for an RPC\n", + M("buffer_alloc_failures %15llu homa_pool_alloc_msg didn't find enough buffer space for an RPC\n", m->buffer_alloc_failures); M("linux_pkt_alloc_bytes %15llu Bytes allocated in new packets by NIC driver due to cache overflows\n", m->linux_pkt_alloc_bytes); diff --git a/homa_metrics.h b/homa_metrics.h index 5fe7a97c..eab9ae58 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -74,11 +74,11 @@ struct homa_metrics { u64 priority_packets[HOMA_MAX_PRIORITIES]; /** - * @skb_allocs: total number of calls to homa_skb_new_tx. + * @skb_allocs: total number of calls to homa_skb_alloc_tx. */ u64 skb_allocs; - /** @skb_alloc_ns: total time spent in homa_skb_new_tx. */ + /** @skb_alloc_ns: total time spent in homa_skb_alloc_tx. */ u64 skb_alloc_ns; /** @@ -576,7 +576,7 @@ struct homa_metrics { /** * @buffer_alloc_failures: total number of times that - * homa_pool_allocate was unable to allocate buffer space for + * homa_pool_alloc_msg was unable to allocate buffer space for * an incoming message. */ u64 buffer_alloc_failures; diff --git a/homa_outgoing.c b/homa_outgoing.c index 0b0c3fdf..e2181107 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -107,9 +107,9 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, } /** - * homa_new_data_packet() - Allocate a new sk_buff and fill it with a Homa - * data packet. The resulting packet will be a GSO packet that will eventually - * be segmented by the NIC. + * homa_tx_data_pkt_alloc() - Allocate a new sk_buff and fill it with an + * outgoing Homa data packet. The resulting packet will be a GSO packet + * that will eventually be segmented by the NIC. * @rpc: RPC that packet will belong to (msgout must have been * initialized). Must be locked by caller. * @iter: Describes location(s) of (remaining) message data in user @@ -124,7 +124,7 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, * a single segment of the GSO packet. * Return: A pointer to the new packet, or a negative errno. */ -struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, +struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data) __must_hold(rpc_bucket_lock) @@ -140,9 +140,9 @@ struct sk_buff *homa_new_data_packet(struct homa_rpc *rpc, /* Initialize the overall skb. */ #ifndef __STRIP__ /* See strip.py */ - skb = homa_skb_new_tx(sizeof(struct homa_data_hdr)); + skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr)); #else /* See strip.py */ - skb = homa_skb_new_tx(sizeof(struct homa_data_hdr) + length + + skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) + length + (segs - 1) * sizeof(struct homa_seg_hdr)); #endif /* See strip.py */ if (!skb) @@ -354,7 +354,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) #endif /* See strip.py */ if (skb_data_bytes > bytes_left) skb_data_bytes = bytes_left; - skb = homa_new_data_packet(rpc, iter, offset, skb_data_bytes, + skb = homa_tx_data_pkt_alloc(rpc, iter, offset, skb_data_bytes, max_seg_data); if (IS_ERR(skb)) { err = PTR_ERR(skb); @@ -461,7 +461,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, int result; dst = homa_get_dst(peer, hsk); - skb = homa_skb_new_tx(HOMA_MAX_HEADER); + skb = homa_skb_alloc_tx(HOMA_MAX_HEADER); if (unlikely(!skb)) return -ENOBUFS; dst_hold(dst); @@ -813,10 +813,10 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) /* This segment must be retransmitted. */ #ifndef __STRIP__ /* See strip.py */ - new_skb = homa_skb_new_tx(sizeof(struct homa_data_hdr) + new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) - sizeof(struct homa_seg_hdr)); #else /* See strip.py */ - new_skb = homa_skb_new_tx(sizeof(struct homa_data_hdr) + new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) + seg_length); #endif /* See strip.py */ if (unlikely(!new_skb)) { diff --git a/homa_pacer.c b/homa_pacer.c index 9462fc50..e2ae5e2d 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -47,13 +47,13 @@ static struct ctl_table pacer_ctl_table[] = { #endif /* See strip.py */ /** - * homa_pacer_new() - Allocate and initialize a new pacer object, which + * homa_pacer_alloc() - Allocate and initialize a new pacer object, which * will hold pacer-related information for @homa. * @homa: Homa transport that the pacer will be associated with. * @net: Network namespace that @homa is associated with. * Return: A pointer to the new struct pacer, or a negative errno. */ -struct homa_pacer *homa_pacer_new(struct homa *homa, struct net *net) +struct homa_pacer *homa_pacer_alloc(struct homa *homa, struct net *net) { struct homa_pacer *pacer; int err; @@ -94,17 +94,17 @@ struct homa_pacer *homa_pacer_new(struct homa *homa, struct net *net) return pacer; error: - homa_pacer_destroy(pacer); + homa_pacer_free(pacer); return ERR_PTR(err); } /** - * homa_pacer_destroy() - Cleanup and destroy the pacer object for a Homa + * homa_pacer_free() - Cleanup and free the pacer object for a Homa * transport. * @pacer: Object to destroy; caller must not reference the object * again once this function returns. */ -void homa_pacer_destroy(struct homa_pacer *pacer) +void homa_pacer_free(struct homa_pacer *pacer) { pacer->exit = true; #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_pacer.h b/homa_pacer.h index 68f803bd..09e17e78 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -145,16 +145,16 @@ struct homa_pacer { atomic64_t link_idle_time ____cacheline_aligned_in_smp; }; +struct homa_pacer *homa_pacer_alloc(struct homa *homa, struct net *net); int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, bool force); int homa_pacer_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -void homa_pacer_destroy(struct homa_pacer *pacer); +void homa_pacer_free(struct homa_pacer *pacer); void homa_pacer_unmanage_rpc(struct homa_rpc *rpc); void homa_pacer_log_throttled(struct homa_pacer *pacer); int homa_pacer_main(void *transport); void homa_pacer_manage_rpc(struct homa_rpc *rpc); -struct homa_pacer *homa_pacer_new(struct homa *homa, struct net *net); void homa_pacer_throttle_lock_slow(struct homa_pacer *pacer); void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer); void homa_pacer_xmit(struct homa_pacer *pacer); diff --git a/homa_plumbing.c b/homa_plumbing.c index 42844b25..80de5fc8 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -736,14 +736,14 @@ int homa_ioc_abort(struct sock *sk, int *karg) return 0; } - rpc = homa_find_client_rpc(hsk, args.id); + rpc = homa_rpc_find_client(hsk, args.id); if (!rpc) return -EINVAL; if (args.error == 0) homa_rpc_end(rpc); else homa_rpc_abort(rpc, -args.error); - homa_rpc_unlock(rpc); /* Locked by homa_find_client_rpc. */ + homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_client. */ return ret; } #endif /* See strip.py */ @@ -988,7 +988,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (!args.id) { /* This is a request message. */ - rpc = homa_rpc_new_client(hsk, addr); + rpc = homa_rpc_alloc_client(hsk, addr); if (IS_ERR(rpc)) { result = PTR_ERR(rpc); rpc = NULL; @@ -1007,12 +1007,12 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (result) goto error; args.id = rpc->id; - homa_rpc_unlock(rpc); /* Locked by homa_rpc_new_client. */ + homa_rpc_unlock(rpc); /* Locked by homa_rpc_alloc_client. */ rpc = NULL; if (unlikely(copy_to_user((void __user *)msg->msg_control, &args, sizeof(args)))) { - rpc = homa_find_client_rpc(hsk, args.id); + rpc = homa_rpc_find_client(hsk, args.id); result = -EFAULT; goto error; } @@ -1034,7 +1034,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) } canonical_dest = canonical_ipv6_addr(addr); - rpc = homa_find_server_rpc(hsk, &canonical_dest, args.id); + rpc = homa_rpc_find_server(hsk, &canonical_dest, args.id); if (!rpc) { /* Return without an error if the RPC doesn't exist; * this could be totally valid (e.g. client is @@ -1051,7 +1051,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (rpc->state != RPC_IN_SERVICE) { tt_record2("homa_sendmsg error: RPC id %d in bad state %d", rpc->id, rpc->state); - /* Locked by homa_find_server_rpc. */ + /* Locked by homa_rpc_find_server. */ homa_rpc_unlock(rpc); rpc = NULL; result = -EINVAL; @@ -1062,7 +1062,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) result = homa_message_out_fill(rpc, &msg->msg_iter, 1); if (result && rpc->state != RPC_DEAD) goto error; - homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ + homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ #ifndef __STRIP__ /* See strip.py */ finish = sched_clock(); #endif /* See strip.py */ @@ -1074,7 +1074,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) error: if (rpc) { homa_rpc_end(rpc); - homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ + homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ } tt_record2("homa_sendmsg returning error %d for id %d", result, args.id); @@ -1139,7 +1139,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, nonblocking = flags & MSG_DONTWAIT; if (control.id != 0) { - rpc = homa_find_client_rpc(hsk, control.id); /* Locks RPC. */ + rpc = homa_rpc_find_client(hsk, control.id); /* Locks RPC. */ if (!rpc) { result = -EINVAL; goto done; diff --git a/homa_pool.c b/homa_pool.c index ee6755af..6307ea45 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -43,12 +43,12 @@ static void set_bpages_needed(struct homa_pool *pool) } /** - * homa_pool_new() - Allocate and initialize a new homa_pool (it will have + * homa_pool_alloc() - Allocate and initialize a new homa_pool (it will have * no region associated with it until homa_pool_set_region is invoked). * @hsk: Socket the pool will be associated with. * Return: A pointer to the new pool or a negative errno. */ -struct homa_pool *homa_pool_new(struct homa_sock *hsk) +struct homa_pool *homa_pool_alloc(struct homa_sock *hsk) { struct homa_pool *pool; @@ -122,11 +122,11 @@ int homa_pool_set_region(struct homa_pool *pool, void __user *region, } /** - * homa_pool_destroy() - Destructor for homa_pool. After this method + * homa_pool_free() - Destructor for homa_pool. After this method * returns, the object should not be used (it will be freed here). * @pool: Pool to destroy. */ -void homa_pool_destroy(struct homa_pool *pool) +void homa_pool_free(struct homa_pool *pool) { if (pool->region) { kfree(pool->descriptors); @@ -266,7 +266,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, } /** - * homa_pool_allocate() - Allocate buffer space for an RPC. + * homa_pool_alloc_msg() - Allocate buffer space for an incoming message. * @rpc: RPC that needs space allocated for its incoming message (space must * not already have been allocated). The fields @msgin->num_buffers * and @msgin->buffers are filled in. Must be locked by caller. @@ -275,7 +275,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, * occurred, such as no buffer pool present, then a negative errno is * returned. */ -int homa_pool_allocate(struct homa_rpc *rpc) +int homa_pool_alloc_msg(struct homa_rpc *rpc) __must_hold(&rpc->bucket->lock) { struct homa_pool *pool = rpc->hsk->buffer_pool; @@ -507,7 +507,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) rpc->id, rpc->msgin.length, atomic_read(&pool->free_bpages), pool->bpages_needed); - homa_pool_allocate(rpc); + homa_pool_alloc_msg(rpc); #ifndef __STRIP__ /* See strip.py */ if (rpc->msgin.num_bpages > 0) { /* Allocation succeeded; "wake up" the RPC. */ diff --git a/homa_pool.h b/homa_pool.h index 4463fa58..9f354e3a 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -138,16 +138,16 @@ struct homa_pool { }; bool homa_bpage_available(struct homa_bpage *bpage, u64 now); -int homa_pool_allocate(struct homa_rpc *rpc); +struct homa_pool *homa_pool_alloc(struct homa_sock *hsk); +int homa_pool_alloc_msg(struct homa_rpc *rpc); void homa_pool_check_waiting(struct homa_pool *pool); -void homa_pool_destroy(struct homa_pool *pool); +void homa_pool_free(struct homa_pool *pool); void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, int *available); int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, int leave_locked); void homa_pool_get_rcvbuf(struct homa_pool *pool, struct homa_rcvbuf_args *args); -struct homa_pool *homa_pool_new(struct homa_sock *hsk); int homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, u32 *buffers); int homa_pool_set_region(struct homa_pool *pool, void __user *region, diff --git a/homa_rpc.c b/homa_rpc.c index 9882dcc5..28c09117 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -17,9 +17,9 @@ #endif /* See strip.py */ /** - * homa_rpc_new_client() - Allocate and construct a client RPC (one that is used - * to issue an outgoing request). Doesn't send any packets. Invoked with no - * locks held. + * homa_rpc_alloc_client() - Allocate and initialize a client RPC (one that + * is used to issue an outgoing request). Doesn't send any packets. Invoked + * with no locks held. * @hsk: Socket to which the RPC belongs. * @dest: Address of host (ip and port) to which the RPC will be sent. * @@ -27,7 +27,7 @@ * errno if an error occurred. The RPC will be locked; the * caller must eventually unlock it. */ -struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, +struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, const union sockaddr_in_union *dest) __acquires(rpc_bucket_lock) { @@ -94,7 +94,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, } /** - * homa_rpc_new_server() - Allocate and construct a server RPC (one that is + * homa_rpc_alloc_server() - Allocate and initialize a server RPC (one that is * used to manage an incoming request). If appropriate, the RPC will also * be handed off (we do it here, while we have the socket locked, to avoid * acquiring the socket lock a second time later for the handoff). @@ -109,7 +109,7 @@ struct homa_rpc *homa_rpc_new_client(struct homa_sock *hsk, * if an error occurred. If there is already an RPC corresponding * to h, then it is returned instead of creating a new RPC. */ -struct homa_rpc *homa_rpc_new_server(struct homa_sock *hsk, +struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, const struct in6_addr *source, struct homa_data_hdr *h, int *created) __acquires(rpc_bucket_lock) @@ -228,11 +228,11 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, if (!hsk2) return; } - rpc = homa_find_server_rpc(hsk2, saddr, id); + rpc = homa_rpc_find_server(hsk2, saddr, id); if (rpc) { tt_record1("homa_rpc_acked freeing id %d", rpc->id); homa_rpc_end(rpc); - homa_rpc_unlock(rpc); /* Locked by homa_find_server_rpc. */ + homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ } if (hsk->port != server_port) sock_put(&hsk2->sock); @@ -490,7 +490,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) } /** - * homa_find_client_rpc() - Locate client-side information about the RPC that + * homa_rpc_find_client() - Locate client-side information about the RPC that * a packet belongs to, if there is any. Thread-safe without socket lock. * @hsk: Socket via which packet was received. * @id: Unique identifier for the RPC. @@ -499,7 +499,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * The RPC will be locked; the caller must eventually unlock it * by invoking homa_rpc_unlock. */ -struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, u64 id) +struct homa_rpc *homa_rpc_find_client(struct homa_sock *hsk, u64 id) __cond_acquires(rpc_bucket_lock) { struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); @@ -515,7 +515,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, u64 id) } /** - * homa_find_server_rpc() - Locate server-side information about the RPC that + * homa_rpc_find_server() - Locate server-side information about the RPC that * a packet belongs to, if there is any. Thread-safe without socket lock. * @hsk: Socket via which packet was received. * @saddr: Address from which the packet was sent. @@ -525,7 +525,7 @@ struct homa_rpc *homa_find_client_rpc(struct homa_sock *hsk, u64 id) * if none. The RPC will be locked; the caller must eventually * unlock it by invoking homa_rpc_unlock. */ -struct homa_rpc *homa_find_server_rpc(struct homa_sock *hsk, +struct homa_rpc *homa_rpc_find_server(struct homa_sock *hsk, const struct in6_addr *saddr, u64 id) __cond_acquires(rpc_bucket_lock) { diff --git a/homa_rpc.h b/homa_rpc.h index d4827216..91fc5a63 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -412,12 +412,18 @@ struct homa_rpc { u64 start_ns; }; -void homa_check_rpc(struct homa_rpc *rpc); struct homa_rpc - *homa_find_client_rpc(struct homa_sock *hsk, u64 id); + *homa_rpc_alloc_client(struct homa_sock *hsk, + const union sockaddr_in_union *dest); struct homa_rpc - *homa_find_server_rpc(struct homa_sock *hsk, - const struct in6_addr *saddr, u64 id); + *homa_rpc_alloc_server(struct homa_sock *hsk, + const struct in6_addr *source, + struct homa_data_hdr *h, int *created); +struct homa_rpc + *homa_rpc_find_client(struct homa_sock *hsk, u64 id); +struct homa_rpc + *homa_rpc_find_server(struct homa_sock *hsk, + const struct in6_addr *saddr, u64 id); void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); void homa_rpc_end(struct homa_rpc *rpc); @@ -427,13 +433,6 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id); void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); void homa_rpc_log_tt(struct homa_rpc *rpc); #endif /* See strip.py */ -struct homa_rpc - *homa_rpc_new_client(struct homa_sock *hsk, - const union sockaddr_in_union *dest); -struct homa_rpc - *homa_rpc_new_server(struct homa_sock *hsk, - const struct in6_addr *source, - struct homa_data_hdr *h, int *created); int homa_rpc_reap(struct homa_sock *hsk, bool reap_all); #ifndef __UPSTREAM__ /* See strip.py */ int homa_validate_incoming(struct homa *homa, int verbose, diff --git a/homa_skb.c b/homa_skb.c index 8a4b575c..56e99fec 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -105,7 +105,7 @@ void homa_skb_cleanup(struct homa *homa) } /** - * homa_skb_new_tx() - Allocate a new sk_buff for outgoing data. + * homa_skb_alloc_tx() - Allocate a new sk_buff for outgoing data. * @length: Number of bytes of data that the caller would like to * have available in the linear part of the sk_buff for * the Homa header and additional data beyond that. This @@ -116,7 +116,7 @@ void homa_skb_cleanup(struct homa *homa) * skb_put will be for the transport (Homa) header. The * homa_skb_info is not initialized. */ -struct sk_buff *homa_skb_new_tx(int length) +struct sk_buff *homa_skb_alloc_tx(int length) { u64 start = sched_clock(); struct sk_buff *skb; @@ -434,7 +434,7 @@ int homa_skb_append_from_skb(struct homa *homa, struct sk_buff *dst_skb, * homa_skb_free_tx() - Release the storage for an sk_buff. * @homa: Overall data about the Homa protocol implementation. * @skb: sk_buff to free; should have been allocated by - * homa_skb_new_tx. + * homa_skb_alloc_tx. */ void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb) { @@ -445,7 +445,7 @@ void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb) * homa_skb_free_many_tx() - Release the storage for multiple sk_buffs. * @homa: Overall data about the Homa protocol implementation. * @skbs: Pointer to first entry in array of sk_buffs to free. All of - * these should have been allocated by homa_skb_new_tx. + * these should have been allocated by homa_skb_alloc_tx. * @count: Total number of sk_buffs to free. */ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) diff --git a/homa_skb.h b/homa_skb.h index ea7e0879..45428ef5 100644 --- a/homa_skb.h +++ b/homa_skb.h @@ -95,6 +95,7 @@ struct homa_skb_core { }; DECLARE_PER_CPU(struct homa_skb_core, homa_skb_core); +struct sk_buff *homa_skb_alloc_tx(int length); int homa_skb_append_from_iter(struct homa *homa, struct sk_buff *skb, struct iov_iter *iter, int length); @@ -115,7 +116,6 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, void homa_skb_get(struct sk_buff *skb, void *dest, int offset, int length); int homa_skb_init(struct homa *homa); -struct sk_buff *homa_skb_new_tx(int length); bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *core); void homa_skb_release_pages(struct homa *homa); diff --git a/homa_sock.c b/homa_sock.c index b4eb9d6b..0e335290 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -197,7 +197,7 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) bucket->id = i + 1000000; INIT_HLIST_HEAD(&bucket->rpcs); } - hsk->buffer_pool = homa_pool_new(hsk); + hsk->buffer_pool = homa_pool_alloc(hsk); if (IS_ERR(hsk->buffer_pool)) { result = PTR_ERR(hsk->buffer_pool); hsk->buffer_pool = NULL; @@ -308,7 +308,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) } if (hsk->buffer_pool) { - homa_pool_destroy(hsk->buffer_pool); + homa_pool_free(hsk->buffer_pool); hsk->buffer_pool = NULL; } tt_record1("Finished shutdown for socket %d", hsk->port); diff --git a/homa_stub.h b/homa_stub.h index 206d2656..8dee617c 100644 --- a/homa_stub.h +++ b/homa_stub.h @@ -72,7 +72,7 @@ static inline void homa_skb_get(struct sk_buff *skb, void *dest, int offset, memcpy(dest, skb_transport_header(skb) + offset, length); } -static inline struct sk_buff *homa_skb_new_tx(int length) +static inline struct sk_buff *homa_skb_alloc_tx(int length) { struct sk_buff *skb; diff --git a/homa_timer.c b/homa_timer.c index 067ae82c..a102372d 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -17,14 +17,14 @@ #endif /* See strip.py */ /** - * homa_check_rpc() - Invoked for each RPC during each timer pass; does + * homa_timer_check_rpc() - Invoked for each RPC during each timer pass; does * most of the work of checking for time-related actions such as sending * resends, aborting RPCs for which there is no response, and sending * requests for acks. It is separate from homa_timer because homa_timer * got too long and deeply indented. * @rpc: RPC to check; must be locked by the caller. */ -void homa_check_rpc(struct homa_rpc *rpc) +void homa_timer_check_rpc(struct homa_rpc *rpc) __must_hold(&rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; @@ -265,7 +265,7 @@ void homa_timer(struct homa *homa) #endif /* See strip.py */ } rpc->silent_ticks++; - homa_check_rpc(rpc); + homa_timer_check_rpc(rpc); homa_rpc_unlock(rpc); rpc_count++; if (rpc_count >= 10) { diff --git a/homa_utils.c b/homa_utils.c index 8b665538..13fcd385 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -39,14 +39,14 @@ int homa_init(struct homa *homa, struct net *net) memset(homa, 0, sizeof(*homa)); atomic64_set(&homa->next_outgoing_id, 2); #ifndef __STRIP__ /* See strip.py */ - homa->grant = homa_grant_new(net); + homa->grant = homa_grant_alloc(net); if (IS_ERR(homa->grant)) { err = PTR_ERR(homa->grant); homa->grant = NULL; return err; } #endif /* See strip.py */ - homa->pacer = homa_pacer_new(homa, net); + homa->pacer = homa_pacer_alloc(homa, net); if (IS_ERR(homa->pacer)) { err = PTR_ERR(homa->pacer); homa->pacer = NULL; @@ -145,12 +145,12 @@ void homa_destroy(struct homa *homa) } #ifndef __STRIP__ /* See strip.py */ if (homa->grant) { - homa_grant_destroy(homa->grant); + homa_grant_free(homa->grant); homa->grant = NULL; } #endif /* See strip.py */ if (homa->pacer) { - homa_pacer_destroy(homa->pacer); + homa_pacer_free(homa->pacer); homa->pacer = NULL; } if (homa->peers) { diff --git a/homa_wire.h b/homa_wire.h index 2cd64d5c..43e6d9c6 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -39,7 +39,7 @@ enum homa_packet_type { /* If you add a new type here, you must also do the following: * 1. Change MAX_OP so it is the highest valid opcode * 2. Add support for the new opcode in homa_print_packet, - * homa_print_packet_short, homa_symbol_for_type, and mock_skb_new. + * homa_print_packet_short, homa_symbol_for_type, and mock_skb_alloc. * 3. Add the header length to header_lengths in homa_plumbing.c. */ }; diff --git a/test/mock.c b/test/mock.c index 52d0e5ce..d6c73c23 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1338,11 +1338,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, memcpy(&h, skb_transport_header(head_skb), sizeof(h)); offset = ntohl(h.seg.offset); length = homa_data_len(head_skb); - skb1 = mock_skb_new(&ipv6_hdr(head_skb)->saddr, &h.common, length/2, + skb1 = mock_skb_alloc(&ipv6_hdr(head_skb)->saddr, &h.common, length/2, offset); offset += length/2; h.seg.offset = htonl(offset); - skb2 = mock_skb_new(&ipv6_hdr(head_skb)->saddr, &h.common, length/2, + skb2 = mock_skb_alloc(&ipv6_hdr(head_skb)->saddr, &h.common, length/2, offset); skb2->next = NULL; skb1->next = skb2; @@ -1774,7 +1774,7 @@ void mock_set_ipv6(struct homa_sock *hsk) } /** - * mock_skb_new() - Allocate and return a packet buffer. The buffer is + * mock_skb_alloc() - Allocate and return a packet buffer. The buffer is * initialized as if it just arrived from the network. * @saddr: IPv6 address to use as the sender of the packet, in * network byte order. @@ -1790,7 +1790,7 @@ void mock_set_ipv6(struct homa_sock *hsk) * Return: A packet buffer containing the information described above. * The caller owns this buffer and is responsible for freeing it. */ -struct sk_buff *mock_skb_new(struct in6_addr *saddr, struct homa_common_hdr *h, +struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h, int extra_bytes, int first_value) { int header_size, ip_size, data_size, shinfo_size; diff --git a/test/mock.h b/test/mock.h index 5cb853a3..1fea3a60 100644 --- a/test/mock.h +++ b/test/mock.h @@ -194,10 +194,10 @@ void mock_set_homa(struct homa *homa); void mock_set_ipv6(struct homa_sock *hsk); void mock_spin_lock(spinlock_t *lock); void mock_spin_unlock(spinlock_t *lock); -int mock_skb_count(void); struct sk_buff * - mock_skb_new(struct in6_addr *saddr, struct homa_common_hdr *h, + mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h, int extra_bytes, int first_value); +int mock_skb_count(void); void mock_sock_destroy(struct homa_sock *hsk, struct homa_socktab *socktab); void mock_sock_hold(struct sock *sk); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index f5ca2751..5921cba0 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -133,48 +133,48 @@ static struct homa_rpc *test_rpc_init(FIXTURE_DATA(homa_grant) *self, return rpc; } -TEST_F(homa_grant, homa_grant_new__success) +TEST_F(homa_grant, homa_grant_alloc__success) { struct homa_grant *grant; - grant = homa_grant_new(&mock_net); + grant = homa_grant_alloc(&mock_net); EXPECT_EQ(50, grant->fifo_fraction); - homa_grant_destroy(grant); + homa_grant_free(grant); } -TEST_F(homa_grant, homa_grant_new__cant_allocate_memory) +TEST_F(homa_grant, homa_grant_alloc__cant_allocate_memory) { struct homa_grant *grant; mock_kmalloc_errors = 1; - grant = homa_grant_new(&mock_net); + grant = homa_grant_alloc(&mock_net); EXPECT_TRUE(IS_ERR(grant)); EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); } -TEST_F(homa_grant, homa_grant_new__cant_register_sysctls) +TEST_F(homa_grant, homa_grant_alloc__cant_register_sysctls) { struct homa_grant *grant; mock_register_sysctl_errors = 1; - grant = homa_grant_new(&mock_net); + grant = homa_grant_alloc(&mock_net); EXPECT_TRUE(IS_ERR(grant)); EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); } -TEST_F(homa_grant, homa_grant_destroy__basics) +TEST_F(homa_grant, homa_grant_free__basics) { struct homa_grant *grant; - grant = homa_grant_new(&mock_net); - homa_grant_destroy(grant); + grant = homa_grant_alloc(&mock_net); + homa_grant_free(grant); EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); } -TEST_F(homa_grant, homa_grant_destroy__sysctls_not_registered) +TEST_F(homa_grant, homa_grant_free__sysctls_not_registered) { struct homa_grant *grant; - grant = homa_grant_new(&mock_net); + grant = homa_grant_alloc(&mock_net); grant->sysctl_header = NULL; - homa_grant_destroy(grant); + homa_grant_free(grant); EXPECT_STREQ("", unit_log_get()); } diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index f4092c12..7d439ec4 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -147,7 +147,7 @@ TEST_F(homa_incoming, homa_message_in_init__message_too_long) int created; self->data.message_length = htonl(HOMA_MAX_MESSAGE_LENGTH+1); - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_TRUE(IS_ERR(srpc)); EXPECT_EQ(EINVAL, -PTR_ERR(srpc)); @@ -158,8 +158,8 @@ TEST_F(homa_incoming, homa_message_in_init__no_buffer_region) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); - homa_pool_destroy(self->hsk.buffer_pool); - self->hsk.buffer_pool = homa_pool_new(&self->hsk); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); EXPECT_EQ(ENOMEM, -homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 0)); EXPECT_EQ(0, crpc->msgin.num_bpages); EXPECT_EQ(-1, crpc->msgin.length); @@ -205,9 +205,9 @@ TEST_F(homa_incoming, homa_gap_retry) self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 100); - homa_gap_new(&srpc->msgin.gaps, 1000, 2000); - homa_gap_new(&srpc->msgin.gaps, 4000, 6000); - homa_gap_new(&srpc->msgin.gaps, 7000, 8000); + homa_gap_alloc(&srpc->msgin.gaps, 1000, 2000); + homa_gap_alloc(&srpc->msgin.gaps, 4000, 6000); + homa_gap_alloc(&srpc->msgin.gaps, 7000, 8000); #ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 8; #endif /* See strip.py */ @@ -237,25 +237,25 @@ TEST_F(homa_incoming, homa_add_packet__basics) unit_log_clear(); mock_ns = 5000; self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 800, 4200)); EXPECT_STREQ("start 0, end 1400, time 5000; start 2800, end 4200, time 5000", unit_print_gaps(crpc)); unit_log_clear(); self->data.seg.offset = 0; - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); EXPECT_STREQ("start 2800, end 4200, time 5000", unit_print_gaps(crpc)); EXPECT_EQ(6400, crpc->msgin.bytes_remaining); unit_log_clear(); self->data.seg.offset = htonl(2800); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2800)); EXPECT_STREQ("", unit_print_gaps(crpc)); unit_log_clear(); @@ -273,7 +273,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_overlaps_message_end) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(9000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400)); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); } @@ -285,15 +285,15 @@ TEST_F(homa_incoming, homa_add_packet__sequential_packets) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400)); self->data.seg.offset = htonl(2800); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2800)); EXPECT_STREQ("", unit_print_gaps(crpc)); EXPECT_EQ(4200, crpc->msgin.recv_end); @@ -307,11 +307,11 @@ TEST_F(homa_incoming, homa_add_packet__new_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); EXPECT_EQ(5600, crpc->msgin.recv_end); @@ -325,12 +325,12 @@ TEST_F(homa_incoming, homa_add_packet__no_memory_for_new_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); mock_kmalloc_errors = 1; - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("", unit_print_gaps(crpc)); EXPECT_EQ(1400, crpc->msgin.recv_end); @@ -345,16 +345,16 @@ TEST_F(homa_incoming, homa_add_packet__packet_before_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } @@ -367,16 +367,16 @@ TEST_F(homa_incoming, homa_add_packet__packet_straddles_start_of_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); self->data.seg.offset = htonl(1000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1000)); EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } @@ -389,16 +389,16 @@ TEST_F(homa_incoming, homa_add_packet__packet_extends_past_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(2000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2000)); EXPECT_STREQ("start 1400, end 2000", unit_print_gaps(crpc)); self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400)); EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } @@ -411,16 +411,16 @@ TEST_F(homa_incoming, homa_add_packet__packet_at_start_of_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400)); EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); unit_log_clear(); @@ -435,16 +435,16 @@ TEST_F(homa_incoming, homa_add_packet__packet_covers_entire_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(2800); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2800)); EXPECT_STREQ("start 1400, end 2800", unit_print_gaps(crpc)); self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400)); EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); EXPECT_STREQ("", unit_print_gaps(crpc)); @@ -458,16 +458,16 @@ TEST_F(homa_incoming, homa_add_packet__packet_beyond_end_of_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); self->data.seg.offset = htonl(5000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 5000)); EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } @@ -480,16 +480,16 @@ TEST_F(homa_incoming, homa_add_packet__packet_straddles_end_of_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); self->data.seg.offset = htonl(4000); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4000)); EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); } @@ -502,16 +502,16 @@ TEST_F(homa_incoming, homa_add_packet__packet_at_end_of_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 1400, end 4200", unit_print_gaps(crpc)); self->data.seg.offset = htonl(2800); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2800)); EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); EXPECT_STREQ("start 1400, end 2800", unit_print_gaps(crpc)); @@ -526,18 +526,18 @@ TEST_F(homa_incoming, homa_add_packet__packet_in_middle_of_gap) unit_log_clear(); mock_ns = 1000; self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 1400, end 4200, time 1000", unit_print_gaps(crpc)); self->data.seg.offset = htonl(2000); mock_ns = 2000; - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2000)); EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); EXPECT_STREQ("start 1400, end 2000, time 1000; start 3400, end 4200, time 1000", @@ -553,11 +553,11 @@ TEST_F(homa_incoming, homa_add_packet__kmalloc_failure_while_splitting_gap) unit_log_clear(); mock_ns = 1000; self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 1400, end 4200, time 1000", unit_print_gaps(crpc)); @@ -565,7 +565,7 @@ TEST_F(homa_incoming, homa_add_packet__kmalloc_failure_while_splitting_gap) self->data.seg.offset = htonl(2000); mock_ns = 2000; mock_kmalloc_errors = 1; - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2000)); EXPECT_EQ(2, skb_queue_len(&crpc->msgin.packets)); EXPECT_STREQ("start 1400, end 4200, time 1000", unit_print_gaps(crpc)); @@ -579,17 +579,17 @@ TEST_F(homa_incoming, homa_add_packet__scan_multiple_gaps) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 0, end 1400; start 2800, end 4200", unit_print_gaps(crpc)); self->data.seg.offset = htonl(2800); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2800)); EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); EXPECT_STREQ("start 0, end 1400", unit_print_gaps(crpc)); @@ -604,21 +604,21 @@ TEST_F(homa_incoming, homa_add_packet__metrics) homa_message_in_init(crpc, 10000, 0); crpc->msgin.recv_end = 4200; self->data.seg.offset = htonl(0); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); EXPECT_EQ(0, homa_metrics_per_cpu()->resent_discards); EXPECT_EQ(1, homa_metrics_per_cpu()->packet_discards); self->data.retransmit = 1; - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); EXPECT_EQ(1, homa_metrics_per_cpu()->resent_discards); EXPECT_EQ(1, homa_metrics_per_cpu()->packet_discards); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); EXPECT_EQ(1, homa_metrics_per_cpu()->resent_packets_used); @@ -637,10 +637,10 @@ TEST_F(homa_incoming, homa_copy_to_user__basics) ASSERT_NE(NULL, crpc); self->data.message_length = htonl(4000); self->data.seg.offset = htonl(1400); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 101000), crpc); self->data.seg.offset = htonl(2800); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1200, 201800), crpc); EXPECT_NE(0, atomic_read(&crpc->flags) & RPC_PKTS_READY); @@ -687,7 +687,7 @@ TEST_F(homa_incoming, homa_copy_to_user__multiple_batches) self->data.message_length = htonl(20000); for (offset = 1400; offset < 1400*8; offset += 1400) { self->data.seg.offset = htonl(offset); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, offset), crpc); } EXPECT_EQ(8, skb_queue_len(&crpc->msgin.packets)); @@ -744,7 +744,7 @@ TEST_F(homa_incoming, homa_copy_to_user__many_chunks_for_one_skb) 1000, 4000); ASSERT_NE(NULL, crpc); self->data.message_length = htonl(4000); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 3000, 101000), crpc); unit_log_clear(); @@ -770,7 +770,7 @@ TEST_F(homa_incoming, homa_copy_to_user__skb_data_extends_past_message_end) 1000, 4000); ASSERT_NE(NULL, crpc); self->data.message_length = htonl(4000); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 3000, 101000), crpc); unit_log_clear(); @@ -830,7 +830,7 @@ TEST_F(homa_incoming, homa_copy_to_user__timetrace_info) self->data.message_length = htonl(20000); for (offset = 4200; offset < 1400*10; offset += 1400) { self->data.seg.offset = htonl(offset); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, offset), crpc); } EXPECT_EQ(8, skb_queue_len(&crpc->msgin.packets)); @@ -868,7 +868,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv4) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); unit_log_clear(); homa_dispatch_pkts(skb, &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); @@ -885,7 +885,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv6) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); unit_log_clear(); homa_dispatch_pkts(skb, &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); @@ -903,7 +903,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__server_not_enabled) mock_sock_init(&self->hsk, &self->homa, 0); self->hsk.is_server = false; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); unit_log_clear(); homa_dispatch_pkts(skb, &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); @@ -920,9 +920,9 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_free_many_packets) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); - skb3 = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + skb2 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); + skb3 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); skb->next = skb2; skb2->next = skb3; unit_log_clear(); @@ -932,7 +932,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_free_many_packets) } TEST_F(homa_incoming, homa_dispatch_pkts__new_server_rpc) { - homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); EXPECT_EQ(1, mock_skb_count()); @@ -940,7 +940,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__new_server_rpc) TEST_F(homa_incoming, homa_dispatch_pkts__cant_create_server_rpc) { mock_kmalloc_errors = 1; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, mock_skb_count()); @@ -958,7 +958,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_server_rpc) EXPECT_EQ(8600, srpc->msgin.bytes_remaining); self->data.seg.offset = htonl(1400); self->data.common.sender_id = cpu_to_be64(self->client_id); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_EQ(7200, srpc->msgin.bytes_remaining); } @@ -982,7 +982,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__non_data_packet_for_existing_server_rp ASSERT_NE(NULL, srpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &resend.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &resend.common, 0, 0), &self->homa); EXPECT_STREQ("xmit BUSY", unit_log_get()); } @@ -999,7 +999,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_client_rpc) crpc->msgout.next_xmit_offset = crpc->msgout.length; self->data.message_length = htonl(1600); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 0), crpc); EXPECT_EQ(RPC_INCOMING, crpc->state); EXPECT_EQ(200, crpc->msgin.bytes_remaining); @@ -1012,7 +1012,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) .type = RPC_UNKNOWN}}; mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_rpcs); } @@ -1024,7 +1024,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) .type = GRANT}}; mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, homa_metrics_per_cpu()->unknown_rpcs); } @@ -1040,7 +1040,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) .cutoff_version = 400}; struct homa_peer *peer; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); peer = homa_peer_find(self->homa.peers, self->server_ip, &self->hsk.inet); @@ -1062,7 +1062,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) .offset = 0, .length = 2000}; #endif /* See strip.py */ - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit RPC_UNKNOWN", unit_log_get()); } @@ -1085,7 +1085,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) unit_log_clear(); crpc->silent_ticks = 5; crpc->peer->outstanding_resends = 2; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_EQ(0, crpc->peer->outstanding_resends); @@ -1094,7 +1094,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) h.common.type = CUTOFFS; crpc->silent_ticks = 5; crpc->peer->outstanding_resends = 2; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(5, crpc->silent_ticks); EXPECT_EQ(0, crpc->peer->outstanding_resends); @@ -1115,9 +1115,9 @@ TEST_F(homa_incoming, homa_dispatch_pkts__multiple_ack_packets) ack.num_acks = htons(1); ack.acks[0].server_port = htons(self->server_port); ack.acks[0].client_id = cpu_to_be64(self->client_id + 4); - skb = mock_skb_new(self->client_ip, &ack.common, 0, 0); - skb2 = mock_skb_new(self->client_ip, &ack.common, 0, 0); - skb3 = mock_skb_new(self->client_ip, &ack.common, 0, 0); + skb = mock_skb_alloc(self->client_ip, &ack.common, 0, 0); + skb2 = mock_skb_alloc(self->client_ip, &ack.common, 0, 0); + skb3 = mock_skb_alloc(self->client_ip, &ack.common, 0, 0); skb->next = skb2; skb2->next = skb3; @@ -1140,7 +1140,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) struct homa_common_hdr h = {.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = 99}; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h, 0, 0), &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h, 0, 0), &self->homa); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_packet_types); #endif /* See strip.py */ @@ -1157,7 +1157,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) .client_id = cpu_to_be64(self->client_id)}; self->data.common.sender_id = cpu_to_be64(self->client_id+10); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); EXPECT_SUBSTR("ack 1235", unit_log_get()); @@ -1171,11 +1171,11 @@ TEST_F(homa_incoming, homa_dispatch_pkts__too_many_acks) .client_id = cpu_to_be64(self->client_id)}; self->data.common.sender_id = cpu_to_be64(self->client_id+10); unit_log_clear(); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); self->data.ack.client_id = cpu_to_be64(self->client_id+2); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb2 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); self->data.ack.client_id = cpu_to_be64(self->client_id+4); - skb3 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb3 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); skb->next = skb2; skb2->next = skb3; homa_dispatch_pkts(skb, &self->homa); @@ -1218,7 +1218,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) /* First packet: below the threshold for reaps. */ self->data.common.dport = htons(self->hsk.port); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), &self->homa); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(31, self->hsk.dead_skbs); @@ -1232,7 +1232,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) /* Second packet: must reap. */ self->homa.dead_buffs_limit = 15; self->homa.reap_limit = 10; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), &self->homa); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(21, self->hsk.dead_skbs); @@ -1254,7 +1254,7 @@ TEST_F(homa_incoming, homa_data_pkt__basics) unit_log_clear(); crpc->msgout.next_xmit_offset = crpc->msgout.length; self->data.message_length = htonl(1600); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 0), crpc); EXPECT_EQ(RPC_INCOMING, crpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); @@ -1275,7 +1275,7 @@ TEST_F(homa_incoming, homa_data_pkt__wrong_client_rpc_state) crpc->state = RPC_DEAD; self->data.message_length = htonl(2000); self->data.seg.offset = htonl(1400); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 600, 1400), crpc); EXPECT_EQ(600, crpc->msgin.bytes_remaining); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); @@ -1289,7 +1289,7 @@ TEST_F(homa_incoming, homa_data_pkt__initialize_msgin) ASSERT_NE(NULL, crpc); self->data.message_length = htonl(1600); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 0), crpc); EXPECT_EQ(200, crpc->msgin.bytes_remaining); #ifndef __STRIP__ /* See strip.py */ @@ -1303,10 +1303,10 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffer_pool) self->server_port, self->client_id, 1000, 1600); ASSERT_NE(NULL, crpc); - homa_pool_destroy(self->hsk.buffer_pool); - self->hsk.buffer_pool = homa_pool_new(&self->hsk); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); unit_log_clear(); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 0), crpc); EXPECT_STREQ("homa_data_pkt discarded packet", unit_log_get()); } @@ -1318,7 +1318,7 @@ TEST_F(homa_incoming, homa_data_pkt__wrong_server_rpc_state) ASSERT_NE(NULL, srpc); unit_log_clear(); - homa_data_pkt(mock_skb_new(self->client_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), srpc); EXPECT_EQ(RPC_OUTGOING, srpc->state); EXPECT_STREQ("homa_data_pkt discarded packet", unit_log_get()); @@ -1333,7 +1333,7 @@ TEST_F(homa_incoming, homa_data_pkt__no_buffers) unit_log_clear(); atomic_set(&self->hsk.buffer_pool->free_bpages, 0); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 0), crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1400, homa_metrics_per_cpu()->dropped_data_no_bufs); @@ -1354,17 +1354,17 @@ TEST_F(homa_incoming, homa_data_pkt__update_delta) #ifndef __STRIP__ /* See strip.py */ self->data.incoming = htonl(4000); #endif /* See strip.py */ - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 0), crpc); /* Total incoming drops on subsequent packet. */ self->data.seg.offset = htonl(2800); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 2800), crpc); /* Duplicate packet should have no effect. */ self->data.seg.offset = htonl(2800); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 2800), crpc); } TEST_F(homa_incoming, homa_data_pkt__handoff) @@ -1380,7 +1380,7 @@ TEST_F(homa_incoming, homa_data_pkt__handoff) /* First packet triggers handoff. */ self->data.message_length = htonl(3000); self->data.seg.offset = htonl(1400); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 0), crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); EXPECT_TRUE(atomic_read(&crpc->flags) & RPC_PKTS_READY); @@ -1394,7 +1394,7 @@ TEST_F(homa_incoming, homa_data_pkt__handoff) self->data.message_length = htonl(3000); self->data.seg.offset = htonl(2800); unit_log_clear(); - homa_data_pkt(mock_skb_new(self->server_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 200, 0), crpc); EXPECT_STREQ("", unit_log_get()); } @@ -1412,7 +1412,7 @@ TEST_F(homa_incoming, homa_data_pkt__send_cutoffs) self->homa.unsched_cutoffs[7] = 12; self->data.message_length = htonl(5000); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_SUBSTR("cutoffs 19 18 17 16 15 14 13 12, version 2", unit_log_get()); @@ -1423,7 +1423,7 @@ TEST_F(homa_incoming, homa_data_pkt__send_cutoffs) unit_log_clear(); self->homa.cutoff_version = 3; self->data.seg.offset = 1400; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); } @@ -1431,7 +1431,7 @@ TEST_F(homa_incoming, homa_data_pkt__cutoffs_up_to_date) { self->homa.cutoff_version = 123; self->data.cutoff_version = htons(123); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), &self->homa); EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); } @@ -1455,7 +1455,7 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) homa_rpc_unlock(srpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(11000, srpc->msgout.granted); EXPECT_STREQ("xmit DATA 1400@10000", unit_log_get()); @@ -1463,7 +1463,7 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) /* Don't let grant offset go backwards. */ h.offset = htonl(10000); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(11000, srpc->msgout.granted); EXPECT_STREQ("", unit_log_get()); @@ -1472,7 +1472,7 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) h.offset = htonl(20000); srpc->state = RPC_INCOMING; unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(11000, srpc->msgout.granted); EXPECT_STREQ("", unit_log_get()); @@ -1501,7 +1501,7 @@ TEST_F(homa_incoming, homa_grant_pkt__reset) EXPECT_EQ(10000, srpc->msgout.granted); EXPECT_EQ(10000, srpc->msgout.next_xmit_offset); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(10000, srpc->msgout.granted); EXPECT_EQ(10000, srpc->msgout.next_xmit_offset); @@ -1528,7 +1528,7 @@ TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(20000, crpc->msgout.granted); } @@ -1543,7 +1543,7 @@ TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) .offset = htonl(100), .length = htonl(200)}; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit RPC_UNKNOWN", unit_log_get()); } @@ -1562,7 +1562,7 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_in_service_server_sends_busy) ASSERT_NE(NULL, srpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit BUSY", unit_log_get()); } @@ -1587,7 +1587,7 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_incoming_server_sends_busy) #endif /* See strip.py */ unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); // The server might send a GRANT right after BUSY so just check substr EXPECT_SUBSTR("xmit BUSY", unit_log_get()); @@ -1610,7 +1610,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); } @@ -1630,7 +1630,7 @@ TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_SUBSTR("xmit BUSY", unit_log_get()); } @@ -1659,7 +1659,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_send_data) unit_log_clear(); mock_clear_xmit_prios(); - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_SUBSTR("xmit DATA retrans 1400@0", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ @@ -1690,7 +1690,7 @@ TEST_F(homa_incoming, homa_resend_pkt__server_send_data) unit_log_clear(); mock_clear_xmit_prios(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit DATA retrans 1400@0; " "xmit DATA retrans 1400@1400", unit_log_get()); @@ -1716,7 +1716,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); #ifndef __STRIP__ /* See strip.py */ EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 2000, RETRANSMIT; " @@ -1749,7 +1749,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); #ifndef __STRIP__ /* See strip.py */ EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 1400, RETRANSMIT", @@ -1773,7 +1773,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) ASSERT_NE(NULL, srpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); } @@ -1796,7 +1796,7 @@ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) EXPECT_EQ(10000, crpc->msgout.granted); unit_log_clear(); - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(400, crpc->peer->cutoff_version); EXPECT_EQ(9, crpc->peer->unsched_cutoffs[1]); @@ -1811,7 +1811,7 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) .unsched_cutoffs = {htonl(10), htonl(9), htonl(8), htonl(7), htonl(6), htonl(5), htonl(4), htonl(3)}, .cutoff_version = 400}; - struct sk_buff *skb = mock_skb_new(self->server_ip, &h.common, 0, 0); + struct sk_buff *skb = mock_skb_alloc(self->server_ip, &h.common, 0, 0); struct homa_peer *peer; mock_kmalloc_errors = 1; @@ -1838,7 +1838,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) ASSERT_NE(NULL, crpc); unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks", unit_log_get()); @@ -1861,7 +1861,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) ASSERT_NE(NULL, crpc); unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ @@ -1883,7 +1883,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) ASSERT_NE(NULL, crpc); unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ @@ -1905,7 +1905,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) peer->acks[0].client_id = cpu_to_be64(self->client_id+2); peer->num_acks = 1; mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->server_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks [sp 99, id 1236]", unit_log_get()); @@ -1927,7 +1927,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_no_extras) EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); #ifndef __STRIP__ /* See strip.py */ @@ -1962,7 +1962,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_plus_extras) .client_id = cpu_to_be64(self->server_id+1)}; h.acks[1] = (struct homa_ack) {.server_port = htons(self->server_port), .client_id = cpu_to_be64(self->server_id+3)}; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc1)); @@ -1993,7 +1993,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) .client_id = cpu_to_be64(self->server_id+5)}; h.acks[1] = (struct homa_ack) {.server_port = htons(self->server_port), .client_id = cpu_to_be64(self->server_id+1)}; - homa_dispatch_pkts(mock_skb_new(self->client_ip, &h.common, 0, 0), + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), &self->homa); EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc1)); diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index f482456c..bcb00f8d 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -63,14 +63,14 @@ FIXTURE_SETUP(homa_offload) } self->napi.gro_bitmask = 0; - self->skb = mock_skb_new(&self->ip, &self->header.common, 1400, 2000); + self->skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 2000); NAPI_GRO_CB(self->skb)->same_flow = 0; NAPI_GRO_CB(self->skb)->last = self->skb; NAPI_GRO_CB(self->skb)->count = 1; self->header.seg.offset = htonl(4000); self->header.common.dport = htons(88); self->header.common.sender_id = cpu_to_be64(1002); - self->skb2 = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + self->skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(self->skb2)->same_flow = 0; NAPI_GRO_CB(self->skb2)->last = self->skb2; NAPI_GRO_CB(self->skb2)->count = 1; @@ -135,7 +135,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) homa_gro_hook_tcp(); self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); h = (struct homa_common_hdr *) skb_transport_header(skb); h->flags = 0; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); @@ -143,7 +143,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_tcp) kfree_skb(skb); unit_log_clear(); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); h = (struct homa_common_hdr *)skb_transport_header(skb); h->urgent -= 1; EXPECT_EQ(NULL, homa_tcp_gro_receive(&self->empty_list, skb)); @@ -159,7 +159,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv6) mock_ipv6 = true; homa_gro_hook_tcp(); self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); ip_hdr(skb)->protocol = IPPROTO_TCP; h = (struct homa_common_hdr *)skb_transport_header(skb); h->flags = HOMA_TCP_FLAGS; @@ -182,7 +182,7 @@ TEST_F(homa_offload, homa_tcp_gro_receive__pass_to_homa_ipv4) mock_ipv6 = false; homa_gro_hook_tcp(); self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); ip_hdr(skb)->protocol = IPPROTO_TCP; h = (struct homa_common_hdr *)skb_transport_header(skb); h->flags = HOMA_TCP_FLAGS; @@ -205,7 +205,7 @@ TEST_F(homa_offload, homa_gso_segment_set_ip_ids) int version; mock_ipv6 = false; - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 2000); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 2000); version = ip_hdr(skb)->version; EXPECT_EQ(4, version); segs = homa_gso_segment(skb, 0); @@ -227,7 +227,7 @@ TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) /* First call: copy offset from sequence number. */ self->header.common.sequence = htonl(6000); self->header.seg.offset = -1; - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; cur_offload_core->held_skb = NULL; cur_offload_core->held_bucket = 99; @@ -238,7 +238,7 @@ TEST_F(homa_offload, homa_gro_receive__update_offset_from_sequence) /* Second call: offset already valid. */ self->header.common.sequence = htonl(6000); self->header.seg.offset = ntohl(5000); - skb2 = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb2)->same_flow = 0; EXPECT_EQ(NULL, homa_gro_receive(&self->empty_list, skb2)); h = (struct homa_data_hdr *)skb_transport_header(skb2); @@ -275,7 +275,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) unit_log_clear(); /* First attempt: HOMA_GRO_SHORT_BYPASS not enabled. */ - skb = mock_skb_new(&self->ip, &h.common, 1400, 2000); + skb = mock_skb_alloc(&self->ip, &h.common, 1400, 2000); result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(0, homa_metrics_per_cpu()->gro_data_bypasses); @@ -285,7 +285,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) */ self->homa.gro_policy |= HOMA_GRO_SHORT_BYPASS; cur_offload_core->last_gro = 400; - skb2 = mock_skb_new(&self->ip, &h.common, 1400, 2000); + skb2 = mock_skb_alloc(&self->ip, &h.common, 1400, 2000); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(0, homa_metrics_per_cpu()->gro_data_bypasses); @@ -294,14 +294,14 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) h.message_length = htonl(1400); h.incoming = htonl(1400); cur_offload_core->last_gro = 400; - skb3 = mock_skb_new(&self->ip, &h.common, 1400, 4000); + skb3 = mock_skb_alloc(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_data_bypasses); /* Third attempt: no bypass because core busy. */ cur_offload_core->last_gro = 600; - skb4 = mock_skb_new(&self->ip, &h.common, 1400, 4000); + skb4 = mock_skb_alloc(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_data_bypasses); @@ -340,7 +340,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) /* First attempt: HOMA_GRO_FAST_GRANTS not enabled. */ self->homa.gro_policy = 0; - skb = mock_skb_new(&client_ip, &h.common, 0, 0); + skb = mock_skb_alloc(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(0, homa_metrics_per_cpu()->gro_grant_bypasses); @@ -349,7 +349,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) /* Second attempt: HOMA_FAST_GRANTS is enabled. */ self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; cur_offload_core->last_gro = 400; - skb2 = mock_skb_new(&client_ip, &h.common, 0, 0); + skb2 = mock_skb_alloc(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); @@ -357,7 +357,7 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) /* Third attempt: core is too busy for fast grants. */ cur_offload_core->last_gro = 600; - skb3 = mock_skb_new(&client_ip, &h.common, 0, 0); + skb3 = mock_skb_alloc(&client_ip, &h.common, 0, 0); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(0, -PTR_ERR(result)); EXPECT_EQ(1, homa_metrics_per_cpu()->gro_grant_bypasses); @@ -370,7 +370,7 @@ TEST_F(homa_offload, homa_gro_receive__no_held_skb) int same_flow; self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; cur_offload_core->held_skb = NULL; @@ -388,7 +388,7 @@ TEST_F(homa_offload, homa_gro_receive__empty_merge_list) int same_flow; self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); skb->hash = 2; NAPI_GRO_CB(skb)->same_flow = 0; cur_offload_core->held_skb = self->skb; @@ -406,7 +406,7 @@ TEST_F(homa_offload, homa_gro_receive__held_skb_not_in_merge_list) int same_flow; self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); skb->hash = 3; NAPI_GRO_CB(skb)->same_flow = 0; cur_offload_core->held_skb = skb; @@ -424,7 +424,7 @@ TEST_F(homa_offload, homa_gro_receive__held_skb__in_merge_list_but_wrong_proto) int same_flow; self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); skb->hash = 3; NAPI_GRO_CB(skb)->same_flow = 0; cur_offload_core->held_skb = self->skb; @@ -450,7 +450,7 @@ TEST_F(homa_offload, homa_gro_receive__merge) self->header.seg.offset = htonl(6000); self->header.common.sender_id = cpu_to_be64(1002); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; @@ -459,7 +459,7 @@ TEST_F(homa_offload, homa_gro_receive__merge) self->header.seg.offset = htonl(7000); self->header.common.sender_id = cpu_to_be64(1004); - skb2 = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb2)->same_flow = 0; EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb2)); same_flow = NAPI_GRO_CB(skb)->same_flow; @@ -480,14 +480,14 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) cur_offload_core->held_skb = self->skb2; cur_offload_core->held_bucket = 2; self->header.seg.offset = htonl(6000); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); homa_gro_receive(&self->napi.gro_hash[3].list, skb); EXPECT_EQ(2, NAPI_GRO_CB(self->skb2)->count); EXPECT_EQ(2, self->napi.gro_hash[2].count); // Second packet hits the limit. self->header.common.sport = htons(40001); - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( &self->napi.gro_hash[3].list, skb))); @@ -503,7 +503,7 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) // to become empty. self->homa.max_gro_skbs = 2; cur_offload_core->held_skb = self->skb; - skb = mock_skb_new(&self->ip, &self->header.common, 1400, 0); + skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( &self->napi.gro_hash[3].list, skb))); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index d0b4c330..34d47b8a 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -124,7 +124,7 @@ TEST_F(homa_outgoing, set_priority__priority_mapping) TEST_F(homa_outgoing, homa_fill_data_interleaved) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); char buffer[1000]; @@ -133,7 +133,7 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved) homa_message_out_init(crpc, 10000); unit_log_clear(); - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 10000, 5000, + struct sk_buff *skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); EXPECT_STREQ("_copy_from_iter 1500 bytes at 1000; " "_copy_from_iter 1500 bytes at 2500; " @@ -153,7 +153,7 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved) } TEST_F(homa_outgoing, homa_fill_data_interleaved__error_copying_data) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); struct sk_buff *skb; @@ -163,14 +163,14 @@ TEST_F(homa_outgoing, homa_fill_data_interleaved__error_copying_data) unit_log_clear(); mock_copy_data_errors = 1; - skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); EXPECT_EQ(EFAULT, -PTR_ERR(skb)); } -TEST_F(homa_outgoing, homa_new_data_packet__one_segment) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__one_segment) { struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct sk_buff *skb; char buffer[1000]; @@ -179,7 +179,7 @@ TEST_F(homa_outgoing, homa_new_data_packet__one_segment) homa_message_out_init(crpc, 500); unit_log_clear(); - skb = homa_new_data_packet(crpc, iter, 5000, 500, 2000); + skb = homa_tx_data_pkt_alloc(crpc, iter, 5000, 500, 2000); EXPECT_STREQ("_copy_from_iter 500 bytes at 1000", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ @@ -193,10 +193,10 @@ TEST_F(homa_outgoing, homa_new_data_packet__one_segment) EXPECT_EQ(0, skb_shinfo(skb)->gso_segs); kfree_skb(skb); } -TEST_F(homa_outgoing, homa_new_data_packet__cant_allocate_skb) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__cant_allocate_skb) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct sk_buff *skb; @@ -205,14 +205,14 @@ TEST_F(homa_outgoing, homa_new_data_packet__cant_allocate_skb) unit_log_clear(); mock_alloc_skb_errors = 1; - skb = homa_new_data_packet(crpc, iter, 0, 500, 2000); + skb = homa_tx_data_pkt_alloc(crpc, iter, 0, 500, 2000); EXPECT_TRUE(IS_ERR(skb)); EXPECT_EQ(ENOMEM, -PTR_ERR(skb)); } -TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_interleaved) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__multiple_segments_homa_fill_data_interleaved) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct sk_buff *skb; char buffer[1000]; @@ -221,7 +221,7 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_int homa_message_out_init(crpc, 10000); unit_log_clear(); - skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); EXPECT_STREQ("_copy_from_iter 1500 bytes at 1000; " "_copy_from_iter 1500 bytes at 2500; " "_copy_from_iter 1500 bytes at 4000; " @@ -241,10 +241,10 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_homa_fill_data_int kfree_skb(skb); } #ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_outgoing, homa_new_data_packet__error_in_homa_fill_data_interleaved) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__error_in_homa_fill_data_interleaved) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); homa_rpc_unlock(crpc); @@ -252,12 +252,12 @@ TEST_F(homa_outgoing, homa_new_data_packet__error_in_homa_fill_data_interleaved) unit_log_clear(); mock_alloc_page_errors = -1; - struct sk_buff *skb = homa_new_data_packet(crpc, iter, 10000, 5000, + struct sk_buff *skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); EXPECT_TRUE(IS_ERR(skb)); EXPECT_EQ(ENOMEM, -PTR_ERR(skb)); } -TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_tcp_hijacking) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__multiple_segments_tcp_hijacking) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); struct homa_rpc *crpc; @@ -267,12 +267,12 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_tcp_hijacking) self->homa.hijack_tcp = 1; mock_sock_init(&hsk, &self->homa, self->client_port+1); - crpc = homa_rpc_new_client(&hsk, &self->server_addr); + crpc = homa_rpc_alloc_client(&hsk, &self->server_addr); homa_rpc_unlock(crpc); homa_message_out_init(crpc, 10000); unit_log_clear(); - skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); EXPECT_STREQ("_copy_from_iter 5000 bytes at 1000", unit_log_get()); EXPECT_STREQ("DATA from 0.0.0.0:40001, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", @@ -280,10 +280,10 @@ TEST_F(homa_outgoing, homa_new_data_packet__multiple_segments_tcp_hijacking) kfree_skb(skb); homa_sock_destroy(&hsk); } -TEST_F(homa_outgoing, homa_new_data_packet__error_copying_data_hijacking_path) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__error_copying_data_hijacking_path) { struct iov_iter *iter = unit_iov_iter((void *) 1000, 5000); - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct sk_buff *skb; @@ -292,15 +292,15 @@ TEST_F(homa_outgoing, homa_new_data_packet__error_copying_data_hijacking_path) unit_log_clear(); mock_copy_data_errors = 1; - skb = homa_new_data_packet(crpc, iter, 5000, 500, 2000); + skb = homa_tx_data_pkt_alloc(crpc, iter, 5000, 500, 2000); EXPECT_TRUE(IS_ERR(skb)); EXPECT_EQ(EFAULT, -PTR_ERR(skb)); } #endif /* See strip.py */ -TEST_F(homa_outgoing, homa_new_data_packet__gso_information) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__gso_information) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct sk_buff *skb; @@ -308,7 +308,7 @@ TEST_F(homa_outgoing, homa_new_data_packet__gso_information) homa_message_out_init(crpc, 10000); unit_log_clear(); - skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); EXPECT_EQ(4, skb_shinfo(skb)->gso_segs); EXPECT_EQ(1500 + sizeof(struct homa_seg_hdr), @@ -316,10 +316,10 @@ TEST_F(homa_outgoing, homa_new_data_packet__gso_information) EXPECT_EQ(SKB_GSO_TCPV6, skb_shinfo(skb)->gso_type); kfree_skb(skb); } -TEST_F(homa_outgoing, homa_new_data_packet__gso_force_software) +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__gso_force_software) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct sk_buff *skb; @@ -328,14 +328,14 @@ TEST_F(homa_outgoing, homa_new_data_packet__gso_force_software) self->homa.gso_force_software = 1; unit_log_clear(); - skb = homa_new_data_packet(crpc, iter, 10000, 5000, 1500); + skb = homa_tx_data_pkt_alloc(crpc, iter, 10000, 5000, 1500); EXPECT_EQ(13, skb_shinfo(skb)->gso_type); kfree_skb(skb); } TEST_F(homa_outgoing, homa_message_out_fill__basics) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); mock_set_ipv6(&self->hsk); @@ -370,7 +370,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__basics) } TEST_F(homa_outgoing, homa_message_out_fill__message_too_long) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); @@ -383,7 +383,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__message_too_long) } TEST_F(homa_outgoing, homa_message_out_fill__zero_length_message) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); @@ -396,11 +396,11 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) { struct homa_rpc *crpc1, *crpc2; - crpc1 = homa_rpc_new_client(&self->hsk, &self->server_addr); + crpc1 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc1 == NULL); homa_rpc_unlock(crpc1); - crpc2 = homa_rpc_new_client(&self->hsk, &self->server_addr); + crpc2 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc2 == NULL); homa_rpc_unlock(crpc2); @@ -430,7 +430,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) { struct homa_rpc *crpc1, *crpc2; - crpc1 = homa_rpc_new_client(&self->hsk, &self->server_addr); + crpc1 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc1 == NULL); mock_set_ipv6(&self->hsk); @@ -444,7 +444,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 2800", unit_log_get()); /* Second try: just barely enough space for 3 packets in GSO. */ - crpc2 = homa_rpc_new_client(&self->hsk, &self->server_addr); + crpc2 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc2 == NULL); mock_net_device.gso_max_size += 1; unit_log_clear(); @@ -455,7 +455,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) } TEST_F(homa_outgoing, homa_message_out_fill__gso_force_software) { - struct homa_rpc *crpc1 = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc1 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct homa_rpc *crpc2; @@ -471,7 +471,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_force_software) EXPECT_SUBSTR("xmit DATA", unit_log_get()); EXPECT_NOSUBSTR("TSO disabled", unit_log_get()); - crpc2 = homa_rpc_new_client(&self->hsk, &self->server_addr); + crpc2 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc2 == NULL); self->homa.gso_force_software = 1; ASSERT_EQ(0, -homa_message_out_fill(crpc2, @@ -483,7 +483,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_force_software) } TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); @@ -497,7 +497,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) } TEST_F(homa_outgoing, homa_message_out_fill__include_acks) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); struct homa_data_hdr h; @@ -515,7 +515,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__include_acks) } TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); @@ -543,7 +543,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) } TEST_F(homa_outgoing, homa_message_out_fill__error_in_homa_new_data_packet) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); @@ -561,7 +561,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__error_in_homa_new_data_packet) } TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); @@ -577,7 +577,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) } TEST_F(homa_outgoing, homa_message_out_fill__add_to_throttled) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); @@ -596,7 +596,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__add_to_throttled) } TEST_F(homa_outgoing, homa_message_out_fill__too_short_for_pipelining) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); @@ -610,7 +610,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__too_short_for_pipelining) } TEST_F(homa_outgoing, homa_message_out_fill__packet_memory_accounting) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); mock_set_ipv6(&self->hsk); @@ -759,7 +759,7 @@ TEST_F(homa_outgoing, homa_xmit_unknown) struct sk_buff *skb; mock_xmit_log_verbose = 1; - skb = mock_skb_new(self->client_ip, &h.common, 0, 0); + skb = mock_skb_alloc(self->client_ip, &h.common, 0, 0); homa_xmit_unknown(skb, &self->hsk); EXPECT_STREQ("xmit RPC_UNKNOWN from 0.0.0.0:99, dport 40000, id 99991", unit_log_get()); diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index 397b0ee1..b00a0a8d 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -81,17 +81,17 @@ TEST_F(homa_pacer, homa_pacer_new__success) { struct homa_pacer *pacer; - pacer = homa_pacer_new(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa, &mock_net); EXPECT_FALSE(IS_ERR(pacer)); EXPECT_EQ(&self->homa, pacer->homa); - homa_pacer_destroy(pacer); + homa_pacer_free(pacer); } TEST_F(homa_pacer, homa_pacer_new__cant_allocate_memory) { struct homa_pacer *pacer; mock_kmalloc_errors = 1; - pacer = homa_pacer_new(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa, &mock_net); EXPECT_TRUE(IS_ERR(pacer)); EXPECT_EQ(ENOMEM, -PTR_ERR(pacer)); } @@ -100,7 +100,7 @@ TEST_F(homa_pacer, homa_pacer_new__cant_create_pacer_thread) struct homa_pacer *pacer; mock_kthread_create_errors = 1; - pacer = homa_pacer_new(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa, &mock_net); EXPECT_TRUE(IS_ERR(pacer)); EXPECT_EQ(EACCES, -PTR_ERR(pacer)); } @@ -110,20 +110,20 @@ TEST_F(homa_pacer, homa_pacer_new__cant_register_sysctls) struct homa_pacer *pacer; mock_register_sysctl_errors = 1; - pacer = homa_pacer_new(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa, &mock_net); EXPECT_TRUE(IS_ERR(pacer)); EXPECT_EQ(ENOMEM, -PTR_ERR(pacer)); } #endif /* See strip.py */ -TEST_F(homa_pacer, homa_pacer_destroy__basics) +TEST_F(homa_pacer, homa_pacer_free__basics) { struct homa_pacer *pacer; - pacer = homa_pacer_new(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa, &mock_net); EXPECT_FALSE(IS_ERR(pacer)); unit_log_clear(); - homa_pacer_destroy(pacer); + homa_pacer_free(pacer); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("unregister_net_sysctl_table; kthread_stop", unit_log_get()); @@ -132,15 +132,15 @@ TEST_F(homa_pacer, homa_pacer_destroy__basics) unit_log_get()); #endif /* See strip.py */ } -TEST_F(homa_pacer, homa_pacer_destroy__no_thread) +TEST_F(homa_pacer, homa_pacer_free__no_thread) { struct homa_pacer *pacer; - pacer = homa_pacer_new(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa, &mock_net); EXPECT_FALSE(IS_ERR(pacer)); pacer->kthread = NULL; unit_log_clear(); - homa_pacer_destroy(pacer); + homa_pacer_free(pacer); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); #endif /* See strip.py */ diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index c49a7157..773a5c37 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -322,8 +322,8 @@ TEST_F(homa_plumbing, homa_setsockopt__recvbuf_success) & ~(PAGE_SIZE - 1)); args.length = 64*HOMA_BPAGE_SIZE; self->optval.user = &args; - homa_pool_destroy(self->hsk.buffer_pool); - self->hsk.buffer_pool = homa_pool_new(&self->hsk); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); EXPECT_EQ(0, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, self->optval, sizeof(struct homa_rcvbuf_args))); @@ -365,8 +365,8 @@ TEST_F(homa_plumbing, homa_getsockopt__recvbuf_success) struct homa_rcvbuf_args val; int size = sizeof(val) + 10; - homa_pool_destroy(self->hsk.buffer_pool); - self->hsk.buffer_pool = homa_pool_new(&self->hsk); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); EXPECT_EQ(0, -homa_pool_set_region(self->hsk.buffer_pool, (void *)0x40000, 10*HOMA_BPAGE_SIZE + 1000)); @@ -552,7 +552,7 @@ TEST_F(homa_plumbing, homa_sendmsg__request_sent_successfully) EXPECT_SUBSTR("xmit DATA 200@0", unit_log_get()); EXPECT_EQ(1234L, self->sendmsg_args.id); ASSERT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - crpc = homa_find_client_rpc(&self->hsk, self->sendmsg_args.id); + crpc = homa_rpc_find_client(&self->hsk, self->sendmsg_args.id); ASSERT_NE(NULL, crpc); EXPECT_EQ(88888, crpc->completion_cookie); homa_rpc_unlock(crpc); @@ -908,7 +908,7 @@ TEST_F(homa_plumbing, homa_softirq__basics) { struct sk_buff *skb; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); homa_softirq(skb); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); } @@ -916,7 +916,7 @@ TEST_F(homa_plumbing, homa_softirq__cant_pull_header) { struct sk_buff *skb; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); skb->data_len = skb->len - 20; homa_softirq(skb); EXPECT_STREQ("pskb discard", unit_log_get()); @@ -925,7 +925,7 @@ TEST_F(homa_plumbing, homa_softirq__remove_extra_headers) { struct sk_buff *skb; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); __skb_push(skb, 10); homa_softirq(skb); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); @@ -936,7 +936,7 @@ TEST_F(homa_plumbing, homa_softirq__packet_too_short) struct homa_ack_hdr h; h.common.type = ACK; - skb = mock_skb_new(self->client_ip, &h.common, 0, 0); + skb = mock_skb_alloc(self->client_ip, &h.common, 0, 0); skb->len -= 1; homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); @@ -949,7 +949,7 @@ TEST_F(homa_plumbing, homa_softirq__bogus_packet_type) struct sk_buff *skb; self->data.common.type = MAX_OP + 1; - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 1400); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); homa_softirq(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); #ifndef __STRIP__ /* See strip.py */ @@ -962,18 +962,18 @@ TEST_F(homa_plumbing, homa_softirq__process_short_messages_first) self->data.common.sender_id = cpu_to_be64(2000); self->data.message_length = htonl(2000); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); self->data.common.sender_id = cpu_to_be64(300); self->data.message_length = htonl(300); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 300, 0); + skb2 = mock_skb_alloc(self->client_ip, &self->data.common, 300, 0); self->data.common.sender_id = cpu_to_be64(200); self->data.message_length = htonl(1600); self->data.seg.offset = htonl(1400); - skb3 = mock_skb_new(self->client_ip, &self->data.common, 200, 0); + skb3 = mock_skb_alloc(self->client_ip, &self->data.common, 200, 0); self->data.common.sender_id = cpu_to_be64(5000); self->data.message_length = htonl(5000); self->data.seg.offset = 0; - skb4 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb4 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); skb_shinfo(skb)->frag_list = skb2; skb2->next = skb3; skb3->next = skb4; @@ -995,8 +995,8 @@ TEST_F(homa_plumbing, homa_softirq__process_control_first) self->data.common.sender_id = cpu_to_be64(2000); self->data.message_length = htonl(2000); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); - skb2 = mock_skb_new(self->client_ip, &unknown, 0, 0); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); + skb2 = mock_skb_alloc(self->client_ip, &unknown, 0, 0); skb_shinfo(skb)->frag_list = skb2; skb2->next = NULL; @@ -1010,13 +1010,13 @@ TEST_F(homa_plumbing, homa_softirq__nothing_to_reorder) self->data.common.sender_id = cpu_to_be64(2000); self->data.message_length = htonl(2000); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); self->data.common.sender_id = cpu_to_be64(3000); self->data.message_length = htonl(3000); - skb2 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb2 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); self->data.common.sender_id = cpu_to_be64(5000); self->data.message_length = htonl(5000); - skb3 = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb3 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); skb_shinfo(skb)->frag_list = skb2; skb2->next = skb3; skb3->next = NULL; @@ -1031,44 +1031,44 @@ TEST_F(homa_plumbing, homa_softirq__per_rpc_batching) self->data.common.sender_id = cpu_to_be64(2000); self->data.message_length = htonl(10000); - skb = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); tail = skb; self->data.common.sender_id = cpu_to_be64(2002); - tail->next = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); tail = tail->next; self->data.common.sender_id = cpu_to_be64(2004); - tail->next = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); tail = tail->next; self->data.common.sender_id = cpu_to_be64(2002); self->data.seg.offset = htonl(1400); - tail->next = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); tail = tail->next; self->data.common.sender_id = cpu_to_be64(2004); - tail->next = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); tail = tail->next; self->data.common.sender_id = cpu_to_be64(2002); self->data.seg.offset = htonl(4200); - tail->next = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); tail = tail->next; self->data.common.sender_id = cpu_to_be64(2002); self->data.seg.offset = htonl(2800); - tail->next = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); tail = tail->next; self->data.common.sender_id = cpu_to_be64(2004); self->data.seg.offset = htonl(5600); - tail->next = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); tail = tail->next; self->data.common.sender_id = cpu_to_be64(2002); self->data.seg.offset = htonl(7000); - tail->next = mock_skb_new(self->client_ip, &self->data.common, 1400, 0); + tail->next = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); tail = tail->next; skb_shinfo(skb)->frag_list = skb->next; @@ -1096,10 +1096,10 @@ TEST_F(homa_plumbing, homa_err_handler_v4__port_unreachable) self->client_id, 100, 100); ASSERT_NE(NULL, crpc); - failed = mock_skb_new(self->server_ip, &self->data.common, 100, 0); + failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); ip_hdr(failed)->daddr = ipv6_to_ipv4(self->server_ip[0]); - icmp = mock_skb_new(self->server_ip, NULL, 1000, 0); + icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); icmph = skb_put(icmp, sizeof *icmph); icmph->type = ICMP_DEST_UNREACH; icmph->code = ICMP_PORT_UNREACH; @@ -1124,10 +1124,10 @@ TEST_F(homa_plumbing, homa_err_handler_v4__host_unreachable) self->client_id, 100, 100); ASSERT_NE(NULL, crpc); - failed = mock_skb_new(self->server_ip, &self->data.common, 100, 0); + failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); ip_hdr(failed)->daddr = ipv6_to_ipv4(self->server_ip[0]); - icmp = mock_skb_new(self->server_ip, NULL, 1000, 0); + icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); icmph = skb_put(icmp, sizeof *icmph); icmph->type = ICMP_DEST_UNREACH; icmph->code = ICMP_HOST_UNKNOWN; @@ -1151,10 +1151,10 @@ TEST_F(homa_plumbing, homa_err_handler_v6__port_unreachable) self->client_id, 100, 100); ASSERT_NE(NULL, crpc); - failed = mock_skb_new(self->server_ip, &self->data.common, 100, 0); + failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); ipv6_hdr(failed)->daddr = self->server_ip[0]; - icmp = mock_skb_new(self->server_ip, NULL, 1000, 0); + icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); memcpy(skb_put(icmp, failed->len), failed->head, failed->len); EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_DEST_UNREACH, @@ -1174,10 +1174,10 @@ TEST_F(homa_plumbing, homa_err_handler_v6__protocol_not_supported) self->client_id, 100, 100); ASSERT_NE(NULL, crpc); - failed = mock_skb_new(self->server_ip, &self->data.common, 100, 0); + failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); ipv6_hdr(failed)->daddr = self->server_ip[0]; - icmp = mock_skb_new(self->server_ip, NULL, 1000, 0); + icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); memcpy(skb_put(icmp, failed->len), failed->head, failed->len); EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_PARAMPROB, diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 72415fa0..779637f3 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -83,72 +83,72 @@ TEST_F(homa_pool, set_bpages_needed) EXPECT_EQ(2, pool->bpages_needed); } -TEST_F(homa_pool, homa_pool_init) +TEST_F(homa_pool, homa_pool_alloc) { struct homa_pool *pool; /* Success */ - pool = homa_pool_new(&self->hsk); + pool = homa_pool_alloc(&self->hsk); EXPECT_FALSE(IS_ERR(pool)); EXPECT_EQ(pool->hsk, &self->hsk); - homa_pool_destroy(pool); + homa_pool_free(pool); /* Can't allocate memory. */ mock_kmalloc_errors = 1; - pool = homa_pool_new(&self->hsk); + pool = homa_pool_alloc(&self->hsk); EXPECT_TRUE(IS_ERR(pool)); EXPECT_EQ(ENOMEM, -PTR_ERR(pool)); } TEST_F(homa_pool, homa_pool_set_region__basics) { - struct homa_pool *pool = homa_pool_new(&self->hsk); + struct homa_pool *pool = homa_pool_alloc(&self->hsk); EXPECT_EQ(0, -homa_pool_set_region(pool, (void *) 0x100000, 78*HOMA_BPAGE_SIZE)); EXPECT_EQ(78, pool->num_bpages); EXPECT_EQ(-1, pool->descriptors[69].owner); - homa_pool_destroy(pool); + homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_set_region__region_not_page_aligned) { - struct homa_pool *pool = homa_pool_new(&self->hsk); + struct homa_pool *pool = homa_pool_alloc(&self->hsk); EXPECT_EQ(EINVAL, -homa_pool_set_region(pool, ((char *) 0x1000000) + 10, 100*HOMA_BPAGE_SIZE)); - homa_pool_destroy(pool); + homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_set_region__region_too_small) { - struct homa_pool *pool = homa_pool_new(&self->hsk); + struct homa_pool *pool = homa_pool_alloc(&self->hsk); EXPECT_EQ(EINVAL, -homa_pool_set_region(pool, (void *) 0x1000000, HOMA_BPAGE_SIZE)); - homa_pool_destroy(pool); + homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_set_region__cant_allocate_descriptors) { - struct homa_pool *pool = homa_pool_new(&self->hsk); + struct homa_pool *pool = homa_pool_alloc(&self->hsk); mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_pool_set_region(pool, (void *) 0x100000, 100*HOMA_BPAGE_SIZE)); - homa_pool_destroy(pool); + homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_set_region__cant_allocate_core_info) { - struct homa_pool *pool = homa_pool_new(&self->hsk); + struct homa_pool *pool = homa_pool_alloc(&self->hsk); mock_kmalloc_errors = 2; EXPECT_EQ(ENOMEM, -homa_pool_set_region(pool, (void *) 0x100000, 100*HOMA_BPAGE_SIZE)); - homa_pool_destroy(pool); + homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_get_rcvbuf) { - struct homa_pool *pool = homa_pool_new(&self->hsk); + struct homa_pool *pool = homa_pool_alloc(&self->hsk); struct homa_rcvbuf_args args; EXPECT_EQ(0, -homa_pool_set_region(pool, (void *)0x40000, @@ -156,7 +156,7 @@ TEST_F(homa_pool, homa_pool_get_rcvbuf) homa_pool_get_rcvbuf(pool, &args); EXPECT_EQ(0x40000, args.start); EXPECT_EQ(10*HOMA_BPAGE_SIZE, args.length); - homa_pool_destroy(pool); + homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_get_pages__basics) @@ -273,7 +273,7 @@ TEST_F(homa_pool, homa_pool_get_pages__set_owner) EXPECT_EQ(2, atomic_read(&pool->descriptors[1].refs)); } -TEST_F(homa_pool, homa_pool_allocate__basics) +TEST_F(homa_pool, homa_pool_alloc_msg__basics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, @@ -289,7 +289,7 @@ TEST_F(homa_pool, homa_pool_allocate__basics) EXPECT_EQ(150000 - 2*HOMA_BPAGE_SIZE, pool->cores[smp_processor_id()].allocated); } -TEST_F(homa_pool, homa_pool_allocate__no_buffer_pool) +TEST_F(homa_pool, homa_pool_alloc_msg__no_buffer_pool) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, @@ -297,12 +297,12 @@ TEST_F(homa_pool, homa_pool_allocate__no_buffer_pool) ASSERT_NE(NULL, crpc); - homa_pool_destroy(self->hsk.buffer_pool); - self->hsk.buffer_pool = homa_pool_new(&self->hsk); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); - EXPECT_EQ(ENOMEM, -homa_pool_allocate(crpc)); + EXPECT_EQ(ENOMEM, -homa_pool_alloc_msg(crpc)); } -TEST_F(homa_pool, homa_pool_allocate__cant_allocate_full_bpages) +TEST_F(homa_pool, homa_pool_alloc_msg__cant_allocate_full_bpages) { struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; @@ -316,7 +316,7 @@ TEST_F(homa_pool, homa_pool_allocate__cant_allocate_full_bpages) EXPECT_FALSE(list_empty(&crpc->buf_links)); EXPECT_EQ(1, atomic_read(&pool->free_bpages)); } -TEST_F(homa_pool, homa_pool_allocate__no_partial_page) +TEST_F(homa_pool, homa_pool_alloc_msg__no_partial_page) { struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; @@ -332,7 +332,7 @@ TEST_F(homa_pool, homa_pool_allocate__no_partial_page) EXPECT_EQ(HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[1]); EXPECT_EQ(0, atomic_read(&pool->free_bpages)); } -TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) +TEST_F(homa_pool, homa_pool_alloc_msg__owned_page_locked_and_page_stolen) { struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; @@ -351,7 +351,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) crpc->msgin.num_bpages = 0; mock_trylock_errors = 1; unit_hook_register(change_owner_hook); - EXPECT_EQ(0, homa_pool_allocate(crpc)); + EXPECT_EQ(0, homa_pool_alloc_msg(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); EXPECT_EQ(3, pool->cores[smp_processor_id()].page_hint); @@ -360,7 +360,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_locked_and_page_stolen) EXPECT_EQ(1, pool->descriptors[3].owner); EXPECT_EQ(38, atomic_read(&pool->free_bpages)); } -TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) +TEST_F(homa_pool, homa_pool_alloc_msg__page_wrap_around) { struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; @@ -382,7 +382,7 @@ TEST_F(homa_pool, homa_pool_allocate__page_wrap_around) EXPECT_EQ(1, homa_metrics_per_cpu()->bpage_reuses); #endif /* See strip.py */ } -TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) +TEST_F(homa_pool, homa_pool_alloc_msg__owned_page_overflow) { struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; @@ -396,7 +396,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); crpc->msgin.num_bpages = 0; pool->cores[smp_processor_id()].allocated = HOMA_BPAGE_SIZE-1900; - EXPECT_EQ(0, homa_pool_allocate(crpc)); + EXPECT_EQ(0, homa_pool_alloc_msg(crpc)); EXPECT_EQ(1, crpc->msgin.num_bpages); EXPECT_EQ(3*HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[0]); EXPECT_EQ(3, pool->cores[smp_processor_id()].page_hint); @@ -406,7 +406,7 @@ TEST_F(homa_pool, homa_pool_allocate__owned_page_overflow) EXPECT_EQ(1, pool->descriptors[3].owner); EXPECT_EQ(48, atomic_read(&pool->free_bpages)); } -TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) +TEST_F(homa_pool, homa_pool_alloc_msg__reuse_owned_page) { struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc1, *crpc2; @@ -427,7 +427,7 @@ TEST_F(homa_pool, homa_pool_allocate__reuse_owned_page) EXPECT_EQ(2, pool->cores[smp_processor_id()].page_hint); EXPECT_EQ(5000, pool->cores[smp_processor_id()].allocated); } -TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) +TEST_F(homa_pool, homa_pool_alloc_msg__cant_allocate_partial_bpage) { struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *crpc; @@ -444,7 +444,7 @@ TEST_F(homa_pool, homa_pool_allocate__cant_allocate_partial_bpage) EXPECT_EQ(0, atomic_read(&pool->descriptors[4].refs)); EXPECT_EQ(5, atomic_read(&pool->free_bpages)); } -TEST_F(homa_pool, homa_pool_allocate__out_of_space) +TEST_F(homa_pool, homa_pool_alloc_msg__out_of_space) { struct homa_pool *pool = self->hsk.buffer_pool; struct homa_rpc *rpc; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index ffd4115b..e6e03e0c 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -112,74 +112,74 @@ static const char *dead_rpcs(struct homa_sock *hsk) return unit_log_get(); } -TEST_F(homa_rpc, homa_rpc_new_client__normal) +TEST_F(homa_rpc, homa_rpc_alloc_client_normal) { - struct homa_rpc *crpc = homa_rpc_new_client(&self->hsk, + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(IS_ERR(crpc)); homa_rpc_end(crpc); homa_rpc_unlock(crpc); } -TEST_F(homa_rpc, homa_rpc_new_client__malloc_error) +TEST_F(homa_rpc, homa_rpc_alloc_client_malloc_error) { struct homa_rpc *crpc; mock_kmalloc_errors = 1; - crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); EXPECT_TRUE(IS_ERR(crpc)); EXPECT_EQ(ENOMEM, -PTR_ERR(crpc)); } -TEST_F(homa_rpc, homa_rpc_new_client__route_error) +TEST_F(homa_rpc, homa_rpc_alloc_client_route_error) { struct homa_rpc *crpc; mock_route_errors = 1; - crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); EXPECT_TRUE(IS_ERR(crpc)); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(crpc)); } -TEST_F(homa_rpc, homa_rpc_new_client__socket_shutdown) +TEST_F(homa_rpc, homa_rpc_alloc_client_socket_shutdown) { struct homa_rpc *crpc; self->hsk.shutdown = 1; - crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); EXPECT_TRUE(IS_ERR(crpc)); EXPECT_EQ(ESHUTDOWN, -PTR_ERR(crpc)); self->hsk.shutdown = 0; } -TEST_F(homa_rpc, homa_rpc_new_server__normal) +TEST_F(homa_rpc, homa_rpc_alloc_server__normal) { struct homa_rpc *srpc; int created; - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); self->data.message_length = N(1600); - homa_data_pkt(mock_skb_new(self->client_ip, &self->data.common, + homa_data_pkt(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0), srpc); EXPECT_EQ(RPC_INCOMING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, created); homa_rpc_end(srpc); } -TEST_F(homa_rpc, homa_rpc_new_server__already_exists) +TEST_F(homa_rpc, homa_rpc_alloc_server__already_exists) { struct homa_rpc *srpc1, *srpc2, *srpc3; int created; - srpc1 = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc1 = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc1)); homa_rpc_unlock(srpc1); self->data.common.sender_id = cpu_to_be64( be64_to_cpu(self->data.common.sender_id) + 2*HOMA_SERVER_RPC_BUCKETS); - srpc2 = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc2 = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc2)); EXPECT_EQ(1, created); @@ -188,81 +188,81 @@ TEST_F(homa_rpc, homa_rpc_new_server__already_exists) self->data.common.sender_id = cpu_to_be64( be64_to_cpu(self->data.common.sender_id) - 2*HOMA_SERVER_RPC_BUCKETS); - srpc3 = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc3 = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc3)); EXPECT_EQ(0, created); homa_rpc_unlock(srpc3); EXPECT_EQ(srpc3, srpc1); } -TEST_F(homa_rpc, homa_rpc_new_server__malloc_error) +TEST_F(homa_rpc, homa_rpc_alloc_server__malloc_error) { struct homa_rpc *srpc; int created; mock_kmalloc_errors = 1; - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); EXPECT_TRUE(IS_ERR(srpc)); EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); } -TEST_F(homa_rpc, homa_rpc_new_server__addr_error) +TEST_F(homa_rpc, homa_rpc_alloc_server__addr_error) { struct homa_rpc *srpc; int created; mock_route_errors = 1; - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); EXPECT_TRUE(IS_ERR(srpc)); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(srpc)); } -TEST_F(homa_rpc, homa_rpc_new_server__socket_shutdown) +TEST_F(homa_rpc, homa_rpc_alloc_server__socket_shutdown) { struct homa_rpc *srpc; int created; self->hsk.shutdown = 1; - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); EXPECT_TRUE(IS_ERR(srpc)); EXPECT_EQ(ESHUTDOWN, -PTR_ERR(srpc)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); self->hsk.shutdown = 0; } -TEST_F(homa_rpc, homa_rpc_new_server__allocate_buffers) +TEST_F(homa_rpc, homa_rpc_alloc_server__allocate_buffers) { struct homa_rpc *srpc; int created; self->data.message_length = N(3*HOMA_BPAGE_SIZE); - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); EXPECT_EQ(3, srpc->msgin.num_bpages); homa_rpc_end(srpc); } -TEST_F(homa_rpc, homa_rpc_new_server__no_buffer_pool) +TEST_F(homa_rpc, homa_rpc_alloc_server__no_buffer_pool) { struct homa_rpc *srpc; int created; self->data.message_length = N(1400); - homa_pool_destroy(self->hsk.buffer_pool); - self->hsk.buffer_pool = homa_pool_new(&self->hsk); - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_TRUE(IS_ERR(srpc)); EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); } -TEST_F(homa_rpc, homa_rpc_new_server__handoff_rpc) +TEST_F(homa_rpc, homa_rpc_alloc_server__handoff_rpc) { struct homa_rpc *srpc; int created; self->data.message_length = N(1400); - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); @@ -271,28 +271,28 @@ TEST_F(homa_rpc, homa_rpc_new_server__handoff_rpc) EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); homa_rpc_end(srpc); } -TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_no_buffers) +TEST_F(homa_rpc, homa_rpc_alloc_server__dont_handoff_no_buffers) { struct homa_rpc *srpc; int created; self->data.message_length = N(1400); atomic_set(&self->hsk.buffer_pool->free_bpages, 0); - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); EXPECT_EQ(0, unit_list_length(&self->hsk.ready_rpcs)); homa_rpc_end(srpc); } -TEST_F(homa_rpc, homa_rpc_new_server__dont_handoff_rpc) +TEST_F(homa_rpc, homa_rpc_alloc_server__dont_handoff_rpc) { struct homa_rpc *srpc; int created; self->data.message_length = N(2800); self->data.seg.offset = N(1400); - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); @@ -309,11 +309,11 @@ TEST_F(homa_rpc, homa_bucket_lock_slow) int created; mock_ns_tick = 10; - crpc = homa_rpc_new_client(&self->hsk, &self->server_addr); + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(IS_ERR(crpc)); homa_rpc_end(crpc); homa_rpc_unlock(crpc); - srpc = homa_rpc_new_server(&self->hsk, self->client_ip, &self->data, + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, &created); ASSERT_FALSE(IS_ERR(srpc)); homa_rpc_unlock(srpc); @@ -422,7 +422,7 @@ TEST_F(homa_rpc, homa_rpc_end__basics) #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); #endif /* See strip.py */ - EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, crpc->id)); + EXPECT_EQ(NULL, homa_rpc_find_client(&self->hsk, crpc->id)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, unit_list_length(&self->hsk.dead_rpcs)); } @@ -476,11 +476,11 @@ TEST_F(homa_rpc, homa_rpc_end__free_gaps) #endif /* See strip.py */ unit_log_clear(); self->data.seg.offset = htonl(1400); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400)); self->data.seg.offset = htonl(4200); - homa_add_packet(crpc, mock_skb_new(self->client_ip, + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 4200)); EXPECT_STREQ("start 0, end 1400; start 2800, end 4200", unit_print_gaps(crpc)); @@ -724,9 +724,9 @@ TEST_F(homa_rpc, homa_rpc_reap__free_gaps) 4000, 98, 1000, 150000); ASSERT_NE(NULL, crpc); - homa_gap_new(&crpc->msgin.gaps, 1000, 2000); + homa_gap_alloc(&crpc->msgin.gaps, 1000, 2000); mock_ns = 1000; - homa_gap_new(&crpc->msgin.gaps, 5000, 6000); + homa_gap_alloc(&crpc->msgin.gaps, 5000, 6000); EXPECT_STREQ("start 1000, end 2000; start 5000, end 6000, time 1000", unit_print_gaps(crpc)); @@ -752,7 +752,7 @@ TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); } -TEST_F(homa_rpc, homa_find_client_rpc) +TEST_F(homa_rpc, homa_rpc_find_client) { struct homa_rpc *crpc1, *crpc2, *crpc3, *crpc4; @@ -774,22 +774,22 @@ TEST_F(homa_rpc, homa_find_client_rpc) self->server_ip, self->server_port, self->client_id+6, 10000, 1000); - EXPECT_EQ(crpc1, homa_find_client_rpc(&self->hsk, crpc1->id)); + EXPECT_EQ(crpc1, homa_rpc_find_client(&self->hsk, crpc1->id)); homa_rpc_unlock(crpc1); - EXPECT_EQ(crpc2, homa_find_client_rpc(&self->hsk, crpc2->id)); + EXPECT_EQ(crpc2, homa_rpc_find_client(&self->hsk, crpc2->id)); homa_rpc_unlock(crpc2); - EXPECT_EQ(crpc3, homa_find_client_rpc(&self->hsk, crpc3->id)); + EXPECT_EQ(crpc3, homa_rpc_find_client(&self->hsk, crpc3->id)); homa_rpc_unlock(crpc3); - EXPECT_EQ(crpc4, homa_find_client_rpc(&self->hsk, crpc4->id)); + EXPECT_EQ(crpc4, homa_rpc_find_client(&self->hsk, crpc4->id)); homa_rpc_unlock(crpc4); - EXPECT_EQ(NULL, homa_find_client_rpc(&self->hsk, 15)); + EXPECT_EQ(NULL, homa_rpc_find_client(&self->hsk, 15)); homa_rpc_end(crpc1); homa_rpc_end(crpc2); homa_rpc_end(crpc3); homa_rpc_end(crpc4); } -TEST_F(homa_rpc, homa_find_server_rpc) +TEST_F(homa_rpc, homa_rpc_find_server) { struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, @@ -810,17 +810,17 @@ TEST_F(homa_rpc, homa_find_server_rpc) ASSERT_NE(NULL, srpc2); ASSERT_NE(NULL, srpc3); ASSERT_NE(NULL, srpc4); - EXPECT_EQ(srpc1, homa_find_server_rpc(&self->hsk, self->client_ip, + EXPECT_EQ(srpc1, homa_rpc_find_server(&self->hsk, self->client_ip, srpc1->id)); homa_rpc_unlock(srpc1); - EXPECT_EQ(srpc2, homa_find_server_rpc(&self->hsk, self->client_ip, + EXPECT_EQ(srpc2, homa_rpc_find_server(&self->hsk, self->client_ip, srpc2->id)); homa_rpc_unlock(srpc2); - EXPECT_EQ(srpc3, homa_find_server_rpc(&self->hsk, self->client_ip, + EXPECT_EQ(srpc3, homa_rpc_find_server(&self->hsk, self->client_ip, srpc3->id)); homa_rpc_unlock(srpc3); - EXPECT_EQ(srpc4, homa_find_server_rpc(&self->hsk, self->client_ip, + EXPECT_EQ(srpc4, homa_rpc_find_server(&self->hsk, self->client_ip, srpc4->id)); homa_rpc_unlock(srpc4); - EXPECT_EQ(NULL, homa_find_server_rpc(&self->hsk, self->client_ip, 3)); + EXPECT_EQ(NULL, homa_rpc_find_server(&self->hsk, self->client_ip, 3)); } diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index d2a9f6a5..ccf5963c 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -19,7 +19,7 @@ static inline struct homa_skb_core *get_skb_core(int core) static struct sk_buff *test_skb(struct homa *homa) { struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); - struct sk_buff *skb = homa_skb_new_tx(100); + struct sk_buff *skb = homa_skb_alloc_tx(100); int32_t data[1000]; char *src; int i; @@ -293,7 +293,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__free_previous_page) TEST_F(homa_skb, homa_skb_page_alloc__reuse_existing_page) { struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); - struct sk_buff *skb = homa_skb_new_tx(100); + struct sk_buff *skb = homa_skb_alloc_tx(100); struct page *page; int length = 100; @@ -452,7 +452,7 @@ TEST_F(homa_skb, homa_skb_append_from_iter__no_memory) TEST_F(homa_skb, homa_skb_append_from_skb__header_only) { struct sk_buff *src_skb = test_skb(&self->homa); - struct sk_buff *dst_skb = homa_skb_new_tx(100); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); int32_t data[500]; EXPECT_EQ(0, homa_skb_append_from_skb(&self->homa, dst_skb, src_skb, @@ -469,7 +469,7 @@ TEST_F(homa_skb, homa_skb_append_from_skb__error_copying_header) { struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); struct sk_buff *src_skb = test_skb(&self->homa); - struct sk_buff *dst_skb = homa_skb_new_tx(100); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); mock_alloc_page_errors = -1; skb_core->page_inuse = skb_core->page_size; @@ -482,7 +482,7 @@ TEST_F(homa_skb, homa_skb_append_from_skb__error_copying_header) TEST_F(homa_skb, homa_skb_append_from_skb__header_and_first_frag) { struct sk_buff *src_skb = test_skb(&self->homa); - struct sk_buff *dst_skb = homa_skb_new_tx(100); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); struct skb_shared_info *dst_shinfo; int32_t data[500]; @@ -502,7 +502,7 @@ TEST_F(homa_skb, homa_skb_append_from_skb__header_and_first_frag) TEST_F(homa_skb, homa_skb_append_from_skb__multiple_frags) { struct sk_buff *src_skb = test_skb(&self->homa); - struct sk_buff *dst_skb = homa_skb_new_tx(100); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); struct skb_shared_info *dst_shinfo; int32_t data[500]; @@ -522,7 +522,7 @@ TEST_F(homa_skb, homa_skb_append_from_skb__multiple_frags) TEST_F(homa_skb, homa_skb_append_from_skb__dst_runs_out_of_frags) { struct sk_buff *src_skb = test_skb(&self->homa); - struct sk_buff *dst_skb = homa_skb_new_tx(100); + struct sk_buff *dst_skb = homa_skb_alloc_tx(100); struct skb_shared_info *dst_shinfo; int i, err; @@ -547,14 +547,14 @@ TEST_F(homa_skb, homa_skb_free_many_tx__basics) struct sk_buff *skbs[2]; int i, length; - skbs[0] = homa_skb_new_tx(100); + skbs[0] = homa_skb_alloc_tx(100); for (i = 0; i < 3; i++) { length = 2*HOMA_SKB_PAGE_SIZE; homa_skb_extend_frags(&self->homa, skbs[0], &length); } EXPECT_EQ(HOMA_SKB_PAGE_SIZE, length); - skbs[1] = homa_skb_new_tx(100); + skbs[1] = homa_skb_alloc_tx(100); length = 2 * HOMA_SKB_PAGE_SIZE; homa_skb_extend_frags(&self->homa, skbs[1], &length); @@ -567,7 +567,7 @@ TEST_F(homa_skb, homa_skb_free_many_tx__skb_ref_count_not_one) struct page *page; int length; - skb = homa_skb_new_tx(100); + skb = homa_skb_alloc_tx(100); length = HOMA_SKB_PAGE_SIZE; homa_skb_extend_frags(&self->homa, skb, &length); EXPECT_EQ(HOMA_SKB_PAGE_SIZE, length); @@ -586,7 +586,7 @@ TEST_F(homa_skb, homa_skb_free_many_tx__check_page_order) struct sk_buff *skb; int i, length; - skb = homa_skb_new_tx(100); + skb = homa_skb_alloc_tx(100); for (i = 0; i < 4; i++) { length = 2 * HOMA_SKB_PAGE_SIZE; homa_skb_extend_frags(&self->homa, skb, &length); diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index b612e2ea..b127a9ad 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -50,7 +50,7 @@ FIXTURE_TEARDOWN(homa_timer) unit_teardown(); } -TEST_F(homa_timer, homa_check_rpc__request_ack) +TEST_F(homa_timer, homa_timer_check_rpc__request_ack) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, @@ -61,33 +61,33 @@ TEST_F(homa_timer, homa_check_rpc__request_ack) /* First call: do nothing (response not fully transmitted). */ homa_rpc_lock(srpc); - homa_check_rpc(srpc); + homa_timer_check_rpc(srpc); EXPECT_EQ(0, srpc->done_timer_ticks); /* Second call: set done_timer_ticks. */ homa_xmit_data(srpc, false); unit_log_clear(); - homa_check_rpc(srpc); + homa_timer_check_rpc(srpc); EXPECT_EQ(100, srpc->done_timer_ticks); EXPECT_STREQ("", unit_log_get()); /* Third call: haven't hit request_ack_ticks yet. */ unit_log_clear(); self->homa.timer_ticks++; - homa_check_rpc(srpc); + homa_timer_check_rpc(srpc); EXPECT_EQ(100, srpc->done_timer_ticks); EXPECT_STREQ("", unit_log_get()); /* Fourth call: request ack. */ unit_log_clear(); self->homa.timer_ticks++; - homa_check_rpc(srpc); + homa_timer_check_rpc(srpc); homa_rpc_unlock(srpc); EXPECT_EQ(100, srpc->done_timer_ticks); EXPECT_STREQ("xmit NEED_ACK", unit_log_get()); } #ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_timer, homa_check_rpc__all_granted_bytes_received) +TEST_F(homa_timer, homa_timer_check_rpc__all_granted_bytes_received) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, @@ -98,13 +98,13 @@ TEST_F(homa_timer, homa_check_rpc__all_granted_bytes_received) crpc->msgin.granted = 1400; crpc->silent_ticks = 10; homa_rpc_lock(crpc); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } #endif /* See strip.py */ -TEST_F(homa_timer, homa_check_rpc__no_buffer_space) +TEST_F(homa_timer, homa_timer_check_rpc__no_buffer_space) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, @@ -115,12 +115,12 @@ TEST_F(homa_timer, homa_check_rpc__no_buffer_space) crpc->msgin.num_bpages = 0; crpc->silent_ticks = 10; homa_rpc_lock(crpc); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_timer, homa_check_rpc__server_has_received_request) +TEST_F(homa_timer, homa_timer_check_rpc__server_has_received_request) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, self->server_ip, self->client_port, @@ -130,12 +130,12 @@ TEST_F(homa_timer, homa_check_rpc__server_has_received_request) unit_log_clear(); srpc->silent_ticks = 10; homa_rpc_lock(srpc); - homa_check_rpc(srpc); + homa_timer_check_rpc(srpc); homa_rpc_unlock(srpc); EXPECT_EQ(0, srpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_timer, homa_check_rpc__granted_bytes_not_sent) +TEST_F(homa_timer, homa_timer_check_rpc__granted_bytes_not_sent) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -145,12 +145,12 @@ TEST_F(homa_timer, homa_check_rpc__granted_bytes_not_sent) unit_log_clear(); crpc->silent_ticks = 10; homa_rpc_lock(crpc); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_timer, homa_check_rpc__timeout) +TEST_F(homa_timer, homa_timer_check_rpc__timeout) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, @@ -160,20 +160,20 @@ TEST_F(homa_timer, homa_check_rpc__timeout) unit_log_clear(); crpc->silent_ticks = self->homa.timeout_ticks-1; homa_rpc_lock(crpc); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(0, homa_metrics_per_cpu()->rpc_timeouts); #endif /* See strip.py */ EXPECT_EQ(0, crpc->error); crpc->silent_ticks = self->homa.timeout_ticks; - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->rpc_timeouts); #endif /* See strip.py */ EXPECT_EQ(ETIMEDOUT, -crpc->error); } -TEST_F(homa_timer, homa_check_rpc__issue_resend) +TEST_F(homa_timer, homa_timer_check_rpc__issue_resend) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, @@ -190,14 +190,14 @@ TEST_F(homa_timer, homa_check_rpc__issue_resend) /* First call: resend_ticks-1. */ crpc->silent_ticks = 2; unit_log_clear(); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); EXPECT_STREQ("", unit_log_get()); /* Second call: resend_ticks. */ crpc->silent_ticks = 3; unit_log_clear(); homa_rpc_lock(crpc); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); #else /* See strip.py */ @@ -207,13 +207,13 @@ TEST_F(homa_timer, homa_check_rpc__issue_resend) /* Third call: not yet time for next resend. */ crpc->silent_ticks = 4; unit_log_clear(); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); EXPECT_STREQ("", unit_log_get()); /* Fourth call: time for second resend. */ crpc->silent_ticks = 5; unit_log_clear(); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1400-4999@7", unit_log_get()); @@ -221,7 +221,7 @@ TEST_F(homa_timer, homa_check_rpc__issue_resend) EXPECT_STREQ("xmit RESEND 1400-9999", unit_log_get()); #endif /* See strip.py */ } -TEST_F(homa_timer, homa_check_rpc__request_first_bytes_of_message) +TEST_F(homa_timer, homa_timer_check_rpc__request_first_bytes_of_message) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -238,13 +238,13 @@ TEST_F(homa_timer, homa_check_rpc__request_first_bytes_of_message) crpc->silent_ticks = 2; unit_log_clear(); homa_rpc_lock(crpc); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); EXPECT_STREQ("", unit_log_get()); /* Second call: resend_ticks. */ crpc->silent_ticks = 3; unit_log_clear(); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 0-99@7", unit_log_get()); @@ -252,7 +252,7 @@ TEST_F(homa_timer, homa_check_rpc__request_first_bytes_of_message) EXPECT_STREQ("xmit RESEND 0-99", unit_log_get()); #endif /* See strip.py */ } -TEST_F(homa_timer, homa_check_rpc__call_homa_gap_retry) +TEST_F(homa_timer, homa_timer_check_rpc__call_homa_gap_retry) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, @@ -265,13 +265,13 @@ TEST_F(homa_timer, homa_check_rpc__call_homa_gap_retry) crpc->msgin.recv_end = 10000; #endif /* See strip.py */ crpc->msgin.bytes_remaining = 15000; - homa_gap_new(&crpc->msgin.gaps, 7000, 8000); + homa_gap_alloc(&crpc->msgin.gaps, 7000, 8000); self->homa.resend_ticks = 3; self->homa.resend_interval = 2; unit_log_clear(); homa_rpc_lock(crpc); - homa_check_rpc(crpc); + homa_timer_check_rpc(crpc); homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 7000-7999@7", unit_log_get()); diff --git a/test/utils.c b/test/utils.c index 2472eda1..edce3af9 100644 --- a/test/utils.c +++ b/test/utils.c @@ -45,7 +45,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, server_addr.in6.sin6_port = htons(server_port); if (id != 0) atomic64_set(&hsk->homa->next_outgoing_id, id); - crpc = homa_rpc_new_client(hsk, &server_addr); + crpc = homa_rpc_alloc_client(hsk, &server_addr); if (IS_ERR(crpc)) return NULL; if (homa_message_out_fill(crpc, unit_iov_iter(NULL, req_length), 0)) { @@ -75,7 +75,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, this_size = (resp_length > UNIT_TEST_DATA_PER_PACKET) ? UNIT_TEST_DATA_PER_PACKET : resp_length; - homa_dispatch_pkts(mock_skb_new(server_ip, &h.common, this_size, 0), + homa_dispatch_pkts(mock_skb_alloc(server_ip, &h.common, this_size, 0), hsk->homa); if (state == UNIT_RCVD_ONE_PKT) return crpc; @@ -86,7 +86,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, if (this_size > UNIT_TEST_DATA_PER_PACKET) this_size = UNIT_TEST_DATA_PER_PACKET; h.seg.offset = htonl(bytes_received); - homa_dispatch_pkts(mock_skb_new(server_ip, &h.common, + homa_dispatch_pkts(mock_skb_alloc(server_ip, &h.common, this_size, 0), hsk->homa); } if (state == UNIT_RCVD_MSG) @@ -378,14 +378,14 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, #ifndef __STRIP__ /* See strip.py */ h.incoming = htonl(10000); #endif /* See strip.py */ - struct homa_rpc *srpc = homa_rpc_new_server(hsk, client_ip, &h, + struct homa_rpc *srpc = homa_rpc_alloc_server(hsk, client_ip, &h, &created); if (IS_ERR(srpc)) return NULL; EXPECT_EQ(srpc->completion_cookie, 0); homa_rpc_unlock(srpc); - homa_dispatch_pkts(mock_skb_new(client_ip, &h.common, + homa_dispatch_pkts(mock_skb_alloc(client_ip, &h.common, (req_length > UNIT_TEST_DATA_PER_PACKET) ? UNIT_TEST_DATA_PER_PACKET : req_length, 0), hsk->homa); @@ -399,7 +399,7 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, if (this_size > UNIT_TEST_DATA_PER_PACKET) this_size = UNIT_TEST_DATA_PER_PACKET; h.seg.offset = htonl(bytes_received); - homa_dispatch_pkts(mock_skb_new(client_ip, &h.common, + homa_dispatch_pkts(mock_skb_alloc(client_ip, &h.common, this_size, 0), hsk->homa); } if (state == UNIT_RCVD_MSG) From 6ee7558932165b157e58203e3614991490a4f0d8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 11:27:56 -0700 Subject: [PATCH 284/625] Allow memory for homa_pool to be allocated without GFP_ATOMIC Requires that socket locking be moved into homa_pool_set_region, so memory can be allocated before the lock is acquired. --- homa_plumbing.c | 5 +-- homa_pool.c | 73 ++++++++++++++++++++++----------------- homa_pool.h | 2 +- homa_sock.c | 23 ++++++++---- homa_sock.h | 2 +- test/mock.c | 2 +- test/unit_homa_plumbing.c | 3 +- test/unit_homa_pool.c | 58 +++++++++++++++++-------------- test/unit_homa_sock.c | 16 ++++----- 9 files changed, 102 insertions(+), 82 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 80de5fc8..715756bd 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -839,11 +839,8 @@ int homa_setsockopt(struct sock *sk, int level, int optname, sizeof(args))) return -EFAULT; - homa_sock_lock(hsk); - ret = homa_pool_set_region(hsk->buffer_pool, - u64_to_user_ptr(args.start), + ret = homa_pool_set_region(hsk, u64_to_user_ptr(args.start), args.length); - homa_sock_unlock(hsk); INC_METRIC(so_set_buf_calls, 1); INC_METRIC(so_set_buf_ns, sched_clock() - start); } else if (optname == SO_HOMA_SERVER) { diff --git a/homa_pool.c b/homa_pool.c index 6307ea45..f27bec70 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -52,7 +52,7 @@ struct homa_pool *homa_pool_alloc(struct homa_sock *hsk) { struct homa_pool *pool; - pool = kzalloc(sizeof(*pool), GFP_ATOMIC); + pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return ERR_PTR(-ENOMEM); pool->hsk = hsk; @@ -61,63 +61,72 @@ struct homa_pool *homa_pool_alloc(struct homa_sock *hsk) /** * homa_pool_set_region() - Associate a region of memory with a pool. - * @pool: Pool the region will be associated with. Must not currently + * @hsk: Socket whose pool the region will be associated with. + * Must not be locked, and the pool must not currently * have a region associated with it. * @region: First byte of the memory region for the pool, allocated * by the application; must be page-aligned. * @region_size: Total number of bytes available at @buf_region. * Return: Either zero (for success) or a negative errno for failure. */ -int homa_pool_set_region(struct homa_pool *pool, void __user *region, +int homa_pool_set_region(struct homa_sock *hsk, void __user *region, u64 region_size) { - int i, result; + struct homa_pool_core __percpu *cores; + struct homa_bpage *descriptors; + int i, result, num_bpages; + struct homa_pool *pool; - if (pool->region) + if (((uintptr_t)region) & ~PAGE_MASK) return -EINVAL; - if (((uintptr_t)region) & ~PAGE_MASK) + /* Allocate memory before locking the socket, so we can allocate + * without GFP_ATOMIC. + */ + num_bpages = region_size >> HOMA_BPAGE_SHIFT; + if (num_bpages < MIN_POOL_SIZE) { return -EINVAL; - pool->region = (char __user *)region; - pool->num_bpages = region_size >> HOMA_BPAGE_SHIFT; - pool->descriptors = NULL; - pool->cores = NULL; - if (pool->num_bpages < MIN_POOL_SIZE) { - result = -EINVAL; - goto error; } - pool->descriptors = kmalloc_array(pool->num_bpages, - sizeof(struct homa_bpage), - GFP_ATOMIC | __GFP_ZERO); - if (!pool->descriptors) { + descriptors = kmalloc_array(num_bpages, sizeof(struct homa_bpage), + __GFP_ZERO); + if (!descriptors) + return -ENOMEM; + cores = alloc_percpu_gfp(struct homa_pool_core, __GFP_ZERO); + if (!cores) { result = -ENOMEM; goto error; } - for (i = 0; i < pool->num_bpages; i++) { - struct homa_bpage *bp = &pool->descriptors[i]; - spin_lock_init(&bp->lock); - bp->owner = -1; + homa_sock_lock(hsk); + pool = hsk->buffer_pool; + if (pool->region) { + result = -EINVAL; + homa_sock_unlock(hsk); + goto error; } + + pool->region = (char __user *)region; + pool->num_bpages = num_bpages; + pool->descriptors = descriptors; atomic_set(&pool->free_bpages, pool->num_bpages); pool->bpages_needed = INT_MAX; - - /* Allocate and initialize core-specific data. */ - pool->cores = alloc_percpu_gfp(struct homa_pool_core, - GFP_ATOMIC | __GFP_ZERO); - if (!pool->cores) { - result = -ENOMEM; - goto error; - } + pool->cores = cores; pool->num_cores = nr_cpu_ids; pool->check_waiting_invoked = 0; + for (i = 0; i < pool->num_bpages; i++) { + struct homa_bpage *bp = &pool->descriptors[i]; + + spin_lock_init(&bp->lock); + bp->owner = -1; + } + + homa_sock_unlock(hsk); return 0; error: - kfree(pool->descriptors); - free_percpu(pool->cores); - pool->region = NULL; + kfree(descriptors); + free_percpu(cores); return result; } diff --git a/homa_pool.h b/homa_pool.h index 9f354e3a..f21722e7 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -150,7 +150,7 @@ void homa_pool_get_rcvbuf(struct homa_pool *pool, struct homa_rcvbuf_args *args); int homa_pool_release_buffers(struct homa_pool *pool, int num_buffers, u32 *buffers); -int homa_pool_set_region(struct homa_pool *pool, void __user *region, +int homa_pool_set_region(struct homa_sock *hsk, void __user *region, u64 region_size); #endif /* _HOMA_POOL_H */ diff --git a/homa_sock.c b/homa_sock.c index 0e335290..ab99b4de 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -139,6 +139,7 @@ void homa_socktab_end_scan(struct homa_socktab_scan *scan) int homa_sock_init(struct homa_sock *hsk, struct homa *homa) { struct homa_socktab *socktab = homa->port_map; + struct homa_pool *buffer_pool; struct homa_sock *other; int starting_port; int result = 0; @@ -147,6 +148,13 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) /* Initialize fields outside the Homa part. */ hsk->sock.sk_sndbuf = homa->wmem_max; + /* Do things requiring memory allocation before locking the socket, + * so that GFP_ATOMIC is not needed. + */ + buffer_pool = homa_pool_alloc(hsk); + if (IS_ERR(buffer_pool)) + return PTR_ERR(buffer_pool); + /* Initialize Homa-specific fields. */ spin_lock_bh(&socktab->write_lock); atomic_set(&hsk->protect_count, 0); @@ -169,7 +177,8 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) if (homa->prev_default_port == starting_port) { spin_unlock_bh(&socktab->write_lock); hsk->shutdown = true; - return -EADDRNOTAVAIL; + result = -EADDRNOTAVAIL; + goto error; } } hsk->port = homa->prev_default_port; @@ -197,17 +206,17 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) bucket->id = i + 1000000; INIT_HLIST_HEAD(&bucket->rpcs); } - hsk->buffer_pool = homa_pool_alloc(hsk); - if (IS_ERR(hsk->buffer_pool)) { - result = PTR_ERR(hsk->buffer_pool); - hsk->buffer_pool = NULL; - } + hsk->buffer_pool = buffer_pool; #ifndef __STRIP__ /* See strip.py */ if (homa->hijack_tcp) hsk->sock.sk_protocol = IPPROTO_TCP; #endif /* See strip.py */ spin_unlock_bh(&socktab->write_lock); return result; + +error: + homa_pool_free(buffer_pool); + return result; } /* @@ -242,7 +251,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) tt_record1("Starting shutdown for socket %d", hsk->port); homa_sock_lock(hsk); - if (hsk->shutdown) { + if (hsk->shutdown || !hsk->homa) { homa_sock_unlock(hsk); return; } diff --git a/homa_sock.h b/homa_sock.h index 2f89839e..4f78ba94 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -139,7 +139,7 @@ struct homa_sock { /** * @homa: Overall state about the Homa implementation. NULL - * means this socket has been deleted. + * means this socket was never initialized or has been deleted. */ struct homa *homa; diff --git a/test/mock.c b/test/mock.c index d6c73c23..b3dd6b3f 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1945,7 +1945,7 @@ int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) mock_mtu = UNIT_TEST_DATA_PER_PACKET + hsk->ip_header_length + sizeof(struct homa_data_hdr); mock_net_device.gso_max_size = mock_mtu; - err = homa_pool_set_region(hsk->buffer_pool, (void *) 0x1000000, + err = homa_pool_set_region(hsk, (void *) 0x1000000, 100*HOMA_BPAGE_SIZE); return err; } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 773a5c37..7ce0a695 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -367,8 +367,7 @@ TEST_F(homa_plumbing, homa_getsockopt__recvbuf_success) homa_pool_free(self->hsk.buffer_pool); self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); - EXPECT_EQ(0, -homa_pool_set_region(self->hsk.buffer_pool, - (void *)0x40000, + EXPECT_EQ(0, -homa_pool_set_region(&self->hsk, (void *)0x40000, 10*HOMA_BPAGE_SIZE + 1000)); EXPECT_EQ(0, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 779637f3..ca9d2444 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -100,63 +100,69 @@ TEST_F(homa_pool, homa_pool_alloc) EXPECT_EQ(ENOMEM, -PTR_ERR(pool)); } -TEST_F(homa_pool, homa_pool_set_region__basics) -{ - struct homa_pool *pool = homa_pool_alloc(&self->hsk); - - EXPECT_EQ(0, -homa_pool_set_region(pool, (void *) 0x100000, - 78*HOMA_BPAGE_SIZE)); - EXPECT_EQ(78, pool->num_bpages); - EXPECT_EQ(-1, pool->descriptors[69].owner); - homa_pool_free(pool); -} TEST_F(homa_pool, homa_pool_set_region__region_not_page_aligned) { - struct homa_pool *pool = homa_pool_alloc(&self->hsk); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); - EXPECT_EQ(EINVAL, -homa_pool_set_region(pool, + EXPECT_EQ(EINVAL, -homa_pool_set_region(&self->hsk, ((char *) 0x1000000) + 10, 100*HOMA_BPAGE_SIZE)); - homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_set_region__region_too_small) { - struct homa_pool *pool = homa_pool_alloc(&self->hsk); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); - EXPECT_EQ(EINVAL, -homa_pool_set_region(pool, (void *) 0x1000000, + EXPECT_EQ(EINVAL, -homa_pool_set_region(&self->hsk, (void *) 0x1000000, HOMA_BPAGE_SIZE)); - homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_set_region__cant_allocate_descriptors) { - struct homa_pool *pool = homa_pool_alloc(&self->hsk); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); mock_kmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_pool_set_region(pool, (void *) 0x100000, + EXPECT_EQ(ENOMEM, -homa_pool_set_region(&self->hsk, (void *) 0x100000, 100*HOMA_BPAGE_SIZE)); - homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_set_region__cant_allocate_core_info) { - struct homa_pool *pool = homa_pool_alloc(&self->hsk); + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); mock_kmalloc_errors = 2; - EXPECT_EQ(ENOMEM, -homa_pool_set_region(pool, (void *) 0x100000, + EXPECT_EQ(ENOMEM, -homa_pool_set_region(&self->hsk, (void *) 0x100000, + 100*HOMA_BPAGE_SIZE)); +} +TEST_F(homa_pool, homa_pool_set_region__pool_already_has_region) +{ + EXPECT_EQ(EINVAL, -homa_pool_set_region(&self->hsk, (void *) 0x100000, 100*HOMA_BPAGE_SIZE)); - homa_pool_free(pool); +} +TEST_F(homa_pool, homa_pool_set_region__success) +{ + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + + EXPECT_EQ(0, -homa_pool_set_region(&self->hsk, (void *) 0x100000, + 78*HOMA_BPAGE_SIZE)); + EXPECT_EQ(78, self->hsk.buffer_pool->num_bpages); + EXPECT_EQ(-1, self->hsk.buffer_pool->descriptors[69].owner); } TEST_F(homa_pool, homa_pool_get_rcvbuf) { - struct homa_pool *pool = homa_pool_alloc(&self->hsk); struct homa_rcvbuf_args args; - EXPECT_EQ(0, -homa_pool_set_region(pool, (void *)0x40000, + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = homa_pool_alloc(&self->hsk); + + EXPECT_EQ(0, -homa_pool_set_region(&self->hsk, (void *)0x40000, 10*HOMA_BPAGE_SIZE + 1000)); - homa_pool_get_rcvbuf(pool, &args); + homa_pool_get_rcvbuf(self->hsk.buffer_pool, &args); EXPECT_EQ(0x40000, args.start); EXPECT_EQ(10*HOMA_BPAGE_SIZE, args.length); - homa_pool_free(pool); } TEST_F(homa_pool, homa_pool_get_pages__basics) diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index a1d5a781..cc37b5ad 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -133,6 +133,14 @@ TEST_F(homa_sock, homa_socktab_end_scan) EXPECT_EQ(0, mock_sock_holds); } +TEST_F(homa_sock, homa_sock_init__cant_allocate_buffer_pool) +{ + struct homa_sock sock; + + mock_kmalloc_errors = 1; + EXPECT_EQ(ENOMEM, -homa_sock_init(&sock, &self->homa)); + homa_sock_destroy(&sock); +} TEST_F(homa_sock, homa_sock_init__skip_port_in_use) { struct homa_sock hsk2, hsk3; @@ -173,14 +181,6 @@ TEST_F(homa_sock, homa_sock_init__ip_header_length) homa_sock_destroy(&hsk_v4); homa_sock_destroy(&hsk_v6); } -TEST_F(homa_sock, homa_sock_init__kzalloc_failure) -{ - struct homa_sock sock; - - mock_kmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_sock_init(&sock, &self->homa)); - homa_sock_destroy(&sock); -} #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_sock, homa_sock_init__hijack_tcp) { From f0e02ab97bb72d94355a5b03055ff92d2d47ce21 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 11:53:42 -0700 Subject: [PATCH 285/625] Remove pool->num_cores: no longer used --- homa_pool.c | 1 - homa_pool.h | 3 --- 2 files changed, 4 deletions(-) diff --git a/homa_pool.c b/homa_pool.c index f27bec70..842dccc0 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -111,7 +111,6 @@ int homa_pool_set_region(struct homa_sock *hsk, void __user *region, atomic_set(&pool->free_bpages, pool->num_bpages); pool->bpages_needed = INT_MAX; pool->cores = cores; - pool->num_cores = nr_cpu_ids; pool->check_waiting_invoked = 0; for (i = 0; i < pool->num_bpages; i++) { diff --git a/homa_pool.h b/homa_pool.h index f21722e7..6005fb4e 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -127,9 +127,6 @@ struct homa_pool { /** @cores: core-specific info; dynamically allocated. */ struct homa_pool_core __percpu *cores; - /** @num_cores: number of elements in @cores. */ - int num_cores; - /** * @check_waiting_invoked: incremented during unit tests when * homa_pool_check_waiting is invoked. From 6c784133d76228ea61cf06c0e76414888bd43e76 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 13:34:43 -0700 Subject: [PATCH 286/625] Fix checkpatch.pl issues on homa_pool.c --- homa_pool.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/homa_pool.c b/homa_pool.c index 842dccc0..0de34806 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -84,9 +84,8 @@ int homa_pool_set_region(struct homa_sock *hsk, void __user *region, * without GFP_ATOMIC. */ num_bpages = region_size >> HOMA_BPAGE_SHIFT; - if (num_bpages < MIN_POOL_SIZE) { + if (num_bpages < MIN_POOL_SIZE) return -EINVAL; - } descriptors = kmalloc_array(num_bpages, sizeof(struct homa_bpage), __GFP_ZERO); if (!descriptors) @@ -219,8 +218,8 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, if (limit == 0) { int extra; - limit = pool->num_bpages - - atomic_read(&pool->free_bpages); + limit = pool->num_bpages - + atomic_read(&pool->free_bpages); extra = limit >> 2; limit += (extra < MIN_EXTRA) ? MIN_EXTRA : extra; if (limit > pool->num_bpages) From 29249864fcfe0fd0dac3ccd572ac3d4e191ef830 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 13:45:25 -0700 Subject: [PATCH 287/625] Add comment for spin_trylock in homa_pool.c --- homa_pool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/homa_pool.c b/homa_pool.c index 0de34806..b0dce441 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -249,6 +249,10 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, if (!homa_bpage_available(bpage, now)) continue; if (!spin_trylock_bh(&bpage->lock)) + /* Rather than wait for a locked page to become free, + * just go on to the next page. If the page is locked, + * it probably won't turn out to be available anyway. + */ continue; if (!homa_bpage_available(bpage, now)) { spin_unlock_bh(&bpage->lock); From 5fb5c43dae7419619ac5929898f9dc4d98dcebb8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 13:58:28 -0700 Subject: [PATCH 288/625] Fix ugliness in stripped code in homa_pool.c --- homa_pool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/homa_pool.c b/homa_pool.c index b0dce441..0ab087cd 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -319,11 +319,15 @@ int homa_pool_alloc_msg(struct homa_rpc *rpc) core_id = smp_processor_id(); core = this_cpu_ptr(pool->cores); bpage = &pool->descriptors[core->page_hint]; +#ifndef __STRIP__ /* See strip.py */ if (!spin_trylock_bh(&bpage->lock)) { tt_record("beginning wait for bpage lock"); spin_lock_bh(&bpage->lock); tt_record("ending wait for bpage lock"); } +#else /* See strip.py */ + spin_lock_bh(&bpage->lock); +#endif /* See strip.py */ if (bpage->owner != core_id) { spin_unlock_bh(&bpage->lock); goto new_page; From 4a402e2230fd809398c4c876a91650ccb295899c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 14:01:35 -0700 Subject: [PATCH 289/625] Improve comments in homa_pool.c --- homa_pool.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/homa_pool.c b/homa_pool.c index 0ab087cd..e0c77281 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -385,7 +385,8 @@ int homa_pool_alloc_msg(struct homa_rpc *rpc) return 0; /* We get here if there wasn't enough buffer space for this - * message; add the RPC to hsk->waiting_for_bufs. + * message; add the RPC to hsk->waiting_for_bufs. The list is sorted + * by RPC length in order to implement SRPT. */ out_of_space: INC_METRIC(buffer_alloc_failures, 1); @@ -504,9 +505,9 @@ void homa_pool_check_waiting(struct homa_pool *pool) struct homa_rpc, buf_links); if (!homa_rpc_try_lock(rpc)) { /* Can't just spin on the RPC lock because we're - * holding the socket lock (see sync.txt). Instead, - * release the socket lock and try the entire - * operation again. + * holding the socket lock and the lock order is + * rpc->socket (see sync.txt). Instead, release the + * socket lock and try the entire operation again. */ homa_sock_unlock(pool->hsk); UNIT_LOG("; ", "rpc lock unavailable in %s", __func__); From 2f8d7529b209dc0bc442a3690ff83935956013c6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 16:17:55 -0700 Subject: [PATCH 290/625] Use a cleaner way to ensure cache line alignment for homa_bpages --- homa_pool.h | 57 ++++++++++++++++++++++------------------------------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/homa_pool.h b/homa_pool.h index 6005fb4e..5a1bb6bf 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -15,39 +15,30 @@ * a buffer pool. */ struct homa_bpage { - union { - /** - * @cache_line: Ensures that each homa_bpage object - * is exactly one cache line long. - */ - char cache_line[L1_CACHE_BYTES]; - struct { - /** @lock: to synchronize shared access. */ - spinlock_t lock; - - /** - * @refs: Counts number of distinct uses of this - * bpage (1 tick for each message that is using - * this page, plus an additional tick if the @owner - * field is set). - */ - atomic_t refs; - - /** - * @owner: kernel core that currently owns this page - * (< 0 if none). - */ - int owner; - - /** - * @expiration: time (in sched_clock() units) after - * which it's OK to steal this page from its current - * owner (if @refs is 1). - */ - u64 expiration; - }; - }; -}; + /** @lock: to synchronize shared access. */ + spinlock_t lock; + + /** + * @refs: Counts number of distinct uses of this + * bpage (1 tick for each message that is using + * this page, plus an additional tick if the @owner + * field is set). + */ + atomic_t refs; + + /** + * @owner: kernel core that currently owns this page + * (< 0 if none). + */ + int owner; + + /** + * @expiration: time (in sched_clock() units) after + * which it's OK to steal this page from its current + * owner (if @refs is 1). + */ + u64 expiration; +} ____cacheline_aligned_in_smp; #ifndef __STRIP__ /* See strip.py */ #ifndef CONFIG_LOCKDEP From 12be288a3b8a561b161993f7a7709a9b7c69aa55 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 6 May 2025 16:21:40 -0700 Subject: [PATCH 291/625] Various cleanups in sync.txt --- sync.txt | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/sync.txt b/sync.txt index eb3c6ffb..1f063f3d 100644 --- a/sync.txt +++ b/sync.txt @@ -28,27 +28,23 @@ This file describes the synchronization strategy used for Homa. * There are several other locks in Homa besides RPC locks. When multiple locks are held, they must always be acquired in a consistent order, in - order to prevent deadlock. For each lock, here are the other locks that - may be acquired while holding the given lock. - * RPC: socket, grant, throttle, peer->ack_lock - * Socket: port_map.write_lock - Any lock not listed above must be a "leaf" lock: no other lock will be - acquired while holding the lock. + order to prevent deadlock. Overall rules: + * RPC locks are "top level" + * Other locks may be acquired while holding an RPC lock + * It is not safe to wait on an RPC lock while holding any other locks. + * It is safe to acquire port_map.write_lock while holding a socket lock + * Other than these rules, all locks should be considered "leaf" locks: + don't accquire other locks while holding them. * Homa's approach means that socket shutdown and deletion can potentially - occur while operations are underway that hold RPC locks but not the socket - lock. This creates several potential problems: - * A socket might be deleted and its memory reclaimed while an RPC still - has access to it. Homa assumes that Linux will prevent socket deletion - while the kernel call is executing. In situations outside kernel call - handling, Homa uses rcu_read_lock and/or socket references to prevent - socket deletion. - * A socket might be shut down while there are active operations on - RPCs. For example, a new RPC creation might be underway when a socket - is shut down, which could add the new RPC after all of its RPCs - have supposedly been deleted. Handling this requires careful ordering - of operations during shutdown, plus the rest of Homa must be careful - never to add new RPCs to a socket that has been shut down. + begin while operations are underway that hold RPC locks but not the socket + lock. For example, a new RPC creation might be underway when a socket + is shut down, which could attempt to add the new RPC after homa_sock_shutdown + thinks it has deleted all RPCs. Handling this requires careful checking + of hsk->shutdown. For example, during new RPC creation the socket lock + must be acquired to add the new RPC to those for the socket; after acquiring + the lock, it must check hsk->shutdown and abort the RPC creation if the + socket has been shutdown. * There are a few places where Homa needs to process RPCs on lists associated with a socket, such as the timer. Such code must first lock From ade5d0391a842606ece81ff303f3b886029df96d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 09:12:27 -0700 Subject: [PATCH 292/625] Add missing memory barrier in homa_peer.c --- homa_peer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/homa_peer.c b/homa_peer.c index 2dbfd07f..d9bcee21 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -231,6 +231,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, INIT_LIST_HEAD(&peer->grantable_rpcs); INIT_LIST_HEAD(&peer->grantable_links); #endif /* See strip.py */ + smp_wmb(); hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]); peer->current_ticks = -1; spin_lock_init(&peer->ack_lock); From 00cbbbf3af4ca70b69799e0403dfc973395fe9d2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 09:35:03 -0700 Subject: [PATCH 293/625] Delete duplicated line in homa_sock_init --- homa_sock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/homa_sock.c b/homa_sock.c index ab99b4de..d38a90b6 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -157,7 +157,6 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) /* Initialize Homa-specific fields. */ spin_lock_bh(&socktab->write_lock); - atomic_set(&hsk->protect_count, 0); spin_lock_init(&hsk->lock); atomic_set(&hsk->protect_count, 0); hsk->homa = homa; From 5d1f16ca77c32ab02b777acb4a90bd91cb8df106 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 09:51:31 -0700 Subject: [PATCH 294/625] Cleanup sk_wmem_alloc check in homa_sock_shutdown --- homa_sock.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/homa_sock.c b/homa_sock.c index d38a90b6..c9cd7f07 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -243,7 +243,6 @@ void homa_sock_shutdown(struct homa_sock *hsk) { struct homa_interest *interest; struct homa_rpc *rpc; - u64 tx_memory; #ifndef __STRIP__ /* See strip.py */ int i = 0; #endif /* See strip.py */ @@ -305,15 +304,16 @@ void homa_sock_shutdown(struct homa_sock *hsk) homa_rpc_reap(hsk, 1000); #endif /* See strip.py */ - tx_memory = refcount_read(&hsk->sock.sk_wmem_alloc); - if (tx_memory != 1) { - pr_err("%s found sk_wmem_alloc %llu bytes, port %d\n", - __func__, tx_memory, hsk->port); + WARN_ON_ONCE(refcount_read(&hsk->sock.sk_wmem_alloc) != 1); #ifdef __UNIT_TEST__ - FAIL(" sk_wmem_alloc %llu after shutdown for port %d", tx_memory, - hsk->port); -#endif /* __UNIT_TEST__ */ + { + u64 tx_memory = refcount_read(&hsk->sock.sk_wmem_alloc); + + if (tx_memory != 1) + FAIL(" sk_wmem_alloc %llu after shutdown for port %d", + tx_memory, hsk->port); } +#endif /* __UNIT_TEST__ */ if (hsk->buffer_pool) { homa_pool_free(hsk->buffer_pool); From 07510f68f0da508c51877e7333c151592036907d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 09:53:38 -0700 Subject: [PATCH 295/625] Set SOCK_RCU_FREE in homa_sock_init, not homa_sock_shutdown --- homa_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_sock.c b/homa_sock.c index c9cd7f07..7fdadb22 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -147,6 +147,7 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) /* Initialize fields outside the Homa part. */ hsk->sock.sk_sndbuf = homa->wmem_max; + sock_set_flag(&hsk->inet.sk, SOCK_RCU_FREE); /* Do things requiring memory allocation before locking the socket, * so that GFP_ATOMIC is not needed. @@ -330,7 +331,6 @@ void homa_sock_shutdown(struct homa_sock *hsk) void homa_sock_destroy(struct homa_sock *hsk) { homa_sock_shutdown(hsk); - sock_set_flag(&hsk->inet.sk, SOCK_RCU_FREE); } /** From 2a82b8c1a59756d87c3df17a52894f908eaca578 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 11:36:14 -0700 Subject: [PATCH 296/625] Improve documentation for struct homa_interest --- homa_interest.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/homa_interest.h b/homa_interest.h index d08adbbe..104d5d41 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -9,10 +9,12 @@ #include "homa_sock.h" /** - * struct homa_interest - Used by homa_wait_private and homa_wait_shared to - * wait for incoming message data to arrive for an RPC. An interest can - * be either private (if referenced by an rpc->private_interest) or shared - * (if present on hsk->interests). + * struct homa_interest - Holds info that allows applications to wait for + * incoming RPC messages. An interest can be either private, in which case + * the application is waiting for a single specific RPC response and the + * interest is referenced by an rpc->private_interest, or shared, in which + * case the application is waiting for any incoming message that isn't + * private and the interest is present on hsk->interests. */ struct homa_interest { /** From c98c56a289c8cf9be69bd83b73839f58771b38c9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 11:42:50 -0700 Subject: [PATCH 297/625] Remove homa_interest->core from stripped version --- homa_interest.c | 2 +- homa_interest.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/homa_interest.c b/homa_interest.c index 17d26fb9..b75e2c0b 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -23,7 +23,7 @@ void homa_interest_init_shared(struct homa_interest *interest, { interest->rpc = NULL; atomic_set(&interest->ready, 0); - interest->core = raw_smp_processor_id(); + IF_NO_STRIP(interest->core = raw_smp_processor_id()); interest->blocked = 0; init_waitqueue_head(&interest->wait_queue); interest->hsk = hsk; diff --git a/homa_interest.h b/homa_interest.h index 104d5d41..4ba81250 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -33,11 +33,13 @@ struct homa_interest { */ atomic_t ready; +#ifndef __STRIP__ /* See strip.py */ /** * @core: Core on which homa_wait_*was invoked. This is a hint * used for load balancing (see balance.txt). */ int core; +#endif /* See strip.py */ /** * @blocked: Zero means a handoff was received without the thread From 5f00f2cd5951d68bfbc431017f257a658c7f3d7e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 12:59:04 -0700 Subject: [PATCH 298/625] Remove homa_backlog_rcv: not needed --- homa_impl.h | 1 - homa_plumbing.c | 18 ------------------ 2 files changed, 19 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 30805b45..8f7058b5 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -652,7 +652,6 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb); -int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb); int homa_bind(struct socket *sk, struct sockaddr *addr, int addr_len); void homa_close(struct sock *sock, long timeout); diff --git a/homa_plumbing.c b/homa_plumbing.c index 715756bd..98f3e227 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -93,7 +93,6 @@ static struct proto homa_prot = { .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, - .backlog_rcv = homa_backlog_rcv, .hash = homa_hash, .unhash = homa_unhash, .get_port = homa_get_port, @@ -114,7 +113,6 @@ static struct proto homav6_prot = { .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, .recvmsg = homa_recvmsg, - .backlog_rcv = homa_backlog_rcv, .hash = homa_hash, .unhash = homa_unhash, .get_port = homa_get_port, @@ -1443,22 +1441,6 @@ int homa_softirq(struct sk_buff *skb) return 0; } -/** - * homa_backlog_rcv() - Invoked to handle packets saved on a socket's - * backlog because it was locked when the packets first arrived. - * @sk: Homa socket that owns the packet's destination port. - * @skb: The incoming packet. This function takes ownership of the packet - * (we'll delete it). - * - * Return: Always returns 0. - */ -int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb) -{ - pr_warn_once("unimplemented backlog_rcv invoked on Homa socket\n"); - kfree_skb(skb); - return 0; -} - /** * homa_err_handler_v4() - Invoked by IP to handle an incoming error * packet, such as ICMP UNREACHABLE. From 9c78c61d0a7e0ac949dd501d55a7369cdbd9a74a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 15:09:29 -0700 Subject: [PATCH 299/625] Move (obsolete) homa_choose_fifo_grant from homa_incoming.c to homa_grant.c --- homa_grant.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++ homa_impl.h | 2 -- homa_incoming.c | 85 ------------------------------------------------- 3 files changed, 85 insertions(+), 87 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 1175652b..c7ed8b98 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -825,6 +825,91 @@ void homa_grant_find_oldest(struct homa *homa) homa->grant->oldest_rpc = oldest; } +#ifndef __STRIP__ /* See strip.py */ +#if 0 +/** + * homa_choose_fifo_grant() - This function is invoked occasionally to give + * a high-priority grant to the oldest incoming message. We do this in + * order to reduce the starvation that SRPT can cause for long messages. + * Note: this method is obsolete and should never be invoked; it's code is + * being retained until fifo grants are reimplemented using the new grant + * mechanism. + * @homa: Overall data about the Homa protocol implementation. The + * grant lock must be held by the caller. + * Return: An RPC to which to send a FIFO grant, or NULL if there is + * no appropriate RPC. This method doesn't actually send a grant, + * but it updates @msgin.granted to reflect the desired grant. + * Also updates homa->total_incoming. + */ +struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) +{ + struct homa_rpc *rpc, *oldest; + u64 oldest_birth; + int granted; + + oldest = NULL; + oldest_birth = ~0; + + /* Find the oldest message that doesn't currently have an + * outstanding "pity grant". + */ + list_for_each_entry(rpc, &homa->grantable_rpcs, grantable_links) { + int received, on_the_way; + + if (rpc->msgin.birth >= oldest_birth) + continue; + + received = (rpc->msgin.length + - rpc->msgin.bytes_remaining); + on_the_way = rpc->msgin.granted - received; + if (on_the_way > homa->unsched_bytes) { + /* The last "pity" grant hasn't been used + * up yet. + */ + continue; + } + oldest = rpc; + oldest_birth = rpc->msgin.birth; + } + if (!oldest) + return NULL; + INC_METRIC(fifo_grants, 1); + if ((oldest->msgin.length - oldest->msgin.bytes_remaining) + == oldest->msgin.granted) + INC_METRIC(fifo_grants_no_incoming, 1); + + oldest->silent_ticks = 0; + granted = homa->fifo_grant_increment; + oldest->msgin.granted += granted; + if (oldest->msgin.granted >= oldest->msgin.length) { + granted -= oldest->msgin.granted - oldest->msgin.length; + oldest->msgin.granted = oldest->msgin.length; + // homa_remove_grantable_locked(homa, oldest); + } + + /* Try to update homa->total_incoming; if we can't lock + * the RPC, just skip it (waiting could deadlock), and it + * will eventually get updated elsewhere. + */ + if (homa_rpc_try_lock(oldest)) { + homa_grant_update_incoming(oldest, homa); + homa_rpc_unlock(oldest); + } + + if (oldest->msgin.granted < (oldest->msgin.length + - oldest->msgin.bytes_remaining)) { + /* We've already received all of the bytes in the new + * grant; most likely this means that the sender sent extra + * data after the last fifo grant (e.g. by rounding up to a + * TSO packet). Don't send this grant. + */ + return NULL; + } + return oldest; +} +#endif +#endif /* See strip.py */ + /** * homa_grant_cand_add() - Add an RPC into the struct, if there is * space. After this function is called, homa_grant_cand_check must diff --git a/homa_impl.h b/homa_impl.h index 8f7058b5..a49f6416 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -720,8 +720,6 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force); void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); #ifndef __STRIP__ /* See strip.py */ -struct homa_rpc - *homa_choose_fifo_grant(struct homa *homa); void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); int homa_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/homa_incoming.c b/homa_incoming.c index a18974e0..e2211219 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1009,91 +1009,6 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, kfree_skb(skb); } -#ifndef __STRIP__ /* See strip.py */ -#if 0 -/** - * homa_choose_fifo_grant() - This function is invoked occasionally to give - * a high-priority grant to the oldest incoming message. We do this in - * order to reduce the starvation that SRPT can cause for long messages. - * Note: this method is obsolete and should never be invoked; it's code is - * being retained until fifo grants are reimplemented using the new grant - * mechanism. - * @homa: Overall data about the Homa protocol implementation. The - * grant lock must be held by the caller. - * Return: An RPC to which to send a FIFO grant, or NULL if there is - * no appropriate RPC. This method doesn't actually send a grant, - * but it updates @msgin.granted to reflect the desired grant. - * Also updates homa->total_incoming. - */ -struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) -{ - struct homa_rpc *rpc, *oldest; - u64 oldest_birth; - int granted; - - oldest = NULL; - oldest_birth = ~0; - - /* Find the oldest message that doesn't currently have an - * outstanding "pity grant". - */ - list_for_each_entry(rpc, &homa->grantable_rpcs, grantable_links) { - int received, on_the_way; - - if (rpc->msgin.birth >= oldest_birth) - continue; - - received = (rpc->msgin.length - - rpc->msgin.bytes_remaining); - on_the_way = rpc->msgin.granted - received; - if (on_the_way > homa->unsched_bytes) { - /* The last "pity" grant hasn't been used - * up yet. - */ - continue; - } - oldest = rpc; - oldest_birth = rpc->msgin.birth; - } - if (!oldest) - return NULL; - INC_METRIC(fifo_grants, 1); - if ((oldest->msgin.length - oldest->msgin.bytes_remaining) - == oldest->msgin.granted) - INC_METRIC(fifo_grants_no_incoming, 1); - - oldest->silent_ticks = 0; - granted = homa->fifo_grant_increment; - oldest->msgin.granted += granted; - if (oldest->msgin.granted >= oldest->msgin.length) { - granted -= oldest->msgin.granted - oldest->msgin.length; - oldest->msgin.granted = oldest->msgin.length; - // homa_remove_grantable_locked(homa, oldest); - } - - /* Try to update homa->total_incoming; if we can't lock - * the RPC, just skip it (waiting could deadlock), and it - * will eventually get updated elsewhere. - */ - if (homa_rpc_try_lock(oldest)) { - homa_grant_update_incoming(oldest, homa); - homa_rpc_unlock(oldest); - } - - if (oldest->msgin.granted < (oldest->msgin.length - - oldest->msgin.bytes_remaining)) { - /* We've already received all of the bytes in the new - * grant; most likely this means that the sender sent extra - * data after the last fifo grant (e.g. by rounding up to a - * TSO packet). Don't send this grant. - */ - return NULL; - } - return oldest; -} -#endif -#endif /* See strip.py */ - /** * homa_rpc_abort() - Terminate an RPC. * @rpc: RPC to be terminated. Must be locked by caller. From 75efd5a3fa5cf7886743ecf91f66c12429609e0b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 15:16:51 -0700 Subject: [PATCH 300/625] Move functions from homa_incoming.c to homa_rpc.c --- homa_impl.h | 7 ---- homa_incoming.c | 105 ------------------------------------------------ homa_rpc.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++ homa_rpc.h | 5 +++ 4 files changed, 110 insertions(+), 112 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index a49f6416..4508a82b 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -646,9 +646,6 @@ void unit_hook(char *id); extern unsigned int homa_net_id; -void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, - int port, int error); -void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb); @@ -689,10 +686,6 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk); -void homa_rpc_abort(struct homa_rpc *crpc, int error); -void homa_rpc_acked(struct homa_sock *hsk, - const struct in6_addr *saddr, struct homa_ack *ack); -void homa_rpc_end(struct homa_rpc *rpc); void homa_rpc_handoff(struct homa_rpc *rpc); int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int homa_setsockopt(struct sock *sk, int level, int optname, diff --git a/homa_incoming.c b/homa_incoming.c index e2211219..890f6947 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1009,111 +1009,6 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, kfree_skb(skb); } -/** - * homa_rpc_abort() - Terminate an RPC. - * @rpc: RPC to be terminated. Must be locked by caller. - * @error: A negative errno value indicating the error that caused the abort. - * If this is a client RPC, the error will be returned to the - * application; if it's a server RPC, the error is ignored and - * we just free the RPC. - */ -void homa_rpc_abort(struct homa_rpc *rpc, int error) - __must_hold(rpc_bucket_lock) -{ - if (!homa_is_client(rpc->id)) { - INC_METRIC(server_rpc_discards, 1); - tt_record3("aborting server RPC: peer 0x%x, id %d, error %d", - tt_addr(rpc->peer->addr), rpc->id, error); - homa_rpc_end(rpc); - return; - } - tt_record3("aborting client RPC: peer 0x%x, id %d, error %d", - tt_addr(rpc->peer->addr), rpc->id, error); - rpc->error = error; - homa_rpc_handoff(rpc); -} - -/** - * homa_abort_rpcs() - Abort all RPCs to/from a particular peer. - * @homa: Overall data about the Homa protocol implementation. - * @addr: Address (network order) of the destination whose RPCs are - * to be aborted. - * @port: If nonzero, then RPCs will only be aborted if they were - * targeted at this server port. - * @error: Negative errno value indicating the reason for the abort. - */ -void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, - int port, int error) -{ - struct homa_socktab_scan scan; - struct homa_rpc *rpc; - struct homa_sock *hsk; - - for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk; - hsk = homa_socktab_next(&scan)) { - /* Skip the (expensive) lock acquisition if there's no - * work to do. - */ - if (list_empty(&hsk->active_rpcs)) - continue; - if (!homa_protect_rpcs(hsk)) - continue; - rcu_read_lock(); - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - if (!ipv6_addr_equal(&rpc->peer->addr, addr)) - continue; - if (port && rpc->dport != port) - continue; - homa_rpc_lock(rpc); - homa_rpc_abort(rpc, error); - homa_rpc_unlock(rpc); - } - rcu_read_unlock(); - homa_unprotect_rpcs(hsk); - } - homa_socktab_end_scan(&scan); -} - -/** - * homa_abort_sock_rpcs() - Abort all outgoing (client-side) RPCs on a given - * socket. - * @hsk: Socket whose RPCs should be aborted. - * @error: Zero means that the aborted RPCs should be freed immediately. - * A nonzero value means that the RPCs should be marked - * complete, so that they can be returned to the application; - * this value (a negative errno) will be returned from - * recvmsg. - */ -void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) -{ - struct homa_rpc *rpc; - - if (list_empty(&hsk->active_rpcs)) - return; - if (!homa_protect_rpcs(hsk)) - return; - rcu_read_lock(); - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - if (!homa_is_client(rpc->id)) - continue; - homa_rpc_lock(rpc); - if (rpc->state == RPC_DEAD) { - homa_rpc_unlock(rpc); - continue; - } - tt_record4("homa_abort_sock_rpcs aborting id %u on port %d, peer 0x%x, error %d", - rpc->id, hsk->port, - tt_addr(rpc->peer->addr), error); - if (error) - homa_rpc_abort(rpc, error); - else - homa_rpc_end(rpc); - homa_rpc_unlock(rpc); - } - rcu_read_unlock(); - homa_unprotect_rpcs(hsk); -} - /** * homa_wait_private() - Waits until the response has been received for * a specific RPC or the RPC has failed with an error. diff --git a/homa_rpc.c b/homa_rpc.c index 28c09117..b726a20f 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -316,6 +316,71 @@ void homa_rpc_end(struct homa_rpc *rpc) homa_pacer_unmanage_rpc(rpc); } +/** + * homa_rpc_abort() - Terminate an RPC. + * @rpc: RPC to be terminated. Must be locked by caller. + * @error: A negative errno value indicating the error that caused the abort. + * If this is a client RPC, the error will be returned to the + * application; if it's a server RPC, the error is ignored and + * we just free the RPC. + */ +void homa_rpc_abort(struct homa_rpc *rpc, int error) + __must_hold(rpc_bucket_lock) +{ + if (!homa_is_client(rpc->id)) { + INC_METRIC(server_rpc_discards, 1); + tt_record3("aborting server RPC: peer 0x%x, id %d, error %d", + tt_addr(rpc->peer->addr), rpc->id, error); + homa_rpc_end(rpc); + return; + } + tt_record3("aborting client RPC: peer 0x%x, id %d, error %d", + tt_addr(rpc->peer->addr), rpc->id, error); + rpc->error = error; + homa_rpc_handoff(rpc); +} + +/** + * homa_abort_rpcs() - Abort all RPCs to/from a particular peer. + * @homa: Overall data about the Homa protocol implementation. + * @addr: Address (network order) of the destination whose RPCs are + * to be aborted. + * @port: If nonzero, then RPCs will only be aborted if they were + * targeted at this server port. + * @error: Negative errno value indicating the reason for the abort. + */ +void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, + int port, int error) +{ + struct homa_socktab_scan scan; + struct homa_rpc *rpc; + struct homa_sock *hsk; + + for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk; + hsk = homa_socktab_next(&scan)) { + /* Skip the (expensive) lock acquisition if there's no + * work to do. + */ + if (list_empty(&hsk->active_rpcs)) + continue; + if (!homa_protect_rpcs(hsk)) + continue; + rcu_read_lock(); + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + if (!ipv6_addr_equal(&rpc->peer->addr, addr)) + continue; + if (port && rpc->dport != port) + continue; + homa_rpc_lock(rpc); + homa_rpc_abort(rpc, error); + homa_rpc_unlock(rpc); + } + rcu_read_unlock(); + homa_unprotect_rpcs(hsk); + } + homa_socktab_end_scan(&scan); +} + /** * homa_rpc_reap() - Invoked to release resources associated with dead * RPCs for a given socket. For a large RPC, it can take a long time to @@ -489,6 +554,46 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) return result; } +/** + * homa_abort_sock_rpcs() - Abort all outgoing (client-side) RPCs on a given + * socket. + * @hsk: Socket whose RPCs should be aborted. + * @error: Zero means that the aborted RPCs should be freed immediately. + * A nonzero value means that the RPCs should be marked + * complete, so that they can be returned to the application; + * this value (a negative errno) will be returned from + * recvmsg. + */ +void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) +{ + struct homa_rpc *rpc; + + if (list_empty(&hsk->active_rpcs)) + return; + if (!homa_protect_rpcs(hsk)) + return; + rcu_read_lock(); + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + if (!homa_is_client(rpc->id)) + continue; + homa_rpc_lock(rpc); + if (rpc->state == RPC_DEAD) { + homa_rpc_unlock(rpc); + continue; + } + tt_record4("homa_abort_sock_rpcs aborting id %u on port %d, peer 0x%x, error %d", + rpc->id, hsk->port, + tt_addr(rpc->peer->addr), error); + if (error) + homa_rpc_abort(rpc, error); + else + homa_rpc_end(rpc); + homa_rpc_unlock(rpc); + } + rcu_read_unlock(); + homa_unprotect_rpcs(hsk); +} + /** * homa_rpc_find_client() - Locate client-side information about the RPC that * a packet belongs to, if there is any. Thread-safe without socket lock. diff --git a/homa_rpc.h b/homa_rpc.h index 91fc5a63..8cbed235 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -412,6 +412,10 @@ struct homa_rpc { u64 start_ns; }; +void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, + int port, int error); +void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); +void homa_rpc_abort(struct homa_rpc *crpc, int error); struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, const union sockaddr_in_union *dest); @@ -419,6 +423,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, const struct in6_addr *source, struct homa_data_hdr *h, int *created); +void homa_rpc_end(struct homa_rpc *rpc); struct homa_rpc *homa_rpc_find_client(struct homa_sock *hsk, u64 id); struct homa_rpc From 4bf149543bc9eb20e42ce03c2e6c53776de25671 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 15:18:40 -0700 Subject: [PATCH 301/625] Remove homa_get_port and homa_disconnect functions These functions were never needed. --- homa_impl.h | 2 -- homa_plumbing.c | 33 --------------------------------- 2 files changed, 35 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 4508a82b..5e98e4f2 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -655,7 +655,6 @@ void homa_close(struct sock *sock, long timeout); int homa_copy_to_user(struct homa_rpc *rpc); void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_destroy(struct homa *homa); -int homa_disconnect(struct sock *sk, int flags); void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); int homa_err_handler_v4(struct sk_buff *skb, u32 info); int homa_err_handler_v6(struct sk_buff *skb, @@ -665,7 +664,6 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter); struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end); void homa_gap_retry(struct homa_rpc *rpc); -int homa_get_port(struct sock *sk, unsigned short snum); int homa_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int homa_hash(struct sock *sk); diff --git a/homa_plumbing.c b/homa_plumbing.c index 98f3e227..e8436987 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -85,7 +85,6 @@ static struct proto homa_prot = { .owner = THIS_MODULE, .close = homa_close, .connect = ip4_datagram_connect, - .disconnect = homa_disconnect, .ioctl = homa_ioctl, .init = homa_socket, .destroy = NULL, @@ -95,7 +94,6 @@ static struct proto homa_prot = { .recvmsg = homa_recvmsg, .hash = homa_hash, .unhash = homa_unhash, - .get_port = homa_get_port, .obj_size = sizeof(struct homa_sock), .no_autobind = 1, }; @@ -105,7 +103,6 @@ static struct proto homav6_prot = { .owner = THIS_MODULE, .close = homa_close, .connect = ip6_datagram_connect, - .disconnect = homa_disconnect, .ioctl = homa_ioctl, .init = homa_socket, .destroy = NULL, @@ -115,7 +112,6 @@ static struct proto homav6_prot = { .recvmsg = homa_recvmsg, .hash = homa_hash, .unhash = homa_unhash, - .get_port = homa_get_port, .obj_size = sizeof(struct homa_v6_sock), .ipv6_pinfo_offset = offsetof(struct homa_v6_sock, inet6), @@ -694,20 +690,6 @@ int homa_shutdown(struct socket *sock, int how) return 0; } -/** - * homa_disconnect() - Invoked when disconnect system call is invoked on a - * Homa socket. - * @sk: Socket to disconnect - * @flags: ?? - * - * Return: 0 on success, otherwise a negative errno. - */ -int homa_disconnect(struct sock *sk, int flags) -{ - pr_warn("unimplemented disconnect invoked on Homa socket\n"); - return -EINVAL; -} - #ifndef __STRIP__ /* See strip.py */ /** * homa_ioc_abort() - The top-level function for the ioctl that implements @@ -1267,21 +1249,6 @@ void homa_unhash(struct sock *sk) { } -/** - * homa_get_port() - It appears that this function is called to assign a - * default port for a socket. - * @sk: Socket for the operation - * @snum: Unclear what this is. - * Return: Zero for success, or a negative errno for an error. - */ -int homa_get_port(struct sock *sk, unsigned short snum) -{ - /* Homa always assigns ports immediately when a socket is created, - * so there is nothing to do here. - */ - return 0; -} - /** * homa_softirq() - This function is invoked at SoftIRQ level to handle * incoming packets. From f3429433f8afee2f5ecf46b8bf7930b1d60dc477 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 May 2025 15:24:17 -0700 Subject: [PATCH 302/625] Create strip_decl.py --- Makefile | 5 +- homa_impl.h | 3 +- util/strip_decl.py | 135 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+), 3 deletions(-) create mode 100755 util/strip_decl.py diff --git a/Makefile b/Makefile index 616a9373..05075d31 100644 --- a/Makefile +++ b/Makefile @@ -67,7 +67,8 @@ CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o timetrace.o, $(HOMA_OBJS CP_EXTRAS := reap.txt \ sync.txt \ Kconfig \ - Makefile + Makefile \ + strip_decl.py CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS) $(CP_EXTRAS)) net-next: $(CP_TARGETS) $(LINUX_SRC_DIR)/include/uapi/linux/homa.h $(HOMA_TARGET)/%: % util/strip.py @@ -76,6 +77,8 @@ $(HOMA_TARGET)/%.txt: %.txt cp $< $@ $(HOMA_TARGET)/Makefile: Makefile.upstream cp $< $@ +$(HOMA_TARGET)/strip_decl.py: util/strip_decl.py + cp $< $@ $(LINUX_SRC_DIR)/include/uapi/linux/homa.h: homa.h util/strip.py util/strip.py $< > $@ diff --git a/homa_impl.h b/homa_impl.h index 5e98e4f2..25b35f0c 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -701,8 +701,7 @@ void homa_unhash(struct sock *sk); void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_unload(void); int homa_wait_private(struct homa_rpc *rpc, int nonblocking); -struct homa_rpc - *homa_wait_shared(struct homa_sock *hsk, int nonblocking); +struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking); int homa_xmit_control(enum homa_packet_type type, void *contents, size_t length, struct homa_rpc *rpc); int __homa_xmit_control(void *contents, size_t length, diff --git a/util/strip_decl.py b/util/strip_decl.py new file mode 100755 index 00000000..f6c3c8b5 --- /dev/null +++ b/util/strip_decl.py @@ -0,0 +1,135 @@ +#!/usr/bin/python3 + +# SPDX-License-Identifier: BSD-2-Clause + +""" +This script is used to make a copy of homa_impl.h that seletively omits +certain function definitions, depending on which patch in a patch series +is being generated. + +Usage: strip_decl.py src dst patch + +Src gives the file to read, dst names the file to (over)write, and patch +identifies the specific patch that is being generated (it must be one +of the initial values in a sublist of symbols below). +""" + +from collections import defaultdict +from glob import glob +import math +import os +import re +import string +import sys + +# Each list element is a list containing a patch name followed by any number +# of line prefixes. The lists are in patch order: a line will be excluded +# from the output file it starts with one of the prefixes for a patch *after* +# the one specified on the command line. The "none" patch includes no symbols, +# "all" includes all symbols. +symbols = [ + ['none'], + ['outgoing', + 'int homa_fill_data_interleaved(', + 'int homa_message_out_fill(', + 'void homa_message_out_init(', + 'struct sk_buff *homa_tx_data_pkt_alloc(', + 'int homa_xmit_control(', + 'int __homa_xmit_control(', + 'void homa_xmit_data(', + 'void __homa_xmit_data(', + 'void homa_xmit_unknown(' + ], + ['utils', + 'void homa_destroy(', + 'int homa_init(', + 'void homa_spin(' + ], + ['incoming', + 'void homa_ack_pkt(', + 'void homa_add_packet(', + 'int homa_copy_to_user(', + 'void homa_data_pkt(', + 'void homa_dispatch_pkts(', + 'struct homa_gap *homa_gap_alloc(', + 'void homa_gap_retry(', + 'int homa_message_in_init(', + 'void homa_need_ack_pkt(', + 'void homa_resend_data(', + 'void homa_resend_pkt(', + 'void homa_rpc_handoff(', + 'void homa_rpc_unknown_pkt(', + 'int homa_wait_private(', + 'struct homa_rpc *homa_wait_shared(' + ], + ['timer', + 'void homa_timer(', + 'void homa_timer_check_rpc(', + 'int homa_timer_main(' + ], + ['plumbing', + 'int homa_bind(', + 'void homa_close(', + 'int homa_err_handler_v4(', + 'int homa_err_handler_v6(', + 'int homa_getsockopt(', + 'int homa_hash(', + 'enum hrtimer_restart homa_hrtimer(', + 'int homa_ioctl(', + 'int homa_load(', + 'int homa_net_init(', + 'void homa_net_exit(', + '__poll_t homa_poll(', + 'int homa_recvmsg(', + 'int homa_sendmsg(', + 'int homa_setsockopt(', + 'int homa_shutdown(', + 'int homa_softirq(', + 'void homa_unhash(', + 'void homa_unload(' + ], + ['all'] +] + + +if len(sys.argv) != 4: + print('Usage: strip_decl.py src dst patch') + exit(1) + +src = open(sys.argv[1]) +dst = open(sys.argv[2], 'w') +patch_name = sys.argv[3] +skipping_to_semi = False +prev_line_empty = False +for line in src: + if skipping_to_semi: + if line.endswith(';\n'): + skipping_to_semi = False + continue + + found_patch = False + omit = False + for patch in symbols: + if found_patch: + for prefix in patch[1:]: + if line.startswith(prefix): + omit = True + break + if omit: + break + if patch_name == patch[0]: + found_patch = True + if omit: + if not line.endswith(';\n'): + skipping_to_semi = True + else: + if line == '\n': + prev_line_empty = True + else: + if prev_line_empty: + print('', file=dst,) + print(line, file=dst, end='') + prev_line_empty = False + +dst.close() +src.close() \ No newline at end of file From fa7ae207b1ddf741bbc6f54f8a0805154b2ae07c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 8 May 2025 11:11:39 -0700 Subject: [PATCH 303/625] Add clock measurements to perf.txt --- perf.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/perf.txt b/perf.txt index 55458fe8..339cb309 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,17 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +59. (May 2025) Measured overhead to read various clocks on 2.4 GHz +Xeon E5-2640 (note: measured when CPU is active, hence running in fastest +mode): + +Function Units Overhead +----------------------------------------------- +rdtsc cycles 8 ns +sched_clock ns 9 ns +ktime_get_mono_fast_ns ns 24 ns +ktime_get_raw_fast_ns ns 24 ns + 58. (September 2024): Interference between Homa and TCP when both run concurrently on the same nodes (no special kernel code to mitigate interference) From ea9ec193706c1852908a12d68dcdcc14cbec5b49 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 9 May 2025 12:01:24 -0700 Subject: [PATCH 304/625] Fix bug in ttsync.py Crashed if there was incomplete freeze data. --- util/ttsync.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/util/ttsync.py b/util/ttsync.py index bb747049..6ad17f85 100755 --- a/util/ttsync.py +++ b/util/ttsync.py @@ -306,6 +306,8 @@ def find_min_delays_alt(num_nodes): if not recv_addr in addr_node_num: continue frecv_node = addr_node_num[recv_addr] + if not frecv_node in recv_freeze: + continue recv_time = recv_freeze[frecv_node][0] freeze_delay = recv_time - send_time if freeze_delay < min_delays[fsend_node][frecv_node]: From 76369b4979babfca3e6502124e9f3a76befa1537 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 9 May 2025 12:09:23 -0700 Subject: [PATCH 305/625] Introduce homa_clock for fine-grain clock Previously used sched_clock, but this does not guarantee monotonicity orconsistency across cores. Homa_clock makes it easy to change the clock source in the future; for now, use the Intel TSC counter when available, ktime otherwise. --- homa_grant.c | 16 ++--- homa_grant.h | 20 +++--- homa_impl.h | 122 ++++++++++++++++++++++++++++++++--- homa_incoming.c | 19 +++--- homa_interest.c | 14 ++-- homa_metrics.c | 132 +++++++++++++++++++------------------- homa_metrics.h | 110 +++++++++++++++---------------- homa_offload.c | 30 ++++----- homa_offload.h | 13 ++-- homa_outgoing.c | 2 +- homa_pacer.c | 57 ++++++++-------- homa_pacer.h | 37 ++++++----- homa_peer.c | 10 +-- homa_peer.h | 5 +- homa_plumbing.c | 30 ++++----- homa_pool.c | 12 ++-- homa_pool.h | 5 +- homa_rpc.c | 4 +- homa_rpc.h | 16 ++--- homa_skb.c | 18 +++--- homa_sock.c | 10 +-- homa_timer.c | 10 +-- homa_utils.c | 4 +- test/mock.c | 59 +++++++---------- test/mock.h | 8 +-- test/unit_homa_grant.c | 26 ++++---- test/unit_homa_incoming.c | 30 +++++---- test/unit_homa_interest.c | 32 ++++----- test/unit_homa_offload.c | 20 +++--- test/unit_homa_outgoing.c | 10 +-- test/unit_homa_pacer.c | 100 +++++++++++++++-------------- test/unit_homa_peer.c | 20 +++--- test/unit_homa_pool.c | 24 +++---- test/unit_homa_rpc.c | 12 ++-- test/unit_homa_skb.c | 20 +++--- test/unit_homa_sock.c | 6 +- test/unit_timetrace.c | 18 +++--- timetrace.c | 8 ++- timetrace.h | 40 ++++++------ util/metrics.py | 95 +++++++++++++-------------- 40 files changed, 666 insertions(+), 558 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index c7ed8b98..7972dcde 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -110,7 +110,7 @@ struct homa_grant *homa_grant_alloc(struct net *net) } #endif /* See strip.py */ homa_grant_update_sysctl_deps(grant); - grant->next_recalc = sched_clock() + grant->recalc_ns; + grant->next_recalc = homa_clock() + grant->recalc_cycles; return grant; error: @@ -414,7 +414,7 @@ void homa_grant_manage_rpc(struct homa_rpc *rpc) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_rpc *bumped; - u64 time = sched_clock(); + u64 time = homa_clock(); BUG_ON(rpc->msgin.rank >= 0 || !list_empty(&rpc->grantable_links)); @@ -540,7 +540,7 @@ void homa_grant_unmanage_rpc(struct homa_rpc *rpc, __must_hold(&rpc->bucket->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; - u64 time = sched_clock(); + u64 time = homa_clock(); homa_grant_lock(grant); @@ -696,7 +696,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * inversions that may have developed. The interval for these scans * is chosen so as not to create too much contention for the grant lock. */ - now = sched_clock(); + now = homa_clock(); limit = atomic_xchg(&grant->incoming_hit_limit, false); recalc = now >= READ_ONCE(grant->next_recalc); if (!recalc && !limit) { @@ -728,7 +728,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) homa_grant_lock(grant); if (recalc) { /* Case 4. */ - grant->next_recalc = now + grant->recalc_ns; + grant->next_recalc = now + grant->recalc_cycles; homa_grant_fix_order(grant); } @@ -972,13 +972,13 @@ void homa_grant_cand_check(struct homa_grant_candidates *cand, void homa_grant_lock_slow(struct homa_grant *grant) __acquires(&grant->lock) { - u64 start = sched_clock(); + u64 start = homa_clock(); tt_record("beginning wait for grant lock"); spin_lock_bh(&grant->lock); tt_record("ending wait for grant lock"); INC_METRIC(grant_lock_misses, 1); - INC_METRIC(grant_lock_miss_ns, sched_clock() - start); + INC_METRIC(grant_lock_miss_cycles, homa_clock() - start); } /** @@ -1001,7 +1001,7 @@ void homa_grant_update_sysctl_deps(struct homa_grant *grant) grant->fifo_grant_increment; grant->grant_nonfifo = tmp; - grant->recalc_ns = grant->recalc_usecs * 1000; + grant->recalc_cycles = homa_usecs_to_cycles(grant->recalc_usecs); grant->window = homa_grant_window(grant); } diff --git a/homa_grant.h b/homa_grant.h index 37053252..d645f568 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -51,7 +51,7 @@ struct homa_grant { spinlock_t lock ____cacheline_aligned_in_smp; /** - * @lock_time: sched_clock() time when lock was last locked. Used + * @lock_time: homa_clock() time when lock was last locked. Used * for computing statistics. */ u64 lock_time; @@ -85,7 +85,7 @@ struct homa_grant { int num_grantable_rpcs; /** - * @last_grantable_change: The sched_clock() time of the most recent + * @last_grantable_change: The homa_clock() time of the most recent * increment or decrement of num_grantable_rpcs; used for computing * statistics. */ @@ -134,12 +134,14 @@ struct homa_grant { */ int recalc_usecs; - /** @recalc_ns: Same as @recalc_usec except in units of nanoseconds. */ - int recalc_ns; + /** + * @recalc_cycles: Same as @recalc_usec except in homa_clock() units. + */ + int recalc_cycles; /** - * @next_recalc: Time (in sched_clock() nanoseconds) at which - * priorities should be recalculated. + * @next_recalc: Time in homa_clock() units at which priorities + * should be recalculated. */ u64 next_recalc; @@ -171,7 +173,7 @@ struct homa_grant { int grant_nonfifo_left; /** - * @oldest_rpc: The RPC with incoming data whose start_ns is + * @oldest_rpc: The RPC with incoming data whose start_cycles is * farthest in the past). NULL means either there are no incoming * RPCs or the oldest needs to be recomputed. Must hold grant_lock * to update. @@ -288,7 +290,7 @@ static inline void homa_grant_lock(struct homa_grant *grant) { if (!spin_trylock_bh(&grant->lock)) homa_grant_lock_slow(grant); - grant->lock_time = sched_clock(); + grant->lock_time = homa_clock(); } /** @@ -298,7 +300,7 @@ static inline void homa_grant_lock(struct homa_grant *grant) static inline void homa_grant_unlock(struct homa_grant *grant) __releases(&grant->grant_lock) { - INC_METRIC(grant_lock_ns, sched_clock() - grant->lock_time); + INC_METRIC(grant_lock_cycles, homa_clock() - grant->lock_time); spin_unlock_bh(&grant->lock); } diff --git a/homa_impl.h b/homa_impl.h index 25b35f0c..e831eb25 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -184,8 +183,8 @@ struct homa { int pages_to_free_slots; /** - * @skb_page_free_time: Time (in sched_clock() units) when the - * next sk_buff page should be freed. Could be in the past. + * @skb_page_free_time: homa_clock() time when the next sk_buff + * page should be freed. Could be in the past. */ u64 skb_page_free_time; @@ -217,6 +216,9 @@ struct homa { */ int poll_usecs; + /** @poll_cycles: Same as poll_usecs except in homa_clock() units. */ + u64 poll_cycles; + /** * @num_priorities: The total number of priority levels available for * Homa's use. Internally, Homa will use priorities from 0 to @@ -404,8 +406,8 @@ struct homa { */ int busy_usecs; - /** @busy_ns: Same as busy_usecs except in sched_clock() units. */ - int busy_ns; + /** @busy_cycles: Same as busy_usecs except in homa_clock() units. */ + int busy_cycles; /** * @gro_busy_usecs: if the gap between the completion of @@ -417,8 +419,10 @@ struct homa { */ int gro_busy_usecs; - /** @gro_busy_ns: Same as busy_usecs except in sched_clock() units. */ - int gro_busy_ns; + /** + * @gro_busy_cycles: Same as busy_usecs except in homa_clock() units. + */ + int gro_busy_cycles; #endif /* See strip.py */ /** @@ -447,6 +451,12 @@ struct homa { */ int bpage_lease_usecs; + /** + * @bpage_lease_cycles: same as bpage_lease_usecs except in + * homa_clock() units. + * */ + int bpage_lease_cycles; + /** * @next_id: Set via sysctl; causes next_outgoing_id to be set to * this value; always reads as zero. Typically used while debugging to @@ -739,7 +749,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc); * homa_from_net() - Return the struct homa associated with a particular * struct net. * @net: Get the struct homa for this net namespace. - * Return: see above + * Return: see above. */ static inline struct homa *homa_from_net(struct net *net) { @@ -750,7 +760,7 @@ static inline struct homa *homa_from_net(struct net *net) * homa_from_sock() - Return the struct homa associated with a particular * struct sock. * @sock: Get the struct homa for this socket. - * Return: see above + * Return: see above. */ static inline struct homa *homa_from_sock(struct sock *sock) { @@ -761,11 +771,103 @@ static inline struct homa *homa_from_sock(struct sock *sock) * homa_from_skb() - Return the struct homa associated with a particular * sk_buff. * @skb: Get the struct homa for this packet buffer. - * Return: see above + * Return: see above. */ static inline struct homa *homa_from_skb(struct sk_buff *skb) { return (struct homa *)net_generic(dev_net(skb->dev), homa_net_id); } +/** + * homa_clock() - Return a fine-grain clock value that is monotonic and + * consistent across cores. + * Return: see above. + */ +static inline u64 homa_clock(void) +{ + /* As of May 2025 there does not appear to be a portable API that + * meets Homa's needs: + * - The Intel X86 TSC works well but is not portable. + * - sched_clock() does not guarantee monotonicity or consistency. + * - ktime_get_mono_fast_ns and ktime_get_raw_fast_ns are very slow + * (27 ns to read, vs 8 ns for TSC) + * Thus we use a hybrid approach that uses TSC (via get_cycles) where + * available (which should be just about everywhere Homa runs). + */ +#ifdef __UNIT_TEST__ + u64 mock_get_clock(void); + return mock_get_clock(); +#else /* __UNIT_TEST__ */ +#ifdef CONFIG_X86_TSC + return get_cycles(); +#else + return ktime_get_mono_fast_ns(); +#endif /* CONFIG_X86_TSC */ +#endif /* __UNIT_TEST__ */ +} + +/** + * homa_clock_khz() - Return the frequency of the values returned by + * homa_clock, in units of KHz. + * Return: see above. + */ +static inline u64 homa_clock_khz(void) +{ +#ifdef __UNIT_TEST__ + return 1000000; +#else /* __UNIT_TEST__ */ +#ifdef CONFIG_X86_TSC + return cpu_khz; +#else + return 1000000; +#endif /* CONFIG_X86_TSC */ +#endif /* __UNIT_TEST__ */ +} + +/** + * homa_ns_to_cycles() - Convert from units of nanoseconds to units of + * homa_clock(). + * @ns: A time measurement in nanoseconds + * Return: The time in homa_clock() units corresponding to @ns. + */ +static inline u64 homa_ns_to_cycles(u64 ns) +{ +#ifdef __UNIT_TEST__ + return ns; +#else /* __UNIT_TEST__ */ +#ifdef CONFIG_X86_TSC + u64 tmp; + + tmp = ns * cpu_khz; + do_div(tmp, 1000000); + return tmp; +#else + return ns; +#endif /* CONFIG_X86_TSC */ +#endif /* __UNIT_TEST__ */ +} + +/** + * homa_usec_to_cycles() - Convert from units of microseconds to units of + * homa_clock(). + * @usecs: A time measurement in microseconds + * Return: The time in homa_clock() units corresponding to @usecs. + */ +static inline u64 homa_usecs_to_cycles(u64 usecs) +{ +#ifdef __UNIT_TEST__ + return usecs * 1000; +#else /* __UNIT_TEST__ */ +#ifdef CONFIG_X86_TSC + u64 tmp; + + tmp = usecs * cpu_khz; + do_div(tmp, 1000); + return tmp; +#else + return usecs * 1000; +#endif /* CONFIG_X86_TSC */ +#endif /* __UNIT_TEST__ */ +} + #endif /* _HOMA_IMPL_H */ diff --git a/homa_incoming.c b/homa_incoming.c index 890f6947..38511c93 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -87,7 +87,7 @@ struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end) return NULL; gap->start = start; gap->end = end; - gap->time = sched_clock(); + gap->time = homa_clock(); list_add_tail(&gap->links, next); return gap; } @@ -359,11 +359,11 @@ int homa_copy_to_user(struct homa_rpc *rpc) } #endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ - start = sched_clock(); + start = homa_clock(); #endif /* See strip.py */ for (i = 0; i < n; i++) kfree_skb(skbs[i]); - INC_METRIC(skb_free_ns, sched_clock() - start); + INC_METRIC(skb_free_cycles, homa_clock() - start); INC_METRIC(skb_frees, n); tt_record2("finished freeing %d skbs for id %d", n, rpc->id); @@ -598,12 +598,12 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) * RPCs. See reap.txt for details. */ #ifndef __STRIP__ /* See strip.py */ - u64 start = sched_clock(); + u64 start = homa_clock(); #endif /* See strip.py */ tt_record("homa_data_pkt calling homa_rpc_reap"); homa_rpc_reap(hsk, false); - INC_METRIC(data_pkt_reap_ns, sched_clock() - start); + INC_METRIC(data_pkt_reap_cycles, homa_clock() - start); } sock_put(&hsk->sock); } @@ -1238,7 +1238,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) * Homa will try to avoid assigning any work there. */ per_cpu(homa_offload_core, interest->core).last_app_active = - sched_clock(); + homa_clock(); #endif /* See strip.py */ } else if (list_empty(&rpc->ready_links)) { list_add_tail(&rpc->ready_links, &hsk->ready_rpcs); @@ -1257,7 +1257,10 @@ void homa_rpc_handoff(struct homa_rpc *rpc) */ void homa_incoming_sysctl_changed(struct homa *homa) { - homa->busy_ns = homa->busy_usecs * 1000; - homa->gro_busy_ns = homa->gro_busy_usecs * 1000; + homa->poll_cycles = homa_usecs_to_cycles(homa->poll_usecs); + homa->busy_cycles = homa_usecs_to_cycles(homa->busy_usecs); + homa->gro_busy_cycles = homa_usecs_to_cycles(homa->gro_busy_usecs); + homa->bpage_lease_cycles = + homa_usecs_to_cycles(homa->bpage_lease_usecs); } #endif /* See strip.py */ diff --git a/homa_interest.c b/homa_interest.c index b75e2c0b..caf2a3a1 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -79,7 +79,7 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) #ifndef __STRIP__ /* See strip.py */ u64 start, block_start, blocked_time, now; - start = sched_clock(); + start = homa_clock(); blocked_time = 0; #endif /* See strip.py */ interest->blocked = 0; @@ -103,10 +103,10 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) } #ifndef __STRIP__ /* See strip.py */ - now = sched_clock(); + now = homa_clock(); per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = now; - if (now - start >= 1000 * hsk->homa->poll_usecs) + if (now - start >= hsk->homa->poll_cycles) break; #else /* See strip.py */ break; @@ -117,15 +117,15 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) IF_NO_STRIP(block_start = now); wait_err = wait_event_interruptible_exclusive(interest->wait_queue, atomic_read_acquire(&interest->ready) != 0); - IF_NO_STRIP(blocked_time = sched_clock() - block_start); + IF_NO_STRIP(blocked_time = homa_clock() - block_start); if (wait_err == -ERESTARTSYS) result = -EINTR; done: #ifndef __STRIP__ /* See strip.py */ if (interest->blocked) - INC_METRIC(blocked_ns, blocked_time); - INC_METRIC(poll_ns, sched_clock() - start - blocked_time); + INC_METRIC(blocked_cycles, blocked_time); + INC_METRIC(poll_cycles, homa_clock() - start - blocked_time); #endif /* See strip.py */ return result; } @@ -158,7 +158,7 @@ void homa_interest_notify_private(struct homa_rpc *rpc) struct homa_interest *homa_choose_interest(struct homa_sock *hsk) __must_hold(&hsk->lock) { - u64 busy_time = sched_clock() - hsk->homa->busy_ns; + u64 busy_time = homa_clock() - hsk->homa->busy_cycles; struct homa_interest *interest, *first; first = list_first_entry(&hsk->interests, struct homa_interest, diff --git a/homa_metrics.c b/homa_metrics.c index 17db997b..55cabe56 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -116,13 +116,15 @@ char *homa_metrics_print(void) homa_mout.length = 0; #define M(...) homa_metric_append(__VA_ARGS__) - M("time_ns %20llu sched_clock() time when metrics were gathered\n", - sched_clock()); + M("time_cycles %20llu homa_clock() time when metrics were gathered\n", + homa_clock()); + M("cpu_khz %15llu Clock rate in khz\n", + homa_clock_khz()); for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = &per_cpu(homa_metrics, core); s64 delta; - M("core %15d Core id for following metrics\n", + M("core %15d Core id for following metrics\n", core); for (i = 0; i < HOMA_NUM_SMALL_COUNTS; i++) { M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", @@ -141,7 +143,7 @@ char *homa_metrics_print(void) m->large_msg_count, lower); M("large_msg_bytes %15llu Bytes in incoming messages >= %d bytes\n", m->large_msg_bytes, lower); - M("sent_msg_bytes %15llu otal bytes in all outgoing messages\n", + M("sent_msg_bytes %15llu Total bytes in all outgoing messages\n", m->sent_msg_bytes); for (i = DATA; i <= MAX_OP; i++) { char *symbol = homa_symbol_for_type(i); @@ -165,16 +167,16 @@ char *homa_metrics_print(void) } M("skb_allocs %15llu sk_buffs allocated\n", m->skb_allocs); - M("skb_alloc_ns %15llu Time spent allocating sk_buffs\n", - m->skb_alloc_ns); + M("skb_alloc_cycles %15llu Time spent allocating sk_buffs\n", + m->skb_alloc_cycles); M("skb_frees %15llu Data sk_buffs freed in normal paths\n", m->skb_frees); - M("skb_free_ns %15llu Time spent freeing data sk_buffs\n", - m->skb_free_ns); + M("skb_free_cycles %15llu Time spent freeing data sk_buffs\n", + m->skb_free_cycles); M("skb_page_allocs %15llu Pages allocated for sk_buff frags\n", m->skb_page_allocs); - M("skb_page_alloc_ns %15llu Time spent allocating pages for sk_buff frags\n", - m->skb_page_alloc_ns); + M("skb_page_alloc_cycles %15llu Time spent allocating pages for sk_buff frags\n", + m->skb_page_alloc_cycles); M("requests_received %15llu Incoming request messages\n", m->requests_received); M("responses_received %15llu Incoming response messages\n", @@ -189,74 +191,74 @@ char *homa_metrics_print(void) m->handoffs_thread_waiting); M("handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", m->handoffs_alt_thread); - M("poll_ns %15llu Time spent polling for incoming messages\n", - m->poll_ns); + M("poll_cycles %15llu Time spent polling for incoming messages\n", + m->poll_cycles); M("softirq_calls %15llu Calls to homa_softirq (i.e. # GRO pkts received)\n", m->softirq_calls); - M("softirq_ns %15llu Time spent in homa_softirq during SoftIRQ\n", - m->softirq_ns); - M("bypass_softirq_ns %15llu Time spent in homa_softirq during bypass from GRO\n", - m->bypass_softirq_ns); - M("linux_softirq_ns %15llu Time spent in all Linux SoftIRQ\n", - m->linux_softirq_ns); - M("napi_ns %15llu Time spent in NAPI-level packet handling\n", - m->napi_ns); - M("send_ns %15llu Time spent in homa_sendmsg for requests\n", - m->send_ns); + M("softirq_cycles %15llu Time spent in homa_softirq during SoftIRQ\n", + m->softirq_cycles); + M("bypass_softirq_cycles %15llu Time spent in homa_softirq during bypass from GRO\n", + m->bypass_softirq_cycles); + M("linux_softirq_cycles %15llu Time spent in all Linux SoftIRQ\n", + m->linux_softirq_cycles); + M("napi_cycles %15llu Time spent in NAPI-level packet handling\n", + m->napi_cycles); + M("send_cycles %15llu Time spent in homa_sendmsg for requests\n", + m->send_cycles); M("send_calls %15llu Total invocations of homa_sendmsg for equests\n", m->send_calls); // It is possible for us to get here at a time when a // thread has been blocked for a long time and has - // recorded blocked_ns, but hasn't finished the - // system call so recv_nss hasn't been incremented + // recorded blocked_cycles, but hasn't finished the + // system call so recv_cycles hasn't been incremented // yet. If that happens, just record 0 to prevent // underflow errors. - delta = m->recv_ns - m->blocked_ns; + delta = m->recv_cycles - m->blocked_cycles; if (delta < 0) delta = 0; - M("recv_ns %15llu Unblocked time spent in recvmsg kernel call\n", + M("recv_cycles %15llu Unblocked time spent in recvmsg kernel call\n", delta); M("recv_calls %15llu Total invocations of recvmsg kernel call\n", m->recv_calls); - M("blocked_ns %15llu Time spent blocked in homa_recvmsg\n", - m->blocked_ns); - M("reply_ns %15llu Time spent in homa_sendmsg for responses\n", - m->reply_ns); + M("blocked_cycles %15llu Time spent blocked in homa_recvmsg\n", + m->blocked_cycles); + M("reply_cycles %15llu Time spent in homa_sendmsg for responses\n", + m->reply_cycles); M("reply_calls %15llu Total invocations of homa_sendmsg for responses\n", m->reply_calls); - M("abort_ns %15llu Time spent in homa_ioc_abort kernel call\n", - m->reply_ns); + M("abort_cycles %15llu Time spent in homa_ioc_abort kernel call\n", + m->reply_cycles); M("abort_calls %15llu Total invocations of abort kernel call\n", m->reply_calls); - M("so_set_buf_ns %15llu Time spent in setsockopt SO_HOMA_RCVBUF\n", - m->so_set_buf_ns); + M("so_set_buf_cycles %15llu Time spent in setsockopt SO_HOMA_RCVBUF\n", + m->so_set_buf_cycles); M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_RCVBUF\n", m->so_set_buf_calls); - M("grant_lock_ns %15llu Time spent with grant lock locked\n", - m->grant_lock_ns); - M("timer_ns %15llu Time spent in homa_timer\n", - m->timer_ns); - M("timer_reap_ns %15llu Time in homa_timer spent reaping RPCs\n", - m->timer_reap_ns); - M("data_pkt_reap_ns %15llu Time in homa_data_pkt spent reaping RPCs\n", - m->data_pkt_reap_ns); - M("pacer_ns %15llu Time spent in homa_pacer_main\n", - m->pacer_ns); - M("homa_ns %15llu Total time in all Homa-related functions\n", - m->softirq_ns + m->napi_ns + - m->send_ns + m->recv_ns + - m->reply_ns - m->blocked_ns + - m->timer_ns + m->pacer_ns); - M("pacer_lost_ns %15llu Lost transmission time because pacer was slow\n", - m->pacer_lost_ns); + M("grant_lock_cycles %15llu Time spent with grant lock locked\n", + m->grant_lock_cycles); + M("timer_cycles %15llu Time spent in homa_timer\n", + m->timer_cycles); + M("timer_reap_cycles %15llu Time in homa_timer spent reaping RPCs\n", + m->timer_reap_cycles); + M("data_pkt_reap_cycles %15llu Time in homa_data_pkt spent reaping RPCs\n", + m->data_pkt_reap_cycles); + M("pacer_cycles %15llu Time spent in homa_pacer_main\n", + m->pacer_cycles); + M("homa_cycles %15llu Total time in all Homa-related functions\n", + m->softirq_cycles + m->napi_cycles + + m->send_cycles + m->recv_cycles + + m->reply_cycles - m->blocked_cycles + + m->timer_cycles + m->pacer_cycles); + M("pacer_lost_cycles %15llu Lost transmission time because pacer was slow\n", + m->pacer_lost_cycles); M("pacer_bytes %15llu Bytes transmitted when the pacer was active\n", m->pacer_bytes); M("pacer_skipped_rpcs %15llu Pacer aborts because of locked RPCs\n", m->pacer_skipped_rpcs); M("pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", m->pacer_needed_help); - M("throttled_ns %15llu Time when the throttled queue was nonempty\n", - m->throttled_ns); + M("throttled_cycles %15llu Time when the throttled queue was nonempty\n", + m->throttled_cycles); M("resent_packets %15llu DATA packets sent in response to RESENDs\n", m->resent_packets); M("peer_hash_links %15llu Hash chain link traversals in peer table\n", @@ -293,28 +295,28 @@ char *homa_metrics_print(void) m->server_rpcs_unknown); M("client_lock_misses %15llu Bucket lock misses for client RPCs\n", m->client_lock_misses); - M("client_lock_miss_ns %15llu Time lost waiting for client bucket locks\n", - m->client_lock_miss_ns); + M("client_lock_miss_cycles %15llu Time lost waiting for client bucket locks\n", + m->client_lock_miss_cycles); M("server_lock_misses %15llu Bucket lock misses for server RPCs\n", m->server_lock_misses); - M("server_lock_miss_ns %15llu Time lost waiting for server bucket locks\n", - m->server_lock_miss_ns); + M("server_lock_miss_cycles %15llu Time lost waiting for server bucket locks\n", + m->server_lock_miss_cycles); M("socket_lock_misses %15llu Socket lock misses\n", m->socket_lock_misses); - M("socket_lock_miss_ns %15llu Time lost waiting for socket locks\n", - m->socket_lock_miss_ns); + M("socket_lock_miss_cycles %15llu Time lost waiting for socket locks\n", + m->socket_lock_miss_cycles); M("throttle_lock_misses %15llu Throttle lock misses\n", m->throttle_lock_misses); - M("throttle_lock_miss_ns %15llu Time lost waiting for throttle locks\n", - m->throttle_lock_miss_ns); + M("throttle_lock_miss_cycles %15llu Time lost waiting for throttle locks\n", + m->throttle_lock_miss_cycles); M("peer_ack_lock_misses %15llu Misses on peer ack locks\n", m->peer_ack_lock_misses); - M("peer_ack_lock_miss_ns %15llu Time lost waiting for peer ack locks\n", - m->peer_ack_lock_miss_ns); + M("peer_ack_lock_miss_cycles %15llu Time lost waiting for peer ack locks\n", + m->peer_ack_lock_miss_cycles); M("grant_lock_misses %15llu Grant lock misses\n", m->grant_lock_misses); - M("grant_lock_miss_ns %15llu Time lost waiting for grant lock\n", - m->grant_lock_miss_ns); + M("grant_lock_miss_cycles %15llu Time lost waiting for grant lock\n", + m->grant_lock_miss_cycles); M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", m->grantable_rpcs_integral); M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", diff --git a/homa_metrics.h b/homa_metrics.h index eab9ae58..8c9f42bf 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -78,8 +78,8 @@ struct homa_metrics { */ u64 skb_allocs; - /** @skb_alloc_ns: total time spent in homa_skb_alloc_tx. */ - u64 skb_alloc_ns; + /** @skb_alloc_cycles: total time spent in homa_skb_alloc_tx. */ + u64 skb_alloc_cycles; /** * @skb_frees: total number of sk_buffs for data packets that have @@ -87,16 +87,16 @@ struct homa_metrics { */ u64 skb_frees; - /** @skb_free_ns: total time spent freeing sk_buffs. */ - u64 skb_free_ns; + /** @skb_free_cycles: total time spent freeing sk_buffs. */ + u64 skb_free_cycles; /** * @skb_page_allocs: total number of calls to homa_skb_page_alloc. */ u64 skb_page_allocs; - /** @skb_page_alloc_ns: total time spent in homa_skb_page_alloc. */ - u64 skb_page_alloc_ns; + /** @skb_page_alloc_cycles: total time spent in homa_skb_page_alloc. */ + u64 skb_page_alloc_cycles; /** * @requests_received: total number of request messages received. @@ -141,10 +141,10 @@ struct homa_metrics { u64 handoffs_alt_thread; /** - * @poll_ns: total time spent in the polling loop in + * @poll_cycles: total time spent in the polling loop in * homa_wait_for_message. */ - u64 poll_ns; + u64 poll_cycles; /** * @softirq_calls: total number of calls to homa_softirq (i.e., @@ -154,36 +154,36 @@ struct homa_metrics { u64 softirq_calls; /** - * @softirq_ns: total time spent executing homa_softirq when + * @softirq_cycles: total time spent executing homa_softirq when * invoked under Linux's SoftIRQ handler. */ - u64 softirq_ns; + u64 softirq_cycles; /** - * @bypass_softirq_ns: total time spent executing homa_softirq when + * @bypass_softirq_cycles: total time spent executing homa_softirq when * invoked during GRO, bypassing the SoftIRQ mechanism. */ - u64 bypass_softirq_ns; + u64 bypass_softirq_cycles; /** - * @linux_softirq_ns: total time spent executing all softirq + * @linux_softirq_cycles: total time spent executing all softirq * activities, as measured by the linux softirq module. Only * available with modified Linux kernels. */ - u64 linux_softirq_ns; + u64 linux_softirq_cycles; /** - * @napi_ns: total time spent executing all NAPI activities, as + * @napi_cycles: total time spent executing all NAPI activities, as * measured by the linux softirq module. Only available with modified * Linux kernels. */ - u64 napi_ns; + u64 napi_cycles; /** - * @send_ns: total time spent executing the homa_sendmsg kernel + * @send_cycles: total time spent executing the homa_sendmsg kernel * call handler to send requests. */ - u64 send_ns; + u64 send_cycles; /** * @send_calls: total number of invocations of homa_semdmsg @@ -192,25 +192,25 @@ struct homa_metrics { u64 send_calls; /** - * @recv_ns: total time spent executing homa_recvmsg (including + * @recv_cycles: total time spent executing homa_recvmsg (including * time when the thread is blocked). */ - u64 recv_ns; + u64 recv_cycles; /** @recv_calls: total number of invocations of homa_recvmsg. */ u64 recv_calls; /** - * @blocked_ns: total time spent by threads in blocked state + * @blocked_cycles: total time spent by threads in blocked state * while executing the homa_recvmsg kernel call handler. */ - u64 blocked_ns; + u64 blocked_cycles; /** - * @reply_ns: total time spent executing the homa_sendmsg kernel + * @reply_cycles: total time spent executing the homa_sendmsg kernel * call handler to send responses. */ - u64 reply_ns; + u64 reply_cycles; /** * @reply_calls: total number of invocations of homa_semdmsg @@ -219,10 +219,10 @@ struct homa_metrics { u64 reply_calls; /** - * @abort_ns: total time spent executing the homa_ioc_abort + * @abort_cycles: total time spent executing the homa_ioc_abort * kernel call handler. */ - u64 abort_ns; + u64 abort_cycles; /** * @abort_calls: total number of invocations of the homa_ioc_abort @@ -231,10 +231,10 @@ struct homa_metrics { u64 abort_calls; /** - * @so_set_buf_ns: total time spent executing the homa_ioc_set_buf + * @so_set_buf_cycles: total time spent executing the homa_ioc_set_buf * kernel call handler. */ - u64 so_set_buf_ns; + u64 so_set_buf_cycles; /** * @so_set_buf_calls: total number of invocations of the homa_ioc_set_buf @@ -242,36 +242,36 @@ struct homa_metrics { */ u64 so_set_buf_calls; - /** @grant_lock_ns: total time spent with the grant lock locked. */ - u64 grant_lock_ns; + /** @grant_lock_cycles: total time spent with the grant lock locked. */ + u64 grant_lock_cycles; - /** @timer_ns: total time spent in homa_timer. */ - u64 timer_ns; + /** @timer_cycles: total time spent in homa_timer. */ + u64 timer_cycles; /** - * @timer_reap_ns: total time spent by homa_timer to reap dead - * RPCs. This time is included in @timer_ns. + * @timer_reap_cycles: total time spent by homa_timer to reap dead + * RPCs. This time is included in @timer_cycles. */ - u64 timer_reap_ns; + u64 timer_reap_cycles; /** - * @data_pkt_reap_ns: total time spent by homa_data_pkt to reap + * @data_pkt_reap_cycles: total time spent by homa_data_pkt to reap * dead RPCs. */ - u64 data_pkt_reap_ns; + u64 data_pkt_reap_cycles; /** - * @pacer_ns: total time spent executing in homa_pacer_main + * @pacer_cycles: total time spent executing in homa_pacer_main * (not including blocked time). */ - u64 pacer_ns; + u64 pacer_cycles; /** - * @pacer_lost_ns: unnecessary delays in transmitting packets + * @pacer_lost_cycles: unnecessary delays in transmitting packets * (i.e. wasted output bandwidth) because the pacer was slow or got * descheduled. */ - u64 pacer_lost_ns; + u64 pacer_lost_cycles; /** * @pacer_bytes: total number of bytes transmitted when @@ -293,10 +293,10 @@ struct homa_metrics { u64 pacer_needed_help; /** - * @throttled_ns: total amount of time that @homa->throttled_rpcs + * @throttled_cycles: total amount of time that @homa->throttled_rpcs * is nonempty. */ - u64 throttled_ns; + u64 throttled_cycles; /** * @resent_packets: total number of data packets issued in response to @@ -411,10 +411,10 @@ struct homa_metrics { u64 client_lock_misses; /** - * @client_lock_miss_ns: total time spent waiting for client + * @client_lock_miss_cycles: total time spent waiting for client * bucket lock misses. */ - u64 client_lock_miss_ns; + u64 client_lock_miss_cycles; /** * @server_lock_misses: total number of times that Homa had to wait @@ -423,16 +423,16 @@ struct homa_metrics { u64 server_lock_misses; /** - * @server_lock_miss_ns: total time spent waiting for server + * @server_lock_miss_cycles: total time spent waiting for server * bucket lock misses. */ - u64 server_lock_miss_ns; + u64 server_lock_miss_cycles; /** - * @socket_lock_miss_ns: total time spent waiting for socket + * @socket_lock_miss_cycles: total time spent waiting for socket * lock misses. */ - u64 socket_lock_miss_ns; + u64 socket_lock_miss_cycles; /** * @socket_lock_misses: total number of times that Homa had to wait @@ -441,10 +441,10 @@ struct homa_metrics { u64 socket_lock_misses; /** - * @throttle_lock_miss_ns: total time spent waiting for throttle + * @throttle_lock_miss_cycles: total time spent waiting for throttle * lock misses. */ - u64 throttle_lock_miss_ns; + u64 throttle_lock_miss_cycles; /** * @throttle_lock_misses: total number of times that Homa had to wait @@ -453,9 +453,9 @@ struct homa_metrics { u64 throttle_lock_misses; /** - * @peer_ack_lock_miss_ns: total time spent waiting for peer lock misses. + * @peer_ack_lock_miss_cycles: total time spent waiting for peer lock misses. */ - u64 peer_ack_lock_miss_ns; + u64 peer_ack_lock_miss_cycles; /** * @peer_ack_lock_misses: total number of times that Homa had to wait @@ -464,10 +464,10 @@ struct homa_metrics { u64 peer_ack_lock_misses; /** - * @grant_lock_miss_ns: total time spent waiting for grant lock + * @grant_lock_miss_cycles: total time spent waiting for grant lock * misses. */ - u64 grant_lock_miss_ns; + u64 grant_lock_miss_cycles; /** * @grant_lock_misses: total number of times that Homa had to wait diff --git a/homa_offload.c b/homa_offload.c index afc0915d..ce1b0dc7 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -287,14 +287,14 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * gro_list by the caller, so it will be considered for merges * in the future. */ - u64 saved_softirq_metric, softirq_ns; + u64 saved_softirq_metric, softirq_cycles; struct homa_offload_core *offload_core; struct homa *homa = homa_from_skb(skb); struct sk_buff *result = NULL; struct homa_data_hdr *h_new; - u64 *softirq_ns_metric; + u64 *softirq_cycles_metric; struct sk_buff *held_skb; - u64 now = sched_clock(); + u64 now = homa_clock(); int priority; u32 saddr; u32 hash; @@ -305,7 +305,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, h_new = (struct homa_data_hdr *)skb_transport_header(skb); offload_core = &per_cpu(homa_offload_core, smp_processor_id()); - busy = (now - offload_core->last_gro) < homa->gro_busy_ns; + busy = (now - offload_core->last_gro) < homa->gro_busy_cycles; offload_core->last_active = now; if (skb_is_ipv6(skb)) { priority = ipv6_hdr(skb)->priority; @@ -465,20 +465,20 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, done: homa_pacer_check(homa->pacer); - offload_core->last_gro = sched_clock(); + offload_core->last_gro = homa_clock(); return result; bypass: /* Record SoftIRQ cycles in a different metric to reflect that * they happened during bypass. */ - softirq_ns_metric = &homa_metrics_per_cpu()->softirq_ns; - saved_softirq_metric = *softirq_ns_metric; + softirq_cycles_metric = &homa_metrics_per_cpu()->softirq_cycles; + saved_softirq_metric = *softirq_cycles_metric; homa_softirq(skb); - softirq_ns = *softirq_ns_metric - saved_softirq_metric; - *softirq_ns_metric = saved_softirq_metric; - INC_METRIC(bypass_softirq_ns, softirq_ns); - offload_core->last_gro = sched_clock(); + softirq_cycles = *softirq_cycles_metric - saved_softirq_metric; + *softirq_cycles_metric = saved_softirq_metric; + INC_METRIC(bypass_softirq_cycles, softirq_cycles); + offload_core->last_gro = homa_clock(); /* This return value indicates that we have freed skb. */ return ERR_PTR(-EINPROGRESS); @@ -507,7 +507,7 @@ void homa_gro_gen2(struct homa *homa, struct sk_buff *skb) int this_core = smp_processor_id(); struct homa_offload_core *offload_core; int candidate = this_core; - u64 now = sched_clock(); + u64 now = homa_clock(); int i; for (i = CORES_TO_CHECK; i > 0; i--) { @@ -517,7 +517,7 @@ void homa_gro_gen2(struct homa *homa, struct sk_buff *skb) offload_core = &per_cpu(homa_offload_core, candidate); if (atomic_read(&offload_core->softirq_backlog) > 0) continue; - if ((offload_core->last_gro + homa->busy_ns) > now) + if ((offload_core->last_gro + homa->busy_cycles) > now) continue; tt_record3("homa_gro_gen2 chose core %d for id %d offset %d", candidate, homa_local_id(h->common.sender_id), @@ -567,8 +567,8 @@ void homa_gro_gen3(struct homa *homa, struct sk_buff *skb) candidates = per_cpu(homa_offload_core, smp_processor_id()).gen3_softirq_cores; - now = sched_clock(); - busy_time = now - homa->busy_ns; + now = homa_clock(); + busy_time = now - homa->busy_cycles; core = candidates[0]; for (i = 0; i < NUM_GEN3_SOFTIRQ_CORES; i++) { diff --git a/homa_offload.h b/homa_offload.h index 3e5562db..cf9904b1 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -13,14 +13,13 @@ */ struct homa_offload_core { /** - * @last_active: the last time (in sched_clock() units) that - * there was system activity, such NAPI or SoftIRQ, on this - * core. Used for load balancing. + * @last_active: homa_clock() time of the last known activity + * on this core, such NAPI or SoftIRQ. Used for load balancing. */ u64 last_active; /** - * @last_gro: the last time (in sched_clock() units) that + * @last_gro: the most recent homa_clock() time when * homa_gro_receive returned on this core. Used to determine * whether GRO is keeping a core busy. */ @@ -50,9 +49,9 @@ struct homa_offload_core { int gen3_softirq_cores[NUM_GEN3_SOFTIRQ_CORES]; /** - * @last_app_active: the most recent time (sched_clock() units) - * when an application was actively using Homa on this core (e.g., - * by sending or receiving messages). Used for load balancing + * @last_app_active: the most recent homa_clock() time when an + * application was actively using Homa on this core (e.g., by + * sending or receiving messages). Used for load balancing * (see balance.txt). */ u64 last_app_active; diff --git a/homa_outgoing.c b/homa_outgoing.c index e2181107..27538844 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -34,7 +34,7 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) if (rpc->msgout.unscheduled > length) rpc->msgout.unscheduled = length; #endif /* See strip.py */ - rpc->msgout.init_ns = sched_clock(); + rpc->msgout.init_time = homa_clock(); } #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_pacer.c b/homa_pacer.c index e2ae5e2d..8f5e997c 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -79,7 +79,7 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa, struct net *net) goto error; } init_completion(&pacer->kthread_done); - atomic64_set(&pacer->link_idle_time, sched_clock()); + atomic64_set(&pacer->link_idle_time, homa_clock()); #ifndef __STRIP__ /* See strip.py */ pacer->sysctl_header = register_net_sysctl(net, "net/homa", @@ -141,17 +141,17 @@ void homa_pacer_free(struct homa_pacer *pacer) int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, bool force) { - u64 idle, new_idle, clock, ns_for_packet; + u64 idle, new_idle, clock, cycles_for_packet; int bytes; bytes = homa_get_skb_info(skb)->wire_bytes; - ns_for_packet = pacer->ns_per_mbyte; - ns_for_packet *= bytes; - do_div(ns_for_packet, 1000000); + cycles_for_packet = pacer->cycles_per_mbyte; + cycles_for_packet *= bytes; + do_div(cycles_for_packet, 1000000); while (1) { - clock = sched_clock(); + clock = homa_clock(); idle = atomic64_read(&pacer->link_idle_time); - if ((clock + pacer->max_nic_queue_ns) < idle && !force && + if ((clock + pacer->max_nic_queue_cycles) < idle && !force && !(pacer->homa->flags & HOMA_FLAG_DONT_THROTTLE)) return 0; #ifndef __STRIP__ /* See strip.py */ @@ -162,18 +162,18 @@ int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, u64 lost = (pacer->wake_time > idle) ? clock - pacer->wake_time : clock - idle; - INC_METRIC(pacer_lost_ns, lost); + INC_METRIC(pacer_lost_cycles, lost); tt_record1("pacer lost %d cycles", lost); } - new_idle = clock + ns_for_packet; + new_idle = clock + cycles_for_packet; } else { - new_idle = idle + ns_for_packet; + new_idle = idle + cycles_for_packet; } #else /* See strip.py */ if (idle < clock) - new_idle = clock + ns_for_packet; + new_idle = clock + cycles_for_packet; else - new_idle = idle + ns_for_packet; + new_idle = idle + cycles_for_packet; #endif /* See strip.py */ /* This method must be thread-safe. */ @@ -198,9 +198,9 @@ int homa_pacer_main(void *arg) while (1) { if (pacer->exit) break; - pacer->wake_time = sched_clock(); + pacer->wake_time = homa_clock(); homa_pacer_xmit(pacer); - INC_METRIC(pacer_ns, sched_clock() - pacer->wake_time); + INC_METRIC(pacer_cycles, homa_clock() - pacer->wake_time); pacer->wake_time = 0; if (!list_empty(&pacer->throttled_rpcs)) { /* NIC queue is full; before calling pacer again, @@ -241,15 +241,15 @@ int homa_pacer_main(void *arg) void homa_pacer_xmit(struct homa_pacer *pacer) { struct homa_rpc *rpc; - s64 queue_ns; + s64 queue_cycles; /* Make sure only one instance of this function executes at a time. */ if (!spin_trylock_bh(&pacer->mutex)) return; while (1) { - queue_ns = atomic64_read(&pacer->link_idle_time) - sched_clock(); - if (queue_ns >= pacer->max_nic_queue_ns) + queue_cycles = atomic64_read(&pacer->link_idle_time) - homa_clock(); + if (queue_cycles >= pacer->max_nic_queue_cycles) break; if (list_empty(&pacer->throttled_rpcs)) break; @@ -272,9 +272,9 @@ void homa_pacer_xmit(struct homa_pacer *pacer) rpc = NULL; list_for_each_entry(cur, &pacer->throttled_rpcs, throttled_links) { - if (cur->msgout.init_ns < oldest) { + if (cur->msgout.init_time < oldest) { rpc = cur; - oldest = cur->msgout.init_ns; + oldest = cur->msgout.init_time; } } } else { @@ -338,10 +338,10 @@ void homa_pacer_manage_rpc(struct homa_rpc *rpc) if (!list_empty(&rpc->throttled_links)) return; - IF_NO_STRIP(now = sched_clock()); + IF_NO_STRIP(now = homa_clock()); #ifndef __STRIP__ /* See strip.py */ if (!list_empty(&pacer->throttled_rpcs)) - INC_METRIC(throttled_ns, now - pacer->throttle_add); + INC_METRIC(throttled_cycles, now - pacer->throttle_add); pacer->throttle_add = now; #endif /* See strip.py */ bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; @@ -388,7 +388,7 @@ void homa_pacer_unmanage_rpc(struct homa_rpc *rpc) list_del_init(&rpc->throttled_links); #ifndef __STRIP__ /* See strip.py */ if (list_empty(&pacer->throttled_rpcs)) - INC_METRIC(throttled_ns, sched_clock() + INC_METRIC(throttled_cycles, homa_clock() - pacer->throttle_add); #endif /* See strip.py */ homa_pacer_throttle_unlock(pacer); @@ -405,12 +405,13 @@ void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) { u64 tmp; - tmp = 8 * 1000ULL * 1000ULL * 1000ULL; + pacer->max_nic_queue_cycles = + homa_ns_to_cycles(pacer->max_nic_queue_ns); /* Underestimate link bandwidth (overestimate time) by 1%. */ - tmp = tmp * 101 / 100; - do_div(tmp, pacer->link_mbps); - pacer->ns_per_mbyte = tmp; + tmp = 101 * 8000 * (__u64)homa_clock_khz(); + do_div(tmp, pacer->link_mbps * 100); + pacer->cycles_per_mbyte = tmp; } #ifndef __STRIP__ /* See strip.py */ @@ -485,12 +486,12 @@ void homa_pacer_log_throttled(struct homa_pacer *pacer) void homa_pacer_throttle_lock_slow(struct homa_pacer *pacer) __acquires(&pacer->throttle_lock) { - u64 start = sched_clock(); + u64 start = homa_clock(); tt_record("beginning wait for throttle lock"); spin_lock_bh(&pacer->throttle_lock); tt_record("ending wait for throttle lock"); INC_METRIC(throttle_lock_misses, 1); - INC_METRIC(throttle_lock_miss_ns, sched_clock() - start); + INC_METRIC(throttle_lock_miss_cycles, homa_clock() - start); } #endif /* See strip.py */ diff --git a/homa_pacer.h b/homa_pacer.h index 09e17e78..21641bc3 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -36,8 +36,8 @@ struct homa_pacer { int fifo_count; /** - * @wake_time: time (in sched_clock units) when the pacer last - * woke up (if the pacer is running) or 0 if the pacer is sleeping. + * @wake_time: homa_clock() time when the pacer woke up (if the pacer + * is running) or 0 if the pacer is sleeping. */ u64 wake_time; @@ -56,8 +56,8 @@ struct homa_pacer { #ifndef __STRIP__ /* See strip.py */ /** - * @throttle_add: The time (in sched_clock() units) when the most - * recent RPC was added to @throttled_rpcs. + * @throttle_add: The most recent homa_clock() time when an RPC was + * added to @throttled_rpcs. */ u64 throttle_add; #endif /* See strip.py */ @@ -77,6 +77,12 @@ struct homa_pacer { */ int max_nic_queue_ns; + /** + * @max_nic_queue_cycles: Same as max_nic_queue_ns except in + * homa_clock() units. + */ + int max_nic_queue_cycles; + /** * @link_mbps: The raw bandwidth of the network uplink, in * units of 1e06 bits per second. Set externally via sysctl. @@ -84,12 +90,12 @@ struct homa_pacer { int link_mbps; /** - * @ns_per_mbyte: the number of ns that it takes to transmit - * 10**6 bytes on our uplink. This is actually a slight overestimate - * of the value, to ensure that we don't underestimate NIC queue - * length and queue too many packets. + * @cycles_per_mbyte: the number of homa_clock() cycles that it takes to + * transmit 10**6 bytes on our uplink. This is actually a slight + * overestimate of the value, to ensure that we don't underestimate + * NIC queue length and queue too many packets. */ - u32 ns_per_mbyte; + u32 cycles_per_mbyte; /** * @throttle_min_bytes: If a packet has fewer bytes than this, then it @@ -135,12 +141,11 @@ struct homa_pacer { struct completion kthread_done; /** - * @link_idle_time: The time, measured by sched_clock, at which we - * estimate that all of the packets we have passed to the NIC for - * transmission will have been transmitted. May be in the past. - * This estimate assumes that only Homa is transmitting data, so - * it could be a severe underestimate if there is competing traffic - * from, say, TCP. Access only with atomic ops. + * @link_idle_time: The homa_clock() time at which we estimate + * that all of the packets we have passed to the NIC for transmission + * will have been transmitted. May be in the past. This estimate + * assumes that only Homa is transmitting data, so it could be a + * severe underestimate if there is competing traffic from, say, TCP. */ atomic64_t link_idle_time ____cacheline_aligned_in_smp; }; @@ -175,7 +180,7 @@ static inline void homa_pacer_check(struct homa_pacer *pacer) * to queue new packets; if the NIC queue becomes more than half * empty, then we will help out here. */ - if ((sched_clock() + (pacer->max_nic_queue_ns >> 1)) < + if ((homa_clock() + (pacer->max_nic_queue_cycles >> 1)) < atomic64_read(&pacer->link_idle_time)) return; tt_record("homa_check_pacer calling homa_pacer_xmit"); diff --git a/homa_peer.c b/homa_peer.c index d9bcee21..fbbd3453 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -132,7 +132,7 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, * homa_peertab_gc_dsts() - Invoked to free unused dst_entries, if it is * safe to do so. * @peertab: The table in which to free entries. - * @now: Current time, in sched_clock() units; entries with expiration + * @now: Current time, in homa_clock() units; entries with expiration * dates no later than this will be freed. Specify ~0 to * free all entries. */ @@ -278,9 +278,9 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, } spin_lock_bh(&peertab->write_lock); - now = sched_clock(); + now = homa_clock(); save_dead->dst = peer->dst; - save_dead->gc_time = now + 100000000; /* 100 ms */ + save_dead->gc_time = now + (homa_clock_khz() << 7); /* ~128 ms */ list_add_tail(&save_dead->dst_links, &peertab->dead_dsts); homa_peertab_gc_dsts(peertab, now); peer->dst = dst; @@ -395,13 +395,13 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, void homa_peer_lock_slow(struct homa_peer *peer) __acquires(&peer->ack_lock) { - u64 start = sched_clock(); + u64 start = homa_clock(); tt_record("beginning wait for peer lock"); spin_lock_bh(&peer->ack_lock); tt_record("ending wait for peer lock"); INC_METRIC(peer_ack_lock_misses, 1); - INC_METRIC(peer_ack_lock_miss_ns, sched_clock() - start); + INC_METRIC(peer_ack_lock_miss_cycles, homa_clock() - start); } #endif /* See strip.py */ diff --git a/homa_peer.h b/homa_peer.h index 947832fc..f2fe6458 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -22,10 +22,7 @@ struct homa_dead_dst { /** @dst: Entry that is no longer used by a struct homa_peer. */ struct dst_entry *dst; - /** - * @gc_time: Time (in units of sched_clock()) when it is safe - * to free @dst. - */ + /** @gc_time: homa_clock() time when it is safe to free @dst. */ u64 gc_time; /** @dst_links: Used to link together entries in peertab->dead_dsts. */ diff --git a/homa_plumbing.c b/homa_plumbing.c index e8436987..31b3b1ed 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -741,12 +741,12 @@ int homa_ioctl(struct sock *sk, int cmd, int *karg) { #ifndef __STRIP__ /* See strip.py */ int result; - u64 start = sched_clock(); + u64 start = homa_clock(); if (cmd == HOMAIOCABORT) { result = homa_ioc_abort(sk, karg); INC_METRIC(abort_calls, 1); - INC_METRIC(abort_ns, sched_clock() - start); + INC_METRIC(abort_cycles, homa_clock() - start); } else if (cmd == HOMAIOCFREEZE) { tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, pid %d", current->pid); @@ -803,7 +803,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, if (optname == SO_HOMA_RCVBUF) { struct homa_rcvbuf_args args; #ifndef __STRIP__ /* See strip.py */ - u64 start = sched_clock(); + u64 start = homa_clock(); #endif /* See strip.py */ if (optlen != sizeof(struct homa_rcvbuf_args)) @@ -822,7 +822,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, ret = homa_pool_set_region(hsk, u64_to_user_ptr(args.start), args.length); INC_METRIC(so_set_buf_calls, 1); - INC_METRIC(so_set_buf_ns, sched_clock() - start); + INC_METRIC(so_set_buf_cycles, homa_clock() - start); } else if (optname == SO_HOMA_SERVER) { int arg; @@ -911,7 +911,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) struct homa_sendmsg_args args; union sockaddr_in_union *addr; #ifndef __STRIP__ /* See strip.py */ - u64 start = sched_clock(); + u64 start = homa_clock(); #endif /* See strip.py */ struct homa_rpc *rpc = NULL; int result = 0; @@ -994,9 +994,9 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) goto error; } #ifndef __STRIP__ /* See strip.py */ - finish = sched_clock(); + finish = homa_clock(); #endif /* See strip.py */ - INC_METRIC(send_ns, finish - start); + INC_METRIC(send_cycles, finish - start); } else { /* This is a response message. */ struct in6_addr canonical_dest; @@ -1041,9 +1041,9 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) goto error; homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ #ifndef __STRIP__ /* See strip.py */ - finish = sched_clock(); + finish = homa_clock(); #endif /* See strip.py */ - INC_METRIC(reply_ns, finish - start); + INC_METRIC(reply_cycles, finish - start); } tt_record1("homa_sendmsg finished, id %d", args.id); return 0; @@ -1076,7 +1076,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct homa_rpc *rpc; int nonblocking; #ifndef __STRIP__ /* See strip.py */ - u64 start = sched_clock(); + u64 start = homa_clock(); u64 finish; #endif /* See strip.py */ int result; @@ -1145,7 +1145,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, * for performance debugging). */ if (rpc->hsk->homa->freeze_type == SLOW_RPC) { - u64 elapsed = (sched_clock() - rpc->start_ns) >> 10; + u64 elapsed = (homa_clock() - rpc->start_time) >> 10; if (elapsed <= hsk->homa->temp[1] && elapsed >= hsk->homa->temp[0] && @@ -1223,9 +1223,9 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } #ifndef __STRIP__ /* See strip.py */ - finish = sched_clock(); + finish = homa_clock(); #endif /* See strip.py */ - INC_METRIC(recv_ns, finish - start); + INC_METRIC(recv_cycles, finish - start); tt_record2("homa_recvmsg returning status %d, id %d", result, control.id); return result; @@ -1265,7 +1265,7 @@ int homa_softirq(struct sk_buff *skb) #ifndef __STRIP__ /* See strip.py */ u64 start; - start = sched_clock(); + start = homa_clock(); per_cpu(homa_offload_core, raw_smp_processor_id()).last_active = start; #endif /* See strip.py */ INC_METRIC(softirq_calls, 1); @@ -1404,7 +1404,7 @@ int homa_softirq(struct sk_buff *skb) #ifndef __STRIP__ /* See strip.py */ atomic_dec(&per_cpu(homa_offload_core, raw_smp_processor_id()).softirq_backlog); #endif /* See strip.py */ - INC_METRIC(softirq_ns, sched_clock() - start); + INC_METRIC(softirq_cycles, homa_clock() - start); return 0; } diff --git a/homa_pool.c b/homa_pool.c index e0c77281..3143fd12 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -159,7 +159,7 @@ void homa_pool_get_rcvbuf(struct homa_pool *pool, /** * homa_bpage_available() - Check whether a bpage is available for use. * @bpage: Bpage to check - * @now: Current time (sched_clock() units) + * @now: Current time (homa_clock() units) * Return: True if the bpage is free or if it can be stolen, otherwise * false. */ @@ -189,7 +189,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, { int core_num = smp_processor_id(); struct homa_pool_core *core; - u64 now = sched_clock(); + u64 now = homa_clock(); int alloced = 0; int limit = 0; @@ -263,8 +263,8 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, if (set_owner) { atomic_set(&bpage->refs, 2); bpage->owner = core_num; - bpage->expiration = now + 1000 * - pool->hsk->homa->bpage_lease_usecs; + bpage->expiration = now + + pool->hsk->homa->bpage_lease_cycles; } else { atomic_set(&bpage->refs, 1); bpage->owner = -1; @@ -355,8 +355,8 @@ int homa_pool_alloc_msg(struct homa_rpc *rpc) goto new_page; } } - bpage->expiration = sched_clock() + - 1000 * pool->hsk->homa->bpage_lease_usecs; + bpage->expiration = homa_clock() + + pool->hsk->homa->bpage_lease_cycles; atomic_inc(&bpage->refs); spin_unlock_bh(&bpage->lock); goto allocate_partial; diff --git a/homa_pool.h b/homa_pool.h index 5a1bb6bf..237cad1d 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -33,9 +33,8 @@ struct homa_bpage { int owner; /** - * @expiration: time (in sched_clock() units) after - * which it's OK to steal this page from its current - * owner (if @refs is 1). + * @expiration: homa_clock() time after which it's OK to steal this + * page from its current owner (if @refs is 1). */ u64 expiration; } ____cacheline_aligned_in_smp; diff --git a/homa_rpc.c b/homa_rpc.c index b726a20f..57a5fa84 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -65,7 +65,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, INIT_LIST_HEAD(&crpc->throttled_links); crpc->resend_timer_ticks = hsk->homa->timer_ticks; crpc->magic = HOMA_RPC_MAGIC; - crpc->start_ns = sched_clock(); + crpc->start_time = homa_clock(); /* Initialize fields that require locking. This allows the most * expensive work, such as copying in the message from user space, @@ -163,7 +163,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, INIT_LIST_HEAD(&srpc->throttled_links); srpc->resend_timer_ticks = hsk->homa->timer_ticks; srpc->magic = HOMA_RPC_MAGIC; - srpc->start_ns = sched_clock(); + srpc->start_time = homa_clock(); #ifndef __STRIP__ /* See strip.py */ tt_record2("Incoming message for id %d has %d unscheduled bytes", srpc->id, ntohl(h->incoming)); diff --git a/homa_rpc.h b/homa_rpc.h index 8cbed235..cbc649e6 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -88,10 +88,10 @@ struct homa_message_out { #endif /* See strip.py */ /** - * @init_ns: Time in sched_clock units when this structure was - * initialized. Used to find the oldest outgoing message. + * @init_time: homa_clock() time when this structure was initialized. + * Used to find the oldest outgoing message. */ - u64 init_ns; + u64 init_time; }; /** @@ -106,7 +106,7 @@ struct homa_gap { int end; /** - * @time: time (in sched_clock units) when the gap was first detected. + * @time: homa_clock() time when the gap was first detected. * As of 7/2024 this isn't used for anything. */ u64 time; @@ -190,7 +190,7 @@ struct homa_message_in { int rec_incoming; /** - * @birth: sched_clock() time when homa_grant_manage_rpc was invoked + * @birth: homa_clock() time when homa_grant_manage_rpc was invoked * for this RPC. Managed by homa_grant.c. Only set if the RPC needs * grants. */ @@ -406,10 +406,10 @@ struct homa_rpc { int magic; /** - * @start_ns: time (from sched_clock()) when this RPC was created. - * Used (sometimes) for testing. + * @start_time: homa_clock() time when this RPC was created. Used + * occasionally for testing. */ - u64 start_ns; + u64 start_time; }; void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, diff --git a/homa_skb.c b/homa_skb.c index 56e99fec..e22850b8 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -118,7 +118,7 @@ void homa_skb_cleanup(struct homa *homa) */ struct sk_buff *homa_skb_alloc_tx(int length) { - u64 start = sched_clock(); + u64 start = homa_clock(); struct sk_buff *skb; /* Note: allocate space for an IPv6 header, which is larger than @@ -131,7 +131,7 @@ struct sk_buff *homa_skb_alloc_tx(int length) skb_reset_transport_header(skb); } INC_METRIC(skb_allocs, 1); - INC_METRIC(skb_alloc_ns, sched_clock() - start); + INC_METRIC(skb_alloc_cycles, homa_clock() - start); return skb; } @@ -286,17 +286,17 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) /* Step 3: can we allocate a new big page? */ INC_METRIC(skb_page_allocs, 1); - start = sched_clock(); + start = homa_clock(); skb_core->skb_page = alloc_pages(GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, HOMA_SKB_PAGE_ORDER); if (likely(skb_core->skb_page)) { - INC_METRIC(skb_page_alloc_ns, sched_clock() - start); + INC_METRIC(skb_page_alloc_cycles, homa_clock() - start); goto success; } /* Step 4: can we allocate a normal page? */ skb_core->skb_page = alloc_page(GFP_ATOMIC); - INC_METRIC(skb_page_alloc_ns, sched_clock() - start); + INC_METRIC(skb_page_alloc_cycles, homa_clock() - start); if (likely(skb_core->skb_page)) { skb_core->page_size = PAGE_SIZE; goto success; @@ -456,7 +456,7 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) #define MAX_PAGES_AT_ONCE 50 #endif struct page *pages_to_cache[MAX_PAGES_AT_ONCE]; - u64 start = sched_clock(); + u64 start = homa_clock(); int num_pages = 0; int i, j; @@ -496,7 +496,7 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) if (num_pages > 0) homa_skb_cache_pages(homa, pages_to_cache, num_pages); INC_METRIC(skb_frees, count); - INC_METRIC(skb_free_ns, sched_clock() - start); + INC_METRIC(skb_free_cycles, homa_clock() - start); } /** @@ -588,13 +588,13 @@ void homa_skb_release_pages(struct homa *homa) { int i, max_low_mark, min_pages, release, release_max; struct homa_page_pool *max_pool; - u64 now = sched_clock(); + u64 now = homa_clock(); if (now < homa->skb_page_free_time) return; /* Free pages every 0.5 second. */ - homa->skb_page_free_time = now + 500000000ULL; + homa->skb_page_free_time = now + (500 * homa_clock_khz()); release_max = homa->skb_page_frees_per_sec / 2; if (homa->pages_to_free_slots < release_max) { struct page **old = homa->skb_pages_to_free; diff --git a/homa_sock.c b/homa_sock.c index 7fdadb22..72ed9167 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -418,13 +418,13 @@ struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) void homa_sock_lock_slow(struct homa_sock *hsk) __acquires(&hsk->lock) { - u64 start = sched_clock(); + u64 start = homa_clock(); tt_record("beginning wait for socket lock"); spin_lock_bh(&hsk->lock); tt_record("ending wait for socket lock"); INC_METRIC(socket_lock_misses, 1); - INC_METRIC(socket_lock_miss_ns, sched_clock() - start); + INC_METRIC(socket_lock_miss_cycles, homa_clock() - start); } /** @@ -439,7 +439,7 @@ void homa_sock_lock_slow(struct homa_sock *hsk) void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) __acquires(rpc_bucket_lock) { - u64 start = sched_clock(); + u64 start = homa_clock(); tt_record2("beginning wait for rpc lock, id %d, (bucket %d)", id, bucket->id); @@ -448,10 +448,10 @@ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) id, bucket->id); if (homa_is_client(id)) { INC_METRIC(client_lock_misses, 1); - INC_METRIC(client_lock_miss_ns, sched_clock() - start); + INC_METRIC(client_lock_miss_cycles, homa_clock() - start); } else { INC_METRIC(server_lock_misses, 1); - INC_METRIC(server_lock_miss_ns, sched_clock() - start); + INC_METRIC(server_lock_miss_cycles, homa_clock() - start); } } #endif /* See strip.py */ diff --git a/homa_timer.c b/homa_timer.c index a102372d..1159b7b2 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -195,7 +195,7 @@ void homa_timer(struct homa *homa) homa->timer_ticks++; #ifndef __STRIP__ /* See strip.py */ - start = sched_clock(); + start = homa_clock(); total_grants = 0; for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = homa_metrics_per_cpu(); @@ -233,13 +233,13 @@ void homa_timer(struct homa *homa) * out. See reap.txt for more info. */ #ifndef __STRIP__ /* See strip.py */ - u64 rpc_start = sched_clock(); + u64 rpc_start = homa_clock(); #endif /* See strip.py */ tt_record("homa_timer calling homa_rpc_reap"); if (homa_rpc_reap(hsk, false) == 0) break; - INC_METRIC(timer_reap_ns, sched_clock() - rpc_start); + INC_METRIC(timer_reap_cycles, homa_clock() - rpc_start); } if (list_empty(&hsk->active_rpcs) || hsk->shutdown) @@ -289,7 +289,7 @@ void homa_timer(struct homa *homa) #endif /* See strip.py */ homa_skb_release_pages(homa); #ifndef __STRIP__ /* See strip.py */ - end = sched_clock(); - INC_METRIC(timer_ns, end - start); + end = homa_clock(); + INC_METRIC(timer_cycles, end - start); #endif /* See strip.py */ } diff --git a/homa_utils.c b/homa_utils.c index 13fcd385..781cb66f 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -211,8 +211,8 @@ void homa_spin(int ns) { u64 end; - end = sched_clock() + ns; - while (sched_clock() < end) + end = homa_clock() + homa_ns_to_cycles(ns); + while (homa_clock() < end) /* Empty loop body.*/ ; } diff --git a/test/mock.c b/test/mock.c index b3dd6b3f..53e5408d 100644 --- a/test/mock.c +++ b/test/mock.c @@ -164,26 +164,23 @@ int mock_rpc_holds; */ static int mock_preempt_disables; -/* Used as the return value for calls to get_cycles. A value of ~0 means - * return actual clock time. Shouldn't be used much anymore (get_cycles - * shouldn't be used). - */ -cycles_t mock_cycles; - -/* Used as the return value for calls to sched_clock. */ -u64 mock_ns; +/* Used as the return value for calls to homa_clock. */ +u64 mock_clock; -/* Add this value to mock_ns every time sched_clock is invoked. */ -u64 mock_ns_tick; +/* Add this value to mock_clock every time homa_clock is invoked. */ +u64 mock_clock_tick; -/* If values are present here, use then as the return values from - * sched_clock, without considering mock_ns or mock_ns_ticks. +/* If values are present here, use them as the return values from + * homa_clock, without considering mock_clock or mock_clock_tick. */ #define MAX_CLOCK_VALS 10 u64 mock_clock_vals[MAX_CLOCK_VALS]; int mock_next_clock_val = 0; int mock_num_clock_vals = 0; +/* Used as the return value for tt_get_cycles. */ +u64 mock_tt_cycles; + /* Indicates whether we should be simulation IPv6 or IPv4 in the * current test. Can be overridden by a test. */ @@ -248,7 +245,6 @@ struct net_offload tcp_offload; struct net_offload tcp_v6_offload; static struct hrtimer_clock_base clock_base; -unsigned int cpu_khz = 1000000; struct task_struct *current_task = &mock_task; unsigned long ex_handler_refcount; struct net init_net; @@ -1198,16 +1194,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} -u64 sched_clock(void) -{ - if (mock_next_clock_val < mock_num_clock_vals) { - mock_next_clock_val++; - return mock_clock_vals[mock_next_clock_val - 1]; - } - mock_ns += mock_ns_tick; - return mock_ns; -} - void schedule(void) { UNIT_HOOK("schedule"); @@ -1551,18 +1537,17 @@ void mock_data_ready(struct sock *sk) } /** - * mock_get_cycles() - Replacement for get_cycles; allows time to be - * hard-while using mock_cycles variable. + * mock_get_clock() - Replacement for homa_clock; allows time to be + * controlled by unit tests. */ -cycles_t mock_get_cycles(void) +u64 mock_get_clock(void) { - if (mock_cycles == ~0) { - uint32_t lo, hi; - - __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); - return (((uint64_t)hi << 32) | lo); + if (mock_next_clock_val < mock_num_clock_vals) { + mock_next_clock_val++; + return mock_clock_vals[mock_next_clock_val - 1]; } - return mock_cycles; + mock_clock += mock_clock_tick; + return mock_clock; } /** @@ -1727,7 +1712,7 @@ void mock_set_homa(struct homa *homa) /** * mock_set_clock_vals() - Specify one or more clock values to be returned - * by the next calls to sched_clock(). The list of arguments must be + * by the next calls to homa_clock(). The list of arguments must be * terminated by a zero value (which will not be used as a clock value). * @t: The first clock reading to return. */ @@ -1973,18 +1958,18 @@ void mock_teardown(void) pcpu_hot.cpu_number = 1; pcpu_hot.current_task = &mock_task; - cpu_khz = 1000000; mock_alloc_page_errors = 0; mock_alloc_skb_errors = 0; mock_copy_data_errors = 0; mock_copy_to_iter_errors = 0; mock_copy_to_user_errors = 0; mock_cpu_idle = 0; - mock_cycles = 0; - mock_ns = 0; - mock_ns_tick = 0; + mock_clock = 0; + mock_clock = 0; + mock_clock_tick = 0; mock_next_clock_val = 0; mock_num_clock_vals = 0; + mock_tt_cycles = 0; mock_ipv6 = mock_ipv6_default; mock_import_ubuf_errors = 0; mock_import_iovec_errors = 0; diff --git a/test/mock.h b/test/mock.h index 1fea3a60..725d3008 100644 --- a/test/mock.h +++ b/test/mock.h @@ -110,12 +110,13 @@ extern int mock_alloc_page_errors; extern int mock_alloc_skb_errors; extern int mock_bpage_size; extern int mock_bpage_shift; +extern u64 mock_clock; +extern u64 mock_clock_tick; extern int mock_compound_order_mask; extern int mock_copy_data_errors; extern int mock_copy_to_user_dont_copy; extern int mock_copy_to_user_errors; extern int mock_cpu_idle; -extern cycles_t mock_cycles; extern int mock_import_iovec_errors; extern int mock_import_ubuf_errors; extern int mock_ip6_xmit_errors; @@ -138,8 +139,6 @@ extern int mock_mtu; extern struct net mock_net; extern struct net_device mock_net_device; -extern u64 mock_ns; -extern u64 mock_ns_tick; extern int mock_numa_mask; extern int mock_page_nid_mask; extern int mock_prepare_to_wait_status; @@ -150,8 +149,9 @@ extern int mock_sock_holds; extern int mock_spin_lock_held; extern struct task_struct mock_task; -extern int mock_total_spin_locks; +extern int mock_total_spin_locks; extern int mock_trylock_errors; +extern u64 mock_tt_cycles; extern int mock_vmalloc_errors; extern int mock_xmit_log_verbose; extern int mock_xmit_log_homa_info; diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 5921cba0..fd9ff277 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -33,7 +33,7 @@ static void grant_spinlock_hook(char *id) { if (strcmp(id, "spin_lock") != 0) return; - mock_ns = 1000; + mock_clock = 1000; hook_spinlock_count++; } @@ -70,7 +70,7 @@ FIXTURE_SETUP(homa_grant) homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); self->homa.num_priorities = 1; - self->homa.poll_usecs = 0; + self->homa.poll_cycles = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.pacer->fifo_fraction = 0; self->homa.grant->fifo_fraction = 0; @@ -518,7 +518,7 @@ TEST_F(homa_grant, homa_grant_manage_rpc__update_metrics) { self->homa.grant->last_grantable_change = 50; self->homa.grant->num_grantable_rpcs = 3; - mock_ns = 200; + mock_clock = 200; homa_grant_manage_rpc(test_rpc(self, 100, self->server_ip, 100000)); EXPECT_EQ(4, self->homa.grant->num_grantable_rpcs); EXPECT_EQ(450, homa_metrics_per_cpu()->grantable_rpcs_integral); @@ -542,9 +542,9 @@ TEST_F(homa_grant, homa_grant_manage_rpc__insert_and_bump_to_grantables) self->homa.grant->max_overcommit = 1; self->homa.grant->last_grantable_change = 50; self->homa.grant->num_grantable_rpcs = 3; - mock_ns = 200; + mock_clock = 200; homa_grant_manage_rpc(rpc1); - mock_ns = 300; + mock_clock = 300; homa_grant_manage_rpc(rpc2); EXPECT_EQ(5, self->homa.grant->max_grantable_rpcs); EXPECT_EQ(850, homa_metrics_per_cpu()->grantable_rpcs_integral); @@ -793,7 +793,7 @@ TEST_F(homa_grant, homa_grant_unmanage_rpc) EXPECT_EQ(30000, self->homa.grant->window); self->homa.grant->last_grantable_change = 100; - mock_ns = 250; + mock_clock = 250; homa_grant_unmanage_rpc(rpc, &self->cand); unit_log_clear(); @@ -1008,7 +1008,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__fix_order) rpc3->msgin.bytes_remaining = 15000; atomic_set(&self->homa.grant->total_incoming, self->homa.grant->max_incoming - 15000); - mock_ns = self->homa.grant->next_recalc; + mock_clock = self->homa.grant->next_recalc; unit_log_clear(); homa_rpc_lock(rpc2); @@ -1132,7 +1132,7 @@ TEST_F(homa_grant, homa_grant_fix_order) #if 0 TEST_F(homa_grant, homa_grant_find_oldest__basics) { - mock_ns_tick = 10; + mock_clock_tick = 10; unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 11, 40000, 100); unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, @@ -1149,7 +1149,7 @@ TEST_F(homa_grant, homa_grant_find_oldest__fifo_grant_unused) { struct homa_rpc *srpc1, *srpc2; - mock_ns_tick = 10; + mock_clock_tick = 10; srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 11, 400000, 100); srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, @@ -1286,14 +1286,14 @@ TEST_F(homa_grant, homa_grant_cand_check__rpc_becomes_fully_granted) TEST_F(homa_grant, homa_grant_lock_slow) { - mock_ns = 500; + mock_clock = 500; unit_hook_register(grant_spinlock_hook); homa_grant_lock_slow(self->homa.grant); homa_grant_unlock(self->homa.grant); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_lock_misses); - EXPECT_EQ(500, homa_metrics_per_cpu()->grant_lock_miss_ns); + EXPECT_EQ(500, homa_metrics_per_cpu()->grant_lock_miss_cycles); } TEST_F(homa_grant, homa_grant_update_sysctl_deps__max_overcommit) @@ -1331,11 +1331,11 @@ TEST_F(homa_grant, homa_grant_update_sysctl_deps__grant_nonfifo) homa_grant_update_sysctl_deps(self->homa.grant); EXPECT_EQ(90000, self->homa.grant->grant_nonfifo); } -TEST_F(homa_grant, homa_grant_update_sysctl_deps__recalc_ns) +TEST_F(homa_grant, homa_grant_update_sysctl_deps__recalc_cycles) { self->homa.grant->recalc_usecs = 7; homa_grant_update_sysctl_deps(self->homa.grant); - EXPECT_EQ(7000, self->homa.grant->recalc_ns); + EXPECT_EQ(7000, self->homa.grant->recalc_cycles); } TEST_F(homa_grant, homa_grant_update_sysctl_deps__grant_window) { diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 7d439ec4..992e7ae4 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -92,7 +92,7 @@ FIXTURE_SETUP(homa_incoming) mock_set_homa(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 1; - self->homa.poll_usecs = 0; + self->homa.poll_cycles = 0; #endif /* See strip.py */ self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.pacer->fifo_fraction = 0; @@ -235,7 +235,7 @@ TEST_F(homa_incoming, homa_add_packet__basics) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); - mock_ns = 5000; + mock_clock = 5000; self->data.seg.offset = htonl(1400); homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400)); @@ -524,7 +524,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_in_middle_of_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); - mock_ns = 1000; + mock_clock = 1000; self->data.seg.offset = htonl(0); homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); @@ -536,7 +536,7 @@ TEST_F(homa_incoming, homa_add_packet__packet_in_middle_of_gap) unit_print_gaps(crpc)); self->data.seg.offset = htonl(2000); - mock_ns = 2000; + mock_clock = 2000; homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2000)); EXPECT_EQ(3, skb_queue_len(&crpc->msgin.packets)); @@ -551,7 +551,7 @@ TEST_F(homa_incoming, homa_add_packet__kmalloc_failure_while_splitting_gap) homa_message_in_init(crpc, 10000, 0); unit_log_clear(); - mock_ns = 1000; + mock_clock = 1000; self->data.seg.offset = htonl(0); homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); @@ -563,7 +563,7 @@ TEST_F(homa_incoming, homa_add_packet__kmalloc_failure_while_splitting_gap) unit_print_gaps(crpc)); self->data.seg.offset = htonl(2000); - mock_ns = 2000; + mock_clock = 2000; mock_kmalloc_errors = 1; homa_add_packet(crpc, mock_skb_alloc(self->client_ip, &self->data.common, 1400, 2000)); @@ -1202,7 +1202,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 20000); struct homa_rpc *srpc; - mock_ns_tick = 10; + mock_clock_tick = 10; homa_rpc_end(dead); #ifndef __STRIP__ /* See strip.py */ @@ -1226,7 +1226,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) EXPECT_EQ(30, self->hsk.dead_skbs); #endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(0, homa_metrics_per_cpu()->data_pkt_reap_ns); + EXPECT_EQ(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); #endif /* See strip.py */ /* Second packet: must reap. */ @@ -1240,7 +1240,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) EXPECT_EQ(20, self->hsk.dead_skbs); #endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ - EXPECT_NE(0, homa_metrics_per_cpu()->data_pkt_reap_ns); + EXPECT_NE(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); #endif /* See strip.py */ } @@ -2314,7 +2314,7 @@ TEST_F(homa_incoming, homa_wait_private__signal_notify_race) ASSERT_NE(NULL, crpc); atomic_or(RPC_PRIVATE, &crpc->flags); - IF_NO_STRIP(self->homa.poll_usecs = 0); + IF_NO_STRIP(self->homa.poll_cycles = 0); unit_hook_register(handoff_hook); hook_rpc = crpc; hook_count = 2; @@ -2534,12 +2534,16 @@ TEST_F(homa_incoming, homa_rpc_handoff__queue_rpc_on_socket) } #ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_incoming, homa_incoming_sysctl_changed__convert_usec_to_ns) +TEST_F(homa_incoming, homa_incoming_sysctl_changed__convert_usec_to_cycles) { + self->homa.poll_usecs = 27; self->homa.busy_usecs = 53; self->homa.gro_busy_usecs = 140; + self->homa.bpage_lease_usecs = 700; homa_incoming_sysctl_changed(&self->homa); - EXPECT_EQ(53000, self->homa.busy_ns); - EXPECT_EQ(140000, self->homa.gro_busy_ns); + EXPECT_EQ(27000, self->homa.poll_cycles); + EXPECT_EQ(53000, self->homa.busy_cycles); + EXPECT_EQ(140000, self->homa.gro_busy_cycles); + EXPECT_EQ(700000, self->homa.bpage_lease_cycles); } #endif /* See strip.py */ diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index bf8a0541..74afbd44 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -155,7 +155,7 @@ TEST_F(homa_interest, homa_interest_wait__call_schedule) homa_interest_init_shared(&interest, &self->hsk); - self->homa.poll_usecs = 100; + self->homa.poll_cycles = 100; unit_hook_register(log_hook); unit_hook_register(notify_hook); hook_interest = &interest; @@ -179,7 +179,7 @@ TEST_F(homa_interest, homa_interest_wait__call_homa_rpc_reap) homa_rpc_end(crpc); EXPECT_EQ(15, self->hsk.dead_skbs); homa_interest_init_shared(&interest, &self->hsk); - IF_NO_STRIP(self->homa.poll_usecs = 0); + IF_NO_STRIP(self->homa.poll_cycles = 0); EXPECT_EQ(EAGAIN, -homa_interest_wait(&interest, 1)); EXPECT_EQ(0, self->hsk.dead_skbs); @@ -190,7 +190,7 @@ TEST_F(homa_interest, homa_interest_wait__nonblocking) struct homa_interest interest; homa_interest_init_shared(&interest, &self->hsk); - IF_NO_STRIP(self->homa.poll_usecs = 100); + IF_NO_STRIP(self->homa.poll_cycles = 100000); EXPECT_EQ(EAGAIN, -homa_interest_wait(&interest, 1)); EXPECT_EQ(0, interest.blocked); @@ -201,17 +201,17 @@ TEST_F(homa_interest, homa_interest_wait__poll_then_block) struct homa_interest interest; homa_interest_init_shared(&interest, &self->hsk); - IF_NO_STRIP(self->homa.poll_usecs = 3); + IF_NO_STRIP(self->homa.poll_cycles = 3000); mock_set_clock_vals(1000, 2000, 3999, 4000, 0); - mock_ns = 4000; + mock_clock = 4000; unit_hook_register(notify_hook); hook_interest = &interest; hook_count = 4; EXPECT_EQ(0, -homa_interest_wait(&interest, 0)); #ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(3000, homa_metrics_per_cpu()->poll_ns); - EXPECT_EQ(0, homa_metrics_per_cpu()->blocked_ns); + EXPECT_EQ(3000, homa_metrics_per_cpu()->poll_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->blocked_cycles); EXPECT_EQ(1, interest.blocked); #endif /* See strip.py */ homa_interest_unlink_shared(&interest); @@ -222,7 +222,7 @@ TEST_F(homa_interest, homa_interest_wait__interrupted_by_signal) homa_interest_init_shared(&interest, &self->hsk); mock_prepare_to_wait_errors = 1; - IF_NO_STRIP(self->homa.poll_usecs = 0); + IF_NO_STRIP(self->homa.poll_cycles = 0); EXPECT_EQ(EINTR, -homa_interest_wait(&interest, 0)); EXPECT_EQ(1, interest.blocked); @@ -233,16 +233,16 @@ TEST_F(homa_interest, homa_interest_wait__time_metrics) struct homa_interest interest; homa_interest_init_shared(&interest, &self->hsk); - IF_NO_STRIP(self->homa.poll_usecs = 0); + IF_NO_STRIP(self->homa.poll_cycles = 0); mock_set_clock_vals(1000, 1500, 3000, 3200, 0); - mock_ns = 4000; + mock_clock = 4000; unit_hook_register(notify_hook); hook_interest = &interest; hook_count = 4; EXPECT_EQ(0, -homa_interest_wait(&interest, 0)); - IF_NO_STRIP(EXPECT_EQ(700, homa_metrics_per_cpu()->poll_ns)); - IF_NO_STRIP(EXPECT_EQ(1500, homa_metrics_per_cpu()->blocked_ns)); + IF_NO_STRIP(EXPECT_EQ(700, homa_metrics_per_cpu()->poll_cycles)); + IF_NO_STRIP(EXPECT_EQ(1500, homa_metrics_per_cpu()->blocked_cycles)); homa_interest_unlink_shared(&interest); } @@ -285,8 +285,8 @@ TEST_F(homa_interest, homa_choose_interest__find_idle_core) homa_interest_init_shared(&interest3, &self->hsk); interest3.core = 3; - mock_ns = 5000; - self->homa.busy_ns = 1000; + mock_clock = 5000; + self->homa.busy_cycles = 1000; per_cpu(homa_offload_core, 1).last_active = 2000; per_cpu(homa_offload_core, 2).last_active = 3500; per_cpu(homa_offload_core, 3).last_active = 4100; @@ -308,8 +308,8 @@ TEST_F(homa_interest, homa_choose_interest__all_cores_busy) homa_interest_init_shared(&interest3, &self->hsk); interest3.core = 3; - mock_ns = 5000; - self->homa.busy_ns = 1000; + mock_clock = 5000; + self->homa.busy_cycles = 1000; per_cpu(homa_offload_core, 1).last_active = 4100; per_cpu(homa_offload_core, 2).last_active = 4001; per_cpu(homa_offload_core, 3).last_active = 4800; diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index bcb00f8d..8997f530 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -88,8 +88,8 @@ FIXTURE_SETUP(homa_offload) unit_log_clear(); /* Configure so core isn't considered too busy for bypasses. */ - mock_ns = 1000; - self->homa.gro_busy_ns = 500; + mock_clock = 1000; + self->homa.gro_busy_cycles = 500; cur_offload_core->last_gro = 400; } FIXTURE_TEARDOWN(homa_offload) @@ -518,8 +518,8 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) TEST_F(homa_offload, homa_gro_gen2) { self->homa.gro_policy = HOMA_GRO_GEN2; - mock_ns = 1000; - self->homa.busy_ns = 100; + mock_clock = 1000; + self->homa.busy_cycles = 100; mock_set_core(5); atomic_set(&per_cpu(homa_offload_core, 6).softirq_backlog, 1); per_cpu(homa_offload_core, 6).last_gro = 0; @@ -566,8 +566,8 @@ TEST_F(homa_offload, homa_gro_gen3__basics) offload3->last_app_active = 4100; offload7->last_app_active = 3900; offload5->last_app_active = 2000; - mock_ns = 5000; - self->homa.busy_ns = 1000; + mock_clock = 5000; + self->homa.busy_cycles = 1000; homa_gro_complete(self->skb, 0); EXPECT_EQ(7, self->skb->hash - 32); @@ -584,8 +584,8 @@ TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) offload_core->gen3_softirq_cores[2] = 5; per_cpu(homa_offload_core, 3).last_app_active = 4100; per_cpu(homa_offload_core, 5).last_app_active = 2000; - mock_ns = 5000; - self->homa.busy_ns = 1000; + mock_clock = 5000; + self->homa.busy_cycles = 1000; homa_gro_complete(self->skb, 0); EXPECT_EQ(3, self->skb->hash - 32); @@ -602,8 +602,8 @@ TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) per_cpu(homa_offload_core, 3).last_app_active = 4100; per_cpu(homa_offload_core, 7).last_app_active = 4001; per_cpu(homa_offload_core, 5).last_app_active = 4500; - mock_ns = 5000; - self->homa.busy_ns = 1000; + mock_clock = 5000; + self->homa.busy_cycles = 1000; homa_gro_complete(self->skb, 0); EXPECT_EQ(3, self->skb->hash - 32); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 34d47b8a..c20ac626 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -80,8 +80,8 @@ FIXTURE_SETUP(homa_outgoing) self->server_id = 1235; homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); - mock_ns = 10000; - self->homa.pacer->ns_per_mbyte = 1000000; + mock_clock = 10000; + self->homa.pacer->cycles_per_mbyte = 1000000; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; @@ -830,7 +830,7 @@ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 11000); - self->homa.pacer->max_nic_queue_ns = 500; + self->homa.pacer->max_nic_queue_cycles = 500; self->homa.pacer->throttle_min_bytes = 250; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); @@ -852,7 +852,7 @@ TEST_F(homa_outgoing, homa_xmit_data__force) /* First, get an RPC on the throttled list. */ atomic64_set(&self->homa.pacer->link_idle_time, 11000); - self->homa.pacer->max_nic_queue_ns = 3000; + self->homa.pacer->max_nic_queue_cycles = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc1); homa_xmit_data(crpc1, false); @@ -880,7 +880,7 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 11000); - self->homa.pacer->max_nic_queue_ns = 3000; + self->homa.pacer->max_nic_queue_cycles = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index b00a0a8d..e9cbb466 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -21,17 +21,17 @@ static void unmanage_hook(char *id) { homa_pacer_unmanage_rpc(hook_rpc); } -static u64 hook_exit_ns; +static u64 hook_exit_cycles; static struct homa_pacer *hook_pacer; static void exit_hook(char *id) { - mock_ns += mock_ns_tick; - if (mock_ns >= hook_exit_ns) + mock_clock += mock_clock_tick; + if (mock_clock >= hook_exit_cycles) hook_pacer->exit = true; } static void exit_idle_hook(char *id) { if (strcmp(id, "schedule") == 0) - unit_log_printf("; ", "time %llu", mock_ns); + unit_log_printf("; ", "time %llu", mock_clock); if (list_empty(&hook_pacer->throttled_rpcs)) hook_pacer->exit = true; } @@ -64,7 +64,7 @@ FIXTURE_SETUP(homa_pacer) self->server_id = 1235; homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); - self->homa.pacer->ns_per_mbyte = 1000000; + self->homa.pacer->cycles_per_mbyte = 1000000; self->homa.pacer->throttle_min_bytes = 0; #ifndef __STRIP__ /* See strip.py */ self->homa.pacer->fifo_fraction = 0; @@ -157,8 +157,8 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__success) homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); - mock_ns = 8000; - self->homa.pacer->max_nic_queue_ns = 1000; + mock_clock = 8000; + self->homa.pacer->max_nic_queue_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, false)); EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -174,8 +174,8 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full) homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); - mock_ns = 7999; - self->homa.pacer->max_nic_queue_ns = 1000; + mock_clock = 7999; + self->homa.pacer->max_nic_queue_cycles = 1000; EXPECT_EQ(0, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, false)); EXPECT_EQ(9000, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -191,8 +191,8 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full_but_force) homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); - mock_ns = 7999; - self->homa.pacer->max_nic_queue_ns = 1000; + mock_clock = 7999; + self->homa.pacer->max_nic_queue_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, true)); EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -210,14 +210,14 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__pacer_metrics) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); self->homa.pacer->wake_time = 9800; - mock_ns = 10000; - self->homa.pacer->max_nic_queue_ns = 1000; + mock_clock = 10000; + self->homa.pacer->max_nic_queue_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, true)); EXPECT_EQ(10500, atomic64_read(&self->homa.pacer->link_idle_time)); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); - EXPECT_EQ(200, homa_metrics_per_cpu()->pacer_lost_ns); + EXPECT_EQ(200, homa_metrics_per_cpu()->pacer_lost_cycles); #endif /* See strip.py */ } TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_empty) @@ -231,8 +231,8 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_empty) homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); - mock_ns = 10000; - self->homa.pacer->max_nic_queue_ns = 1000; + mock_clock = 10000; + self->homa.pacer->max_nic_queue_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, true)); EXPECT_EQ(10500, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -242,10 +242,10 @@ TEST_F(homa_pacer, homa_pacer_main__exit) { unit_hook_register(exit_hook); hook_pacer = self->homa.pacer; - hook_exit_ns = 5000; - mock_ns_tick = 200; + hook_exit_cycles = 5000; + mock_clock_tick = 200; homa_pacer_main(self->homa.pacer); - EXPECT_TRUE(mock_ns >= 5000); + EXPECT_TRUE(mock_clock >= 5000); } TEST_F(homa_pacer, homa_pacer_main__xmit_data) { @@ -260,8 +260,8 @@ TEST_F(homa_pacer, homa_pacer_main__xmit_data) homa_pacer_manage_rpc(crpc1); homa_pacer_manage_rpc(crpc2); - self->homa.pacer->max_nic_queue_ns = 3000; - mock_ns_tick = 200; + self->homa.pacer->max_nic_queue_cycles = 3000; + mock_clock_tick = 200; unit_hook_register(exit_idle_hook); hook_pacer = self->homa.pacer; unit_log_clear(); @@ -293,11 +293,11 @@ TEST_F(homa_pacer, homa_pacer_main__rpc_arrives_while_sleeping) unit_hook_register(exit_hook); hook_pacer = self->homa.pacer; - hook_exit_ns = 5000; - mock_ns_tick = 200; + hook_exit_cycles = 5000; + mock_clock_tick = 200; unit_hook_register(manage_hook); hook_rpc = crpc; - self->homa.pacer->max_nic_queue_ns = 2000; + self->homa.pacer->max_nic_queue_cycles = 2000; unit_log_clear(); homa_pacer_main(self->homa.pacer); @@ -329,7 +329,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__basics) homa_pacer_manage_rpc(crpc1); homa_pacer_manage_rpc(crpc2); homa_pacer_manage_rpc(crpc3); - self->homa.pacer->max_nic_queue_ns = 2000; + self->homa.pacer->max_nic_queue_cycles = 2000; unit_log_clear(); homa_pacer_xmit(self->homa.pacer); EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", @@ -349,7 +349,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__pacer_already_active) self->client_id, 10000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.pacer->max_nic_queue_ns = 2000; + self->homa.pacer->max_nic_queue_cycles = 2000; mock_trylock_errors = 1; unit_log_clear(); homa_pacer_xmit(self->homa.pacer); @@ -367,8 +367,8 @@ TEST_F(homa_pacer, homa_pacer_xmit__nic_queue_fills) self->client_id, 10000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.pacer->max_nic_queue_ns = 2001; - mock_ns = 10000; + self->homa.pacer->max_nic_queue_cycles = 2001; + mock_clock = 10000; atomic64_set(&self->homa.pacer->link_idle_time, 12000); unit_log_clear(); homa_pacer_xmit(self->homa.pacer); @@ -381,7 +381,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__nic_queue_fills) } TEST_F(homa_pacer, homa_pacer_xmit__queue_empty) { - self->homa.pacer->max_nic_queue_ns = 2000; + self->homa.pacer->max_nic_queue_cycles = 2000; unit_log_clear(); homa_pacer_xmit(self->homa.pacer); unit_log_throttled(&self->homa); @@ -391,15 +391,15 @@ TEST_F(homa_pacer, homa_pacer_xmit__xmit_fifo) { struct homa_rpc *crpc1, *crpc2, *crpc3; - mock_ns = 10000; + mock_clock = 10000; crpc1 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 2, 20000, 1000); - mock_ns = 11000; + mock_clock = 11000; crpc2 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 4, 10000, 1000); - mock_ns = 12000; + mock_clock = 12000; crpc3 = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 6, 30000, 1000); @@ -408,10 +408,10 @@ TEST_F(homa_pacer, homa_pacer_xmit__xmit_fifo) homa_pacer_manage_rpc(crpc3); /* First attempt: pacer->fifo_count doesn't reach zero. */ - self->homa.pacer->max_nic_queue_ns = 1300; + self->homa.pacer->max_nic_queue_cycles = 1300; self->homa.pacer->fifo_count = 200; self->homa.pacer->fifo_fraction = 150; - mock_ns= 13000; + mock_clock= 13000; atomic64_set(&self->homa.pacer->link_idle_time, 10000); unit_log_clear(); mock_xmit_log_verbose = 1; @@ -447,7 +447,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__rpc_removed_from_queue_before_locked) self->client_id, 10000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.pacer->max_nic_queue_ns = 10000; + self->homa.pacer->max_nic_queue_cycles = 10000; unit_log_clear(); unit_hook_register(unmanage_hook); hook_rpc = crpc; @@ -468,7 +468,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__rpc_locked) self->client_id, 5000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.pacer->max_nic_queue_ns = 2000; + self->homa.pacer->max_nic_queue_cycles = 2000; unit_log_clear(); mock_trylock_errors = ~1; homa_pacer_xmit(self->homa.pacer); @@ -495,7 +495,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__remove_from_queue) homa_pacer_manage_rpc(crpc1); homa_pacer_manage_rpc(crpc2); - self->homa.pacer->max_nic_queue_ns = 2000; + self->homa.pacer->max_nic_queue_cycles = 2000; unit_log_clear(); /* First call completes id 2, but id 4 is still in the queue. */ @@ -509,7 +509,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__remove_from_queue) /* Second call completes id 4, queue now empty. */ unit_log_clear(); - self->homa.pacer->max_nic_queue_ns = 10000; + self->homa.pacer->max_nic_queue_cycles = 10000; homa_pacer_xmit(self->homa.pacer); EXPECT_STREQ("xmit DATA 600@1400; removing id 4 from throttled list", unit_log_get()); @@ -637,39 +637,41 @@ TEST_F(homa_pacer, homa_pacer_unmanage_rpc__metrics) self->server_ip, self->server_port, self->client_id+2, 5000, 1000); - mock_ns = 1000; + mock_clock = 1000; homa_pacer_manage_rpc(crpc1); EXPECT_EQ(1000, self->homa.pacer->throttle_add); - EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_ns); + EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); - mock_ns = 3000; + mock_clock = 3000; homa_pacer_manage_rpc(crpc2); EXPECT_EQ(3000, self->homa.pacer->throttle_add); - EXPECT_EQ(2000, homa_metrics_per_cpu()->throttled_ns); + EXPECT_EQ(2000, homa_metrics_per_cpu()->throttled_cycles); - mock_ns = 7000; + mock_clock = 7000; homa_pacer_unmanage_rpc(crpc1); EXPECT_EQ(3000, self->homa.pacer->throttle_add); - EXPECT_EQ(2000, homa_metrics_per_cpu()->throttled_ns); + EXPECT_EQ(2000, homa_metrics_per_cpu()->throttled_cycles); - mock_ns = 8000; + mock_clock = 8000; homa_pacer_unmanage_rpc(crpc2); EXPECT_EQ(3000, self->homa.pacer->throttle_add); - EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_ns); + EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_cycles); } #endif /* See strip.py */ TEST_F(homa_pacer, homa_pacer_update_sysctl_deps) { + self->homa.pacer->max_nic_queue_ns = 6000; self->homa.pacer->link_mbps = 10000; homa_pacer_update_sysctl_deps(self->homa.pacer); - EXPECT_EQ(808000, self->homa.pacer->ns_per_mbyte); + EXPECT_EQ(6000, self->homa.pacer->max_nic_queue_cycles); + EXPECT_EQ(808000, self->homa.pacer->cycles_per_mbyte); self->homa.pacer->link_mbps = 1000; homa_pacer_update_sysctl_deps(self->homa.pacer); - EXPECT_EQ(8080000, self->homa.pacer->ns_per_mbyte); + EXPECT_EQ(8080000, self->homa.pacer->cycles_per_mbyte); self->homa.pacer->link_mbps = 40000; homa_pacer_update_sysctl_deps(self->homa.pacer); - EXPECT_EQ(202000, self->homa.pacer->ns_per_mbyte); + EXPECT_EQ(202000, self->homa.pacer->cycles_per_mbyte); } \ No newline at end of file diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 0e29b56c..a19e0460 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -78,7 +78,7 @@ static void peer_spinlock_hook(char *id) { if (strcmp(id, "spin_lock") != 0) return; - mock_ns += 1000; + mock_clock += 1000; } #endif /* See strip.py */ @@ -138,15 +138,15 @@ TEST_F(homa_peer, homa_peertab_gc_dsts) struct homa_peer *peer; peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - mock_ns = 0; + mock_clock = 0; homa_dst_refresh(&self->peertab, peer, &self->hsk); - mock_ns = 50000000; + mock_clock = 50000000; homa_dst_refresh(&self->peertab, peer, &self->hsk); - mock_ns = 90000000; + mock_clock = 90000000; homa_dst_refresh(&self->peertab, peer, &self->hsk); EXPECT_EQ(3, dead_count(&self->peertab)); - homa_peertab_gc_dsts(&self->peertab, 110000000); + homa_peertab_gc_dsts(&self->peertab, 130000000); EXPECT_EQ(2, dead_count(&self->peertab)); homa_peertab_gc_dsts(&self->peertab, ~0); EXPECT_EQ(0, dead_count(&self->peertab)); @@ -336,11 +336,11 @@ TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); - mock_ns = 0; + mock_clock = 0; homa_dst_refresh(self->homa.peers, peer, &self->hsk); homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(2, dead_count(self->homa.peers)); - mock_ns = 500000000; + mock_clock = 500000000; homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(1, dead_count(self->homa.peers)); } @@ -409,17 +409,17 @@ TEST_F(homa_peer, homa_peer_lock_slow) &self->hsk.inet); ASSERT_NE(NULL, peer); - mock_ns = 10000; + mock_clock = 10000; homa_peer_lock(peer); EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_miss_ns); + EXPECT_EQ(0, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); homa_peer_unlock(peer); mock_trylock_errors = 1; unit_hook_register(peer_spinlock_hook); homa_peer_lock(peer); EXPECT_EQ(1, homa_metrics_per_cpu()->peer_ack_lock_misses); - EXPECT_EQ(1000, homa_metrics_per_cpu()->peer_ack_lock_miss_ns); + EXPECT_EQ(1000, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); homa_peer_unlock(peer); } #endif /* See strip.py */ diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index ca9d2444..02d2a695 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -50,11 +50,11 @@ static void steal_bpages_hook(char *id) case 2: atomic_set(&cur_pool->descriptors[1].refs, 1); cur_pool->descriptors[1].owner = 3; - cur_pool->descriptors[1].expiration = mock_ns + 1; + cur_pool->descriptors[1].expiration = mock_clock + 1; case 3: atomic_set(&cur_pool->descriptors[2].refs, 1); cur_pool->descriptors[2].owner = 3; - cur_pool->descriptors[2].expiration = mock_ns - 1; + cur_pool->descriptors[2].expiration = mock_clock - 1; case 4: atomic_set(&cur_pool->descriptors[3].refs, 1); } @@ -215,14 +215,14 @@ TEST_F(homa_pool, homa_pool_get_pages__skip_unusable_bpages) struct homa_pool *pool = self->hsk.buffer_pool; u32 pages[10]; - mock_ns = 1000; + mock_clock = 1000; atomic_set(&pool->descriptors[0].refs, 2); atomic_set(&pool->descriptors[1].refs, 1); pool->descriptors[1].owner = 3; - pool->descriptors[1].expiration = mock_ns + 1; + pool->descriptors[1].expiration = mock_clock + 1; atomic_set(&pool->descriptors[2].refs, 1); pool->descriptors[2].owner = 3; - pool->descriptors[2].expiration = mock_ns - 1; + pool->descriptors[2].expiration = mock_clock - 1; atomic_set(&pool->descriptors[3].refs, 1); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(2, pages[0]); @@ -233,7 +233,7 @@ TEST_F(homa_pool, homa_pool_get_pages__cant_lock_pages) struct homa_pool *pool = self->hsk.buffer_pool; u32 pages[10]; - mock_ns = 1000; + mock_clock = 1000; mock_trylock_errors = 3; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(2, pages[0]); @@ -244,7 +244,7 @@ TEST_F(homa_pool, homa_pool_get_pages__state_changes_while_locking) struct homa_pool *pool = self->hsk.buffer_pool; u32 pages[10]; - mock_ns = 1000; + mock_clock = 1000; unit_hook_register(steal_bpages_hook); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(2, pages[0]); @@ -256,8 +256,8 @@ TEST_F(homa_pool, homa_pool_get_pages__steal_expired_page) u32 pages[10]; pool->descriptors[0].owner = 5; - mock_ns = 5000; - pool->descriptors[0].expiration = mock_ns - 1; + mock_clock = 5000; + pool->descriptors[0].expiration = mock_clock - 1; atomic_set(&pool->free_bpages, 20); EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 0)); EXPECT_EQ(0, pages[0]); @@ -270,11 +270,11 @@ TEST_F(homa_pool, homa_pool_get_pages__set_owner) struct homa_pool *pool = self->hsk.buffer_pool; u32 pages[10]; - self->homa.bpage_lease_usecs = 1; - mock_ns = 5000; + self->homa.bpage_lease_cycles = 1000; + mock_clock = 5000; EXPECT_EQ(0, homa_pool_get_pages(pool, 2, pages, 1)); EXPECT_EQ(1, pool->descriptors[pages[0]].owner); - EXPECT_EQ(mock_ns + 1000, + EXPECT_EQ(mock_clock + 1000, pool->descriptors[pages[1]].expiration); EXPECT_EQ(2, atomic_read(&pool->descriptors[1].refs)); } diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index e6e03e0c..180b501d 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -308,7 +308,7 @@ TEST_F(homa_rpc, homa_bucket_lock_slow) struct homa_rpc *crpc, *srpc; int created; - mock_ns_tick = 10; + mock_clock_tick = 10; crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(IS_ERR(crpc)); homa_rpc_end(crpc); @@ -319,17 +319,17 @@ TEST_F(homa_rpc, homa_bucket_lock_slow) homa_rpc_unlock(srpc); EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_miss_ns); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_lock_miss_cycles); homa_bucket_lock_slow(crpc->bucket, crpc->id); homa_rpc_unlock(crpc); EXPECT_EQ(1, homa_metrics_per_cpu()->client_lock_misses); - EXPECT_NE(0, homa_metrics_per_cpu()->client_lock_miss_ns); + EXPECT_NE(0, homa_metrics_per_cpu()->client_lock_miss_cycles); EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_miss_ns); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_lock_miss_cycles); homa_bucket_lock_slow(srpc->bucket, srpc->id); homa_rpc_unlock(srpc); EXPECT_EQ(1, homa_metrics_per_cpu()->server_lock_misses); - EXPECT_EQ(10, homa_metrics_per_cpu()->server_lock_miss_ns); + EXPECT_EQ(10, homa_metrics_per_cpu()->server_lock_miss_cycles); } #endif /* See strip.py */ @@ -725,7 +725,7 @@ TEST_F(homa_rpc, homa_rpc_reap__free_gaps) ASSERT_NE(NULL, crpc); homa_gap_alloc(&crpc->msgin.gaps, 1000, 2000); - mock_ns = 1000; + mock_clock = 1000; homa_gap_alloc(&crpc->msgin.gaps, 5000, 6000); EXPECT_STREQ("start 1000, end 2000; start 5000, end 6000, time 1000", diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index ccf5963c..fe26f084 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -349,20 +349,20 @@ TEST_F(homa_skb, homa_skb_page_alloc__new_large_page) { struct homa_skb_core *skb_core = get_skb_core(smp_processor_id()); - mock_ns_tick = 100; + mock_clock_tick = 100; EXPECT_EQ(0, skb_core->pool->avail); EXPECT_EQ(0, skb_core->num_stashed_pages); EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); EXPECT_NE(NULL, skb_core->skb_page); EXPECT_EQ(HOMA_SKB_PAGE_SIZE, skb_core->page_size); EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); - EXPECT_EQ(100, homa_metrics_per_cpu()->skb_page_alloc_ns); + EXPECT_EQ(100, homa_metrics_per_cpu()->skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) { struct homa_skb_core *skb_core = get_skb_core(2); - mock_ns_tick = 50; + mock_clock_tick = 50; mock_alloc_page_errors = 1; EXPECT_TRUE(homa_skb_page_alloc(&self->homa, skb_core)); EXPECT_NE(NULL, skb_core->skb_page); @@ -370,7 +370,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__high_order_page_not_available) EXPECT_EQ(PAGE_SIZE, skb_core->page_size); EXPECT_EQ(0, skb_core->page_inuse); EXPECT_EQ(1, homa_metrics_per_cpu()->skb_page_allocs); - EXPECT_EQ(50, homa_metrics_per_cpu()->skb_page_alloc_ns); + EXPECT_EQ(50, homa_metrics_per_cpu()->skb_page_alloc_cycles); } TEST_F(homa_skb, homa_skb_page_alloc__no_pages_available) { @@ -672,7 +672,7 @@ TEST_F(homa_skb, homa_skb_get) TEST_F(homa_skb, homa_skb_release_pages__basics) { EXPECT_EQ(0UL, self->homa.skb_page_free_time); - mock_ns = 1000000; + mock_clock = 1000000; self->homa.skb_page_free_time = 500000; self->homa.skb_page_frees_per_sec = 10; self->homa.skb_page_pool_min_kb = 0; @@ -689,7 +689,7 @@ TEST_F(homa_skb, homa_skb_release_pages__basics) TEST_F(homa_skb, homa_skb_release_pages__not_time_to_free) { EXPECT_EQ(0UL, self->homa.skb_page_free_time); - mock_ns = 1000000; + mock_clock = 1000000; self->homa.skb_page_free_time = 1000001; self->homa.skb_page_frees_per_sec = 10; self->homa.skb_page_pool_min_kb = 0; @@ -701,7 +701,7 @@ TEST_F(homa_skb, homa_skb_release_pages__not_time_to_free) TEST_F(homa_skb, homa_skb_release_pages__allocate_skb_pages_to_free) { EXPECT_EQ(0, self->homa.pages_to_free_slots); - mock_ns= 1000000; + mock_clock= 1000000; self->homa.skb_page_frees_per_sec = 10; self->homa.skb_page_free_time = 500000; @@ -720,7 +720,7 @@ TEST_F(homa_skb, homa_skb_release_pages__cant_reallocate_skb_pages_to_free) struct homa_page_pool *pool; EXPECT_EQ(0UL, self->homa.skb_page_free_time); - mock_ns = 1000000; + mock_clock = 1000000; self->homa.skb_page_free_time = 500000; self->homa.skb_page_frees_per_sec = 20; self->homa.skb_page_pool_min_kb = 0; @@ -741,7 +741,7 @@ TEST_F(homa_skb, homa_skb_release_pages__cant_reallocate_skb_pages_to_free) TEST_F(homa_skb, homa_skb_release_pages__limited_by_min_kb) { EXPECT_EQ(0UL, self->homa.skb_page_free_time); - mock_ns = 1000000; + mock_clock = 1000000; self->homa.skb_page_free_time = 500000; self->homa.skb_page_frees_per_sec = 20; self->homa.skb_page_pool_min_kb = (5 * HOMA_SKB_PAGE_SIZE) / 1000; @@ -754,7 +754,7 @@ TEST_F(homa_skb, homa_skb_release_pages__limited_by_min_kb) TEST_F(homa_skb, homa_skb_release_pages__empty_pool) { EXPECT_EQ(0UL, self->homa.skb_page_free_time); - mock_ns= 2000000; + mock_clock= 2000000; self->homa.skb_page_free_time = 500000; self->homa.skb_page_frees_per_sec = 1000; self->homa.skb_page_pool_min_kb = 0; diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index cc37b5ad..5a7d8cdf 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -372,17 +372,17 @@ TEST_F(homa_sock, homa_sock_find__long_hash_chain) #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_sock, homa_sock_lock_slow) { - mock_ns_tick = 100; + mock_clock_tick = 100; homa_sock_lock(&self->hsk); EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_misses); - EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_miss_ns); + EXPECT_EQ(0, homa_metrics_per_cpu()->socket_lock_miss_cycles); homa_sock_unlock(&self->hsk); mock_trylock_errors = 1; homa_sock_lock(&self->hsk); EXPECT_EQ(1, homa_metrics_per_cpu()->socket_lock_misses); - EXPECT_EQ(100, homa_metrics_per_cpu()->socket_lock_miss_ns); + EXPECT_EQ(100, homa_metrics_per_cpu()->socket_lock_miss_cycles); homa_sock_unlock(&self->hsk); } #endif /* See strip.py */ diff --git a/test/unit_timetrace.c b/test/unit_timetrace.c index ee7e64d2..99693220 100644 --- a/test/unit_timetrace.c +++ b/test/unit_timetrace.c @@ -16,7 +16,7 @@ FIXTURE_SETUP(timetrace) tt_buffer_size = 64; tt_test_no_khz = true; tt_init("tt"); - mock_cycles = 1000; + mock_tt_cycles = 1000; } FIXTURE_TEARDOWN(timetrace) { @@ -46,13 +46,13 @@ TEST_F(timetrace, tt_record__basics) memset(buffer, 0, sizeof(buffer)); tt_record("Message with no args"); - mock_cycles++; + mock_tt_cycles++; tt_record1("Message with 1 arg: %d", 99); - mock_cycles++; + mock_tt_cycles++; tt_record2("Message with 2 args: %d %d %d %d", 100, 200); - mock_cycles++; + mock_tt_cycles++; tt_record3("Message with 3 args: %d %d %d %d", 10, 20, 30); - mock_cycles++; + mock_tt_cycles++; tt_record4("Message with 4 args: %d %d %d %d", 1, 2, 3, 4); tt_proc_open(NULL, &self->file); tt_proc_read(&self->file, buffer, sizeof(buffer), 0); @@ -72,13 +72,13 @@ TEST_F(timetrace, tt_record_buf__wraparound) memset(buffer, 0, sizeof(buffer)); tt_buffer_size = 4; tt_record("Message 1"); - mock_cycles++; + mock_tt_cycles++; tt_record("Message 2"); - mock_cycles++; + mock_tt_cycles++; tt_record("Message 3"); - mock_cycles++; + mock_tt_cycles++; tt_record("Message 4"); - mock_cycles++; + mock_tt_cycles++; tt_record("Message 5"); tt_proc_open(NULL, &self->file); tt_proc_read(&self->file, buffer, sizeof(buffer), 0); diff --git a/timetrace.c b/timetrace.c index a47a6737..793234da 100644 --- a/timetrace.c +++ b/timetrace.c @@ -92,6 +92,10 @@ int tt_pf_storage = TT_PF_BUF_SIZE; /* Set during tests to disable "cpu_khz" line in trace output. */ bool tt_test_no_khz; +#ifdef __UNIT_TEST__ +unsigned int cpu_khz = 1000000; +#endif + #define MAX_IDS 10 #define MAX_CORES 50 static atomic_t id_counts[MAX_CORES][MAX_IDS]; @@ -913,8 +917,8 @@ void tt_inc_metric(int metric, u64 count) * for the legal values of metric. */ static int offsets[] = { - offsetof(struct homa_metrics, napi_ns), - offsetof(struct homa_metrics, linux_softirq_ns), + offsetof(struct homa_metrics, napi_cycles), + offsetof(struct homa_metrics, linux_softirq_cycles), offsetof(struct homa_metrics, linux_pkt_alloc_bytes), }; u64 *metric_addr = (u64 *)(((char *)homa_metrics_per_cpu()) diff --git a/timetrace.h b/timetrace.h index 9e95dc9a..0cef6744 100644 --- a/timetrace.h +++ b/timetrace.h @@ -8,7 +8,7 @@ #ifdef __UNIT_TEST__ #undef get_cycles #define get_cycles mock_get_cycles -cycles_t mock_get_cycles(void); +u64 mock_get_cycles(void); #endif /* __UNIT_TEST__ */ // Change 1 -> 0 in the following line to disable time tracing globally. @@ -131,16 +131,18 @@ extern s64 tt_debug_int64[100]; extern void *tt_debug_ptr[100]; /** - * tt_rdtsc(): return the current value of the fine-grain CPU cycle counter - * (accessed via the RDTSC instruction). - * Return: see above + * tt_get_cycles(): return the current value of the fine-grain CPU cycle + * counter. + * Return: see above. */ -static inline u64 tt_rdtsc(void) +static inline u64 tt_get_cycles(void) { - u32 lo, hi; - - __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); - return (((u64)hi << 32) | lo); +#ifdef __UNIT_TEST__ + extern u64 mock_tt_cycles; + return mock_tt_cycles; +#else /* __UNIT_TEST__ */ + return get_cycles(); +#endif /* __UNIT_TEST__ */ } /* @@ -164,8 +166,8 @@ static inline void tt_record4(const char *format, u32 arg0, u32 arg1, u32 arg2, u32 arg3) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - arg0, arg1, arg2, arg3); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, arg0, arg1, arg2, arg3); #endif } @@ -173,32 +175,32 @@ static inline void tt_record3(const char *format, u32 arg0, u32 arg1, u32 arg2) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - arg0, arg1, arg2, 0); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, arg0, arg1, arg2, 0); #endif } static inline void tt_record2(const char *format, u32 arg0, u32 arg1) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - arg0, arg1, 0, 0); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, arg0, arg1, 0, 0); #endif } static inline void tt_record1(const char *format, u32 arg0) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - arg0, 0, 0, 0); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, arg0, 0, 0, 0); #endif } static inline void tt_record(const char *format) { #if ENABLE_TIME_TRACE - tt_record_buf(tt_buffers[raw_smp_processor_id()], get_cycles(), format, - 0, 0, 0, 0); + tt_record_buf(tt_buffers[raw_smp_processor_id()], tt_get_cycles(), + format, 0, 0, 0, 0); #endif } diff --git a/util/metrics.py b/util/metrics.py index dc2f8170..734e8370 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -115,7 +115,7 @@ def scale_number(number): # Sum all of the individual core counts for both the new and old data and # compute the difference in "deltas" for symbol in symbols: - if (symbol == "time_ns") or (symbol == "core"): + if (symbol == "time_cycles") or (symbol == "cpu_khz") or (symbol == "core"): # This symbol shouldn't be summed. continue total_cur = 0 @@ -134,21 +134,22 @@ def scale_number(number): elapsed_secs = 0 reaper_calls = 0 pad = "" +cpu_khz = float(cur[0]["cpu_khz"]) if len(prev) > 0: - time_delta = cur[0]["time_ns"] - prev[0]["time_ns"] - elapsed_secs = float(time_delta)*1e-09 + time_delta = cur[0]["time_cycles"] - prev[0]["time_cycles"] + elapsed_secs = float(time_delta)/(cpu_khz * 1000.0) pad = pad.ljust(13) secs = "(%.1f s)" % (elapsed_secs) secs = secs.ljust(12) - print("%-28s %15d %s %s" % ("time_ns", time_delta, secs, - docs["time_ns"])) + print("%-28s %15d %s %s" % ("time_cycles", time_delta, secs, + docs["time_cycles"])) else: - print("%-15s %28d %s%s" % ("time_ns", cur[0]["time_ns"], - "", docs["time_ns"])) + print("%-15s %28d %s%s" % ("time_cycles", cur[0]["time_cycles"], + "", docs["time_cycles"])) for symbol in symbols: - if (symbol == "time_ns"): + if (symbol == "time_cycles") or (symbol == "cpu_khz"): # This symbol is handled specially above continue delta = deltas[symbol] @@ -160,7 +161,7 @@ def scale_number(number): rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) if ("msg_bytes" in symbol) and (symbol != "sent_msg_bytes"): total_received_bytes += delta - if symbol.endswith("_ns") and (time_delta != 0): + if symbol.endswith("_cycles") and (time_delta != 0): percent = "(%.1f%%)" % (100.0*delta/time_delta) percent = percent.ljust(12) print("%-28s %15d %s %s" % (symbol, delta, percent, doc)) @@ -181,10 +182,10 @@ def scale_number(number): if (symbol == "reaper_dead_skbs") and ("reaper_calls" in deltas): print("%-28s %6.1f %sAvg. hsk->dead_skbs in reaper" % ( "avg_dead_skbs", delta/deltas["reaper_calls"], pad)) - if symbol.endswith("_miss_ns") and (time_delta != 0): + if symbol.endswith("_miss_cycles") and (time_delta != 0): prefix = symbol[:-12] if ((prefix + "_misses") in deltas) and (deltas[prefix + "_misses"] != 0): - ns = (delta/deltas[prefix + "_misses"]) + ns = (delta/deltas[prefix + "_misses"])/(cpu_khz * 1e-06) print("%-28s %6.1f %sAvg. wait time per %s miss (ns)" % ( prefix + "_miss_delay", ns, pad, prefix)) if (symbol == "large_msg_bytes") and (total_received_bytes != 0) \ @@ -223,9 +224,9 @@ def scale_number(number): for where in ["napi", "softirq", "send", "recv", "reply", "timer", "pacer"]: if where == "softirq": - symbol = "linux_softirq_ns" + symbol = "linux_softirq_cycles" else: - symbol = where + "_ns" + symbol = where + "_cycles" line = "%-10s " % (where) for core in range(first_core, end_core): frac = float(cur[core][symbol] - prev[core][symbol]) / float( @@ -262,7 +263,7 @@ def scale_number(number): total_cores_used = 0.0 total_syscalls = 0 - time = float(deltas["send_ns"]) + time = float(deltas["send_cycles"]) cores = time/time_delta total_cores_used += cores calls = float(deltas["send_calls"]) @@ -270,10 +271,10 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (time/calls)/1000 + us_per = (time/calls)/(cpu_khz/1e03) print("send syscall %6.2f %7.2f us/syscall" % (cores, us_per)) - time = float(deltas["recv_ns"]) - float(deltas["poll_ns"]) + time = float(deltas["recv_cycles"]) - float(deltas["poll_cycles"]) cores = time/time_delta total_cores_used += cores calls = float(deltas["recv_calls"]) @@ -281,10 +282,10 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (time/calls)/1000 + us_per = (time/calls)/(cpu_khz/1e03) print("recv syscall (-poll) %6.2f %7.2f us/syscall" % (cores, us_per)) - time = float(deltas["reply_ns"]) + time = float(deltas["reply_cycles"]) cores = time/time_delta total_cores_used += cores calls = float(deltas["reply_calls"]) @@ -292,13 +293,13 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (time/calls)/1000 + us_per = (time/calls)/(cpu_khz/1e03) print("reply syscall %6.2f %7.2f us/syscall" % (cores, us_per)) - for print_name, symbol in [["NAPI", "napi_ns"], - [" Bypass homa_softirq", "bypass_softirq_ns"], - ["Linux SoftIRQ", "linux_softirq_ns"], - [" Normal homa_softirq", "softirq_ns"]]: + for print_name, symbol in [["NAPI", "napi_cycles"], + [" Bypass homa_softirq", "bypass_softirq_cycles"], + ["Linux SoftIRQ", "linux_softirq_cycles"], + [" Normal homa_softirq", "softirq_cycles"]]: cpu_time = float(deltas[symbol]) cores = cpu_time/time_delta if packets_received > 0: @@ -306,17 +307,17 @@ def scale_number(number): cores, (cpu_time/packets_received) / 1000)) else: print("%s %6.2f" % (print_name.ljust(22), cores)) - cpu_time = float(deltas["napi_ns"]) + cpu_time = float(deltas["napi_cycles"]) if cpu_time == 0: - cpu_time = float(deltas["bypass_softirq_ns"]) + cpu_time = float(deltas["bypass_softirq_cycles"]) total_cores_used += cpu_time/time_delta - cpu_time = float(deltas["linux_softirq_ns"]) + cpu_time = float(deltas["linux_softirq_cycles"]) if cpu_time == 0: - cpu_time = float(deltas["softirq_ns"]) + cpu_time = float(deltas["softirq_cycles"]) total_cores_used += cpu_time/time_delta - for print_name, symbol in [["Pacer", "pacer_ns"], - ["Timer handler", "timer_ns"]]: + for print_name, symbol in [["Pacer", "pacer_cycles"], + ["Timer handler", "timer_cycles"]]: cpu_time = float(deltas[symbol]) cores = cpu_time/time_delta total_cores_used += cores @@ -325,7 +326,7 @@ def scale_number(number): print("----------------------------------") print("Total Core Utilization %6.2f" % (total_cores_used)) - time = float(deltas["poll_ns"]) + time = float(deltas["poll_cycles"]) cores = time/time_delta calls = float(deltas["recv_calls"]) if calls == 0: @@ -339,31 +340,31 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (deltas["skb_alloc_ns"]/calls)/1000 + us_per = (deltas["skb_alloc_cycles"]/calls)/1000 print("Skb allocation %6.2f %7.2f us/skb" % ( - deltas["skb_alloc_ns"]/time_delta, us_per)) + deltas["skb_alloc_cycles"]/time_delta, us_per)) calls = deltas["skb_frees"] if calls == 0: us_per = 0 else: - us_per = (deltas["skb_free_ns"]/calls)/1000 + us_per = (deltas["skb_free_cycles"]/calls)/1000 print("Skb freeing %6.2f %7.2f us/skb" % ( - deltas["skb_free_ns"]/time_delta, us_per)) + deltas["skb_free_cycles"]/time_delta, us_per)) print("\nLock Misses:") print("------------") print(" Misses/sec. ns/Miss %CPU") for lock in ["client", "server", "socket", "grant", "throttle", "peer_ack"]: misses = float(deltas[lock + "_lock_misses"]) - ns = float(deltas[lock + "_lock_miss_ns"]) + cycles = float(deltas[lock + "_lock_miss_cycles"]) if misses == 0: - ns_per_miss = 0.0 + cycles_per_miss = 0.0 else: - ns_per_miss = ns/misses + cycles_per_miss = cycles/misses print("%-10s %s %6.1f %5.1f" % (lock, scale_number(misses/elapsed_secs), - ns_per_miss, 100.0*ns/time_delta)) + cycles_per_miss/(cpu_khz/1e06), 100.0*cycles/time_delta)) total_messages = float(deltas["requests_received"] + deltas["responses_received"]) @@ -417,21 +418,21 @@ def scale_number(number): print(" %5.2f Gbps/core (goodput)" % ( 8e-9*(total_received_bytes + float(deltas["sent_msg_bytes"])) /(total_cores_used * elapsed_secs))) - if deltas["pacer_ns"] != 0: - pacer_secs = float(deltas["pacer_ns"])/1000 + if deltas["pacer_cycles"] != 0: + pacer_secs = float(deltas["pacer_cycles"])/(cpu_khz * 1000.0) print("Pacer throughput: %6.2f Gbps (pacer output when pacer running)" % ( deltas["pacer_bytes"]*8e-09/pacer_secs)) - if deltas["throttled_ns"] != 0: - throttled_secs = float(deltas["throttled_ns"])/1000 + if deltas["throttled_cycles"] != 0: + throttled_secs = float(deltas["throttled_cycles"])/(cpu_khz * 1000.0) print("Throttled throughput: %5.2f Gbps (pacer output when throttled)" % ( deltas["pacer_bytes"]*8e-09/throttled_secs)) if deltas["skb_allocs"] != 0: print("Skb alloc time: %4.2f usec/skb" % ( - float(deltas["skb_alloc_ns"]) / 1000 / + float(deltas["skb_alloc_cycles"]) / (cpu_khz / 1000.0) / deltas["skb_allocs"])) if deltas["skb_page_allocs"] != 0: print("Skb page alloc time: %5.2f usec/skb" % ( - float(deltas["skb_page_alloc_ns"]) / 1000 / + float(deltas["skb_page_alloc_cycles"]) / (cpu_khz / 1000.0) / deltas["skb_page_allocs"])) print("\nCanaries (possible problem indicators):") @@ -450,8 +451,8 @@ def scale_number(number): rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) print("%-28s %15d %s%s" % (symbol, deltas[symbol], rate_info, docs[symbol])) - for symbol in ["pacer_lost_ns", "timer_reap_ns", "data_pkt_reap_ns", - "grant_lock_ns"]: + for symbol in ["pacer_lost_cycles", "timer_reap_cycles", + "data_pkt_reap_cycles", "grant_lock_cycles"]: delta = deltas[symbol] if delta == 0 or time_delta == 0: continue @@ -466,7 +467,7 @@ def scale_number(number): if deltas["responses_received"] > 0: print("%-28s %15.1f ACK packets sent per 1000 client RPCs" - % ("acks_per_rpc", 1000.0 * deltas["packets_sent_ACK"] + % ("acks_per_krpc", 1000.0 * deltas["packets_sent_ACK"] / deltas["responses_received"])) if avg_grantable_rpcs > 1.0: From 23265f2d044b56e9107a4a8d110853f94b1745df Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 9 May 2025 15:57:52 -0700 Subject: [PATCH 306/625] Automatically include kselftest_harness.h in Homa files --- homa_devel.h | 7 +++++++ homa_sock.c | 5 ----- test/main.c | 1 + test/mock.c | 3 --- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/homa_devel.h b/homa_devel.h index 8e0638ae..9b2e6d76 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -7,6 +7,13 @@ #ifndef _HOMA_DEVEL_H #define _HOMA_DEVEL_H +#ifdef __UNIT_TEST__ +#ifndef __NO_KSELFTEST__ +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#endif /* __NO_KSELFTEST__ */ +#endif /* __UNIT_TEST__ */ + #include "timetrace.h" #ifdef __STRIP__ diff --git a/homa_sock.c b/homa_sock.c index 72ed9167..74c4ef40 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -11,11 +11,6 @@ #include "homa_grant.h" #endif /* See strip.py */ -#ifdef __UNIT_TEST__ -#define KSELFTEST_NOT_MAIN 1 -#include "test/kselftest_harness.h" -#endif /* __UNIT_TEST__ */ - /** * homa_socktab_init() - Constructor for homa_socktabs. * @socktab: The object to initialize; previous contents are discarded. diff --git a/test/main.c b/test/main.c index 73ae9c30..1196faae 100644 --- a/test/main.c +++ b/test/main.c @@ -2,6 +2,7 @@ /* Main program for running Homa unit tests. */ +#define __NO_KSELFTEST__ 1 #include "homa_impl.h" #include "kselftest_harness.h" #include "mock.h" diff --git a/test/mock.c b/test/mock.c index 53e5408d..8e977f85 100644 --- a/test/mock.c +++ b/test/mock.c @@ -14,9 +14,6 @@ #include "mock.h" #include "utils.h" -#define KSELFTEST_NOT_MAIN 1m -#include "kselftest_harness.h" - /* It isn't safe to include some header files, such as stdlib, because * they conflict with kernel header files. The explicit declarations * below replace those header files. From e3be255fae56b3139a270d823764cf5b5bcf58c0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 9 May 2025 16:01:36 -0700 Subject: [PATCH 307/625] Introduce struct homa_shared Right now it contains nothing except a list of struct homas. --- homa_impl.h | 33 ++++++++++++++++++++++-- homa_utils.c | 57 ++++++++++++++++++++++++++++++++++++++++++ test/unit_homa_utils.c | 26 +++++++++++++++++++ 3 files changed, 114 insertions(+), 2 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index e831eb25..9d56736f 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -71,6 +71,7 @@ struct homa; struct homa_peer; struct homa_rpc; struct homa_sock; +struct homa_shared; #ifndef __STRIP__ /* See strip.py */ #include "timetrace.h" @@ -102,10 +103,17 @@ union sockaddr_in_union { }; /** - * struct homa - Stores overall information about an implementation of - * the Homa transport. One of these objects exists for each network namespace. + * struct homa - Stores overall information about the implementation of + * Homa for a particular network namespace (there is a logcially separate + * implementation of Homa for each namespace). */ struct homa { + /** @shared: information shared across all struct homas. */ + struct homa_shared *shared; + + /** shared_links: used to link this struct into shared->homas. */ + struct list_head shared_links; + /** * @next_outgoing_id: Id to use for next outgoing RPC request. * This is always even: it's used only to generate client-side ids. @@ -503,6 +511,24 @@ struct homa { #endif /* See strip.py */ }; +/** + * struct homa_shared - Contains "global" information that is shared + * across all instances of struct homa. + */ +struct homa_shared { + /** + * @lock: used when exclusive access is needed, such as when + * updating @homas. + */ + spinlock_t lock; + + /** + * @homas: contains all of the existing struct homas, linked + * through their shared_links fields. Managed with RCU. + */ + struct list_head homas; +}; + /** * struct homa_skb_info - Additional information needed by Homa for each * outbound DATA packet. Space is allocated for this at the very end of the @@ -655,6 +681,7 @@ void unit_hook(char *id); #endif /* See strip.py */ extern unsigned int homa_net_id; +extern struct homa_shared *homa_shared; void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); @@ -698,6 +725,8 @@ void homa_rpc_handoff(struct homa_rpc *rpc); int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +struct homa_shared *homa_shared_alloc(void); +void homa_shared_free(struct homa_shared *shared); int homa_shutdown(struct socket *sock, int how); int homa_softirq(struct sk_buff *skb); void homa_spin(int ns); diff --git a/homa_utils.c b/homa_utils.c index 781cb66f..78b92f68 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -17,6 +17,38 @@ #include "homa_stub.h" #endif /* See strip.py */ +/* Pointer to the singleton homa_shared object, of NULL if there are + * currently no struct homa objects in existence. + */ +struct homa_shared *homa_shared; + +/** + * homa_shared_alloc() - Allocate and initialize a new homa_shared + * object. + * Return: the new homa_shared object, or NULL if memory allocation failed. + */ +struct homa_shared *homa_shared_alloc(void) +{ + struct homa_shared *shared; + + shared = kmalloc(sizeof(*homa_shared), GFP_KERNEL); + if (!shared) + return NULL; + spin_lock_init(&shared->lock); + INIT_LIST_HEAD(&shared->homas); + return shared; +} + +/** + * homa_shared_free() - Clean up and free a homa_shared object. + */ +void homa_shared_free(struct homa_shared *shared) +{ + kfree(shared); + if (shared == homa_shared) + homa_shared = NULL; +} + /** * homa_init() - Constructor for homa objects. * @homa: Object to initialize. @@ -37,6 +69,17 @@ int homa_init(struct homa *homa, struct net *net) #endif /* See strip.py */ memset(homa, 0, sizeof(*homa)); + + if (!homa_shared) { + homa_shared = homa_shared_alloc(); + if (!homa_shared) + return -ENOMEM; + } + homa->shared = homa_shared; + spin_lock_bh(&homa_shared->lock); + list_add_tail(&homa->shared_links, &homa_shared->homas); + spin_unlock_bh(&homa_shared->lock); + atomic64_set(&homa->next_outgoing_id, 2); #ifndef __STRIP__ /* See strip.py */ homa->grant = homa_grant_alloc(net); @@ -137,6 +180,20 @@ void homa_destroy(struct homa *homa) unit_homa_destroy(homa); #endif /* __UNIT_TEST__ */ + if (homa->shared) { + struct homa_shared *shared = homa->shared; + + spin_lock_bh(&shared->lock); + __list_del_entry(&homa->shared_links); + if (list_empty(&homa->shared->homas)) { + spin_unlock_bh(&shared->lock); + homa_shared_free(homa->shared); + } else { + spin_unlock_bh(&shared->lock); + } + homa->shared = NULL; + } + /* The order of the following statements matters! */ if (homa->port_map) { homa_socktab_destroy(homa->port_map); diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index e66c3ee4..99b9ef94 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -53,6 +53,32 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, } #endif /* See strip.py */ +TEST_F(homa_utils, homa_shared_init__kmalloc_failure) +{ + mock_kmalloc_errors = 1; + EXPECT_EQ(NULL, homa_shared_alloc()); +} +TEST_F(homa_utils, homa_shared_init__success) +{ + struct homa_shared *shared; + + shared = homa_shared_alloc(); + EXPECT_NE(NULL, shared); + EXPECT_EQ(1, list_empty(&shared->homas)); + homa_shared_free(shared); +} + +TEST_F(homa_utils, homa_shared_free__clear_global_variable) +{ + struct homa_shared *saved; + + saved = homa_shared; + homa_shared = homa_shared_alloc(); + homa_shared_free(homa_shared); + EXPECT_EQ(NULL, homa_shared); + homa_shared = saved; +} + TEST_F(homa_utils, homa_init__kmalloc_failure_for_port_map) { struct homa homa2; From 9902087161bc3c265496c851b8ecf11064d99cf1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sat, 10 May 2025 16:17:21 -0700 Subject: [PATCH 308/625] Add reference counts to homa_peers This is in preparation for releasing peers if they are using too much memory. --- homa_incoming.c | 2 ++ homa_outgoing.c | 1 + homa_peer.c | 23 ++++++++++++++++---- homa_peer.h | 27 ++++++++++++++++++++++++ homa_rpc.c | 32 ++++++++++++++++++---------- homa_rpc.h | 1 + test/unit_homa_incoming.c | 3 +++ test/unit_homa_outgoing.c | 44 +++++++++++++++++++++++---------------- test/unit_homa_peer.c | 31 ++++++++++++++++++++++----- test/unit_homa_plumbing.c | 4 +++- test/unit_homa_rpc.c | 16 ++++++++++++++ 11 files changed, 145 insertions(+), 39 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 38511c93..b89dc364 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -897,6 +897,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) for (i = 1; i < HOMA_MAX_PRIORITIES; i++) peer->unsched_cutoffs[i] = ntohl(h->unsched_cutoffs[i]); peer->cutoff_version = h->cutoff_version; + homa_peer_put(peer); } kfree_skb(skb); } @@ -961,6 +962,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, __homa_xmit_control(&ack, sizeof(ack), peer, hsk); tt_record3("Responded to NEED_ACK for id %d, peer %0x%x with %d other acks", id, tt_addr(saddr), ntohs(ack.num_acks)); + homa_peer_put(peer); done: kfree_skb(skb); diff --git a/homa_outgoing.c b/homa_outgoing.c index 27538844..a4059fcc 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -576,6 +576,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); if (!IS_ERR(peer)) __homa_xmit_control(&unknown, sizeof(unknown), peer, hsk); + homa_peer_put(peer); } /** diff --git a/homa_peer.c b/homa_peer.c index fbbd3453..fa08684e 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -53,6 +53,17 @@ void homa_peertab_destroy(struct homa_peertab *peertab) for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], peertab_links) { + if (atomic_read(&peer->refs) != 0) +#ifdef __UNIT_TEST__ + FAIL(" %s found peer %s with reference count %d", + __func__, + homa_print_ipv6_addr(&peer->addr), + atomic_read(&peer->refs)); + +#else /* __UNIT_TEST__ */ + pr_err("%s found peer with reference count %d", + __func__, atomic_read(&peer->refs)); +#endif dst_release(peer->dst); kfree(peer); } @@ -160,9 +171,9 @@ void homa_peertab_gc_dsts(struct homa_peertab *peertab, u64 now) * @inet: Socket that will be used for sending packets. * * Return: The peer associated with @addr, or a negative errno if an - * error occurred. The caller can retain this pointer - * indefinitely: peer entries are never deleted except in - * homa_peertab_destroy. + * error occurred. On a successful return the reference count + * will be incremented for the returned peer. The caller must + * eventually call homa_peer_put to release the reference. */ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, const struct in6_addr *addr, @@ -191,6 +202,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], peertab_links) { if (ipv6_addr_equal(&peer->addr, addr)) { + homa_peer_hold(peer); rcu_read_unlock(); return peer; } @@ -207,8 +219,10 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, spin_lock_bh(&peertab->write_lock); hlist_for_each_entry(peer, &peertab->buckets[bucket], peertab_links) { - if (ipv6_addr_equal(&peer->addr, addr)) + if (ipv6_addr_equal(&peer->addr, addr)) { + homa_peer_hold(peer); goto done; + } } peer = kmalloc(sizeof(*peer), GFP_ATOMIC | __GFP_ZERO); if (!peer) { @@ -216,6 +230,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, INC_METRIC(peer_kmalloc_errors, 1); goto done; } + atomic_set(&peer->refs, 1); peer->addr = *addr; dst = homa_peer_get_dst(peer, inet); if (IS_ERR(dst)) { diff --git a/homa_peer.h b/homa_peer.h index f2fe6458..980355d4 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -75,6 +75,12 @@ struct homa_peertab { * have communicated with (either as client or server). */ struct homa_peer { + /** + * @refs: Number of unmatched calls to homa_peer_hold; it's not safe + * to free this object until the reference count is zero. + */ + atomic_t refs; + /** * @addr: IPv6 address for the machine (IPv4 addresses are stored * as IPv4-mapped IPv6 addresses). @@ -277,4 +283,25 @@ static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, return peer->dst; } +/** + * homa_peer_hold() - Increment the reference count on an RPC, which will + * prevent it from being freed until homa_peer_put() is called. + * @peer: Object on which to take a reference. + */ +static inline void homa_peer_hold(struct homa_peer *peer) +{ + atomic_inc(&peer->refs); +} + +/** + * homa_peer_put() - Release a reference on a peer (cancels the effect of + * a previous call to homa_peer_put). If the reference count becomes zero + * then the peer may be deleted at any time. + * @peer: Object to release. + */ +static inline void homa_peer_put(struct homa_peer *peer) +{ + atomic_dec(&peer->refs); +} + #endif /* _HOMA_PEER_H */ diff --git a/homa_rpc.c b/homa_rpc.c index 57a5fa84..7f334a42 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -51,6 +51,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, if (IS_ERR(crpc->peer)) { tt_record("error in homa_peer_find"); err = PTR_ERR(crpc->peer); + crpc->peer = NULL; goto error; } crpc->dport = ntohs(dest->in6.sin6_port); @@ -89,6 +90,8 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, return crpc; error: + if (crpc->peer) + homa_peer_put(crpc->peer); kfree(crpc); return ERR_PTR(err); } @@ -148,6 +151,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, srpc->peer = homa_peer_find(hsk->homa->peers, source, &hsk->inet); if (IS_ERR(srpc->peer)) { err = PTR_ERR(srpc->peer); + srpc->peer = NULL; goto error; } srpc->dport = ntohs(h->common.sport); @@ -197,6 +201,8 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, error: homa_bucket_unlock(bucket, id); + if (srpc && srpc->peer) + homa_peer_put(srpc->peer); kfree(srpc); return ERR_PTR(err); } @@ -251,17 +257,17 @@ void homa_rpc_end(struct homa_rpc *rpc) { /* The goal for this function is to make the RPC inaccessible, * so that no other code will ever access it again. However, don't - * actually release resources; leave that to homa_rpc_reap, which - * runs later. There are two reasons for this. First, releasing - * resources may be expensive, so we don't want to keep the caller - * waiting; homa_rpc_reap will run in situations where there is time - * to spare. Second, there may be other code that currently has - * pointers to this RPC but temporarily released the lock (e.g. to - * copy data to/from user space). It isn't safe to clean up until - * that code has finished its work and released any pointers to the - * RPC (homa_rpc_reap will ensure that this has happened). So, this - * function should only make changes needed to make the RPC - * inaccessible. + * actually release resources or tear down the internal structure + * of the RPC; leave that to homa_rpc_reap, which runs later. There + * are two reasons for this. First, releasing resources may be + * expensive, so we don't want to keep the caller waiting; homa_rpc_reap + * will run in situations where there is time to spare. Second, there + * may be other code that currently has pointers to this RPC but + * temporarily released the lock (e.g. to copy data to/from user space). + * It isn't safe to clean up until that code has finished its work and + * released any pointers to the RPC (homa_rpc_reap will ensure that + * this has happened). So, this function should only make changes + * needed to make the RPC inaccessible. */ if (!rpc || rpc->state == RPC_DEAD) return; @@ -538,6 +544,10 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) kfree(gap); } } + if (rpc->peer) { + homa_peer_put(rpc->peer); + rpc->peer = NULL; + } tt_record2("homa_rpc_reap finished reaping id %d, socket %d", rpc->id, rpc->hsk->port); rpc->state = 0; diff --git a/homa_rpc.h b/homa_rpc.h index cbc649e6..ca1f80b3 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -282,6 +282,7 @@ struct homa_rpc { /** * @peer: Information about the other machine (the server, if * this is a client RPC, or the client, if this is a server RPC). + * If non-NULL then we own a reference on the object. */ struct homa_peer *peer; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 992e7ae4..f2e0ae6a 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1048,6 +1048,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) EXPECT_EQ(400, peer->cutoff_version); EXPECT_EQ(9, peer->unsched_cutoffs[1]); EXPECT_EQ(3, peer->unsched_cutoffs[7]); + homa_peer_put(peer); } #endif /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) @@ -1821,6 +1822,7 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) &self->hsk.inet); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(0, peer->cutoff_version); + homa_peer_put(peer); } #endif /* See strip.py */ @@ -1909,6 +1911,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) &self->homa); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks [sp 99, id 1236]", unit_log_get()); + homa_peer_put(peer); } TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_no_extras) diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index c20ac626..51131304 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -98,6 +98,7 @@ FIXTURE_SETUP(homa_outgoing) } FIXTURE_TEARDOWN(homa_outgoing) { + homa_peer_put(self->peer); homa_destroy(&self->homa); unit_teardown(); } @@ -209,6 +210,31 @@ TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__cant_allocate_skb) EXPECT_TRUE(IS_ERR(skb)); EXPECT_EQ(ENOMEM, -PTR_ERR(skb)); } +TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__include_acks) +{ + struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, + &self->server_addr); + struct homa_data_hdr h; + struct sk_buff *skb; + + ASSERT_NE(NULL, crpc); + homa_rpc_unlock(crpc); + + crpc->peer->acks[0] = (struct homa_ack) { + .server_port = htons(200), + .client_id = cpu_to_be64(1000)}; + crpc->peer->num_acks = 1; + + homa_message_out_init(crpc, 500); + skb = homa_tx_data_pkt_alloc(crpc, iter, 0, 500, 2000); + ASSERT_NE(NULL, skb); + + homa_skb_get(skb, &h, 0, sizeof(h)); + EXPECT_STREQ("server_port 200, client_id 1000", + unit_ack_string(&h.ack)); + kfree_skb(skb); +} TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__multiple_segments_homa_fill_data_interleaved) { struct iov_iter *iter = unit_iov_iter((void *)1000, 5000); @@ -495,24 +521,6 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) homa_rpc_unlock(crpc); EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 1400;", unit_log_get()); } -TEST_F(homa_outgoing, homa_message_out_fill__include_acks) -{ - struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, - &self->server_addr); - struct homa_data_hdr h; - - ASSERT_FALSE(crpc == NULL); - crpc->peer->acks[0] = (struct homa_ack) { - .server_port = htons(200), - .client_id = cpu_to_be64(1000)}; - crpc->peer->num_acks = 1; - ASSERT_EQ(0, -homa_message_out_fill(crpc, - unit_iov_iter((void *) 1000, 500), 0)); - homa_rpc_unlock(crpc); - homa_skb_get(crpc->msgout.packets, &h, 0, sizeof(h)); - EXPECT_STREQ("server_port 200, client_id 1000", - unit_ack_string(&h.ack)); -} TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) { struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index a19e0460..8dadd199 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -30,7 +30,8 @@ static void kmalloc_hook(char *id) snprintf(addr_string, sizeof(addr_string), "10.0.0.%d", i); addr = unit_get_in_addr(addr_string); - homa_peer_find(hook_peertab, &addr, &hook_hsk->inet); + homa_peer_put(homa_peer_find(hook_peertab, &addr, + &hook_hsk->inet)); } } #endif /* See strip.py */ @@ -96,13 +97,18 @@ TEST_F(homa_peer, homa_peer_find__basics) peer2 = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); EXPECT_EQ(peer, peer2); + EXPECT_EQ(2, atomic_read(&peer->refs)); peer2 = homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); EXPECT_NE(peer, peer2); + EXPECT_EQ(1, atomic_read(&peer2->refs)); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(2, homa_metrics_per_cpu()->peer_new_entries); #endif /* See strip.py */ + homa_peer_put(peer); + homa_peer_put(peer); + homa_peer_put(peer2); } static struct _test_data_homa_peer *test_data; @@ -120,6 +126,7 @@ static void peer_lock_hook(char *id) */ conflicting_peer = homa_peer_find(&test_data->peertab, ip3333, &test_data->hsk.inet); + homa_peer_put(conflicting_peer); } TEST_F(homa_peer, homa_peertab_init__vmalloc_failed) @@ -150,6 +157,7 @@ TEST_F(homa_peer, homa_peertab_gc_dsts) EXPECT_EQ(2, dead_count(&self->peertab)); homa_peertab_gc_dsts(&self->peertab, ~0); EXPECT_EQ(0, dead_count(&self->peertab)); + homa_peer_put(peer); } #ifndef __STRIP__ /* See strip.py */ @@ -190,6 +198,7 @@ TEST_F(homa_peer, homa_peertab_get_peers__one_peer) EXPECT_EQ(1, num_peers); EXPECT_EQ(peer, peers[0]); kfree(peers); + homa_peer_put(peer); } TEST_F(homa_peer, homa_peertab_get_peers__multiple_peers) { @@ -210,14 +219,17 @@ TEST_F(homa_peer, homa_peertab_get_peers__multiple_peers) EXPECT_TRUE((peers[0] == peer3) || (peers[1] == peer3) || (peers[2] == peer3)); kfree(peers); + homa_peer_put(peer1); + homa_peer_put(peer2); + homa_peer_put(peer3); } TEST_F(homa_peer, homa_peertab_get_peers__a_few_new_peers_created_concurrently) { struct homa_peer **peers; int num_peers = 45; - homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); + homa_peer_put(homa_peer_find(&self->peertab, ip1111, &self->hsk.inet)); + homa_peer_put(homa_peer_find(&self->peertab, ip2222, &self->hsk.inet)); unit_hook_register(kmalloc_hook); hook_hsk = &self->hsk; hook_peertab = &self->peertab; @@ -232,8 +244,8 @@ TEST_F(homa_peer, homa_peertab_get_peers__many_new_peers_created_concurrently) struct homa_peer **peers; int num_peers = 45; - homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); + homa_peer_put(homa_peer_find(&self->peertab, ip1111, &self->hsk.inet)); + homa_peer_put(homa_peer_find(&self->peertab, ip2222, &self->hsk.inet)); unit_hook_register(kmalloc_hook); hook_hsk = &self->hsk; hook_peertab = &self->peertab; @@ -255,6 +267,7 @@ TEST_F(homa_peer, homa_peer_find__conflicting_creates) peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); EXPECT_NE(NULL, conflicting_peer); EXPECT_EQ(conflicting_peer, peer); + homa_peer_put(peer); } TEST_F(homa_peer, homa_peer_find__kmalloc_error) { @@ -294,6 +307,7 @@ TEST_F(homa_peer, homa_dst_refresh__basics) homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_NE(old_dst, peer->dst); EXPECT_EQ(1, dead_count(self->homa.peers)); + homa_peer_put(peer); } TEST_F(homa_peer, homa_dst_refresh__malloc_error) { @@ -309,6 +323,7 @@ TEST_F(homa_peer, homa_dst_refresh__malloc_error) homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); EXPECT_EQ(0, dead_count(self->homa.peers)); + homa_peer_put(peer); } TEST_F(homa_peer, homa_dst_refresh__routing_error) { @@ -327,6 +342,7 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); #endif /* See strip.py */ EXPECT_EQ(0, dead_count(self->homa.peers)); + homa_peer_put(peer); } TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) { @@ -343,6 +359,7 @@ TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) mock_clock = 500000000; homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(1, dead_count(self->homa.peers)); + homa_peer_put(peer); } #ifndef __STRIP__ /* See strip.py */ @@ -376,6 +393,7 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv4) dst_release(dst); EXPECT_STREQ("196.168.0.1", homa_print_ipv4_addr(peer->flow.u.ip4.daddr)); + homa_peer_put(peer); } TEST_F(homa_peer, homa_peer_get_dst_ipv6) { @@ -400,6 +418,7 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) (addr >> 16) & 0xff, (addr >> 8) & 0xff, addr & 0xff); EXPECT_STREQ("[1::1:1:1]", homa_print_ipv6_addr(&peer->flow.u.ip6.daddr)); + homa_peer_put(peer); } #ifndef __STRIP__ /* See strip.py */ @@ -421,6 +440,7 @@ TEST_F(homa_peer, homa_peer_lock_slow) EXPECT_EQ(1, homa_metrics_per_cpu()->peer_ack_lock_misses); EXPECT_EQ(1000, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); homa_peer_unlock(peer); + homa_peer_put(peer); } #endif /* See strip.py */ @@ -506,4 +526,5 @@ TEST_F(homa_peer, homa_peer_get_acks) EXPECT_EQ(1, homa_peer_get_acks(peer, 2, acks)); EXPECT_STREQ("server_port 5000, client_id 100", unit_ack_string(&acks[0])); + homa_peer_put(peer); } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 7ce0a695..c7a6a2e0 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -835,14 +835,16 @@ TEST_F(homa_plumbing, homa_recvmsg__add_ack) struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 100, 2000); + struct homa_peer *peer; EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); crpc->completion_cookie = 44444; + peer = crpc->peer; EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); - EXPECT_EQ(1, crpc->peer->num_acks); + EXPECT_EQ(1, peer->num_acks); } TEST_F(homa_plumbing, homa_recvmsg__server_normal_completion) { diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 180b501d..362ee176 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -735,6 +735,22 @@ TEST_F(homa_rpc, homa_rpc_reap__free_gaps) homa_rpc_reap(&self->hsk, false); // Test framework will complain if memory not freed. } +TEST_F(homa_rpc, homa_rpc_reap__release_peer_ref) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + 4000, 98, 1000, 150000); + struct homa_peer *peer; + + ASSERT_NE(NULL, crpc); + peer = crpc->peer; + EXPECT_EQ(1, atomic_read(&peer->refs)); + + homa_rpc_end(crpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(0, atomic_read(&peer->refs)); + EXPECT_EQ(NULL, crpc->peer); +} TEST_F(homa_rpc, homa_rpc_reap__call_homa_sock_wakeup_wmem) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, From 1996a423e7919d52d88524e6b75452d2486e8f93 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 11 May 2025 21:05:15 -0700 Subject: [PATCH 309/625] Remove homa_peertab_gc_dsts mechanism Never should have been needed in the first place. Also, change homa_get_dst to take a reference on the returned dst_entry. --- homa_outgoing.c | 11 ++----- homa_peer.c | 45 +--------------------------- homa_peer.h | 11 ++----- test/unit_homa_peer.c | 69 ++----------------------------------------- 4 files changed, 9 insertions(+), 127 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index a4059fcc..bca01104 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -288,6 +288,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) gso_size = dst->dev->gso_max_size; if (gso_size > rpc->hsk->homa->max_gso_size) gso_size = rpc->hsk->homa->max_gso_size; + dst_release(dst); #ifndef __STRIP__ /* See strip.py */ /* Round gso_size down to an even # of mtus; calculation depends @@ -452,7 +453,6 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, struct netdev_queue *txq; #endif /* See strip.py */ struct homa_common_hdr *h; - struct dst_entry *dst; struct sk_buff *skb; int extra_bytes; #ifndef __STRIP__ /* See strip.py */ @@ -460,12 +460,10 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, #endif /* See strip.py */ int result; - dst = homa_get_dst(peer, hsk); skb = homa_skb_alloc_tx(HOMA_MAX_HEADER); if (unlikely(!skb)) return -ENOBUFS; - dst_hold(dst); - skb_dst_set(skb, dst); + skb_dst_set(skb, homa_get_dst(peer, hsk)); h = skb_put(skb, length); memcpy(h, contents, length); @@ -683,7 +681,6 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority) void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) #endif /* See strip.py */ { - struct dst_entry *dst; #ifndef __STRIP__ /* See strip.py */ int err; @@ -694,9 +691,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) rpc->peer->cutoff_version; #endif /* See strip.py */ - dst = homa_get_dst(rpc->peer, rpc->hsk); - dst_hold(dst); - skb_dst_set(skb, dst); + skb_dst_set(skb, homa_get_dst(rpc->peer, rpc->hsk)); skb->ooo_okay = 1; skb->ip_summed = CHECKSUM_PARTIAL; diff --git a/homa_peer.c b/homa_peer.c index fa08684e..3f7cf097 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -23,7 +23,6 @@ int homa_peertab_init(struct homa_peertab *peertab) int i; spin_lock_init(&peertab->write_lock); - INIT_LIST_HEAD(&peertab->dead_dsts); peertab->buckets = vmalloc(HOMA_PEERTAB_BUCKETS * sizeof(*peertab->buckets)); if (!peertab->buckets) @@ -69,7 +68,6 @@ void homa_peertab_destroy(struct homa_peertab *peertab) } } vfree(peertab->buckets); - homa_peertab_gc_dsts(peertab, ~0); spin_unlock_bh(&peertab->write_lock); } @@ -139,29 +137,6 @@ struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, } #endif /* See strip.py */ -/** - * homa_peertab_gc_dsts() - Invoked to free unused dst_entries, if it is - * safe to do so. - * @peertab: The table in which to free entries. - * @now: Current time, in homa_clock() units; entries with expiration - * dates no later than this will be freed. Specify ~0 to - * free all entries. - */ -void homa_peertab_gc_dsts(struct homa_peertab *peertab, u64 now) - __must_hold(&peer_tab->write_lock) -{ - while (!list_empty(&peertab->dead_dsts)) { - struct homa_dead_dst *dead = - list_first_entry(&peertab->dead_dsts, - struct homa_dead_dst, dst_links); - if (dead->gc_time > now) - break; - dst_release(dead->dst); - list_del(&dead->dst_links); - kfree(dead); - } -} - /** * homa_peer_find() - Returns the peer associated with a given host; creates * a new homa_peer if one doesn't already exist. @@ -267,17 +242,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, struct homa_sock *hsk) { - struct homa_dead_dst *save_dead; struct dst_entry *dst; - u64 now; - - /* Need to keep around the current entry for a while in case - * someone is using it. If we can't do that, then don't update - * the entry. - */ - save_dead = kmalloc(sizeof(*save_dead), GFP_ATOMIC); - if (unlikely(!save_dead)) - return; dst = homa_peer_get_dst(peer, &hsk->inet); if (IS_ERR(dst)) { @@ -288,18 +253,10 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, __func__, PTR_ERR(dst)); INC_METRIC(peer_route_errors, 1); #endif /* See strip.py */ - kfree(save_dead); return; } - - spin_lock_bh(&peertab->write_lock); - now = homa_clock(); - save_dead->dst = peer->dst; - save_dead->gc_time = now + (homa_clock_khz() << 7); /* ~128 ms */ - list_add_tail(&save_dead->dst_links, &peertab->dead_dsts); - homa_peertab_gc_dsts(peertab, now); + dst_release(peer->dst); peer->dst = dst; - spin_unlock_bh(&peertab->write_lock); } #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_peer.h b/homa_peer.h index 980355d4..b3d244d1 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -56,12 +56,6 @@ struct homa_peertab { */ spinlock_t write_lock; - /** - * @dead_dsts: List of dst_entries that are waiting to be deleted. - * Hold @write_lock when manipulating. - */ - struct list_head dead_dsts; - /** * @buckets: Pointer to heads of chains of homa_peers for each bucket. * Malloc-ed, and must eventually be freed. NULL means this structure @@ -232,7 +226,6 @@ void homa_peer_lock_slow(struct homa_peer *peer); void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7); #endif /* See strip.py */ -void homa_peertab_gc_dsts(struct homa_peertab *peertab, u64 now); #ifndef __STRIP__ /* See strip.py */ /** @@ -273,13 +266,15 @@ static inline void homa_peer_unlock(struct homa_peer *peer) * updating it if the cached information is stale. * @peer: Peer whose destination information is desired. * @hsk: Homa socket; needed by lower-level code to recreate the dst. - * Return: Up-to-date destination for peer. + * Return: Up-to-date destination for peer; a reference has been taken + * on this dst_entry, which the caller must eventually release. */ static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk) { if (unlikely(peer->dst->obsolete > 0)) homa_dst_refresh(hsk->homa->peers, peer, hsk); + dst_hold(peer->dst); return peer->dst; } diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 8dadd199..267cce29 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -64,16 +64,6 @@ FIXTURE_TEARDOWN(homa_peer) unit_teardown(); } -static int dead_count(struct homa_peertab *peertab) -{ - struct list_head *pos; - int count = 0; - - list_for_each(pos, &peertab->dead_dsts) - count++; - return count; -} - #ifndef __STRIP__ /* See strip.py */ static void peer_spinlock_hook(char *id) { @@ -140,26 +130,6 @@ TEST_F(homa_peer, homa_peertab_init__vmalloc_failed) homa_peertab_destroy(&table); } -TEST_F(homa_peer, homa_peertab_gc_dsts) -{ - struct homa_peer *peer; - - peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - mock_clock = 0; - homa_dst_refresh(&self->peertab, peer, &self->hsk); - mock_clock = 50000000; - homa_dst_refresh(&self->peertab, peer, &self->hsk); - mock_clock = 90000000; - homa_dst_refresh(&self->peertab, peer, &self->hsk); - EXPECT_EQ(3, dead_count(&self->peertab)); - - homa_peertab_gc_dsts(&self->peertab, 130000000); - EXPECT_EQ(2, dead_count(&self->peertab)); - homa_peertab_gc_dsts(&self->peertab, ~0); - EXPECT_EQ(0, dead_count(&self->peertab)); - homa_peer_put(peer); -} - #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_peer, homa_peertab_get_peers__not_init) { @@ -303,26 +273,9 @@ TEST_F(homa_peer, homa_dst_refresh__basics) ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); - old_dst = homa_get_dst(peer, &self->hsk); + old_dst = peer->dst; homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_NE(old_dst, peer->dst); - EXPECT_EQ(1, dead_count(self->homa.peers)); - homa_peer_put(peer); -} -TEST_F(homa_peer, homa_dst_refresh__malloc_error) -{ - struct dst_entry *old_dst; - struct homa_peer *peer; - - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); - - old_dst = homa_get_dst(peer, &self->hsk); - mock_kmalloc_errors = 1; - homa_dst_refresh(self->homa.peers, peer, &self->hsk); - EXPECT_EQ(old_dst, peer->dst); - EXPECT_EQ(0, dead_count(self->homa.peers)); homa_peer_put(peer); } TEST_F(homa_peer, homa_dst_refresh__routing_error) @@ -334,31 +287,13 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); - old_dst = homa_get_dst(peer, &self->hsk); + old_dst = peer->dst; mock_route_errors = 1; homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); #endif /* See strip.py */ - EXPECT_EQ(0, dead_count(self->homa.peers)); - homa_peer_put(peer); -} -TEST_F(homa_peer, homa_dst_refresh__free_old_dsts) -{ - struct homa_peer *peer; - - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); - - mock_clock = 0; - homa_dst_refresh(self->homa.peers, peer, &self->hsk); - homa_dst_refresh(self->homa.peers, peer, &self->hsk); - EXPECT_EQ(2, dead_count(self->homa.peers)); - mock_clock = 500000000; - homa_dst_refresh(self->homa.peers, peer, &self->hsk); - EXPECT_EQ(1, dead_count(self->homa.peers)); homa_peer_put(peer); } From 9c14cdcc7b97ab1e94c1f271553f8424fc262bf6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 12 May 2025 08:58:10 -0700 Subject: [PATCH 310/625] Add rashtable.c and rhashtable.h from Linux kernel Needed for unit tests. --- test/rhashtable.c | 1255 +++++++++++++++++++++++++++++++++++++++++++ test/rhashtable.h | 1286 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 2541 insertions(+) create mode 100644 test/rhashtable.c create mode 100644 test/rhashtable.h diff --git a/test/rhashtable.c b/test/rhashtable.c new file mode 100644 index 00000000..3e555d01 --- /dev/null +++ b/test/rhashtable.c @@ -0,0 +1,1255 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Copyright (c) 2015 Herbert Xu + * Copyright (c) 2014-2015 Thomas Graf + * Copyright (c) 2008-2014 Patrick McHardy + * + * Code partially derived from nft_hash + * Rewritten with rehash code from br_multicast plus single list + * pointer as suggested by Josh Triplett + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HASH_DEFAULT_SIZE 64UL +#define HASH_MIN_SIZE 4U + +union nested_table { + union nested_table __rcu *table; + struct rhash_lock_head __rcu *bucket; +}; + +static u32 head_hashfn(struct rhashtable *ht, + const struct bucket_table *tbl, + const struct rhash_head *he) +{ + return rht_head_hashfn(ht, tbl, he, ht->p); +} + +#ifdef CONFIG_PROVE_LOCKING +#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) + +int lockdep_rht_mutex_is_held(struct rhashtable *ht) +{ + return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); + +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) +{ + if (!debug_locks) + return 1; + if (unlikely(tbl->nest)) + return 1; + return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); +} +EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); +#else +#define ASSERT_RHT_MUTEX(HT) +#endif + +static inline union nested_table *nested_table_top( + const struct bucket_table *tbl) +{ + /* The top-level bucket entry does not need RCU protection + * because it's set at the same time as tbl->nest. + */ + return (void *)rcu_dereference_protected(tbl->buckets[0], 1); +} + +static void nested_table_free(union nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_protected(ntbl->table, 1); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + } + + kfree(ntbl); +} + +static void nested_bucket_table_free(const struct bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union nested_table *ntbl; + unsigned int i; + + ntbl = nested_table_top(tbl); + + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + + kfree(ntbl); +} + +static void bucket_table_free(const struct bucket_table *tbl) +{ + if (tbl->nest) + nested_bucket_table_free(tbl); + + kvfree(tbl); +} + +static void bucket_table_free_rcu(struct rcu_head *head) +{ + bucket_table_free(container_of(head, struct bucket_table, rcu)); +} + +static union nested_table *nested_table_alloc(struct rhashtable *ht, + union nested_table __rcu **prev, + bool leaf) +{ + union nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = alloc_hooks_tag(ht->alloc_tag, + kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO)); + + if (ntbl && leaf) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket); + } + + if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) + return ntbl; + /* Raced with another thread. */ + kfree(ntbl); + return rcu_dereference(*prev); +} + +static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = alloc_hooks_tag(ht->alloc_tag, + kmalloc_noprof(size, gfp|__GFP_ZERO)); + if (!tbl) + return NULL; + + if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, + false)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + +static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + struct bucket_table *tbl = NULL; + size_t size; + int i; + static struct lock_class_key __key; + + tbl = alloc_hooks_tag(ht->alloc_tag, + kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), + gfp|__GFP_ZERO, NUMA_NO_NODE)); + + size = nbuckets; + + if (tbl == NULL && !gfpflags_allow_blocking(gfp)) { + tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; + } + + if (tbl == NULL) + return NULL; + + lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0); + + tbl->size = size; + + rcu_head_init(&tbl->rcu); + INIT_LIST_HEAD(&tbl->walkers); + + tbl->hash_rnd = get_random_u32(); + + for (i = 0; i < nbuckets; i++) + INIT_RHT_NULLS_HEAD(tbl->buckets[i]); + + return tbl; +} + +static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, + struct bucket_table *tbl) +{ + struct bucket_table *new_tbl; + + do { + new_tbl = tbl; + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + } while (tbl); + + return new_tbl; +} + +static int rhashtable_rehash_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + unsigned int old_hash) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); + int err = -EAGAIN; + struct rhash_head *head, *next, *entry; + struct rhash_head __rcu **pprev = NULL; + unsigned int new_hash; + unsigned long flags; + + if (new_tbl->nest) + goto out; + + err = -ENOENT; + + rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), + old_tbl, old_hash) { + err = 0; + next = rht_dereference_bucket(entry->next, old_tbl, old_hash); + + if (rht_is_a_nulls(next)) + break; + + pprev = &entry->next; + } + + if (err) + goto out; + + new_hash = head_hashfn(ht, new_tbl, entry); + + flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], + SINGLE_DEPTH_NESTING); + + head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); + + RCU_INIT_POINTER(entry->next, head); + + rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags); + + if (pprev) + rcu_assign_pointer(*pprev, next); + else + /* Need to preserved the bit lock. */ + rht_assign_locked(bkt, next); + +out: + return err; +} + +static int rhashtable_rehash_chain(struct rhashtable *ht, + unsigned int old_hash) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); + unsigned long flags; + int err; + + if (!bkt) + return 0; + flags = rht_lock(old_tbl, bkt); + + while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) + ; + + if (err == -ENOENT) + err = 0; + rht_unlock(old_tbl, bkt, flags); + + return err; +} + +static int rhashtable_rehash_attach(struct rhashtable *ht, + struct bucket_table *old_tbl, + struct bucket_table *new_tbl) +{ + /* Make insertions go into the new, empty table right away. Deletions + * and lookups will be attempted in both tables until we synchronize. + * As cmpxchg() provides strong barriers, we do not need + * rcu_assign_pointer(). + */ + + if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, + new_tbl) != NULL) + return -EEXIST; + + return 0; +} + +static int rhashtable_rehash_table(struct rhashtable *ht) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; + struct rhashtable_walker *walker; + unsigned int old_hash; + int err; + + new_tbl = rht_dereference(old_tbl->future_tbl, ht); + if (!new_tbl) + return 0; + + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + cond_resched(); + } + + /* Publish the new table pointer. */ + rcu_assign_pointer(ht->tbl, new_tbl); + + spin_lock(&ht->lock); + list_for_each_entry(walker, &old_tbl->walkers, list) + walker->tbl = NULL; + + /* Wait for readers. All new readers will see the new + * table, and thus no references to the old table will + * remain. + * We do this inside the locked region so that + * rhashtable_walk_stop() can use rcu_head_after_call_rcu() + * to check if it should not re-link the table. + */ + call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + spin_unlock(&ht->lock); + + return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; +} + +static int rhashtable_rehash_alloc(struct rhashtable *ht, + struct bucket_table *old_tbl, + unsigned int size) +{ + struct bucket_table *new_tbl; + int err; + + ASSERT_RHT_MUTEX(ht); + + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; + + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); + + return err; +} + +/** + * rhashtable_shrink - Shrink hash table while allowing concurrent lookups + * @ht: the hash table to shrink + * + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * The caller must ensure that no concurrent table mutations take place. + * It is however valid to have concurrent lookups if they are RCU protected. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ +static int rhashtable_shrink(struct rhashtable *ht) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + unsigned int nelems = atomic_read(&ht->nelems); + unsigned int size = 0; + + if (nelems) + size = roundup_pow_of_two(nelems * 3 / 2); + if (size < ht->p.min_size) + size = ht->p.min_size; + + if (old_tbl->size <= size) + return 0; + + if (rht_dereference(old_tbl->future_tbl, ht)) + return -EEXIST; + + return rhashtable_rehash_alloc(ht, old_tbl, size); +} + +static void rht_deferred_worker(struct work_struct *work) +{ + struct rhashtable *ht; + struct bucket_table *tbl; + int err = 0; + + ht = container_of(work, struct rhashtable, run_work); + mutex_lock(&ht->mutex); + + tbl = rht_dereference(ht->tbl, ht); + tbl = rhashtable_last_table(ht, tbl); + + if (rht_grow_above_75(ht, tbl)) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); + else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) + err = rhashtable_shrink(ht); + else if (tbl->nest) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size); + + if (!err || err == -EEXIST) { + int nerr; + + nerr = rhashtable_rehash_table(ht); + err = err ?: nerr; + } + + mutex_unlock(&ht->mutex); + + if (err) + schedule_work(&ht->run_work); +} + +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) +{ + struct bucket_table *old_tbl; + struct bucket_table *new_tbl; + unsigned int size; + int err; + + old_tbl = rht_dereference_rcu(ht->tbl, ht); + + size = tbl->size; + + err = -EBUSY; + + if (rht_grow_above_75(ht, tbl)) + size *= 2; + /* Do not schedule more than one rehash */ + else if (old_tbl != tbl) + goto fail; + + err = -ENOMEM; + + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); + if (new_tbl == NULL) + goto fail; + + err = rhashtable_rehash_attach(ht, tbl, new_tbl); + if (err) { + bucket_table_free(new_tbl); + if (err == -EEXIST) + err = 0; + } else + schedule_work(&ht->run_work); + + return err; + +fail: + /* Do not fail the insert if someone else did a rehash. */ + if (likely(rcu_access_pointer(tbl->future_tbl))) + return 0; + + /* Schedule async rehash to retry allocation in process context. */ + if (err == -ENOMEM) + schedule_work(&ht->run_work); + + return err; +} + +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) +{ + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev = NULL; + struct rhash_head *head; + int elasticity; + + elasticity = RHT_ELASTICITY; + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; + + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } + + if (!ht->rhlist) + return rht_obj(ht, head); + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + if (pprev) + rcu_assign_pointer(*pprev, obj); + else + /* Need to preserve the bit lock */ + rht_assign_locked(bkt, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one( + struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); + + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); + + head = rht_ptr(bkt, tbl, hash); + + RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } + + /* bkt is always the head of the list, so it holds + * the lock, which we need to preserve + */ + rht_assign_locked(bkt, obj); + + return NULL; +} + +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + struct rhash_lock_head __rcu **bkt; + unsigned long flags; + unsigned int hash; + void *data; + + new_tbl = rcu_dereference(ht->tbl); + + do { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + if (rcu_access_pointer(tbl->future_tbl)) + /* Failure is OK */ + bkt = rht_bucket_var(tbl, hash); + else + bkt = rht_bucket_insert(ht, tbl, hash); + if (bkt == NULL) { + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + data = ERR_PTR(-EAGAIN); + } else { + bool inserted; + + flags = rht_lock(tbl, bkt); + data = rhashtable_lookup_one(ht, bkt, tbl, + hash, key, obj); + new_tbl = rhashtable_insert_one(ht, bkt, tbl, + hash, obj, data); + inserted = data && !new_tbl; + if (inserted) + atomic_inc(&ht->nelems); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + rht_unlock(tbl, bkt, flags); + + if (inserted && rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); + } + } while (!IS_ERR_OR_NULL(new_tbl)); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; +} +EXPORT_SYMBOL_GPL(rhashtable_insert_slow); + +/** + * rhashtable_walk_enter - Initialise an iterator + * @ht: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptible context, but cannot be called from softirq or + * hardirq context. + * + * You must call rhashtable_walk_exit after this function returns. + */ +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) +{ + iter->ht = ht; + iter->p = NULL; + iter->slot = 0; + iter->skip = 0; + iter->end_of_table = 0; + + spin_lock(&ht->lock); + iter->walker.tbl = + rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); + spin_unlock(&ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); + +/** + * rhashtable_walk_exit - Free an iterator + * @iter: Hash table Iterator + * + * This function frees resources allocated by rhashtable_walk_enter. + */ +void rhashtable_walk_exit(struct rhashtable_iter *iter) +{ + spin_lock(&iter->ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&iter->ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_exit); + +/** + * rhashtable_walk_start_check - Start a hash table walk + * @iter: Hash table iterator + * + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call rhashtable_walk_stop to clean up. + * + * Returns zero if successful. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may use it immediately + * by calling rhashtable_walk_next. + * + * rhashtable_walk_start is defined as an inline variant that returns + * void. This is preferred in cases where the caller would ignore + * resize events and always continue. + */ +int rhashtable_walk_start_check(struct rhashtable_iter *iter) + __acquires(RCU) +{ + struct rhashtable *ht = iter->ht; + bool rhlist = ht->rhlist; + + rcu_read_lock(); + + spin_lock(&ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&ht->lock); + + if (iter->end_of_table) + return 0; + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); + iter->slot = 0; + iter->skip = 0; + return -EAGAIN; + } + + if (iter->p && !rhlist) { + /* + * We need to validate that 'p' is still in the table, and + * if so, update 'skip' + */ + struct rhash_head *p; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + skip++; + if (p == iter->p) { + iter->skip = skip; + goto found; + } + } + iter->p = NULL; + } else if (iter->p && rhlist) { + /* Need to validate that 'list' is still in the table, and + * if so, update 'skip' and 'p'. + */ + struct rhash_head *p; + struct rhlist_head *list; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + for (list = container_of(p, struct rhlist_head, rhead); + list; + list = rcu_dereference(list->next)) { + skip++; + if (list == iter->list) { + iter->p = p; + iter->skip = skip; + goto found; + } + } + } + iter->p = NULL; + } +found: + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); + +/** + * __rhashtable_walk_find_next - Find the next element in a table (or the first + * one in case of a new walk). + * + * @iter: Hash table iterator + * + * Returns the found object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. + */ +static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) +{ + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (!tbl) + return NULL; + + for (; iter->slot < tbl->size; iter->slot++) { + int skip = iter->skip; + + rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } + if (!skip) + break; + skip--; + } + +next: + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + iter->skip = 0; + } + + iter->p = NULL; + + /* Ensure we see any new tables. */ + smp_rmb(); + + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { + iter->slot = 0; + iter->skip = 0; + return ERR_PTR(-EAGAIN); + } else { + iter->end_of_table = true; + } + + return NULL; +} + +/** + * rhashtable_walk_next - Return the next object and advance the iterator + * @iter: Hash table iterator + * + * Note that you must call rhashtable_walk_stop when you are finished + * with the walk. + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_next(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (p) { + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + /* At the end of this slot, switch to next one and then find + * next entry from that point. + */ + iter->skip = 0; + iter->slot++; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_next); + +/** + * rhashtable_walk_peek - Return the next object but don't advance the iterator + * @iter: Hash table iterator + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_peek(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + + if (p) + return rht_obj(ht, ht->rhlist ? &list->rhead : p); + + /* No object found in current iter, find next one in the table. */ + + if (iter->skip) { + /* A nonzero skip value points to the next entry in the table + * beyond that last one that was found. Decrement skip so + * we find the current value. __rhashtable_walk_find_next + * will restore the original value of skip assuming that + * the table hasn't changed. + */ + iter->skip--; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_peek); + +/** + * rhashtable_walk_stop - Finish a hash table walk + * @iter: Hash table iterator + * + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. + */ +void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) +{ + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker.tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) + /* This bucket table is being freed, don't re-link it. */ + iter->walker.tbl = NULL; + else + list_add(&iter->walker.list, &tbl->walkers); + spin_unlock(&ht->lock); + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_stop); + +static size_t rounded_hashtable_size(const struct rhashtable_params *params) +{ + size_t retsize; + + if (params->nelem_hint) + retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + (unsigned long)params->min_size); + else + retsize = max(HASH_DEFAULT_SIZE, + (unsigned long)params->min_size); + + return retsize; +} + +static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) +{ + return jhash2(key, length, seed); +} + +/** + * rhashtable_init - initialize a new hash table + * @ht: hash table to be initialized + * @params: configuration parameters + * + * Initializes a new hash table based on the provided configuration + * parameters. A table can be configured either with a variable or + * fixed length key: + * + * Configuration Example 1: Fixed length keys + * struct test_obj { + * int key; + * void * my_member; + * struct rhash_head node; + * }; + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .key_offset = offsetof(struct test_obj, key), + * .key_len = sizeof(int), + * .hashfn = jhash, + * }; + * + * Configuration Example 2: Variable length keys + * struct test_obj { + * [...] + * struct rhash_head node; + * }; + * + * u32 my_hash_fn(const void *data, u32 len, u32 seed) + * { + * struct test_obj *obj = data; + * + * return [... hash ...]; + * } + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .hashfn = jhash, + * .obj_hashfn = my_hash_fn, + * }; + */ +int rhashtable_init_noprof(struct rhashtable *ht, + const struct rhashtable_params *params) +{ + struct bucket_table *tbl; + size_t size; + + if ((!params->key_len && !params->obj_hashfn) || + (params->obj_hashfn && !params->obj_cmpfn)) + return -EINVAL; + + memset(ht, 0, sizeof(*ht)); + mutex_init(&ht->mutex); + spin_lock_init(&ht->lock); + memcpy(&ht->p, params, sizeof(*params)); + + alloc_tag_record(ht->alloc_tag); + + if (params->min_size) + ht->p.min_size = roundup_pow_of_two(params->min_size); + + /* Cap total entries at 2^31 to avoid nelems overflow. */ + ht->max_elems = 1u << 31; + + if (params->max_size) { + ht->p.max_size = rounddown_pow_of_two(params->max_size); + if (ht->p.max_size < ht->max_elems / 2) + ht->max_elems = ht->p.max_size * 2; + } + + ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + + size = rounded_hashtable_size(&ht->p); + + ht->key_len = ht->p.key_len; + if (!params->hashfn) { + ht->p.hashfn = jhash; + + if (!(ht->key_len & (sizeof(u32) - 1))) { + ht->key_len /= sizeof(u32); + ht->p.hashfn = rhashtable_jhash2; + } + } + + /* + * This is api initialization and thus we need to guarantee the + * initial rhashtable allocation. Upon failure, retry with the + * smallest possible size with __GFP_NOFAIL semantics. + */ + tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (unlikely(tbl == NULL)) { + size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); + } + + atomic_set(&ht->nelems, 0); + + RCU_INIT_POINTER(ht->tbl, tbl); + + INIT_WORK(&ht->run_work, rht_deferred_worker); + + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_init_noprof); + +/** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) +{ + int err; + + err = rhashtable_init_noprof(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init_noprof); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** + * rhashtable_free_and_destroy - free elements and destroy hash table + * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. + * + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. + */ +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct bucket_table *tbl, *next_tbl; + unsigned int i; + + cancel_work_sync(&ht->run_work); + + mutex_lock(&ht->mutex); + tbl = rht_dereference(ht->tbl, ht); +restart: + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + + cond_resched(); + for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + !rht_is_a_nulls(pos); + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL) + rhashtable_free_one(ht, pos, free_fn, arg); + } + } + + next_tbl = rht_dereference(tbl->future_tbl, ht); + bucket_table_free(tbl); + if (next_tbl) { + tbl = next_tbl; + goto restart; + } + mutex_unlock(&ht->mutex); +} +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ + return rhashtable_free_and_destroy(ht, NULL, NULL); +} +EXPORT_SYMBOL_GPL(rhashtable_destroy); + +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, + tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return NULL; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(__rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + static struct rhash_lock_head __rcu *rhnull; + + if (!rhnull) + INIT_RHT_NULLS_HEAD(rhnull); + return __rht_bucket_nested(tbl, hash) ?: &rhnull; +} +EXPORT_SYMBOL_GPL(rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + hash >>= tbl->nest; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); diff --git a/test/rhashtable.h b/test/rhashtable.h new file mode 100644 index 00000000..6c85b28e --- /dev/null +++ b/test/rhashtable.h @@ -0,0 +1,1286 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Copyright (c) 2015-2016 Herbert Xu + * Copyright (c) 2014-2015 Thomas Graf + * Copyright (c) 2008-2014 Patrick McHardy + * + * Code partially derived from nft_hash + * Rewritten with rehash code from br_multicast plus single list + * pointer as suggested by Josh Triplett + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _LINUX_RHASHTABLE_H +#define _LINUX_RHASHTABLE_H + +#include +#include +#include +#include +#include +#include +#include + +#include +/* + * Objects in an rhashtable have an embedded struct rhash_head + * which is linked into as hash chain from the hash table - or one + * of two or more hash tables when the rhashtable is being resized. + * The end of the chain is marked with a special nulls marks which has + * the least significant bit set but otherwise stores the address of + * the hash bucket. This allows us to be sure we've found the end + * of the right list. + * The value stored in the hash bucket has BIT(0) used as a lock bit. + * This bit must be atomically set before any changes are made to + * the chain. To avoid dereferencing this pointer without clearing + * the bit first, we use an opaque 'struct rhash_lock_head *' for the + * pointer stored in the bucket. This struct needs to be defined so + * that rcu_dereference() works on it, but it has no content so a + * cast is needed for it to be useful. This ensures it isn't + * used by mistake with clearing the lock bit first. + */ +struct rhash_lock_head {}; + +/* Maximum chain length before rehash + * + * The maximum (not average) chain length grows with the size of the hash + * table, at a rate of (log N)/(log log N). + * + * The value of 16 is selected so that even if the hash table grew to + * 2^32 you would not expect the maximum chain length to exceed it + * unless we are under attack (or extremely unlucky). + * + * As this limit is only to detect attacks, we don't need to set it to a + * lower value as you'd need the chain length to vastly exceed 16 to have + * any real effect on the system. + */ +#define RHT_ELASTICITY 16u + +/** + * struct bucket_table - Table of hash buckets + * @size: Number of hash buckets + * @nest: Number of bits of first-level nested table. + * @rehash: Current bucket being rehashed + * @hash_rnd: Random seed to fold into hash + * @walkers: List of active walkers + * @rcu: RCU structure for freeing the table + * @future_tbl: Table under construction during rehashing + * @ntbl: Nested table used when out of memory. + * @buckets: size * hash buckets + */ +struct bucket_table { + unsigned int size; + unsigned int nest; + u32 hash_rnd; + struct list_head walkers; + struct rcu_head rcu; + + struct bucket_table __rcu *future_tbl; + + struct lockdep_map dep_map; + + struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; +}; + +/* + * NULLS_MARKER() expects a hash value with the low + * bits mostly likely to be significant, and it discards + * the msb. + * We give it an address, in which the bottom bit is + * always 0, and the msb might be significant. + * So we shift the address down one bit to align with + * expectations and avoid losing a significant bit. + * + * We never store the NULLS_MARKER in the hash table + * itself as we need the lsb for locking. + * Instead we store a NULL + */ +#define RHT_NULLS_MARKER(ptr) \ + ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) +#define INIT_RHT_NULLS_HEAD(ptr) \ + ((ptr) = NULL) + +static inline bool rht_is_a_nulls(const struct rhash_head *ptr) +{ + return ((unsigned long) ptr & 1); +} + +static inline void *rht_obj(const struct rhashtable *ht, + const struct rhash_head *he) +{ + return (char *)he - ht->p.head_offset; +} + +static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, + unsigned int hash) +{ + return hash & (tbl->size - 1); +} + +static inline unsigned int rht_key_get_hash(struct rhashtable *ht, + const void *key, const struct rhashtable_params params, + unsigned int hash_rnd) +{ + unsigned int hash; + + /* params must be equal to ht->p if it isn't constant. */ + if (!__builtin_constant_p(params.key_len)) + hash = ht->p.hashfn(key, ht->key_len, hash_rnd); + else if (params.key_len) { + unsigned int key_len = params.key_len; + + if (params.hashfn) + hash = params.hashfn(key, key_len, hash_rnd); + else if (key_len & (sizeof(u32) - 1)) + hash = jhash(key, key_len, hash_rnd); + else + hash = jhash2(key, key_len / sizeof(u32), hash_rnd); + } else { + unsigned int key_len = ht->p.key_len; + + if (params.hashfn) + hash = params.hashfn(key, key_len, hash_rnd); + else + hash = jhash(key, key_len, hash_rnd); + } + + return hash; +} + +static inline unsigned int rht_key_hashfn( + struct rhashtable *ht, const struct bucket_table *tbl, + const void *key, const struct rhashtable_params params) +{ + unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); + + return rht_bucket_index(tbl, hash); +} + +static inline unsigned int rht_head_hashfn( + struct rhashtable *ht, const struct bucket_table *tbl, + const struct rhash_head *he, const struct rhashtable_params params) +{ + const char *ptr = rht_obj(ht, he); + + return likely(params.obj_hashfn) ? + rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?: + ht->p.key_len, + tbl->hash_rnd)) : + rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); +} + +/** + * rht_grow_above_75 - returns true if nelems > 0.75 * table-size + * @ht: hash table + * @tbl: current table + */ +static inline bool rht_grow_above_75(const struct rhashtable *ht, + const struct bucket_table *tbl) +{ + /* Expand table when exceeding 75% load */ + return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) && + (!ht->p.max_size || tbl->size < ht->p.max_size); +} + +/** + * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size + * @ht: hash table + * @tbl: current table + */ +static inline bool rht_shrink_below_30(const struct rhashtable *ht, + const struct bucket_table *tbl) +{ + /* Shrink table beneath 30% load */ + return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) && + tbl->size > ht->p.min_size; +} + +/** + * rht_grow_above_100 - returns true if nelems > table-size + * @ht: hash table + * @tbl: current table + */ +static inline bool rht_grow_above_100(const struct rhashtable *ht, + const struct bucket_table *tbl) +{ + return atomic_read(&ht->nelems) > tbl->size && + (!ht->p.max_size || tbl->size < ht->p.max_size); +} + +/** + * rht_grow_above_max - returns true if table is above maximum + * @ht: hash table + * @tbl: current table + */ +static inline bool rht_grow_above_max(const struct rhashtable *ht, + const struct bucket_table *tbl) +{ + return atomic_read(&ht->nelems) >= ht->max_elems; +} + +#ifdef CONFIG_PROVE_LOCKING +int lockdep_rht_mutex_is_held(struct rhashtable *ht); +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); +#else +static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) +{ + return 1; +} + +static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, + u32 hash) +{ + return 1; +} +#endif /* CONFIG_PROVE_LOCKING */ + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj); + +void rhashtable_walk_enter(struct rhashtable *ht, + struct rhashtable_iter *iter); +void rhashtable_walk_exit(struct rhashtable_iter *iter); +int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); + +static inline void rhashtable_walk_start(struct rhashtable_iter *iter) +{ + (void)rhashtable_walk_start_check(iter); +} + +void *rhashtable_walk_next(struct rhashtable_iter *iter); +void *rhashtable_walk_peek(struct rhashtable_iter *iter); +void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); + +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg); +void rhashtable_destroy(struct rhashtable *ht); + +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash); +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash); +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); + +#define rht_dereference(p, ht) \ + rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) + +#define rht_dereference_rcu(p, ht) \ + rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) + +#define rht_dereference_bucket(p, tbl, hash) \ + rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) + +#define rht_dereference_bucket_rcu(p, tbl, hash) \ + rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) + +#define rht_entry(tpos, pos, member) \ + ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) + +static inline struct rhash_lock_head __rcu *const *rht_bucket( + const struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_lock_head __rcu **rht_bucket_var( + struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_lock_head __rcu **rht_bucket_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : + &tbl->buckets[hash]; +} + +/* + * We lock a bucket by setting BIT(0) in the pointer - this is always + * zero in real pointers. The NULLS mark is never stored in the bucket, + * rather we store NULL if the bucket is empty. + * bit_spin_locks do not handle contention well, but the whole point + * of the hashtable design is to achieve minimum per-bucket contention. + * A nested hash table might not have a bucket pointer. In that case + * we cannot get a lock. For remove and replace the bucket cannot be + * interesting and doesn't need locking. + * For insert we allocate the bucket if this is the last bucket_table, + * and then take the lock. + * Sometimes we unlock a bucket by writing a new pointer there. In that + * case we don't need to unlock, but we do need to reset state such as + * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() + * provides the same release semantics that bit_spin_unlock() provides, + * this is safe. + * When we write to a bucket without unlocking, we use rht_assign_locked(). + */ + +static inline unsigned long rht_lock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt) +{ + unsigned long flags; + + local_irq_save(flags); + bit_spin_lock(0, (unsigned long *)bkt); + lock_map_acquire(&tbl->dep_map); + return flags; +} + +static inline unsigned long rht_lock_nested(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bucket, + unsigned int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + bit_spin_lock(0, (unsigned long *)bucket); + lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); + return flags; +} + +static inline void rht_unlock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt, + unsigned long flags) +{ + lock_map_release(&tbl->dep_map); + bit_spin_unlock(0, (unsigned long *)bkt); + local_irq_restore(flags); +} + +static inline struct rhash_head *__rht_ptr( + struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) +{ + return (struct rhash_head *) + ((unsigned long)p & ~BIT(0) ?: + (unsigned long)RHT_NULLS_MARKER(bkt)); +} + +/* + * Where 'bkt' is a bucket and might be locked: + * rht_ptr_rcu() dereferences that pointer and clears the lock bit. + * rht_ptr() dereferences in a context where the bucket is locked. + * rht_ptr_exclusive() dereferences in a context where exclusive + * access is guaranteed, such as when destroying the table. + */ +static inline struct rhash_head *rht_ptr_rcu( + struct rhash_lock_head __rcu *const *bkt) +{ + return __rht_ptr(rcu_dereference(*bkt), bkt); +} + +static inline struct rhash_head *rht_ptr( + struct rhash_lock_head __rcu *const *bkt, + struct bucket_table *tbl, + unsigned int hash) +{ + return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); +} + +static inline struct rhash_head *rht_ptr_exclusive( + struct rhash_lock_head __rcu *const *bkt) +{ + return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); +} + +static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, + struct rhash_head *obj) +{ + if (rht_is_a_nulls(obj)) + obj = NULL; + rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); +} + +static inline void rht_assign_unlock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt, + struct rhash_head *obj, + unsigned long flags) +{ + if (rht_is_a_nulls(obj)) + obj = NULL; + lock_map_release(&tbl->dep_map); + rcu_assign_pointer(*bkt, (void *)obj); + preempt_enable(); + __release(bitlock); + local_irq_restore(flags); +} + +/** + * rht_for_each_from - iterate over hash chain from given head + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + */ +#define rht_for_each_from(pos, head, tbl, hash) \ + for (pos = head; \ + !rht_is_a_nulls(pos); \ + pos = rht_dereference_bucket((pos)->next, tbl, hash)) + +/** + * rht_for_each - iterate over hash chain + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + */ +#define rht_for_each(pos, tbl, hash) \ + rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + tbl, hash) + +/** + * rht_for_each_entry_from - iterate over hash chain from given head + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + */ +#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ + for (pos = head; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = rht_dereference_bucket((pos)->next, tbl, hash)) + +/** + * rht_for_each_entry - iterate over hash chain of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + */ +#define rht_for_each_entry(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_from(tpos, pos, \ + rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + tbl, hash, member) + +/** + * rht_for_each_entry_safe - safely iterate over hash chain of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @next: the &struct rhash_head to use as next in loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive allows for the looped code to + * remove the loop cursor from the list. + */ +#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ + for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = next, \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL) + +/** + * rht_for_each_rcu_from - iterate over rcu hash chain from given head + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_rcu_from(pos, head, tbl, hash) \ + for (({barrier(); }), \ + pos = head; \ + !rht_is_a_nulls(pos); \ + pos = rcu_dereference_raw(pos->next)) + +/** + * rht_for_each_rcu - iterate over rcu hash chain + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_rcu(pos, tbl, hash) \ + for (({barrier(); }), \ + pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ + !rht_is_a_nulls(pos); \ + pos = rcu_dereference_raw(pos->next)) + +/** + * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ + for (({barrier(); }), \ + pos = head; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) + +/** + * rht_for_each_entry_rcu - iterate over rcu hash chain of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_rcu_from(tpos, pos, \ + rht_ptr_rcu(rht_bucket(tbl, hash)), \ + tbl, hash, member) + +/** + * rhl_for_each_rcu - iterate over rcu hash table list + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_rcu(pos, list) \ + for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) + +/** + * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * @member: name of the &struct rlist_head within the hashable struct. + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_entry_rcu(tpos, pos, list, member) \ + for (pos = list; pos && rht_entry(tpos, pos, member); \ + pos = rcu_dereference_raw(pos->next)) + +static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, + const void *obj) +{ + struct rhashtable *ht = arg->ht; + const char *ptr = obj; + + return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); +} + +/* Internal function, do not use. */ +static inline struct rhash_head *__rhashtable_lookup( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_lock_head __rcu *const *bkt; + struct bucket_table *tbl; + struct rhash_head *he; + unsigned int hash; + + tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, key, params); + bkt = rht_bucket(tbl, hash); + do { + rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { + if (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, he)) : + rhashtable_compare(&arg, rht_obj(ht, he))) + continue; + return he; + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + + return NULL; +} + +/** + * rhashtable_lookup - search hash table + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for an entry with an identical key. The first matching entry is returned. + * + * This must only be called under the RCU read lock. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(ht, key, params); + + return he ? rht_obj(ht, he) : NULL; +} + +/** + * rhashtable_lookup_fast - search hash table, without RCU read lock + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for an entry with an identical key. The first matching entry is returned. + * + * Only use this function when you have other mechanisms guaranteeing + * that the object won't go away after the RCU read lock is released. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup_fast( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + void *obj; + + rcu_read_lock(); + obj = rhashtable_lookup(ht, key, params); + rcu_read_unlock(); + + return obj; +} + +/** + * rhltable_lookup - search hash list table + * @hlt: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for an entry with an identical key. All matching entries are returned + * in a list. + * + * This must only be called under the RCU read lock. + * + * Returns the list of entries that match the given key. + */ +static inline struct rhlist_head *rhltable_lookup( + struct rhltable *hlt, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); + + return he ? container_of(he, struct rhlist_head, rhead) : NULL; +} + +/* Internal function, please use rhashtable_insert_fast() instead. This + * function returns the existing element already in hashes if there is a clash, + * otherwise it returns an error via ERR_PTR(). + */ +static inline void *__rhashtable_insert_fast( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params, bool rhlist) +{ + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_lock_head __rcu **bkt; + struct rhash_head __rcu **pprev; + struct bucket_table *tbl; + struct rhash_head *head; + unsigned long flags; + unsigned int hash; + int elasticity; + void *data; + + rcu_read_lock(); + + tbl = rht_dereference_rcu(ht->tbl, ht); + hash = rht_head_hashfn(ht, tbl, obj, params); + elasticity = RHT_ELASTICITY; + bkt = rht_bucket_insert(ht, tbl, hash); + data = ERR_PTR(-ENOMEM); + if (!bkt) + goto out; + pprev = NULL; + flags = rht_lock(tbl, bkt); + + if (unlikely(rcu_access_pointer(tbl->future_tbl))) { +slow_path: + rht_unlock(tbl, bkt, flags); + rcu_read_unlock(); + return rhashtable_insert_slow(ht, key, obj); + } + + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *plist; + struct rhlist_head *list; + + elasticity--; + if (!key || + (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } + + data = rht_obj(ht, head); + + if (!rhlist) + goto out_unlock; + + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + if (pprev) { + rcu_assign_pointer(*pprev, obj); + rht_unlock(tbl, bkt, flags); + } else + rht_assign_unlock(tbl, bkt, obj, flags); + data = NULL; + goto out; + } + + if (elasticity <= 0) + goto slow_path; + + data = ERR_PTR(-E2BIG); + if (unlikely(rht_grow_above_max(ht, tbl))) + goto out_unlock; + + if (unlikely(rht_grow_above_100(ht, tbl))) + goto slow_path; + + /* Inserting at head of list makes unlocking free. */ + head = rht_ptr(bkt, tbl, hash); + + RCU_INIT_POINTER(obj->next, head); + if (rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } + + atomic_inc(&ht->nelems); + rht_assign_unlock(tbl, bkt, obj, flags); + + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); + + data = NULL; +out: + rcu_read_unlock(); + + return data; + +out_unlock: + rht_unlock(tbl, bkt, flags); + goto out; +} + +/** + * rhashtable_insert_fast - insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Will take the per bucket bitlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhashtable_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + void *ret; + + ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; +} + +/** + * rhltable_insert_key - insert object into hash list table + * @hlt: hash list table + * @key: the pointer to the key + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take the per bucket bitlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhltable_insert_key( + struct rhltable *hlt, const void *key, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, + params, true)); +} + +/** + * rhltable_insert - insert object into hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take the per bucket bitlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhltable_insert( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + const char *key = rht_obj(&hlt->ht, &list->rhead); + + key += params.key_offset; + + return rhltable_insert_key(hlt, key, list, params); +} + +/** + * rhashtable_lookup_insert_fast - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * This lookup function may only be used for fixed key hash table (key_len + * parameter set). It will BUG() if used inappropriately. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhashtable_lookup_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + const char *key = rht_obj(ht, obj); + void *ret; + + BUG_ON(ht->p.obj_hashfn); + + ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, + false); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; +} + +/** + * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Just like rhashtable_lookup_insert_fast(), but this function returns the + * object if it exists, NULL if it did not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + const char *key = rht_obj(ht, obj); + + BUG_ON(ht->p.obj_hashfn); + + return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, + false); +} + +/** + * rhashtable_lookup_insert_key - search and insert object to hash table + * with explicit key + * @ht: hash table + * @key: key + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Lookups may occur in parallel with hashtable mutations and resizing. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + * + * Returns zero on success. + */ +static inline int rhashtable_lookup_insert_key( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params) +{ + void *ret; + + BUG_ON(!ht->p.obj_hashfn || !key); + + ret = __rhashtable_insert_fast(ht, key, obj, params, false); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; +} + +/** + * rhashtable_lookup_get_insert_key - lookup and insert object into hash table + * @ht: hash table + * @key: key + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Just like rhashtable_lookup_insert_key(), but this function returns the + * object if it exists, NULL if it does not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_key( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params) +{ + BUG_ON(!ht->p.obj_hashfn || !key); + + return __rhashtable_insert_fast(ht, key, obj, params, false); +} + +/* Internal function, please use rhashtable_remove_fast() instead */ +static inline int __rhashtable_remove_fast_one( + struct rhashtable *ht, struct bucket_table *tbl, + struct rhash_head *obj, const struct rhashtable_params params, + bool rhlist) +{ + struct rhash_lock_head __rcu **bkt; + struct rhash_head __rcu **pprev; + struct rhash_head *he; + unsigned long flags; + unsigned int hash; + int err = -ENOENT; + + hash = rht_head_hashfn(ht, tbl, obj, params); + bkt = rht_bucket_var(tbl, hash); + if (!bkt) + return -ENOENT; + pprev = NULL; + flags = rht_lock(tbl, bkt); + + rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *list; + + list = container_of(he, struct rhlist_head, rhead); + + if (he != obj) { + struct rhlist_head __rcu **lpprev; + + pprev = &he->next; + + if (!rhlist) + continue; + + do { + lpprev = &list->next; + list = rht_dereference_bucket(list->next, + tbl, hash); + } while (list && obj != &list->rhead); + + if (!list) + continue; + + list = rht_dereference_bucket(list->next, tbl, hash); + RCU_INIT_POINTER(*lpprev, list); + err = 0; + break; + } + + obj = rht_dereference_bucket(obj->next, tbl, hash); + err = 1; + + if (rhlist) { + list = rht_dereference_bucket(list->next, tbl, hash); + if (list) { + RCU_INIT_POINTER(list->rhead.next, obj); + obj = &list->rhead; + err = 0; + } + } + + if (pprev) { + rcu_assign_pointer(*pprev, obj); + rht_unlock(tbl, bkt, flags); + } else { + rht_assign_unlock(tbl, bkt, obj, flags); + } + goto unlocked; + } + + rht_unlock(tbl, bkt, flags); +unlocked: + if (err > 0) { + atomic_dec(&ht->nelems); + if (unlikely(ht->p.automatic_shrinking && + rht_shrink_below_30(ht, tbl))) + schedule_work(&ht->run_work); + err = 0; + } + + return err; +} + +/* Internal function, please use rhashtable_remove_fast() instead */ +static inline int __rhashtable_remove_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params, bool rhlist) +{ + struct bucket_table *tbl; + int err; + + rcu_read_lock(); + + tbl = rht_dereference_rcu(ht->tbl, ht); + + /* Because we have already taken (and released) the bucket + * lock in old_tbl, if we find that future_tbl is not yet + * visible then that guarantees the entry to still be in + * the old tbl if it exists. + */ + while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, + rhlist)) && + (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) + ; + + rcu_read_unlock(); + + return err; +} + +/** + * rhashtable_remove_fast - remove object from hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerable slow if the hash table is not correctly sized. + * + * Will automatically shrink the table if permitted when residency drops + * below 30%. + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ +static inline int rhashtable_remove_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(ht, obj, params, false); +} + +/** + * rhltable_remove - remove object from hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerably slower if the hash table is not correctly sized. + * + * Will automatically shrink the table if permitted when residency drops + * below 30% + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ +static inline int rhltable_remove( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); +} + +/* Internal function, please use rhashtable_replace_fast() instead */ +static inline int __rhashtable_replace_fast( + struct rhashtable *ht, struct bucket_table *tbl, + struct rhash_head *obj_old, struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct rhash_lock_head __rcu **bkt; + struct rhash_head __rcu **pprev; + struct rhash_head *he; + unsigned long flags; + unsigned int hash; + int err = -ENOENT; + + /* Minimally, the old and new objects must have same hash + * (which should mean identifiers are the same). + */ + hash = rht_head_hashfn(ht, tbl, obj_old, params); + if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) + return -EINVAL; + + bkt = rht_bucket_var(tbl, hash); + if (!bkt) + return -ENOENT; + + pprev = NULL; + flags = rht_lock(tbl, bkt); + + rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { + if (he != obj_old) { + pprev = &he->next; + continue; + } + + rcu_assign_pointer(obj_new->next, obj_old->next); + if (pprev) { + rcu_assign_pointer(*pprev, obj_new); + rht_unlock(tbl, bkt, flags); + } else { + rht_assign_unlock(tbl, bkt, obj_new, flags); + } + err = 0; + goto unlocked; + } + + rht_unlock(tbl, bkt, flags); + +unlocked: + return err; +} + +/** + * rhashtable_replace_fast - replace an object in hash table + * @ht: hash table + * @obj_old: pointer to hash head inside object being replaced + * @obj_new: pointer to hash head inside object which is new + * @params: hash table parameters + * + * Replacing an object doesn't affect the number of elements in the hash table + * or bucket, so we don't need to worry about shrinking or expanding the + * table here. + * + * Returns zero on success, -ENOENT if the entry could not be found, + * -EINVAL if hash is not the same for the old and new objects. + */ +static inline int rhashtable_replace_fast( + struct rhashtable *ht, struct rhash_head *obj_old, + struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct bucket_table *tbl; + int err; + + rcu_read_lock(); + + tbl = rht_dereference_rcu(ht->tbl, ht); + + /* Because we have already taken (and released) the bucket + * lock in old_tbl, if we find that future_tbl is not yet + * visible then that guarantees the entry to still be in + * the old tbl if it exists. + */ + while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, + obj_new, params)) && + (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) + ; + + rcu_read_unlock(); + + return err; +} + +/** + * rhltable_walk_enter - Initialise an iterator + * @hlt: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptable context, but cannot be called from softirq or + * hardirq context. + * + * You must call rhashtable_walk_exit after this function returns. + */ +static inline void rhltable_walk_enter(struct rhltable *hlt, + struct rhashtable_iter *iter) +{ + rhashtable_walk_enter(&hlt->ht, iter); +} + +/** + * rhltable_free_and_destroy - free elements and destroy hash list table + * @hlt: the hash list table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * See documentation for rhashtable_free_and_destroy. + */ +static inline void rhltable_free_and_destroy(struct rhltable *hlt, + void (*free_fn)(void *ptr, + void *arg), + void *arg) +{ + rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); +} + +static inline void rhltable_destroy(struct rhltable *hlt) +{ + rhltable_free_and_destroy(hlt, NULL, NULL); +} + +#endif /* _LINUX_RHASHTABLE_H */ From 10e9735532acfe9b5960579518f8e22a03fb65bf Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 12 May 2025 11:29:09 -0700 Subject: [PATCH 311/625] Make tests build with rhashtable.c from kernel --- test/Makefile | 29 ++++++++++++++++++++++++-- test/mock.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/test/Makefile b/test/Makefile index 48150271..a470a252 100644 --- a/test/Makefile +++ b/test/Makefile @@ -2,6 +2,7 @@ LINUX_VERSION ?= $(shell uname -r) KDIR ?= /lib/modules/$(LINUX_VERSION)/build +LINUX_SRC_DIR ?= /ouster/linux-stable CC ?= gcc CXX ?= g++ PERL ?= perl @@ -36,7 +37,7 @@ endif WARNS := -Wall -Wundef -Wno-trigraphs -Wno-sign-compare \ -Wno-strict-aliasing -Wunused-but-set-variable -Werror -CFLAGS := $(WARNS) -Wstrict-prototypes -MD -g $(CINCLUDES) $(DEFS) +CFLAGS := $(WARNS) -Wstrict-prototypes -MD -no-pie -g $(CINCLUDES) $(DEFS) CCFLAGS := -std=c++11 $(WARNS) -MD -g $(CCINCLUDES) $(DEFS) -fsanitize=address TEST_SRCS := unit_homa_incoming.c \ @@ -78,7 +79,7 @@ HOMA_SRCS += homa_grant.c \ homa_offload.c \ homa_skb.c endif -HOMA_OBJS := $(patsubst %.c,%.o,$(HOMA_SRCS)) +HOMA_OBJS := $(patsubst %.c,%.o,$(HOMA_SRCS)) rhashtable.o OTHER_SRCS := ccutils.cc \ main.c \ @@ -107,6 +108,30 @@ CLEANS = unit $(OBJS) *.d .deps %.e: %.cc $(CXX) -E $(CCFLAGS) $< -o $@ +dummyFile: + $(CXX) -c $(CCFLAGS) \ + -I $(LINUX_SRC_DIR)/include \ + -I $(LINUX_SRC_DIR)/arch/x86/include \ + -I $(LINUX_SRC_DIR)/arch/x86/include/generated \ + -include $(LINUX_SRC_DIR)/include/linux/kconfig.h \ + -O2 $< -o $@ + +# Note: Without -O2 there will be strange compiler errors such as +# 'asm operand 2 probably does not match constraints'. +rhashtable.o: rhashtable.c + gcc \ + -I .. -I . \ + -I $(LINUX_SRC_DIR)/arch/x86/include \ + -I $(LINUX_SRC_DIR)/arch/x86/include/generated \ + -I $(LINUX_SRC_DIR)/include \ + -I $(LINUX_SRC_DIR)/include/uapi \ + -include $(LINUX_SRC_DIR)/include/linux/kconfig.h \ + -D__KERNEL__ -D__UNIT_TEST__ -O2 -g -std=gnu11 \ + -fno-strict-aliasing \ + -DKBUILD_MODFILE='"lib/rhashtable"' -DKBUILD_BASENAME='"rhashtable"' \ + -DKBUILD_MODNAME='"rhashtable"' -D__KBUILD_MODNAME=kmod_rhashtable \ + -c $< -o $@ + unit: $(OBJS) $(CXX) $(CFLAGS) $^ -o $@ -lasan diff --git a/test/mock.c b/test/mock.c index 8e977f85..ca2cf82a 100644 --- a/test/mock.c +++ b/test/mock.c @@ -261,7 +261,11 @@ struct net_hotdata net_hotdata = { .rps_sock_flow_table = (struct rps_sock_flow_table *) sock_flow_table }; int debug_locks; +struct static_call_key __SCK__cond_resched; struct static_call_key __SCK__might_resched; +struct static_call_key __SCK__preempt_schedule; +struct paravirt_patch_template pv_ops; +struct workqueue_struct *system_wq; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map rcu_lock_map; @@ -312,6 +316,19 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, return 0; } +void BUG_func(void) +{} + +void call_rcu(struct rcu_head *head, void free_func(struct rcu_head *head)) +{ + free_func(head); +} + +bool cancel_work_sync(struct work_struct *work) +{ + return false; +} + void __check_object_size(const void *ptr, unsigned long n, bool to_user) {} size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) @@ -437,6 +454,11 @@ void get_random_bytes(void *buf, size_t nbytes) memset(buf, 0, nbytes); } +u32 get_random_u32(void) +{ + return 0; +} + int hrtimer_cancel(struct hrtimer *timer) { return 0; @@ -894,6 +916,16 @@ void *__kmalloc_noprof(size_t size, gfp_t flags) return mock_kmalloc(size, flags); } +void kvfree(const void *addr) +{ + kfree(addr); +} + +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + return mock_kmalloc(size, flags); +} + struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], @@ -1000,6 +1032,12 @@ int netif_receive_skb(struct sk_buff *skb) return 0; } +void preempt_count_add(int val) +{} + +void preempt_count_sub(int val) +{} + long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { @@ -1087,6 +1125,12 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) return NULL; } +bool queue_work_on(int cpu, struct workqueue_struct *wq, + struct work_struct *work) +{ + return true; +} + void _raw_spin_lock(raw_spinlock_t *lock) { mock_record_locked(lock); @@ -1121,6 +1165,12 @@ int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) return 1; } +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +{ + UNIT_HOOK("unlock"); + mock_record_unlocked(lock); +} + void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { UNIT_HOOK("unlock"); @@ -1204,11 +1254,19 @@ signed long schedule_timeout(signed long timeout) return timeout - 1; } +int __SCT__cond_resched(void) +{ + return 0; +} + int __SCT__might_resched(void) { return 0; } +void __SCT__preempt_schedule(void) +{} + void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic) {} From ae500abca5f29fad54715ceb92a76f260a693e4a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 12 May 2025 15:31:26 -0700 Subject: [PATCH 312/625] Refactor homa_peer to use rhashtable --- homa_devel.c | 35 +- homa_incoming.c | 4 +- homa_metrics.c | 2 - homa_metrics.h | 5 - homa_outgoing.c | 2 +- homa_peer.c | 287 ++++----- homa_peer.h | 111 +++- homa_rpc.c | 4 +- test/Makefile | 8 +- test/mock.c | 33 +- test/mock.h | 8 + test/rhashtable.h | 1286 ------------------------------------- test/unit_homa_incoming.c | 9 +- test/unit_homa_outgoing.c | 5 +- test/unit_homa_peer.c | 326 +++++----- test/unit_homa_utils.c | 2 +- 16 files changed, 440 insertions(+), 1687 deletions(-) delete mode 100644 test/rhashtable.h diff --git a/homa_devel.c b/homa_devel.c index ba0d4268..66fb4ade 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -377,9 +377,10 @@ void homa_freeze_peers(struct homa *homa) { struct homa_socktab_scan scan; struct homa_freeze_hdr freeze; - struct homa_peer **peers; - int num_peers, i, err; + struct rhashtable_iter iter; + struct homa_peer *peer; struct homa_sock *hsk; + int err; /* Find a socket to use (any will do). */ rcu_read_lock(); @@ -390,25 +391,35 @@ void homa_freeze_peers(struct homa *homa) goto done; } - peers = homa_peertab_get_peers(homa->peers, &num_peers); - if (!peers) { - tt_record("homa_freeze_peers couldn't find peers to freeze"); - goto done; - } freeze.common.type = FREEZE; freeze.common.sport = htons(hsk->port); freeze.common.dport = 0; IF_NO_STRIP(freeze.common.flags = HOMA_TCP_FLAGS); IF_NO_STRIP(freeze.common.urgent = htons(HOMA_TCP_URGENT)); freeze.common.sender_id = 0; - for (i = 0; i < num_peers; i++) { - tt_record1("Sending freeze to 0x%x", tt_addr(peers[i]->addr)); - err = __homa_xmit_control(&freeze, sizeof(freeze), peers[i], hsk); + + rhashtable_walk_enter(&homa->peers->ht, &iter); + rhashtable_walk_start(&iter); + while (true) { + peer = rhashtable_walk_next(&iter); + if (!peer) + break; + if (IS_ERR(peer)) + /* Resize event occurred and walk will restart; + * that could result in duplicate freezes, but + * that's OK. + */ + continue; + if (peer->ht_key.homa != homa) + continue; + tt_record1("Sending freeze to 0x%x", tt_addr(peer->addr)); + err = __homa_xmit_control(&freeze, sizeof(freeze), peer, hsk); if (err != 0) tt_record2("homa_freeze_peers got error %d in xmit to 0x%x\n", - err, tt_addr(peers[i]->addr)); + err, tt_addr(peer->addr)); } - kfree(peers); + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); done: rcu_read_unlock(); diff --git a/homa_incoming.c b/homa_incoming.c index b89dc364..3565d123 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -891,7 +891,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) struct homa_peer *peer; int i; - peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); + peer = homa_peer_find(hsk->homa, &saddr, &hsk->inet); if (!IS_ERR(peer)) { peer->unsched_cutoffs[0] = INT_MAX; for (i = 1; i < HOMA_MAX_PRIORITIES; i++) @@ -939,7 +939,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, #endif /* See strip.py */ goto done; } else { - peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); + peer = homa_peer_find(hsk->homa, &saddr, &hsk->inet); if (IS_ERR(peer)) goto done; } diff --git a/homa_metrics.c b/homa_metrics.c index 55cabe56..84569578 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -261,8 +261,6 @@ char *homa_metrics_print(void) m->throttled_cycles); M("resent_packets %15llu DATA packets sent in response to RESENDs\n", m->resent_packets); - M("peer_hash_links %15llu Hash chain link traversals in peer table\n", - m->peer_hash_links); M("peer_new_entries %15llu New entries created in peer table\n", m->peer_new_entries); M("peer_kmalloc_errors %15llu kmalloc failures creating peer table entries\n", diff --git a/homa_metrics.h b/homa_metrics.h index 8c9f42bf..228ec5c2 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -304,11 +304,6 @@ struct homa_metrics { */ u64 resent_packets; - /** - * @peer_hash_links: total # of link traversals in homa_peer_find. - */ - u64 peer_hash_links; - /** * @peer_new_entries: total # of new entries created in Homa's * peer table (this value doesn't increment if the desired peer is diff --git a/homa_outgoing.c b/homa_outgoing.c index bca01104..1ba7bc97 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -571,7 +571,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) unknown.common.urgent = htons(HOMA_TCP_URGENT); #endif /* See strip.py */ unknown.common.sender_id = cpu_to_be64(homa_local_id(h->sender_id)); - peer = homa_peer_find(hsk->homa->peers, &saddr, &hsk->inet); + peer = homa_peer_find(hsk->homa, &saddr, &hsk->inet); if (!IS_ERR(peer)) __homa_xmit_control(&unknown, sizeof(unknown), peer, hsk); homa_peer_put(peer); diff --git a/homa_peer.c b/homa_peer.c index 3f7cf097..5964cfd0 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -8,6 +8,15 @@ #include "homa_peer.h" #include "homa_rpc.h" +const struct rhashtable_params ht_params = { + .key_len = sizeof(struct homa_peer_key), + .key_offset = offsetof(struct homa_peer, ht_key), + .head_offset = offsetof(struct homa_peer, ht_linkage), + .nelem_hint = 10000, + .hashfn = homa_peer_hash, + .obj_cmpfn = homa_peer_compare +}; + /** * homa_peertab_init() - Constructor for homa_peertabs. * @peertab: The object to initialize; previous contents are discarded. @@ -20,199 +29,88 @@ int homa_peertab_init(struct homa_peertab *peertab) * safe to call homa_peertab_destroy, even if this function returns * an error. */ - int i; + int status; - spin_lock_init(&peertab->write_lock); - peertab->buckets = vmalloc(HOMA_PEERTAB_BUCKETS * - sizeof(*peertab->buckets)); - if (!peertab->buckets) - return -ENOMEM; - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) - INIT_HLIST_HEAD(&peertab->buckets[i]); - return 0; + status = rhashtable_init(&peertab->ht, &ht_params); + peertab->live = (status == 0); + return status; } /** - * homa_peertab_destroy() - Destructor for homa_peertabs. After this - * function returns, it is unsafe to use any results from previous calls - * to homa_peer_find, since all existing homa_peer objects will have been - * destroyed. - * @peertab: The table to destroy. + * homa_peertab_free_fn() - This function is invoked for each entry in + * the peer hash table by the rhashtable code when the table is being + * deleted. It frees its argument. + * @object: struct homa_peer to free. + * @dummy: Not used. */ -void homa_peertab_destroy(struct homa_peertab *peertab) +void homa_peertab_free_fn(void *object, void *dummy) { - struct hlist_node *next; - struct homa_peer *peer; - int i; + struct homa_peer *peer = object; - if (!peertab->buckets) - return; - - spin_lock_bh(&peertab->write_lock); - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], - peertab_links) { - if (atomic_read(&peer->refs) != 0) #ifdef __UNIT_TEST__ - FAIL(" %s found peer %s with reference count %d", - __func__, - homa_print_ipv6_addr(&peer->addr), - atomic_read(&peer->refs)); - + if (atomic_read(&peer->refs) != 0) { + if (!mock_peertab_free_fn_no_complain) + FAIL(" %s found peer %s with reference count %d", + __func__, + homa_print_ipv6_addr(&peer->addr), + atomic_read(&peer->refs)); + } #else /* __UNIT_TEST__ */ - pr_err("%s found peer with reference count %d", - __func__, atomic_read(&peer->refs)); + if (atomic_read(&peer->refs) != 0) + pr_err("%s found peer with reference count %d", + __func__, atomic_read(&peer->refs)); #endif - dst_release(peer->dst); - kfree(peer); - } - } - vfree(peertab->buckets); - spin_unlock_bh(&peertab->write_lock); + else + homa_peer_free(peer); } -#ifndef __UPSTREAM__ /* See strip.py */ /** - * homa_peertab_get_peers() - Return information about all of the peers - * currently known - * @peertab: The table to search for peers. - * @num_peers: Modified to hold the number of peers returned. - * Return: kmalloced array holding pointers to all known peers. The - * caller must free this. If there is an error, or if there - * are no peers, NULL is returned. Note: if a large number - * of new peers are created while this function executes, - * then the results may not be complete. + * homa_peertab_destroy() - Destructor for homa_peertabs. After this + * function returns, it is unsafe to use any results from previous calls + * to homa_peer_find, since all existing homa_peer objects will have been + * destroyed. + * @peertab: The table to destroy. */ -struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, - int *num_peers) +void homa_peertab_destroy(struct homa_peertab *peertab) { - int i, slots, next_slot; - struct homa_peer **result; - struct homa_peer *peer; - - /* Note: RCU must be used in the iterators below to ensure safety - * with concurrent insertions. Technically, rcu_read_lock and - * rcu_read_unlock shouldn't be necessary because we don't have to - * worry about concurrent deletions. But without them, some sanity - * checkers will complain. - */ - rcu_read_lock(); - - /* Figure out how large an array to allocate. */ - slots = 0; - next_slot = 0; - result = NULL; - if (peertab->buckets) { - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - hlist_for_each_entry_rcu(peer, &peertab->buckets[i], - peertab_links) - slots++; - } + if (peertab->live) { + rhashtable_free_and_destroy(&peertab->ht, homa_peertab_free_fn, + NULL); + peertab->live = false; } - if (slots == 0) - goto done; - - /* Allocate extra space in case new peers got created while we - * were counting. - */ - slots += 10; - result = kmalloc_array(slots, sizeof(peer), GFP_ATOMIC); - if (!result) - goto done; - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - hlist_for_each_entry_rcu(peer, &peertab->buckets[i], - peertab_links) { - result[next_slot] = peer; - next_slot++; - - /* We might not have allocated enough extra space. */ - if (next_slot >= slots) - goto done; - } - } -done: - rcu_read_unlock(); - *num_peers = next_slot; - return result; } -#endif /* See strip.py */ /** - * homa_peer_find() - Returns the peer associated with a given host; creates - * a new homa_peer if one doesn't already exist. - * @peertab: Peer table in which to perform lookup. + * homa_peer_alloc() - Allocate and initialize a new homa_peer object. + * @homa: Homa context in which the peer will be used. * @addr: Address of the desired host: IPv4 addresses are represented * as IPv4-mapped IPv6 addresses. * @inet: Socket that will be used for sending packets. - * * Return: The peer associated with @addr, or a negative errno if an * error occurred. On a successful return the reference count - * will be incremented for the returned peer. The caller must - * eventually call homa_peer_put to release the reference. + * will be incremented for the returned peer. */ -struct homa_peer *homa_peer_find(struct homa_peertab *peertab, - const struct in6_addr *addr, - struct inet_sock *inet) +struct homa_peer *homa_peer_alloc(struct homa *homa, + const struct in6_addr *addr, + struct inet_sock *inet) { struct homa_peer *peer; struct dst_entry *dst; - // Should use siphash or jhash here: - u32 bucket = hash_32((__force u32)addr->in6_u.u6_addr32[0], - HOMA_PEERTAB_BUCKET_BITS); - - bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[1], - HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[2], - HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[3], - HOMA_PEERTAB_BUCKET_BITS); - - /* Use RCU operators to ensure safety even if a concurrent call is - * adding a new entry. The calls to rcu_read_lock and rcu_read_unlock - * shouldn't actually be needed, since we don't need to protect - * against concurrent deletion. - */ - rcu_read_lock(); - hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], - peertab_links) { - if (ipv6_addr_equal(&peer->addr, addr)) { - homa_peer_hold(peer); - rcu_read_unlock(); - return peer; - } - INC_METRIC(peer_hash_links, 1); - } - rcu_read_unlock(); - - /* No existing entry; create a new one. - * - * Note: after we acquire the lock, we have to check again to - * make sure the entry still doesn't exist (it might have been - * created by a concurrent invocation of this function). - */ - spin_lock_bh(&peertab->write_lock); - hlist_for_each_entry(peer, &peertab->buckets[bucket], - peertab_links) { - if (ipv6_addr_equal(&peer->addr, addr)) { - homa_peer_hold(peer); - goto done; - } - } peer = kmalloc(sizeof(*peer), GFP_ATOMIC | __GFP_ZERO); if (!peer) { - peer = (struct homa_peer *)ERR_PTR(-ENOMEM); INC_METRIC(peer_kmalloc_errors, 1); - goto done; + return (struct homa_peer *)ERR_PTR(-ENOMEM); } + peer->ht_key.addr = *addr; + peer->ht_key.homa = homa; atomic_set(&peer->refs, 1); peer->addr = *addr; dst = homa_peer_get_dst(peer, inet); if (IS_ERR(dst)) { - kfree(peer); - peer = (struct homa_peer *)PTR_ERR(dst); INC_METRIC(peer_route_errors, 1); - goto done; + kfree(peer); + return (struct homa_peer *)dst; } peer->dst = dst; #ifndef __STRIP__ /* See strip.py */ @@ -221,14 +119,83 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, INIT_LIST_HEAD(&peer->grantable_rpcs); INIT_LIST_HEAD(&peer->grantable_links); #endif /* See strip.py */ - smp_wmb(); - hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]); peer->current_ticks = -1; spin_lock_init(&peer->ack_lock); INC_METRIC(peer_new_entries, 1); + return peer; +} + +/** + * homa_peer_free() - Release any resources in a peer and free the homa_peer + * struct. + * @peer: Structure to free. Must not currently be linked into + * peertab->ht. + */ +void homa_peer_free(struct homa_peer *peer) +{ + dst_release(peer->dst); + kfree(peer); +} + +/** + * homa_peer_find() - Returns the peer associated with a given host; creates + * a new homa_peer if one doesn't already exist. + * @homa: Homa context in which the peer will be used. + * @addr: Address of the desired host: IPv4 addresses are represented + * as IPv4-mapped IPv6 addresses. + * @inet: Socket that will be used for sending packets. + * + * Return: The peer associated with @addr, or a negative errno if an + * error occurred. On a successful return the reference count + * will be incremented for the returned peer. The caller must + * eventually call homa_peer_put to release the reference. + */ +struct homa_peer *homa_peer_find(struct homa *homa, + const struct in6_addr *addr, + struct inet_sock *inet) +{ + struct homa_peer *peer, *other; + struct homa_peer_key key; + u64 start = homa_clock(); -done: - spin_unlock_bh(&peertab->write_lock); + key.addr = *addr; + key.homa = homa; + rcu_read_lock(); + peer = rhashtable_lookup(&homa->peers->ht, &key, ht_params); + if (peer) { + homa_peer_hold(peer); + rcu_read_unlock(); + tt_record1("homa_peer_find took %d cycles to find existing peer", + homa_clock() - start); + return peer; + } + + /* No existing entry, so we have to create a new one. */ + peer = homa_peer_alloc(homa, addr, inet); + if (IS_ERR(peer)) { + rcu_read_unlock(); + return peer; + } +#ifdef __UNIT_TEST__ + other = mock_rht_lookup_get_insert_fast(&homa->peers->ht, + &peer->ht_linkage, ht_params); +#else /* __UNIT_TEST__ */ + other = rhashtable_lookup_get_insert_fast(&homa->peers->ht, + &peer->ht_linkage, ht_params); +#endif /* __UNIT_TEST__ */ + if (IS_ERR(other)) { + homa_peer_free(peer); + rcu_read_unlock(); + return other; + } + if (other) { + /* Someone else already created the desired peer. */ + homa_peer_hold(other); + rcu_read_unlock(); + homa_peer_free(peer); + return other; + } + rcu_read_unlock(); return peer; } @@ -288,7 +255,7 @@ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, * @peer: The peer for which a dst is needed. Note: this peer's flow * struct will be overwritten. * @inet: Socket that will be used for sending packets. - * Return: The dst structure (or an ERR_PTR). + * Return: The dst structure (or an ERR_PTR); a reference has been taken. */ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, struct inet_sock *inet) diff --git a/homa_peer.h b/homa_peer.h index b3d244d1..dd21afdf 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -10,6 +10,8 @@ #include "homa_wire.h" #include "homa_sock.h" +#include + struct homa_rpc; /** @@ -50,18 +52,28 @@ struct homa_dead_dst { * permit efficient lookups. */ struct homa_peertab { + /** @ht: Hash table that stores all struct peers. */ + struct rhashtable ht; + /** - * @write_lock: Synchronizes addition of new entries; not needed - * for lookups (RCU is used instead). + * @live: True means ht has been successfully initialized and + * not yet destructed. */ - spinlock_t write_lock; + bool live; +}; +/** + * struct homa_peer_key - Used to look up homa_peer structs in an rhashtable. + */ +struct homa_peer_key { /** - * @buckets: Pointer to heads of chains of homa_peers for each bucket. - * Malloc-ed, and must eventually be freed. NULL means this structure - * has not been initialized. + * @addr: Address of the desired host. IPv4 addresses are represented + * with IPv4-mapped IPv6 addresses. */ - struct hlist_head *buckets; + struct in6_addr addr; + + /** @homa: The context in which the peer will be used. */ + struct homa *homa; }; /** @@ -69,17 +81,25 @@ struct homa_peertab { * have communicated with (either as client or server). */ struct homa_peer { + /** @key: The hash table key for this peer in peertab->ht. */ + struct homa_peer_key ht_key; + + /** + * @ht: Used by rashtable implement to link this peer into peertab->ht. + */ + struct rhash_head ht_linkage; + /** * @refs: Number of unmatched calls to homa_peer_hold; it's not safe * to free this object until the reference count is zero. */ - atomic_t refs; + atomic_t refs ____cacheline_aligned_in_smp; /** * @addr: IPv6 address for the machine (IPv4 addresses are stored * as IPv4-mapped IPv6 addresses). */ - struct in6_addr addr; + struct in6_addr addr ____cacheline_aligned_in_smp; /** @flow: Addressing info needed to send packets. */ struct flowi flow; @@ -125,7 +145,8 @@ struct homa_peer { * involving this peer that are not in homa->active_rpcs but * whose msgins eventually need more grants. The list is sorted in * priority order (head has fewest ungranted bytes). Managed by - * homa_grant.c under the grant lock. + * homa_grant.c under the grant lock. If this list is nonempty + * then refs will be nonzero. */ struct list_head grantable_rpcs; @@ -133,17 +154,11 @@ struct homa_peer { * @grantable_links: Used to link this peer into homa->grantable_peers. * If this RPC is not linked into homa->grantable_peers, this is an * empty list pointing to itself. Managed by homa_grant.c under the - * grant lock.abort + * grant lock. If this list is nonempty then refs will be nonzero. */ struct list_head grantable_links; #endif /* See strip.py */ - /** - * @peertab_links: Links this object into a bucket of its - * homa_peertab. - */ - struct hlist_node peertab_links; - /** * @outstanding_resends: the number of resend requests we have * sent to this server (spaced @homa.resend_interval apart) since @@ -205,22 +220,20 @@ struct homa_peer { void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, struct homa_sock *hsk); void homa_peertab_destroy(struct homa_peertab *peertab); -#ifndef __UPSTREAM__ /* See strip.py */ -struct homa_peer ** - homa_peertab_get_peers(struct homa_peertab *peertab, - int *num_peers); -#endif /* See strip.py */ +void homa_peertab_free_fn(void *object, void *dummy); int homa_peertab_init(struct homa_peertab *peertab); void homa_peer_add_ack(struct homa_rpc *rpc); struct homa_peer - *homa_peer_find(struct homa_peertab *peertab, - const struct in6_addr *addr, - struct inet_sock *inet); + *homa_peer_alloc(struct homa *homa, const struct in6_addr *addr, + struct inet_sock *inet); +struct homa_peer + *homa_peer_find(struct homa *homa, const struct in6_addr *addr, + struct inet_sock *inet); +void homa_peer_free(struct homa_peer *peer); int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst); struct dst_entry - *homa_peer_get_dst(struct homa_peer *peer, - struct inet_sock *inet); + *homa_peer_get_dst(struct homa_peer *peer, struct inet_sock *inet); #ifndef __STRIP__ /* See strip.py */ void homa_peer_lock_slow(struct homa_peer *peer); void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, @@ -299,4 +312,48 @@ static inline void homa_peer_put(struct homa_peer *peer) atomic_dec(&peer->refs); } +static inline u32 homa_peer_hash(const void *data, u32 dummy, u32 seed) +{ + /* This is MurmurHash3, used instead of the jhash default because it + * is faster (25 ns vs. 40 ns as of May 2025). + */ + BUILD_BUG_ON(sizeof(struct homa_peer_key) & 0x3); + const u32 len = sizeof(struct homa_peer_key) >> 2; + const u32 c1 = 0xcc9e2d51; + const u32 c2 = 0x1b873593; + const u32 *key = data; + u32 h = seed; + + + for (size_t i = 0; i < len; i++) { + u32 k = key[i]; + k *= c1; + k = (k << 15) | (k >> (32 - 15)); + k *= c2; + + h ^= k; + h = (h << 13) | (h >> (32 - 13)); + h = h * 5 + 0xe6546b64; + } + + h ^= len * 4; // Total number of input bytes + + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} + +static inline int homa_peer_compare(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct homa_peer *peer = obj; + const struct homa_peer_key *key = arg->key; + + return !ipv6_addr_equal(&key->addr, &peer->ht_key.addr) && + peer->ht_key.homa == key->homa; +} + #endif /* _HOMA_PEER_H */ diff --git a/homa_rpc.c b/homa_rpc.c index 7f334a42..c524c48d 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -46,7 +46,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, bucket = homa_client_rpc_bucket(hsk, crpc->id); crpc->bucket = bucket; crpc->state = RPC_OUTGOING; - crpc->peer = homa_peer_find(hsk->homa->peers, &dest_addr_as_ipv6, + crpc->peer = homa_peer_find(hsk->homa, &dest_addr_as_ipv6, &hsk->inet); if (IS_ERR(crpc->peer)) { tt_record("error in homa_peer_find"); @@ -148,7 +148,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, srpc->hsk = hsk; srpc->bucket = bucket; srpc->state = RPC_INCOMING; - srpc->peer = homa_peer_find(hsk->homa->peers, source, &hsk->inet); + srpc->peer = homa_peer_find(hsk->homa, source, &hsk->inet); if (IS_ERR(srpc->peer)) { err = PTR_ERR(srpc->peer); srpc->peer = NULL; diff --git a/test/Makefile b/test/Makefile index a470a252..0b04acf8 100644 --- a/test/Makefile +++ b/test/Makefile @@ -119,13 +119,7 @@ dummyFile: # Note: Without -O2 there will be strange compiler errors such as # 'asm operand 2 probably does not match constraints'. rhashtable.o: rhashtable.c - gcc \ - -I .. -I . \ - -I $(LINUX_SRC_DIR)/arch/x86/include \ - -I $(LINUX_SRC_DIR)/arch/x86/include/generated \ - -I $(LINUX_SRC_DIR)/include \ - -I $(LINUX_SRC_DIR)/include/uapi \ - -include $(LINUX_SRC_DIR)/include/linux/kconfig.h \ + gcc $(CINCLUDES) \ -D__KERNEL__ -D__UNIT_TEST__ -O2 -g -std=gnu11 \ -fno-strict-aliasing \ -DKBUILD_MODFILE='"lib/rhashtable"' -DKBUILD_BASENAME='"rhashtable"' \ diff --git a/test/mock.c b/test/mock.c index ca2cf82a..858aad01 100644 --- a/test/mock.c +++ b/test/mock.c @@ -14,6 +14,8 @@ #include "mock.h" #include "utils.h" +#include + /* It isn't safe to include some header files, such as stdlib, because * they conflict with kernel header files. The explicit declarations * below replace those header files. @@ -46,6 +48,7 @@ int mock_kthread_create_errors; int mock_prepare_to_wait_errors; int mock_register_protosw_errors; int mock_register_sysctl_errors; +int mock_rht_insert_errors; int mock_route_errors; int mock_spin_lock_held; int mock_trylock_errors; @@ -228,6 +231,11 @@ static struct socket mock_socket; static struct homa *mock_homa; struct net mock_net; +/* Nonzero means don't generate an error message in homa_peertabe_free_fn + * if the reference count isn't zero. + */ +int mock_peertab_free_fn_no_complain; + struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; struct netdev_queue mock_net_queue = {.state = 0}; struct net_device mock_net_device = { @@ -1033,10 +1041,20 @@ int netif_receive_skb(struct sk_buff *skb) } void preempt_count_add(int val) -{} +{ + int i; + + for (i = 0; i < val; i++) + preempt_disable(); +} void preempt_count_sub(int val) -{} +{ + int i; + + for (i = 0; i < val; i++) + preempt_enable(); +} long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) @@ -1241,6 +1259,15 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} +void *mock_rht_lookup_get_insert_fast(struct rhashtable *ht, + struct rhash_head *obj, + const struct rhashtable_params params) +{ + if (mock_check_error(&mock_rht_insert_errors)) + return ERR_PTR(-EINVAL); + return rhashtable_lookup_get_insert_fast(ht, obj, params); +} + void schedule(void) { UNIT_HOOK("schedule"); @@ -2035,6 +2062,7 @@ void mock_teardown(void) mock_prepare_to_wait_errors = 0; mock_register_protosw_errors = 0; mock_register_sysctl_errors = 0; + mock_rht_insert_errors = 0; mock_wait_intr_irq_errors = 0; mock_copy_to_user_dont_copy = 0; mock_bpage_size = 0x10000; @@ -2060,6 +2088,7 @@ void mock_teardown(void) mock_min_default_port = 0x8000; mock_homa = NULL; homa_net_id = 0; + mock_peertab_free_fn_no_complain = 0; mock_net_device.gso_max_size = 0; mock_net_device.gso_max_segs = 1000; memset(inet_offloads, 0, sizeof(inet_offloads)); diff --git a/test/mock.h b/test/mock.h index 725d3008..9f73eddc 100644 --- a/test/mock.h +++ b/test/mock.h @@ -50,6 +50,9 @@ #define kthread_complete_and_exit(...) +#undef local_irq_save +#define local_irq_save(flags) (flags) = 0 + #define net_generic(net, id) mock_net_generic(net, id) #ifdef page_address @@ -141,8 +144,10 @@ extern struct net_device mock_net_device; extern int mock_numa_mask; extern int mock_page_nid_mask; +extern int mock_peertab_free_fn_no_complain; extern int mock_prepare_to_wait_status; extern char mock_printk_output[]; +extern int mock_rht_insert_errors; extern int mock_route_errors; extern int mock_signal_pending; extern int mock_sock_holds; @@ -186,6 +191,9 @@ struct ctl_table_header * mock_register_net_sysctl(struct net *net, const char *path, struct ctl_table *table); +void *mock_rht_lookup_get_insert_fast(struct rhashtable *ht, + struct rhash_head *obj, + const struct rhashtable_params params); void mock_rpc_hold(struct homa_rpc *rpc); void mock_rpc_put(struct homa_rpc *rpc); void mock_set_clock_vals(u64 t, ...); diff --git a/test/rhashtable.h b/test/rhashtable.h deleted file mode 100644 index 6c85b28e..00000000 --- a/test/rhashtable.h +++ /dev/null @@ -1,1286 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Resizable, Scalable, Concurrent Hash Table - * - * Copyright (c) 2015-2016 Herbert Xu - * Copyright (c) 2014-2015 Thomas Graf - * Copyright (c) 2008-2014 Patrick McHardy - * - * Code partially derived from nft_hash - * Rewritten with rehash code from br_multicast plus single list - * pointer as suggested by Josh Triplett - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef _LINUX_RHASHTABLE_H -#define _LINUX_RHASHTABLE_H - -#include -#include -#include -#include -#include -#include -#include - -#include -/* - * Objects in an rhashtable have an embedded struct rhash_head - * which is linked into as hash chain from the hash table - or one - * of two or more hash tables when the rhashtable is being resized. - * The end of the chain is marked with a special nulls marks which has - * the least significant bit set but otherwise stores the address of - * the hash bucket. This allows us to be sure we've found the end - * of the right list. - * The value stored in the hash bucket has BIT(0) used as a lock bit. - * This bit must be atomically set before any changes are made to - * the chain. To avoid dereferencing this pointer without clearing - * the bit first, we use an opaque 'struct rhash_lock_head *' for the - * pointer stored in the bucket. This struct needs to be defined so - * that rcu_dereference() works on it, but it has no content so a - * cast is needed for it to be useful. This ensures it isn't - * used by mistake with clearing the lock bit first. - */ -struct rhash_lock_head {}; - -/* Maximum chain length before rehash - * - * The maximum (not average) chain length grows with the size of the hash - * table, at a rate of (log N)/(log log N). - * - * The value of 16 is selected so that even if the hash table grew to - * 2^32 you would not expect the maximum chain length to exceed it - * unless we are under attack (or extremely unlucky). - * - * As this limit is only to detect attacks, we don't need to set it to a - * lower value as you'd need the chain length to vastly exceed 16 to have - * any real effect on the system. - */ -#define RHT_ELASTICITY 16u - -/** - * struct bucket_table - Table of hash buckets - * @size: Number of hash buckets - * @nest: Number of bits of first-level nested table. - * @rehash: Current bucket being rehashed - * @hash_rnd: Random seed to fold into hash - * @walkers: List of active walkers - * @rcu: RCU structure for freeing the table - * @future_tbl: Table under construction during rehashing - * @ntbl: Nested table used when out of memory. - * @buckets: size * hash buckets - */ -struct bucket_table { - unsigned int size; - unsigned int nest; - u32 hash_rnd; - struct list_head walkers; - struct rcu_head rcu; - - struct bucket_table __rcu *future_tbl; - - struct lockdep_map dep_map; - - struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; -}; - -/* - * NULLS_MARKER() expects a hash value with the low - * bits mostly likely to be significant, and it discards - * the msb. - * We give it an address, in which the bottom bit is - * always 0, and the msb might be significant. - * So we shift the address down one bit to align with - * expectations and avoid losing a significant bit. - * - * We never store the NULLS_MARKER in the hash table - * itself as we need the lsb for locking. - * Instead we store a NULL - */ -#define RHT_NULLS_MARKER(ptr) \ - ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) -#define INIT_RHT_NULLS_HEAD(ptr) \ - ((ptr) = NULL) - -static inline bool rht_is_a_nulls(const struct rhash_head *ptr) -{ - return ((unsigned long) ptr & 1); -} - -static inline void *rht_obj(const struct rhashtable *ht, - const struct rhash_head *he) -{ - return (char *)he - ht->p.head_offset; -} - -static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, - unsigned int hash) -{ - return hash & (tbl->size - 1); -} - -static inline unsigned int rht_key_get_hash(struct rhashtable *ht, - const void *key, const struct rhashtable_params params, - unsigned int hash_rnd) -{ - unsigned int hash; - - /* params must be equal to ht->p if it isn't constant. */ - if (!__builtin_constant_p(params.key_len)) - hash = ht->p.hashfn(key, ht->key_len, hash_rnd); - else if (params.key_len) { - unsigned int key_len = params.key_len; - - if (params.hashfn) - hash = params.hashfn(key, key_len, hash_rnd); - else if (key_len & (sizeof(u32) - 1)) - hash = jhash(key, key_len, hash_rnd); - else - hash = jhash2(key, key_len / sizeof(u32), hash_rnd); - } else { - unsigned int key_len = ht->p.key_len; - - if (params.hashfn) - hash = params.hashfn(key, key_len, hash_rnd); - else - hash = jhash(key, key_len, hash_rnd); - } - - return hash; -} - -static inline unsigned int rht_key_hashfn( - struct rhashtable *ht, const struct bucket_table *tbl, - const void *key, const struct rhashtable_params params) -{ - unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); - - return rht_bucket_index(tbl, hash); -} - -static inline unsigned int rht_head_hashfn( - struct rhashtable *ht, const struct bucket_table *tbl, - const struct rhash_head *he, const struct rhashtable_params params) -{ - const char *ptr = rht_obj(ht, he); - - return likely(params.obj_hashfn) ? - rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?: - ht->p.key_len, - tbl->hash_rnd)) : - rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); -} - -/** - * rht_grow_above_75 - returns true if nelems > 0.75 * table-size - * @ht: hash table - * @tbl: current table - */ -static inline bool rht_grow_above_75(const struct rhashtable *ht, - const struct bucket_table *tbl) -{ - /* Expand table when exceeding 75% load */ - return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) && - (!ht->p.max_size || tbl->size < ht->p.max_size); -} - -/** - * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size - * @ht: hash table - * @tbl: current table - */ -static inline bool rht_shrink_below_30(const struct rhashtable *ht, - const struct bucket_table *tbl) -{ - /* Shrink table beneath 30% load */ - return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) && - tbl->size > ht->p.min_size; -} - -/** - * rht_grow_above_100 - returns true if nelems > table-size - * @ht: hash table - * @tbl: current table - */ -static inline bool rht_grow_above_100(const struct rhashtable *ht, - const struct bucket_table *tbl) -{ - return atomic_read(&ht->nelems) > tbl->size && - (!ht->p.max_size || tbl->size < ht->p.max_size); -} - -/** - * rht_grow_above_max - returns true if table is above maximum - * @ht: hash table - * @tbl: current table - */ -static inline bool rht_grow_above_max(const struct rhashtable *ht, - const struct bucket_table *tbl) -{ - return atomic_read(&ht->nelems) >= ht->max_elems; -} - -#ifdef CONFIG_PROVE_LOCKING -int lockdep_rht_mutex_is_held(struct rhashtable *ht); -int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); -#else -static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) -{ - return 1; -} - -static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, - u32 hash) -{ - return 1; -} -#endif /* CONFIG_PROVE_LOCKING */ - -void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, - struct rhash_head *obj); - -void rhashtable_walk_enter(struct rhashtable *ht, - struct rhashtable_iter *iter); -void rhashtable_walk_exit(struct rhashtable_iter *iter); -int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); - -static inline void rhashtable_walk_start(struct rhashtable_iter *iter) -{ - (void)rhashtable_walk_start_check(iter); -} - -void *rhashtable_walk_next(struct rhashtable_iter *iter); -void *rhashtable_walk_peek(struct rhashtable_iter *iter); -void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); - -void rhashtable_free_and_destroy(struct rhashtable *ht, - void (*free_fn)(void *ptr, void *arg), - void *arg); -void rhashtable_destroy(struct rhashtable *ht); - -struct rhash_lock_head __rcu **rht_bucket_nested( - const struct bucket_table *tbl, unsigned int hash); -struct rhash_lock_head __rcu **__rht_bucket_nested( - const struct bucket_table *tbl, unsigned int hash); -struct rhash_lock_head __rcu **rht_bucket_nested_insert( - struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); - -#define rht_dereference(p, ht) \ - rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) - -#define rht_dereference_rcu(p, ht) \ - rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) - -#define rht_dereference_bucket(p, tbl, hash) \ - rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) - -#define rht_dereference_bucket_rcu(p, tbl, hash) \ - rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) - -#define rht_entry(tpos, pos, member) \ - ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) - -static inline struct rhash_lock_head __rcu *const *rht_bucket( - const struct bucket_table *tbl, unsigned int hash) -{ - return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : - &tbl->buckets[hash]; -} - -static inline struct rhash_lock_head __rcu **rht_bucket_var( - struct bucket_table *tbl, unsigned int hash) -{ - return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : - &tbl->buckets[hash]; -} - -static inline struct rhash_lock_head __rcu **rht_bucket_insert( - struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) -{ - return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : - &tbl->buckets[hash]; -} - -/* - * We lock a bucket by setting BIT(0) in the pointer - this is always - * zero in real pointers. The NULLS mark is never stored in the bucket, - * rather we store NULL if the bucket is empty. - * bit_spin_locks do not handle contention well, but the whole point - * of the hashtable design is to achieve minimum per-bucket contention. - * A nested hash table might not have a bucket pointer. In that case - * we cannot get a lock. For remove and replace the bucket cannot be - * interesting and doesn't need locking. - * For insert we allocate the bucket if this is the last bucket_table, - * and then take the lock. - * Sometimes we unlock a bucket by writing a new pointer there. In that - * case we don't need to unlock, but we do need to reset state such as - * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() - * provides the same release semantics that bit_spin_unlock() provides, - * this is safe. - * When we write to a bucket without unlocking, we use rht_assign_locked(). - */ - -static inline unsigned long rht_lock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt) -{ - unsigned long flags; - - local_irq_save(flags); - bit_spin_lock(0, (unsigned long *)bkt); - lock_map_acquire(&tbl->dep_map); - return flags; -} - -static inline unsigned long rht_lock_nested(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bucket, - unsigned int subclass) -{ - unsigned long flags; - - local_irq_save(flags); - bit_spin_lock(0, (unsigned long *)bucket); - lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); - return flags; -} - -static inline void rht_unlock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt, - unsigned long flags) -{ - lock_map_release(&tbl->dep_map); - bit_spin_unlock(0, (unsigned long *)bkt); - local_irq_restore(flags); -} - -static inline struct rhash_head *__rht_ptr( - struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) -{ - return (struct rhash_head *) - ((unsigned long)p & ~BIT(0) ?: - (unsigned long)RHT_NULLS_MARKER(bkt)); -} - -/* - * Where 'bkt' is a bucket and might be locked: - * rht_ptr_rcu() dereferences that pointer and clears the lock bit. - * rht_ptr() dereferences in a context where the bucket is locked. - * rht_ptr_exclusive() dereferences in a context where exclusive - * access is guaranteed, such as when destroying the table. - */ -static inline struct rhash_head *rht_ptr_rcu( - struct rhash_lock_head __rcu *const *bkt) -{ - return __rht_ptr(rcu_dereference(*bkt), bkt); -} - -static inline struct rhash_head *rht_ptr( - struct rhash_lock_head __rcu *const *bkt, - struct bucket_table *tbl, - unsigned int hash) -{ - return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); -} - -static inline struct rhash_head *rht_ptr_exclusive( - struct rhash_lock_head __rcu *const *bkt) -{ - return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); -} - -static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, - struct rhash_head *obj) -{ - if (rht_is_a_nulls(obj)) - obj = NULL; - rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); -} - -static inline void rht_assign_unlock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt, - struct rhash_head *obj, - unsigned long flags) -{ - if (rht_is_a_nulls(obj)) - obj = NULL; - lock_map_release(&tbl->dep_map); - rcu_assign_pointer(*bkt, (void *)obj); - preempt_enable(); - __release(bitlock); - local_irq_restore(flags); -} - -/** - * rht_for_each_from - iterate over hash chain from given head - * @pos: the &struct rhash_head to use as a loop cursor. - * @head: the &struct rhash_head to start from - * @tbl: the &struct bucket_table - * @hash: the hash value / bucket index - */ -#define rht_for_each_from(pos, head, tbl, hash) \ - for (pos = head; \ - !rht_is_a_nulls(pos); \ - pos = rht_dereference_bucket((pos)->next, tbl, hash)) - -/** - * rht_for_each - iterate over hash chain - * @pos: the &struct rhash_head to use as a loop cursor. - * @tbl: the &struct bucket_table - * @hash: the hash value / bucket index - */ -#define rht_for_each(pos, tbl, hash) \ - rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ - tbl, hash) - -/** - * rht_for_each_entry_from - iterate over hash chain from given head - * @tpos: the type * to use as a loop cursor. - * @pos: the &struct rhash_head to use as a loop cursor. - * @head: the &struct rhash_head to start from - * @tbl: the &struct bucket_table - * @hash: the hash value / bucket index - * @member: name of the &struct rhash_head within the hashable struct. - */ -#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ - for (pos = head; \ - (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ - pos = rht_dereference_bucket((pos)->next, tbl, hash)) - -/** - * rht_for_each_entry - iterate over hash chain of given type - * @tpos: the type * to use as a loop cursor. - * @pos: the &struct rhash_head to use as a loop cursor. - * @tbl: the &struct bucket_table - * @hash: the hash value / bucket index - * @member: name of the &struct rhash_head within the hashable struct. - */ -#define rht_for_each_entry(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_from(tpos, pos, \ - rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ - tbl, hash, member) - -/** - * rht_for_each_entry_safe - safely iterate over hash chain of given type - * @tpos: the type * to use as a loop cursor. - * @pos: the &struct rhash_head to use as a loop cursor. - * @next: the &struct rhash_head to use as next in loop cursor. - * @tbl: the &struct bucket_table - * @hash: the hash value / bucket index - * @member: name of the &struct rhash_head within the hashable struct. - * - * This hash chain list-traversal primitive allows for the looped code to - * remove the loop cursor from the list. - */ -#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ - for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ - next = !rht_is_a_nulls(pos) ? \ - rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ - (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ - pos = next, \ - next = !rht_is_a_nulls(pos) ? \ - rht_dereference_bucket(pos->next, tbl, hash) : NULL) - -/** - * rht_for_each_rcu_from - iterate over rcu hash chain from given head - * @pos: the &struct rhash_head to use as a loop cursor. - * @head: the &struct rhash_head to start from - * @tbl: the &struct bucket_table - * @hash: the hash value / bucket index - * - * This hash chain list-traversal primitive may safely run concurrently with - * the _rcu mutation primitives such as rhashtable_insert() as long as the - * traversal is guarded by rcu_read_lock(). - */ -#define rht_for_each_rcu_from(pos, head, tbl, hash) \ - for (({barrier(); }), \ - pos = head; \ - !rht_is_a_nulls(pos); \ - pos = rcu_dereference_raw(pos->next)) - -/** - * rht_for_each_rcu - iterate over rcu hash chain - * @pos: the &struct rhash_head to use as a loop cursor. - * @tbl: the &struct bucket_table - * @hash: the hash value / bucket index - * - * This hash chain list-traversal primitive may safely run concurrently with - * the _rcu mutation primitives such as rhashtable_insert() as long as the - * traversal is guarded by rcu_read_lock(). - */ -#define rht_for_each_rcu(pos, tbl, hash) \ - for (({barrier(); }), \ - pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ - !rht_is_a_nulls(pos); \ - pos = rcu_dereference_raw(pos->next)) - -/** - * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head - * @tpos: the type * to use as a loop cursor. - * @pos: the &struct rhash_head to use as a loop cursor. - * @head: the &struct rhash_head to start from - * @tbl: the &struct bucket_table - * @hash: the hash value / bucket index - * @member: name of the &struct rhash_head within the hashable struct. - * - * This hash chain list-traversal primitive may safely run concurrently with - * the _rcu mutation primitives such as rhashtable_insert() as long as the - * traversal is guarded by rcu_read_lock(). - */ -#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ - for (({barrier(); }), \ - pos = head; \ - (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ - pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) - -/** - * rht_for_each_entry_rcu - iterate over rcu hash chain of given type - * @tpos: the type * to use as a loop cursor. - * @pos: the &struct rhash_head to use as a loop cursor. - * @tbl: the &struct bucket_table - * @hash: the hash value / bucket index - * @member: name of the &struct rhash_head within the hashable struct. - * - * This hash chain list-traversal primitive may safely run concurrently with - * the _rcu mutation primitives such as rhashtable_insert() as long as the - * traversal is guarded by rcu_read_lock(). - */ -#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_rcu_from(tpos, pos, \ - rht_ptr_rcu(rht_bucket(tbl, hash)), \ - tbl, hash, member) - -/** - * rhl_for_each_rcu - iterate over rcu hash table list - * @pos: the &struct rlist_head to use as a loop cursor. - * @list: the head of the list - * - * This hash chain list-traversal primitive should be used on the - * list returned by rhltable_lookup. - */ -#define rhl_for_each_rcu(pos, list) \ - for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) - -/** - * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type - * @tpos: the type * to use as a loop cursor. - * @pos: the &struct rlist_head to use as a loop cursor. - * @list: the head of the list - * @member: name of the &struct rlist_head within the hashable struct. - * - * This hash chain list-traversal primitive should be used on the - * list returned by rhltable_lookup. - */ -#define rhl_for_each_entry_rcu(tpos, pos, list, member) \ - for (pos = list; pos && rht_entry(tpos, pos, member); \ - pos = rcu_dereference_raw(pos->next)) - -static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, - const void *obj) -{ - struct rhashtable *ht = arg->ht; - const char *ptr = obj; - - return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); -} - -/* Internal function, do not use. */ -static inline struct rhash_head *__rhashtable_lookup( - struct rhashtable *ht, const void *key, - const struct rhashtable_params params) -{ - struct rhashtable_compare_arg arg = { - .ht = ht, - .key = key, - }; - struct rhash_lock_head __rcu *const *bkt; - struct bucket_table *tbl; - struct rhash_head *he; - unsigned int hash; - - tbl = rht_dereference_rcu(ht->tbl, ht); -restart: - hash = rht_key_hashfn(ht, tbl, key, params); - bkt = rht_bucket(tbl, hash); - do { - rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { - if (params.obj_cmpfn ? - params.obj_cmpfn(&arg, rht_obj(ht, he)) : - rhashtable_compare(&arg, rht_obj(ht, he))) - continue; - return he; - } - /* An object might have been moved to a different hash chain, - * while we walk along it - better check and retry. - */ - } while (he != RHT_NULLS_MARKER(bkt)); - - /* Ensure we see any new tables. */ - smp_rmb(); - - tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (unlikely(tbl)) - goto restart; - - return NULL; -} - -/** - * rhashtable_lookup - search hash table - * @ht: hash table - * @key: the pointer to the key - * @params: hash table parameters - * - * Computes the hash value for the key and traverses the bucket chain looking - * for an entry with an identical key. The first matching entry is returned. - * - * This must only be called under the RCU read lock. - * - * Returns the first entry on which the compare function returned true. - */ -static inline void *rhashtable_lookup( - struct rhashtable *ht, const void *key, - const struct rhashtable_params params) -{ - struct rhash_head *he = __rhashtable_lookup(ht, key, params); - - return he ? rht_obj(ht, he) : NULL; -} - -/** - * rhashtable_lookup_fast - search hash table, without RCU read lock - * @ht: hash table - * @key: the pointer to the key - * @params: hash table parameters - * - * Computes the hash value for the key and traverses the bucket chain looking - * for an entry with an identical key. The first matching entry is returned. - * - * Only use this function when you have other mechanisms guaranteeing - * that the object won't go away after the RCU read lock is released. - * - * Returns the first entry on which the compare function returned true. - */ -static inline void *rhashtable_lookup_fast( - struct rhashtable *ht, const void *key, - const struct rhashtable_params params) -{ - void *obj; - - rcu_read_lock(); - obj = rhashtable_lookup(ht, key, params); - rcu_read_unlock(); - - return obj; -} - -/** - * rhltable_lookup - search hash list table - * @hlt: hash table - * @key: the pointer to the key - * @params: hash table parameters - * - * Computes the hash value for the key and traverses the bucket chain looking - * for an entry with an identical key. All matching entries are returned - * in a list. - * - * This must only be called under the RCU read lock. - * - * Returns the list of entries that match the given key. - */ -static inline struct rhlist_head *rhltable_lookup( - struct rhltable *hlt, const void *key, - const struct rhashtable_params params) -{ - struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); - - return he ? container_of(he, struct rhlist_head, rhead) : NULL; -} - -/* Internal function, please use rhashtable_insert_fast() instead. This - * function returns the existing element already in hashes if there is a clash, - * otherwise it returns an error via ERR_PTR(). - */ -static inline void *__rhashtable_insert_fast( - struct rhashtable *ht, const void *key, struct rhash_head *obj, - const struct rhashtable_params params, bool rhlist) -{ - struct rhashtable_compare_arg arg = { - .ht = ht, - .key = key, - }; - struct rhash_lock_head __rcu **bkt; - struct rhash_head __rcu **pprev; - struct bucket_table *tbl; - struct rhash_head *head; - unsigned long flags; - unsigned int hash; - int elasticity; - void *data; - - rcu_read_lock(); - - tbl = rht_dereference_rcu(ht->tbl, ht); - hash = rht_head_hashfn(ht, tbl, obj, params); - elasticity = RHT_ELASTICITY; - bkt = rht_bucket_insert(ht, tbl, hash); - data = ERR_PTR(-ENOMEM); - if (!bkt) - goto out; - pprev = NULL; - flags = rht_lock(tbl, bkt); - - if (unlikely(rcu_access_pointer(tbl->future_tbl))) { -slow_path: - rht_unlock(tbl, bkt, flags); - rcu_read_unlock(); - return rhashtable_insert_slow(ht, key, obj); - } - - rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { - struct rhlist_head *plist; - struct rhlist_head *list; - - elasticity--; - if (!key || - (params.obj_cmpfn ? - params.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head)))) { - pprev = &head->next; - continue; - } - - data = rht_obj(ht, head); - - if (!rhlist) - goto out_unlock; - - - list = container_of(obj, struct rhlist_head, rhead); - plist = container_of(head, struct rhlist_head, rhead); - - RCU_INIT_POINTER(list->next, plist); - head = rht_dereference_bucket(head->next, tbl, hash); - RCU_INIT_POINTER(list->rhead.next, head); - if (pprev) { - rcu_assign_pointer(*pprev, obj); - rht_unlock(tbl, bkt, flags); - } else - rht_assign_unlock(tbl, bkt, obj, flags); - data = NULL; - goto out; - } - - if (elasticity <= 0) - goto slow_path; - - data = ERR_PTR(-E2BIG); - if (unlikely(rht_grow_above_max(ht, tbl))) - goto out_unlock; - - if (unlikely(rht_grow_above_100(ht, tbl))) - goto slow_path; - - /* Inserting at head of list makes unlocking free. */ - head = rht_ptr(bkt, tbl, hash); - - RCU_INIT_POINTER(obj->next, head); - if (rhlist) { - struct rhlist_head *list; - - list = container_of(obj, struct rhlist_head, rhead); - RCU_INIT_POINTER(list->next, NULL); - } - - atomic_inc(&ht->nelems); - rht_assign_unlock(tbl, bkt, obj, flags); - - if (rht_grow_above_75(ht, tbl)) - schedule_work(&ht->run_work); - - data = NULL; -out: - rcu_read_unlock(); - - return data; - -out_unlock: - rht_unlock(tbl, bkt, flags); - goto out; -} - -/** - * rhashtable_insert_fast - insert object into hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * @params: hash table parameters - * - * Will take the per bucket bitlock to protect against mutual mutations - * on the same bucket. Multiple insertions may occur in parallel unless - * they map to the same bucket. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if residency in the - * table grows beyond 70%. - */ -static inline int rhashtable_insert_fast( - struct rhashtable *ht, struct rhash_head *obj, - const struct rhashtable_params params) -{ - void *ret; - - ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); - if (IS_ERR(ret)) - return PTR_ERR(ret); - - return ret == NULL ? 0 : -EEXIST; -} - -/** - * rhltable_insert_key - insert object into hash list table - * @hlt: hash list table - * @key: the pointer to the key - * @list: pointer to hash list head inside object - * @params: hash table parameters - * - * Will take the per bucket bitlock to protect against mutual mutations - * on the same bucket. Multiple insertions may occur in parallel unless - * they map to the same bucket. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if residency in the - * table grows beyond 70%. - */ -static inline int rhltable_insert_key( - struct rhltable *hlt, const void *key, struct rhlist_head *list, - const struct rhashtable_params params) -{ - return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, - params, true)); -} - -/** - * rhltable_insert - insert object into hash list table - * @hlt: hash list table - * @list: pointer to hash list head inside object - * @params: hash table parameters - * - * Will take the per bucket bitlock to protect against mutual mutations - * on the same bucket. Multiple insertions may occur in parallel unless - * they map to the same bucket. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if residency in the - * table grows beyond 70%. - */ -static inline int rhltable_insert( - struct rhltable *hlt, struct rhlist_head *list, - const struct rhashtable_params params) -{ - const char *key = rht_obj(&hlt->ht, &list->rhead); - - key += params.key_offset; - - return rhltable_insert_key(hlt, key, list, params); -} - -/** - * rhashtable_lookup_insert_fast - lookup and insert object into hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * @params: hash table parameters - * - * This lookup function may only be used for fixed key hash table (key_len - * parameter set). It will BUG() if used inappropriately. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if residency in the - * table grows beyond 70%. - */ -static inline int rhashtable_lookup_insert_fast( - struct rhashtable *ht, struct rhash_head *obj, - const struct rhashtable_params params) -{ - const char *key = rht_obj(ht, obj); - void *ret; - - BUG_ON(ht->p.obj_hashfn); - - ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, - false); - if (IS_ERR(ret)) - return PTR_ERR(ret); - - return ret == NULL ? 0 : -EEXIST; -} - -/** - * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * @params: hash table parameters - * - * Just like rhashtable_lookup_insert_fast(), but this function returns the - * object if it exists, NULL if it did not and the insertion was successful, - * and an ERR_PTR otherwise. - */ -static inline void *rhashtable_lookup_get_insert_fast( - struct rhashtable *ht, struct rhash_head *obj, - const struct rhashtable_params params) -{ - const char *key = rht_obj(ht, obj); - - BUG_ON(ht->p.obj_hashfn); - - return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, - false); -} - -/** - * rhashtable_lookup_insert_key - search and insert object to hash table - * with explicit key - * @ht: hash table - * @key: key - * @obj: pointer to hash head inside object - * @params: hash table parameters - * - * Lookups may occur in parallel with hashtable mutations and resizing. - * - * Will trigger an automatic deferred table resizing if residency in the - * table grows beyond 70%. - * - * Returns zero on success. - */ -static inline int rhashtable_lookup_insert_key( - struct rhashtable *ht, const void *key, struct rhash_head *obj, - const struct rhashtable_params params) -{ - void *ret; - - BUG_ON(!ht->p.obj_hashfn || !key); - - ret = __rhashtable_insert_fast(ht, key, obj, params, false); - if (IS_ERR(ret)) - return PTR_ERR(ret); - - return ret == NULL ? 0 : -EEXIST; -} - -/** - * rhashtable_lookup_get_insert_key - lookup and insert object into hash table - * @ht: hash table - * @key: key - * @obj: pointer to hash head inside object - * @params: hash table parameters - * - * Just like rhashtable_lookup_insert_key(), but this function returns the - * object if it exists, NULL if it does not and the insertion was successful, - * and an ERR_PTR otherwise. - */ -static inline void *rhashtable_lookup_get_insert_key( - struct rhashtable *ht, const void *key, struct rhash_head *obj, - const struct rhashtable_params params) -{ - BUG_ON(!ht->p.obj_hashfn || !key); - - return __rhashtable_insert_fast(ht, key, obj, params, false); -} - -/* Internal function, please use rhashtable_remove_fast() instead */ -static inline int __rhashtable_remove_fast_one( - struct rhashtable *ht, struct bucket_table *tbl, - struct rhash_head *obj, const struct rhashtable_params params, - bool rhlist) -{ - struct rhash_lock_head __rcu **bkt; - struct rhash_head __rcu **pprev; - struct rhash_head *he; - unsigned long flags; - unsigned int hash; - int err = -ENOENT; - - hash = rht_head_hashfn(ht, tbl, obj, params); - bkt = rht_bucket_var(tbl, hash); - if (!bkt) - return -ENOENT; - pprev = NULL; - flags = rht_lock(tbl, bkt); - - rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { - struct rhlist_head *list; - - list = container_of(he, struct rhlist_head, rhead); - - if (he != obj) { - struct rhlist_head __rcu **lpprev; - - pprev = &he->next; - - if (!rhlist) - continue; - - do { - lpprev = &list->next; - list = rht_dereference_bucket(list->next, - tbl, hash); - } while (list && obj != &list->rhead); - - if (!list) - continue; - - list = rht_dereference_bucket(list->next, tbl, hash); - RCU_INIT_POINTER(*lpprev, list); - err = 0; - break; - } - - obj = rht_dereference_bucket(obj->next, tbl, hash); - err = 1; - - if (rhlist) { - list = rht_dereference_bucket(list->next, tbl, hash); - if (list) { - RCU_INIT_POINTER(list->rhead.next, obj); - obj = &list->rhead; - err = 0; - } - } - - if (pprev) { - rcu_assign_pointer(*pprev, obj); - rht_unlock(tbl, bkt, flags); - } else { - rht_assign_unlock(tbl, bkt, obj, flags); - } - goto unlocked; - } - - rht_unlock(tbl, bkt, flags); -unlocked: - if (err > 0) { - atomic_dec(&ht->nelems); - if (unlikely(ht->p.automatic_shrinking && - rht_shrink_below_30(ht, tbl))) - schedule_work(&ht->run_work); - err = 0; - } - - return err; -} - -/* Internal function, please use rhashtable_remove_fast() instead */ -static inline int __rhashtable_remove_fast( - struct rhashtable *ht, struct rhash_head *obj, - const struct rhashtable_params params, bool rhlist) -{ - struct bucket_table *tbl; - int err; - - rcu_read_lock(); - - tbl = rht_dereference_rcu(ht->tbl, ht); - - /* Because we have already taken (and released) the bucket - * lock in old_tbl, if we find that future_tbl is not yet - * visible then that guarantees the entry to still be in - * the old tbl if it exists. - */ - while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, - rhlist)) && - (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) - ; - - rcu_read_unlock(); - - return err; -} - -/** - * rhashtable_remove_fast - remove object from hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * @params: hash table parameters - * - * Since the hash chain is single linked, the removal operation needs to - * walk the bucket chain upon removal. The removal operation is thus - * considerable slow if the hash table is not correctly sized. - * - * Will automatically shrink the table if permitted when residency drops - * below 30%. - * - * Returns zero on success, -ENOENT if the entry could not be found. - */ -static inline int rhashtable_remove_fast( - struct rhashtable *ht, struct rhash_head *obj, - const struct rhashtable_params params) -{ - return __rhashtable_remove_fast(ht, obj, params, false); -} - -/** - * rhltable_remove - remove object from hash list table - * @hlt: hash list table - * @list: pointer to hash list head inside object - * @params: hash table parameters - * - * Since the hash chain is single linked, the removal operation needs to - * walk the bucket chain upon removal. The removal operation is thus - * considerably slower if the hash table is not correctly sized. - * - * Will automatically shrink the table if permitted when residency drops - * below 30% - * - * Returns zero on success, -ENOENT if the entry could not be found. - */ -static inline int rhltable_remove( - struct rhltable *hlt, struct rhlist_head *list, - const struct rhashtable_params params) -{ - return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); -} - -/* Internal function, please use rhashtable_replace_fast() instead */ -static inline int __rhashtable_replace_fast( - struct rhashtable *ht, struct bucket_table *tbl, - struct rhash_head *obj_old, struct rhash_head *obj_new, - const struct rhashtable_params params) -{ - struct rhash_lock_head __rcu **bkt; - struct rhash_head __rcu **pprev; - struct rhash_head *he; - unsigned long flags; - unsigned int hash; - int err = -ENOENT; - - /* Minimally, the old and new objects must have same hash - * (which should mean identifiers are the same). - */ - hash = rht_head_hashfn(ht, tbl, obj_old, params); - if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) - return -EINVAL; - - bkt = rht_bucket_var(tbl, hash); - if (!bkt) - return -ENOENT; - - pprev = NULL; - flags = rht_lock(tbl, bkt); - - rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { - if (he != obj_old) { - pprev = &he->next; - continue; - } - - rcu_assign_pointer(obj_new->next, obj_old->next); - if (pprev) { - rcu_assign_pointer(*pprev, obj_new); - rht_unlock(tbl, bkt, flags); - } else { - rht_assign_unlock(tbl, bkt, obj_new, flags); - } - err = 0; - goto unlocked; - } - - rht_unlock(tbl, bkt, flags); - -unlocked: - return err; -} - -/** - * rhashtable_replace_fast - replace an object in hash table - * @ht: hash table - * @obj_old: pointer to hash head inside object being replaced - * @obj_new: pointer to hash head inside object which is new - * @params: hash table parameters - * - * Replacing an object doesn't affect the number of elements in the hash table - * or bucket, so we don't need to worry about shrinking or expanding the - * table here. - * - * Returns zero on success, -ENOENT if the entry could not be found, - * -EINVAL if hash is not the same for the old and new objects. - */ -static inline int rhashtable_replace_fast( - struct rhashtable *ht, struct rhash_head *obj_old, - struct rhash_head *obj_new, - const struct rhashtable_params params) -{ - struct bucket_table *tbl; - int err; - - rcu_read_lock(); - - tbl = rht_dereference_rcu(ht->tbl, ht); - - /* Because we have already taken (and released) the bucket - * lock in old_tbl, if we find that future_tbl is not yet - * visible then that guarantees the entry to still be in - * the old tbl if it exists. - */ - while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, - obj_new, params)) && - (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) - ; - - rcu_read_unlock(); - - return err; -} - -/** - * rhltable_walk_enter - Initialise an iterator - * @hlt: Table to walk over - * @iter: Hash table Iterator - * - * This function prepares a hash table walk. - * - * Note that if you restart a walk after rhashtable_walk_stop you - * may see the same object twice. Also, you may miss objects if - * there are removals in between rhashtable_walk_stop and the next - * call to rhashtable_walk_start. - * - * For a completely stable walk you should construct your own data - * structure outside the hash table. - * - * This function may be called from any process context, including - * non-preemptable context, but cannot be called from softirq or - * hardirq context. - * - * You must call rhashtable_walk_exit after this function returns. - */ -static inline void rhltable_walk_enter(struct rhltable *hlt, - struct rhashtable_iter *iter) -{ - rhashtable_walk_enter(&hlt->ht, iter); -} - -/** - * rhltable_free_and_destroy - free elements and destroy hash list table - * @hlt: the hash list table to destroy - * @free_fn: callback to release resources of element - * @arg: pointer passed to free_fn - * - * See documentation for rhashtable_free_and_destroy. - */ -static inline void rhltable_free_and_destroy(struct rhltable *hlt, - void (*free_fn)(void *ptr, - void *arg), - void *arg) -{ - rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); -} - -static inline void rhltable_destroy(struct rhltable *hlt) -{ - rhltable_free_and_destroy(hlt, NULL, NULL); -} - -#endif /* _LINUX_RHASHTABLE_H */ diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index f2e0ae6a..90ccff74 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1042,8 +1042,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); - peer = homa_peer_find(self->homa.peers, self->server_ip, - &self->hsk.inet); + peer = homa_peer_find(&self->homa, self->server_ip, &self->hsk.inet); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(400, peer->cutoff_version); EXPECT_EQ(9, peer->unsched_cutoffs[1]); @@ -1818,7 +1817,7 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) mock_kmalloc_errors = 1; homa_cutoffs_pkt(skb, &self->hsk); EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); - peer = homa_peer_find(self->homa.peers, self->server_ip, + peer = homa_peer_find(&self->homa, self->server_ip, &self->hsk.inet); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(0, peer->cutoff_version); @@ -1895,8 +1894,8 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) { - struct homa_peer *peer = homa_peer_find(self->homa.peers, - self->server_ip, &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->homa, self->server_ip, + &self->hsk.inet); struct homa_need_ack_hdr h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 51131304..92fa180f 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -92,8 +92,9 @@ FIXTURE_SETUP(homa_outgoing) self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->peer = homa_peer_find(self->homa.peers, - &self->server_addr.in6.sin6_addr, &self->hsk.inet); + self->peer = homa_peer_find(&self->homa, + &self->server_addr.in6.sin6_addr, + &self->hsk.inet); unit_log_clear(); } FIXTURE_TEARDOWN(homa_outgoing) diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 267cce29..0c7cd789 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -13,33 +13,9 @@ struct in6_addr ip1111[1]; struct in6_addr ip2222[1]; struct in6_addr ip3333[1]; -#ifndef __STRIP__ /* See strip.py */ -static int hook_new_peer_count; -static struct homa_peertab *hook_peertab; -static struct homa_sock *hook_hsk; - -static void kmalloc_hook(char *id) -{ - int i; - - if (strcmp(id, "kmalloc") != 0) - return; - for (i = 0; i < hook_new_peer_count; i++) { - char addr_string[20]; - struct in6_addr addr; - - snprintf(addr_string, sizeof(addr_string), "10.0.0.%d", i); - addr = unit_get_in_addr(addr_string); - homa_peer_put(homa_peer_find(hook_peertab, &addr, - &hook_hsk->inet)); - } -} -#endif /* See strip.py */ - FIXTURE(homa_peer) { struct homa homa; struct homa_sock hsk; - struct homa_peertab peertab; struct in6_addr client_ip[1]; struct in6_addr server_ip[1]; int server_port; @@ -49,7 +25,6 @@ FIXTURE_SETUP(homa_peer) homa_init(&self->homa, &mock_net); mock_set_homa(&self->homa); mock_sock_init(&self->hsk, &self->homa, 0); - homa_peertab_init(&self->peertab); self->client_ip[0] = unit_get_in_addr("196.168.0.1"); self->server_ip[0] = unit_get_in_addr("1.2.3.4"); ip1111[0] = unit_get_in_addr("1::1:1:1"); @@ -59,7 +34,6 @@ FIXTURE_SETUP(homa_peer) } FIXTURE_TEARDOWN(homa_peer) { - homa_peertab_destroy(&self->peertab); homa_destroy(&self->homa); unit_teardown(); } @@ -73,203 +47,208 @@ static void peer_spinlock_hook(char *id) } #endif /* See strip.py */ -TEST_F(homa_peer, homa_peer_find__basics) -{ - struct homa_peer *peer, *peer2; - - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); - EXPECT_EQ(0, peer->cutoff_version); -#endif /* See strip.py */ - - peer2 = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - EXPECT_EQ(peer, peer2); - EXPECT_EQ(2, atomic_read(&peer->refs)); - - peer2 = homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); - EXPECT_NE(peer, peer2); - EXPECT_EQ(1, atomic_read(&peer2->refs)); - -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(2, homa_metrics_per_cpu()->peer_new_entries); -#endif /* See strip.py */ - homa_peer_put(peer); - homa_peer_put(peer); - homa_peer_put(peer2); -} - static struct _test_data_homa_peer *test_data; static struct homa_peer *conflicting_peer; -static int peer_lock_hook_invocations; -static void peer_lock_hook(char *id) +static int peer_race_hook_invocations; +static void peer_race_hook(char *id) { - if (strcmp(id, "spin_lock") != 0) + if (strcmp(id, "kmalloc") != 0) return; - if (peer_lock_hook_invocations > 0) + if (peer_race_hook_invocations > 0) return; - peer_lock_hook_invocations++; - /* Creates a peer with the same address as the one being created - * by the main test function below. + peer_race_hook_invocations++; + + /* Create a peer with the same address as the one being created + * by the current test. */ - conflicting_peer = homa_peer_find(&test_data->peertab, ip3333, - &test_data->hsk.inet); + conflicting_peer = homa_peer_find(&test_data->homa, ip3333, + &test_data->hsk.inet); homa_peer_put(conflicting_peer); } -TEST_F(homa_peer, homa_peertab_init__vmalloc_failed) +TEST_F(homa_peer, homa_peertab_init__success) { struct homa_peertab table; - mock_vmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_peertab_init(&table)); + EXPECT_EQ(0, -homa_peertab_init(&table)); + EXPECT_EQ(1, table.live); - /* Make sure destroy is safe after failed init. */ homa_peertab_destroy(&table); } -#ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_peer, homa_peertab_get_peers__not_init) +TEST_F(homa_peer, homa_peertab_free_fn__ref_count_zero) { - struct homa_peertab peertab; - int num_peers = 45; + struct homa_peer *peer; + struct dst_entry *dst; + + peer = homa_peer_alloc(&self->homa, ip3333, &self->hsk.inet); + dst = peer->dst; + dst_hold(dst); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + homa_peer_put(peer); - memset(&peertab, 0, sizeof(peertab)); - EXPECT_EQ(NULL, homa_peertab_get_peers(&peertab, &num_peers)); - EXPECT_EQ(0, num_peers); + homa_peertab_free_fn(peer, NULL); + EXPECT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); + dst_release(dst); } -TEST_F(homa_peer, homa_peertab_get_peers__table_empty) +TEST_F(homa_peer, homa_peertab_free_fn__bad_reference_count) { - int num_peers = 45; + struct homa_peer *peer; + struct dst_entry *dst; - EXPECT_EQ(NULL, homa_peertab_get_peers(&self->peertab, &num_peers)); - EXPECT_EQ(0, num_peers); + peer = homa_peer_alloc(&self->homa, ip3333, &self->hsk.inet); + dst = peer->dst; + dst_hold(dst); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + + mock_peertab_free_fn_no_complain = 1; + homa_peertab_free_fn(peer, NULL); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + dst_release(dst); + homa_peer_put(peer); + homa_peer_free(peer); } -TEST_F(homa_peer, homa_peertab_get_peers__kmalloc_fails) -{ - int num_peers = 45; - mock_kmalloc_errors = 1; - homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - EXPECT_EQ(NULL, homa_peertab_get_peers(&self->peertab, &num_peers)); - EXPECT_EQ(0, num_peers); +TEST_F(homa_peer, homa_peertab__destroy) { + /* First call: peertab live. */ + EXPECT_EQ(1, self->homa.peers->live); + homa_peertab_destroy(self->homa.peers); + EXPECT_EQ(0, self->homa.peers->live); + + /* Second call: peertab no longer live. */ + homa_peertab_destroy(self->homa.peers); + EXPECT_EQ(0, self->homa.peers->live); } -TEST_F(homa_peer, homa_peertab_get_peers__one_peer) + +TEST_F(homa_peer, homa_peer_alloc__success) { - struct homa_peer **peers; struct homa_peer *peer; - int num_peers = 45; - - peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - peers = homa_peertab_get_peers(&self->peertab, &num_peers); - ASSERT_NE(NULL, peers); - EXPECT_EQ(1, num_peers); - EXPECT_EQ(peer, peers[0]); - kfree(peers); - homa_peer_put(peer); -} -TEST_F(homa_peer, homa_peertab_get_peers__multiple_peers) -{ - struct homa_peer *peer1, *peer2, *peer3; - struct homa_peer **peers; - int num_peers = 45; - - peer1 = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); - peer2 = homa_peer_find(&self->peertab, ip2222, &self->hsk.inet); - peer3 = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - peers = homa_peertab_get_peers(&self->peertab, &num_peers); - ASSERT_NE(NULL, peers); - EXPECT_EQ(3, num_peers); - EXPECT_TRUE((peers[0] == peer1) || (peers[1] == peer1) - || (peers[2] == peer1)); - EXPECT_TRUE((peers[0] == peer2) || (peers[1] == peer2) - || (peers[2] == peer2)); - EXPECT_TRUE((peers[0] == peer3) || (peers[1] == peer3) - || (peers[2] == peer3)); - kfree(peers); - homa_peer_put(peer1); - homa_peer_put(peer2); - homa_peer_put(peer3); + + peer = homa_peer_alloc(&self->homa, ip1111, &self->hsk.inet); + ASSERT_FALSE(IS_ERR(peer)); + EXPECT_EQ_IP(*ip1111, peer->addr); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); + EXPECT_EQ(0, peer->cutoff_version); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_new_entries); +#endif /* See strip.py */ + EXPECT_EQ(1, atomic_read(&peer->dst->__rcuref.refcnt)); + homa_peer_free(peer); } -TEST_F(homa_peer, homa_peertab_get_peers__a_few_new_peers_created_concurrently) +TEST_F(homa_peer, homa_peer_alloc__kmalloc_error) { - struct homa_peer **peers; - int num_peers = 45; - - homa_peer_put(homa_peer_find(&self->peertab, ip1111, &self->hsk.inet)); - homa_peer_put(homa_peer_find(&self->peertab, ip2222, &self->hsk.inet)); - unit_hook_register(kmalloc_hook); - hook_hsk = &self->hsk; - hook_peertab = &self->peertab; - hook_new_peer_count = 3; - peers = homa_peertab_get_peers(&self->peertab, &num_peers); - ASSERT_NE(NULL, peers); - EXPECT_EQ(5, num_peers); - kfree(peers); + struct homa_peer *peer; + + mock_kmalloc_errors = 1; + peer = homa_peer_alloc(&self->homa, ip3333, &self->hsk.inet); + EXPECT_EQ(ENOMEM, -PTR_ERR(peer)); + +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); +#endif /* See strip.py */ } -TEST_F(homa_peer, homa_peertab_get_peers__many_new_peers_created_concurrently) +TEST_F(homa_peer, homa_peer_alloc__route_error) { - struct homa_peer **peers; - int num_peers = 45; - - homa_peer_put(homa_peer_find(&self->peertab, ip1111, &self->hsk.inet)); - homa_peer_put(homa_peer_find(&self->peertab, ip2222, &self->hsk.inet)); - unit_hook_register(kmalloc_hook); - hook_hsk = &self->hsk; - hook_peertab = &self->peertab; - hook_new_peer_count = 20; - peers = homa_peertab_get_peers(&self->peertab, &num_peers); - ASSERT_NE(NULL, peers); - EXPECT_EQ(12, num_peers); - kfree(peers); -} + struct homa_peer *peer; + + mock_route_errors = 1; + peer = homa_peer_alloc(&self->homa, ip3333, &self->hsk.inet); + EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); + +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); #endif /* See strip.py */ +} -TEST_F(homa_peer, homa_peer_find__conflicting_creates) +TEST_F(homa_peer, homa_peer_free) { struct homa_peer *peer; + struct dst_entry *dst; + + peer = homa_peer_alloc(&self->homa, ip1111, &self->hsk.inet); + ASSERT_FALSE(IS_ERR(peer)); + dst = peer->dst; + dst_hold(dst); + ASSERT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); - test_data = self; - peer_lock_hook_invocations = 0; - unit_hook_register(peer_lock_hook); - peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - EXPECT_NE(NULL, conflicting_peer); - EXPECT_EQ(conflicting_peer, peer); homa_peer_put(peer); + homa_peer_free(peer); + ASSERT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); + dst_release(dst); } -TEST_F(homa_peer, homa_peer_find__kmalloc_error) + +TEST_F(homa_peer, homa_peer_find__basics) { - struct homa_peer *peer; + struct homa_peer *peer, *peer2; - mock_kmalloc_errors = 1; - peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); - EXPECT_EQ(ENOMEM, -PTR_ERR(peer)); + /* First call: create new peer. */ + peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + ASSERT_FALSE(IS_ERR(peer)); + EXPECT_EQ_IP(*ip1111, peer->addr); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); + EXPECT_EQ(0, peer->cutoff_version); +#endif /* See strip.py */ + + /* Second call: lookup existing peer. */ + peer2 = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + EXPECT_EQ(peer, peer2); + EXPECT_EQ(2, atomic_read(&peer->refs)); + + /* Third call: lookup new peer. */ + peer2 = homa_peer_find(&self->homa, ip2222, &self->hsk.inet); + EXPECT_NE(peer, peer2); + ASSERT_FALSE(IS_ERR(peer2)); + EXPECT_EQ(1, atomic_read(&peer2->refs)); #ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); + EXPECT_EQ(2, homa_metrics_per_cpu()->peer_new_entries); #endif /* See strip.py */ + homa_peer_put(peer); + homa_peer_put(peer); + homa_peer_put(peer2); } -TEST_F(homa_peer, homa_peer_find__route_error) +TEST_F(homa_peer, homa_peer_find__error_in_homa_peer_alloc) { struct homa_peer *peer; mock_route_errors = 1; - peer = homa_peer_find(&self->peertab, ip3333, &self->hsk.inet); + peer = homa_peer_find(&self->homa, ip3333, &self->hsk.inet); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); #endif /* See strip.py */ } +TEST_F(homa_peer, homa_peer_find__conflicting_creates) +{ + struct homa_peer *peer; + + test_data = self; + peer_race_hook_invocations = 0; + unit_hook_register(peer_race_hook); + peer = homa_peer_find(&self->homa, ip3333, &self->hsk.inet); + EXPECT_FALSE(IS_ERR(conflicting_peer)); + EXPECT_EQ(conflicting_peer, peer); + EXPECT_EQ(1, atomic_read(&peer->refs)); + homa_peer_put(peer); +} +TEST_F(homa_peer, homa_peer_find__insert_error) +{ + struct homa_peer *peer; + + mock_rht_insert_errors = 1; + peer = homa_peer_find(&self->homa, ip3333, &self->hsk.inet); + EXPECT_TRUE(IS_ERR(peer)); + EXPECT_EQ(EINVAL, -PTR_ERR(peer)); +} TEST_F(homa_peer, homa_dst_refresh__basics) { struct dst_entry *old_dst; struct homa_peer *peer; - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); + peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); @@ -283,7 +262,7 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) struct dst_entry *old_dst; struct homa_peer *peer; - peer = homa_peer_find(&self->peertab, ip1111, &self->hsk.inet); + peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); @@ -319,8 +298,9 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv4) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - struct homa_peer *peer = homa_peer_find(&self->peertab, - &self->client_ip[0], &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->homa, + &self->client_ip[0], + &self->hsk.inet); ASSERT_NE(NULL, peer); dst = homa_peer_get_dst(peer, &self->hsk.inet); @@ -341,8 +321,8 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, &self->homa, 0); - struct homa_peer *peer = homa_peer_find(&self->peertab, &ip1111[0], - &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->homa, &ip1111[0], + &self->hsk.inet); ASSERT_NE(NULL, peer); dst = homa_peer_get_dst(peer, &self->hsk.inet); @@ -359,8 +339,8 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_peer, homa_peer_lock_slow) { - struct homa_peer *peer = homa_peer_find(&self->peertab, ip3333, - &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->homa, ip3333, + &self->hsk.inet); ASSERT_NE(NULL, peer); mock_clock = 10000; @@ -429,8 +409,8 @@ TEST_F(homa_peer, homa_peer_add_ack) TEST_F(homa_peer, homa_peer_get_acks) { - struct homa_peer *peer = homa_peer_find(&self->peertab, ip3333, - &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->homa, ip3333, + &self->hsk.inet); struct homa_ack acks[2]; ASSERT_NE(NULL, peer); diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 99b9ef94..a9809cce 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -106,7 +106,7 @@ TEST_F(homa_utils, homa_init__homa_skb_init_failure) struct homa homa2; memset(&homa2, 0, sizeof(homa2)); - mock_kmalloc_errors = 0x10; + mock_kmalloc_errors = 0x20; EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); EXPECT_SUBSTR("Couldn't initialize skb management (errno 12)", mock_printk_output); From 00ee1d2cc6dba6bceb72e211034ad485159da19c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 13 May 2025 11:25:43 -0700 Subject: [PATCH 313/625] Improve ttsum.py * When -c is specified, compute relative events separately for each core. * When replacing numbers, only replace numbers that are whole words. --- util/ttsum.py | 74 ++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/util/ttsum.py b/util/ttsum.py index 87d05180..10a0a9ed 100755 --- a/util/ttsum.py +++ b/util/ttsum.py @@ -53,9 +53,17 @@ # Core number -> time of most recent event on that core. -1 means no # events seen for that core yet. - corePrev = defaultdict(lambda : None) +# Core number -> most recent time the "starting event" occurred on +# that core. +startTimes = defaultdict(lambda : None) + +# Core number -> dictionary mapping from event string to the number +# of times that event has occurred on the given core since the starting +# event. +eventCounts = defaultdict(lambda: defaultdict(lambda: 0)) + def scan(f, startingEvent): """ Scan the log file given by 'f' (handle for an open file) and collect @@ -65,29 +73,26 @@ def scan(f, startingEvent): other events, relative to the most recent occurrence of the starting event. """ - foundStart = False - startTime = 0.0 lastTime = None for line in f: - match = re.match('(^|.* )([0-9.]+) us \(\+ *([0-9.]+) us\) ' - '(\[C([0-9]+)\].+)', line) + match = re.match(r'(^|.* )([0-9.]+) us \(\+ *([0-9.]+) us\) ' + r'\[C([0-9]+)\] (.+)', line) if not match: continue - thisEventTime = float(match.group(2))*1000.0 - core = int(match.group(5)) - if options.useCores: - prevTime = corePrev[core] - else: - prevTime = lastTime + thisEventTime = float(match.group(2)) + core = int(match.group(4)) + thisEvent = match.group(5) + if not options.useCores: + core = 0 + prevTime = corePrev[core] if prevTime == None: thisEventInterval = 0 else: thisEventInterval = thisEventTime - prevTime - thisEvent = match.group(4) rawEvent = thisEvent if options.noNumbers: - thisEvent = re.sub('0x[0-9a-f]+', '?', thisEvent) - thisEvent = re.sub('[0-9]+', '?', thisEvent) + thisEvent = re.sub(r'\b0x[0-9a-f]+\b', '?', thisEvent) + thisEvent = re.sub(r'\b[0-9.]+\b', '?', thisEvent) if (lastTime != None) and (thisEventTime < lastTime): print('Time went backwards at the following line:\n%s' % (line)) lastTime = thisEventTime @@ -101,11 +106,11 @@ def scan(f, startingEvent): if startingEvent in rawEvent: # Reset variables to indicate that we are starting a new # sequence of events from the starting event. - startTime = thisEventTime - foundStart = True - eventCount = {} + startTimes[core] = thisEventTime + eventCounts[core] = defaultdict(lambda: 0) - if not foundStart: + startTime = startTimes[core] + if startTime == None: continue # If we get here, it means that we have found an event that @@ -113,13 +118,13 @@ def scan(f, startingEvent): # the starting event. First, see how many times this event has # occurred since the last occurrence of the starting event. relativeTime = thisEventTime - startTime - # print('%.1f %.1f %s' % (relativeTime, thisEventInterval, thisEvent)) - if thisEvent in eventCount: - count = eventCount[thisEvent] + 1 - else: - count = 1 - eventCount[thisEvent] = count - # print("Count for '%s': %d" % (thisEvent, count)) + # print('%9.3f: %.1f %.1f %s' % (thisEventTime, relativeTime, + # thisEventInterval, thisEvent)) + count = eventCounts[core][thisEvent] + 1 + eventCounts[core][thisEvent] = count + + # print("%9.3f: count for '%s': %d" % (thisEventTime, thisEvent, + # count)) if not thisEvent in relativeEvents: relativeEvents[thisEvent] = [] occurrences = relativeEvents[thisEvent] @@ -141,9 +146,10 @@ def scan(f, startingEvent): 'max, etc. for cumulative time, not delta)') parser.add_option('-c', '--cores', action='store_true', default=False, dest='useCores', - help='compute elapsed time for each event relative to the previous ' - 'event on the same core (default: compute relative to the previous ' - 'event on any core)') + help='treat events on each core independently: compute elapsed time ' + 'for each event relative to the previous event on the same core, and ' + 'if -f is specified, compute relative times separately on each core ' + '(default: consider all events on all cores as a single stream)') parser.add_option('-f', '--from', type='string', dest='startEvent', help='measure times for other events relative to FROM; FROM contains a ' 'substring of an event') @@ -227,14 +233,14 @@ def scan(f, startingEvent): medianInterval = intervals[len(intervals)//2] if options.altFormat: message = '%-*s %6.0f %6.0f %6.0f %6.0f %6.0f %7d' % ( - nameLength, eventName, medianTime, times[0], times[-1], - sum(times)/len(times), intervals[len(intervals)//2], - len(times)) + nameLength, eventName, medianTime*1e03, times[0]*1e03, + times[-1]*1e03, sum(times)*1e03/len(times), + intervals[len(intervals)//2]*1e03, len(times)) else: message = '%-*s %6.0f %6.0f %6.0f %6.0f %6.0f %7d' % ( - nameLength, eventName, medianTime, medianInterval, - intervals[0], intervals[-1], sum(intervals)/len(intervals), - len(intervals)) + nameLength, eventName, medianTime*1e03, medianInterval*1e03, + intervals[0]*1e03, intervals[-1]*1e03, + sum(intervals)/len(intervals)*1e03, len(intervals)) outputInfo.append([medianTime, message]) outputInfo.sort(key=lambda item: item[0]) From f9f9af9ffcd9395b48fce85e246a022ac041accd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 13 May 2025 11:33:53 -0700 Subject: [PATCH 314/625] Make ttgrep.py support arbitrary Python regular expressions --- homa_peer_old.c | 442 ++++++++++++++++++++++++++++++++++++++++++++++++ util/ttgrep.py | 19 ++- 2 files changed, 452 insertions(+), 9 deletions(-) create mode 100644 homa_peer_old.c diff --git a/homa_peer_old.c b/homa_peer_old.c new file mode 100644 index 00000000..8a24e577 --- /dev/null +++ b/homa_peer_old.c @@ -0,0 +1,442 @@ +// SPDX-License-Identifier: BSD-2-Clause + +/* This file provides functions related to homa_peer and homa_peertab + * objects. + */ + +#include "homa_impl.h" +#include "homa_peer.h" +#include "homa_rpc.h" + +/** + * homa_peertab_init() - Constructor for homa_peertabs. + * @peertab: The object to initialize; previous contents are discarded. + * + * Return: 0 in the normal case, or a negative errno if there was a problem. + */ +int homa_peertab_init(struct homa_peertab *peertab) +{ + /* Note: when we return, the object must be initialized so it's + * safe to call homa_peertab_destroy, even if this function returns + * an error. + */ + int i; + + spin_lock_init(&peertab->write_lock); + peertab->buckets = vmalloc(HOMA_PEERTAB_BUCKETS * + sizeof(*peertab->buckets)); + if (!peertab->buckets) + return -ENOMEM; + for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) + INIT_HLIST_HEAD(&peertab->buckets[i]); + return 0; +} + +/** + * homa_peertab_destroy() - Destructor for homa_peertabs. After this + * function returns, it is unsafe to use any results from previous calls + * to homa_peer_find, since all existing homa_peer objects will have been + * destroyed. + * @peertab: The table to destroy. + */ +void homa_peertab_destroy(struct homa_peertab *peertab) +{ + struct hlist_node *next; + struct homa_peer *peer; + int i; + + if (!peertab->buckets) + return; + + spin_lock_bh(&peertab->write_lock); + for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { + hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], + peertab_links) { + if (atomic_read(&peer->refs) != 0) +#ifdef __UNIT_TEST__ + FAIL(" %s found peer %s with reference count %d", + __func__, + homa_print_ipv6_addr(&peer->addr), + atomic_read(&peer->refs)); + +#else /* __UNIT_TEST__ */ + pr_err("%s found peer with reference count %d", + __func__, atomic_read(&peer->refs)); +#endif + dst_release(peer->dst); + kfree(peer); + } + } + vfree(peertab->buckets); + spin_unlock_bh(&peertab->write_lock); +} + +#ifndef __UPSTREAM__ /* See strip.py */ +/** + * homa_peertab_get_peers() - Return information about all of the peers + * currently known + * @peertab: The table to search for peers. + * @num_peers: Modified to hold the number of peers returned. + * Return: kmalloced array holding pointers to all known peers. The + * caller must free this. If there is an error, or if there + * are no peers, NULL is returned. Note: if a large number + * of new peers are created while this function executes, + * then the results may not be complete. + */ +struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, + int *num_peers) +{ + int i, slots, next_slot; + struct homa_peer **result; + struct homa_peer *peer; + + /* Note: RCU must be used in the iterators below to ensure safety + * with concurrent insertions. Technically, rcu_read_lock and + * rcu_read_unlock shouldn't be necessary because we don't have to + * worry about concurrent deletions. But without them, some sanity + * checkers will complain. + */ + rcu_read_lock(); + + /* Figure out how large an array to allocate. */ + slots = 0; + next_slot = 0; + result = NULL; + if (peertab->buckets) { + for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { + hlist_for_each_entry_rcu(peer, &peertab->buckets[i], + peertab_links) + slots++; + } + } + if (slots == 0) + goto done; + + /* Allocate extra space in case new peers got created while we + * were counting. + */ + slots += 10; + result = kmalloc_array(slots, sizeof(peer), GFP_ATOMIC); + if (!result) + goto done; + for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { + hlist_for_each_entry_rcu(peer, &peertab->buckets[i], + peertab_links) { + result[next_slot] = peer; + next_slot++; + + /* We might not have allocated enough extra space. */ + if (next_slot >= slots) + goto done; + } + } +done: + rcu_read_unlock(); + *num_peers = next_slot; + return result; +} +#endif /* See strip.py */ + +/** + * homa_peer_find() - Returns the peer associated with a given host; creates + * a new homa_peer if one doesn't already exist. + * @peertab: Peer table in which to perform lookup. + * @addr: Address of the desired host: IPv4 addresses are represented + * as IPv4-mapped IPv6 addresses. + * @inet: Socket that will be used for sending packets. + * + * Return: The peer associated with @addr, or a negative errno if an + * error occurred. On a successful return the reference count + * will be incremented for the returned peer. The caller must + * eventually call homa_peer_put to release the reference. + */ +struct homa_peer *homa_peer_find(struct homa_peertab *peertab, + const struct in6_addr *addr, + struct inet_sock *inet) +{ + struct homa_peer *peer; + struct dst_entry *dst; + u64 start = homa_clock(); + + // Should use siphash or jhash here: + tt_record("homa_peer_find starting"); + u32 bucket = hash_32((__force u32)addr->in6_u.u6_addr32[0], + HOMA_PEERTAB_BUCKET_BITS); + + bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[1], + HOMA_PEERTAB_BUCKET_BITS); + bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[2], + HOMA_PEERTAB_BUCKET_BITS); + bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[3], + HOMA_PEERTAB_BUCKET_BITS); + + /* Use RCU operators to ensure safety even if a concurrent call is + * adding a new entry. The calls to rcu_read_lock and rcu_read_unlock + * shouldn't actually be needed, since we don't need to protect + * against concurrent deletion. + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], + peertab_links) { + if (ipv6_addr_equal(&peer->addr, addr)) { + tt_record("homa_peer_find before homa_peer_hold"); + homa_peer_hold(peer); + tt_record("homa_peer_find after homa_peer_hold"); + rcu_read_unlock(); + tt_record1("homa_peer_find took %d cycles to find existing peer", + homa_clock() - start); + return peer; + } + INC_METRIC(peer_hash_links, 1); + } + rcu_read_unlock(); + + /* No existing entry; create a new one. + * + * Note: after we acquire the lock, we have to check again to + * make sure the entry still doesn't exist (it might have been + * created by a concurrent invocation of this function). + */ + spin_lock_bh(&peertab->write_lock); + hlist_for_each_entry(peer, &peertab->buckets[bucket], + peertab_links) { + if (ipv6_addr_equal(&peer->addr, addr)) { + homa_peer_hold(peer); + goto done; + } + } + peer = kmalloc(sizeof(*peer), GFP_ATOMIC | __GFP_ZERO); + if (!peer) { + peer = (struct homa_peer *)ERR_PTR(-ENOMEM); + INC_METRIC(peer_kmalloc_errors, 1); + goto done; + } + atomic_set(&peer->refs, 1); + peer->addr = *addr; + dst = homa_peer_get_dst(peer, inet); + if (IS_ERR(dst)) { + kfree(peer); + peer = (struct homa_peer *)PTR_ERR(dst); + INC_METRIC(peer_route_errors, 1); + goto done; + } + peer->dst = dst; +#ifndef __STRIP__ /* See strip.py */ + peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; + peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 2] = INT_MAX; + INIT_LIST_HEAD(&peer->grantable_rpcs); + INIT_LIST_HEAD(&peer->grantable_links); +#endif /* See strip.py */ + smp_wmb(); + hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]); + peer->current_ticks = -1; + spin_lock_init(&peer->ack_lock); + INC_METRIC(peer_new_entries, 1); + +done: + spin_unlock_bh(&peertab->write_lock); + return peer; +} + +/** + * homa_dst_refresh() - This method is called when the dst for a peer is + * obsolete; it releases that dst and creates a new one. + * @peertab: Table containing the peer. + * @peer: Peer whose dst is obsolete. + * @hsk: Socket that will be used to transmit data to the peer. + */ +void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, + struct homa_sock *hsk) +{ + struct dst_entry *dst; + + dst = homa_peer_get_dst(peer, &hsk->inet); + if (IS_ERR(dst)) { +#ifndef __STRIP__ /* See strip.py */ + /* Retain the existing dst if we can't create a new one. */ + if (hsk->homa->verbose) + pr_notice("%s couldn't recreate dst: error %ld", + __func__, PTR_ERR(dst)); + INC_METRIC(peer_route_errors, 1); +#endif /* See strip.py */ + return; + } + dst_release(peer->dst); + peer->dst = dst; +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_unsched_priority() - Returns the priority level to use for + * unscheduled packets of a message. + * @homa: Overall data about the Homa protocol implementation. + * @peer: The destination of the message. + * @length: Number of bytes in the message. + * + * Return: A priority level. + */ +int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, + int length) +{ + int i; + + for (i = homa->num_priorities - 1; ; i--) { + if (peer->unsched_cutoffs[i] >= length) + return i; + } + /* Can't ever get here */ +} +#endif /* See strip.py */ + +/** + * homa_peer_get_dst() - Find an appropriate dst structure (either IPv4 + * or IPv6) for a peer. + * @peer: The peer for which a dst is needed. Note: this peer's flow + * struct will be overwritten. + * @inet: Socket that will be used for sending packets. + * Return: The dst structure (or an ERR_PTR). + */ +struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, + struct inet_sock *inet) +{ + memset(&peer->flow, 0, sizeof(peer->flow)); + if (inet->sk.sk_family == AF_INET) { + struct rtable *rt; + + flowi4_init_output(&peer->flow.u.ip4, inet->sk.sk_bound_dev_if, + inet->sk.sk_mark, inet->tos, + RT_SCOPE_UNIVERSE, inet->sk.sk_protocol, 0, + peer->addr.in6_u.u6_addr32[3], + inet->inet_saddr, 0, 0, inet->sk.sk_uid); + security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); + rt = ip_route_output_flow(sock_net(&inet->sk), + &peer->flow.u.ip4, &inet->sk); + if (IS_ERR(rt)) + return (struct dst_entry *)(PTR_ERR(rt)); + return &rt->dst; + } + peer->flow.u.ip6.flowi6_oif = inet->sk.sk_bound_dev_if; + peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX; + peer->flow.u.ip6.flowi6_mark = inet->sk.sk_mark; + peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE; + peer->flow.u.ip6.flowi6_proto = inet->sk.sk_protocol; + peer->flow.u.ip6.flowi6_flags = 0; + peer->flow.u.ip6.flowi6_secid = 0; + peer->flow.u.ip6.flowi6_tun_key.tun_id = 0; + peer->flow.u.ip6.flowi6_uid = inet->sk.sk_uid; + peer->flow.u.ip6.daddr = peer->addr; + peer->flow.u.ip6.saddr = inet->pinet6->saddr; + peer->flow.u.ip6.fl6_dport = 0; + peer->flow.u.ip6.fl6_sport = 0; + peer->flow.u.ip6.mp_hash = 0; + peer->flow.u.ip6.__fl_common.flowic_tos = inet->tos; + peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(inet->tos, 0); + security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); + return ip6_dst_lookup_flow(sock_net(&inet->sk), &inet->sk, + &peer->flow.u.ip6, NULL); +} + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_peer_set_cutoffs() - Set the cutoffs for unscheduled priorities in + * a peer object. This is a convenience function used primarily by unit tests. + * @peer: Homa_peer object whose cutoffs should be set. + * @c0: Largest message size that will use priority 0. + * @c1: Largest message size that will use priority 1. + * @c2: Largest message size that will use priority 2. + * @c3: Largest message size that will use priority 3. + * @c4: Largest message size that will use priority 4. + * @c5: Largest message size that will use priority 5. + * @c6: Largest message size that will use priority 6. + * @c7: Largest message size that will use priority 7. + */ +void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, + int c3, int c4, int c5, int c6, int c7) +{ + peer->unsched_cutoffs[0] = c0; + peer->unsched_cutoffs[1] = c1; + peer->unsched_cutoffs[2] = c2; + peer->unsched_cutoffs[3] = c3; + peer->unsched_cutoffs[4] = c4; + peer->unsched_cutoffs[5] = c5; + peer->unsched_cutoffs[6] = c6; + peer->unsched_cutoffs[7] = c7; +} + +/** + * homa_peer_lock_slow() - This function implements the slow path for + * acquiring a peer's @ack_lock. It is invoked when the lock isn't + * immediately available. It waits for the lock, but also records statistics + * about the waiting time. + * @peer: Peer to lock. + */ +void homa_peer_lock_slow(struct homa_peer *peer) + __acquires(&peer->ack_lock) +{ + u64 start = homa_clock(); + + tt_record("beginning wait for peer lock"); + spin_lock_bh(&peer->ack_lock); + tt_record("ending wait for peer lock"); + INC_METRIC(peer_ack_lock_misses, 1); + INC_METRIC(peer_ack_lock_miss_cycles, homa_clock() - start); +} +#endif /* See strip.py */ + +/** + * homa_peer_add_ack() - Add a given RPC to the list of unacked + * RPCs for its server. Once this method has been invoked, it's safe + * to delete the RPC, since it will eventually be acked to the server. + * @rpc: Client RPC that has now completed. + */ +void homa_peer_add_ack(struct homa_rpc *rpc) +{ + struct homa_peer *peer = rpc->peer; + struct homa_ack_hdr ack; + + homa_peer_lock(peer); + if (peer->num_acks < HOMA_MAX_ACKS_PER_PKT) { + peer->acks[peer->num_acks].client_id = cpu_to_be64(rpc->id); + peer->acks[peer->num_acks].server_port = htons(rpc->dport); + peer->num_acks++; + homa_peer_unlock(peer); + return; + } + + /* The peer has filled up; send an ACK message to empty it. The + * RPC in the message header will also be considered ACKed. + */ + INC_METRIC(ack_overflows, 1); + memcpy(ack.acks, peer->acks, sizeof(peer->acks)); + ack.num_acks = htons(peer->num_acks); + peer->num_acks = 0; + homa_peer_unlock(peer); + homa_xmit_control(ACK, &ack, sizeof(ack), rpc); +} + +/** + * homa_peer_get_acks() - Copy acks out of a peer, and remove them from the + * peer. + * @peer: Peer to check for possible unacked RPCs. + * @count: Maximum number of acks to return. + * @dst: The acks are copied to this location. + * + * Return: The number of acks extracted from the peer (<= count). + */ +int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst) +{ + /* Don't waste time acquiring the lock if there are no ids available. */ + if (peer->num_acks == 0) + return 0; + + homa_peer_lock(peer); + + if (count > peer->num_acks) + count = peer->num_acks; + memcpy(dst, &peer->acks[peer->num_acks - count], + count * sizeof(peer->acks[0])); + peer->num_acks -= count; + + homa_peer_unlock(peer); + return count; +} diff --git a/util/ttgrep.py b/util/ttgrep.py index cf53c907..10a63461 100755 --- a/util/ttgrep.py +++ b/util/ttgrep.py @@ -4,11 +4,11 @@ # SPDX-License-Identifier: BSD-1-Clause """ -Scan the time trace data in a log file; find all records containing -a given string, and output only those records. If the --rebase argument -is present, times are offset so the first event is at time 0. If the file -is omitted, standard input is used. -Usage: ttgrep.py [--rebase] string [file] +Scan the time trace data in a log file; find all records whose events +match a given Python regular expression, and output only those records. +If the --rebase argument is present, times are offset so the first event +is at time 0. If the file is omitted, standard input is used. +Usage: ttgrep.py [--rebase] regex [file] """ from __future__ import division, print_function @@ -22,15 +22,16 @@ rebase = False -def scan(f, string): +def scan(f, pattern): """ Scan the log file given by 'f' (handle for an open file) and output - all-time trace records containing string. + all-time trace records that match pattern. """ global rebase startTime = 0.0 prevTime = 0.0 writes = 0 + compiled = re.compile(pattern) for line in f: match = re.match(' *([-0-9.]+) us \(\+ *([0-9.]+) us\) (.*)', line) @@ -39,7 +40,7 @@ def scan(f, string): time = float(match.group(1)) interval = float(match.group(2)) event = match.group(3) - if (string not in event) and ("Freez" not in event): + if (not compiled.search(event)) and ("Freez" not in event): continue if startTime == 0.0: startTime = time @@ -60,7 +61,7 @@ def scan(f, string): if len(sys.argv) == 3: f = open(sys.argv[2]) elif len(sys.argv) != 2: - print("Usage: %s [--rebase] string [logFile]" % (sys.argv[0])) + print("Usage: %s [--rebase] regex [logFile]" % (sys.argv[0])) sys.exit(1) scan(f, sys.argv[1]) \ No newline at end of file From c4680f25c80209b5311d308c2a290502be2cb9a0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 14 May 2025 10:51:05 -0700 Subject: [PATCH 315/625] Add is_homa_pkt function --- homa_impl.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/homa_impl.h b/homa_impl.h index 9d56736f..47def78f 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -650,6 +650,24 @@ static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb) return mapped; } +/** + * is_homa_pkt() - Return true if @skb is a Homa packet, false otherwise. + * @skb: Packet buffer to check. + * Return: see above. + */ +static inline bool is_homa_pkt(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + +#ifndef __STRIP__ /* See strip.py */ + return ((iph->protocol == IPPROTO_HOMA) || + ((iph->protocol == IPPROTO_TCP) && + (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); +#else /* See strip.py */ + return ((iph->protocol == IPPROTO_HOMA); +#endif /* See strip.py */ +} + /** * homa_make_header_avl() - Invokes pskb_may_pull to make sure that all the * Homa header information for a packet is in the linear part of the skb From 3aa75d478284c3f59926989b4b0ee4a532a6ef69 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 14 May 2025 10:59:28 -0700 Subject: [PATCH 316/625] Abort homa_rpc_alloc_server if there is no buffer pool --- homa_rpc.c | 3 +++ test/unit_homa_rpc.c | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/homa_rpc.c b/homa_rpc.c index c524c48d..06ccb998 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -122,6 +122,9 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, struct homa_rpc *srpc = NULL; int err; + if (!hsk->buffer_pool) + return ERR_PTR(-ENOMEM); + /* Lock the bucket, and make sure no-one else has already created * the desired RPC. */ diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 362ee176..13ccd189 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -167,6 +167,19 @@ TEST_F(homa_rpc, homa_rpc_alloc_server__normal) EXPECT_EQ(1, created); homa_rpc_end(srpc); } +TEST_F(homa_rpc, homa_rpc_alloc_server__no_buffer_pool) +{ + struct homa_rpc *srpc; + int created; + + homa_pool_free(self->hsk.buffer_pool); + self->hsk.buffer_pool = NULL; + srpc = homa_rpc_alloc_server(&self->hsk, self->client_ip, &self->data, + &created); + EXPECT_TRUE(IS_ERR(srpc)); + EXPECT_EQ(ENOMEM, -PTR_ERR(srpc)); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); +} TEST_F(homa_rpc, homa_rpc_alloc_server__already_exists) { struct homa_rpc *srpc1, *srpc2, *srpc3; @@ -243,7 +256,7 @@ TEST_F(homa_rpc, homa_rpc_alloc_server__allocate_buffers) EXPECT_EQ(3, srpc->msgin.num_bpages); homa_rpc_end(srpc); } -TEST_F(homa_rpc, homa_rpc_alloc_server__no_buffer_pool) +TEST_F(homa_rpc, homa_rpc_alloc_server__cant_allocate_buffers) { struct homa_rpc *srpc; int created; From 417fb2db99e3eb65c991fbf52eb110c1fc2ebee6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 14 May 2025 16:49:22 -0700 Subject: [PATCH 317/625] Move struct homa_peertab to struct homa_shared There is now a single table of peers, shared across all homas. --- homa_devel.c | 2 +- homa_impl.h | 17 ++--- homa_peer.c | 145 ++++++++++++++++++++++++++--------------- homa_peer.h | 51 ++------------- homa_skb.c | 1 + homa_utils.c | 73 +++++++++++---------- test/mock.c | 19 ++++-- test/mock.h | 5 +- test/unit_homa_peer.c | 121 ++++++++++++++++++++++------------ test/unit_homa_skb.c | 8 +-- test/unit_homa_utils.c | 60 ++++++++++++----- test/utils.c | 26 ++++++++ test/utils.h | 63 +++++++++--------- 13 files changed, 354 insertions(+), 237 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 66fb4ade..32bb4614 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -398,7 +398,7 @@ void homa_freeze_peers(struct homa *homa) IF_NO_STRIP(freeze.common.urgent = htons(HOMA_TCP_URGENT)); freeze.common.sender_id = 0; - rhashtable_walk_enter(&homa->peers->ht, &iter); + rhashtable_walk_enter(&homa->shared->peers->ht, &iter); rhashtable_walk_start(&iter); while (true) { peer = rhashtable_walk_next(&iter); diff --git a/homa_impl.h b/homa_impl.h index 47def78f..03204873 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -139,19 +139,13 @@ struct homa { * @prev_default_port: The most recent port number assigned from * the range of default ports. */ - __u16 prev_default_port ____cacheline_aligned_in_smp; + __u16 prev_default_port; /** * @port_map: Information about all open sockets. Dynamically * allocated; must be kfreed. */ - struct homa_socktab *port_map ____cacheline_aligned_in_smp; - - /** - * @peers: Info about all the other hosts we have communicated with. - * Dynamically allocated; must be kfreed. - */ - struct homa_peertab *peers; + struct homa_socktab *port_map; #ifndef __STRIP__ /* See strip.py */ /** @@ -527,6 +521,13 @@ struct homa_shared { * through their shared_links fields. Managed with RCU. */ struct list_head homas; + + /** + * @peers: Info about all the other hosts we have communicated with; + * includes peers from all struct homas. Dynamically allocated; must + * be kfreed. + */ + struct homa_peertab *peers; }; /** diff --git a/homa_peer.c b/homa_peer.c index 5964cfd0..ed41b4e3 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -8,6 +8,14 @@ #include "homa_peer.h" #include "homa_rpc.h" +#ifdef __UNIT_TEST__ +#undef rhashtable_init +#define rhashtable_init mock_rht_init + +#undef rhashtable_lookup_get_insert_fast +#define rhashtable_lookup_get_insert_fast mock_rht_lookup_get_insert_fast +#endif /* __UNIT_TEST__ */ + const struct rhashtable_params ht_params = { .key_len = sizeof(struct homa_peer_key), .key_offset = offsetof(struct homa_peer, ht_key), @@ -18,22 +26,57 @@ const struct rhashtable_params ht_params = { }; /** - * homa_peertab_init() - Constructor for homa_peertabs. - * @peertab: The object to initialize; previous contents are discarded. + * homa_peertab_alloc() - Allocate and initialize a homa_peertab. * - * Return: 0 in the normal case, or a negative errno if there was a problem. + * Return: A pointer to the new homa_peertab, or ERR_PTR(-errno) if there + * was a problem. */ -int homa_peertab_init(struct homa_peertab *peertab) +struct homa_peertab *homa_peertab_alloc(void) { - /* Note: when we return, the object must be initialized so it's - * safe to call homa_peertab_destroy, even if this function returns - * an error. - */ - int status; + struct homa_peertab *peertab; + int err; - status = rhashtable_init(&peertab->ht, &ht_params); - peertab->live = (status == 0); - return status; + peertab = kmalloc(sizeof(*peertab), GFP_KERNEL); + if (!peertab) { + pr_err("%s couldn't create peers: kmalloc failure", __func__); + return ERR_PTR(-ENOMEM); + } + + err = rhashtable_init(&peertab->ht, &ht_params); + if (err) { + kfree(peertab); + return ERR_PTR(err); + } + return peertab; +} + +/** + * homa_peertab_free_homa() - Garbage collect all of the peer information + * associated with a particular struct homa. + * @homa: Object whose peers should be freed. + */ +void homa_peertab_free_homa(struct homa *homa) +{ + struct homa_peertab *peertab = homa->shared->peers; + struct rhashtable_iter iter; + struct homa_peer *peer; + + rhashtable_walk_enter(&peertab->ht, &iter); + rhashtable_walk_start(&iter); + while (1) { + peer = rhashtable_walk_next(&iter); + if (!peer) + break; + if (IS_ERR(peer)) + continue; + if (peer->ht_key.homa != homa) + continue; + rhashtable_remove_fast(&peertab->ht, &peer->ht_linkage, + ht_params); + homa_peer_free(peer); + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); } /** @@ -47,37 +90,21 @@ void homa_peertab_free_fn(void *object, void *dummy) { struct homa_peer *peer = object; -#ifdef __UNIT_TEST__ - if (atomic_read(&peer->refs) != 0) { - if (!mock_peertab_free_fn_no_complain) - FAIL(" %s found peer %s with reference count %d", - __func__, - homa_print_ipv6_addr(&peer->addr), - atomic_read(&peer->refs)); - } -#else /* __UNIT_TEST__ */ - if (atomic_read(&peer->refs) != 0) - pr_err("%s found peer with reference count %d", - __func__, atomic_read(&peer->refs)); -#endif - else - homa_peer_free(peer); + homa_peer_free(peer); } /** - * homa_peertab_destroy() - Destructor for homa_peertabs. After this + * homa_peertab_free() - Destructor for homa_peertabs. After this * function returns, it is unsafe to use any results from previous calls * to homa_peer_find, since all existing homa_peer objects will have been * destroyed. * @peertab: The table to destroy. */ -void homa_peertab_destroy(struct homa_peertab *peertab) +void homa_peertab_free(struct homa_peertab *peertab) { - if (peertab->live) { - rhashtable_free_and_destroy(&peertab->ht, homa_peertab_free_fn, - NULL); - peertab->live = false; - } + rhashtable_free_and_destroy(&peertab->ht, homa_peertab_free_fn, + NULL); + kfree(peertab); } /** @@ -134,7 +161,24 @@ struct homa_peer *homa_peer_alloc(struct homa *homa, void homa_peer_free(struct homa_peer *peer) { dst_release(peer->dst); - kfree(peer); + + if (atomic_read(&peer->refs) == 0) + kfree(peer); + else { +#ifdef __UNIT_TEST__ + if (!mock_peer_free_no_fail) + FAIL(" %s found peer %s with reference count %d", + __func__, homa_print_ipv6_addr(&peer->addr), + atomic_read(&peer->refs)); + else + UNIT_LOG("; ", "peer %s has reference count %d", + homa_print_ipv6_addr(&peer->addr), + atomic_read(&peer->refs)); +#else /* __UNIT_TEST__ */ + WARN(1, "%s found peer with reference count %d", + __func__, atomic_read(&peer->refs)); +#endif /* __UNIT_TEST__ */ + } } /** @@ -154,19 +198,17 @@ struct homa_peer *homa_peer_find(struct homa *homa, const struct in6_addr *addr, struct inet_sock *inet) { + struct homa_peertab *peertab = homa->shared->peers; struct homa_peer *peer, *other; struct homa_peer_key key; - u64 start = homa_clock(); key.addr = *addr; key.homa = homa; rcu_read_lock(); - peer = rhashtable_lookup(&homa->peers->ht, &key, ht_params); + peer = rhashtable_lookup(&peertab->ht, &key, ht_params); if (peer) { homa_peer_hold(peer); rcu_read_unlock(); - tt_record1("homa_peer_find took %d cycles to find existing peer", - homa_clock() - start); return peer; } @@ -176,24 +218,21 @@ struct homa_peer *homa_peer_find(struct homa *homa, rcu_read_unlock(); return peer; } -#ifdef __UNIT_TEST__ - other = mock_rht_lookup_get_insert_fast(&homa->peers->ht, - &peer->ht_linkage, ht_params); -#else /* __UNIT_TEST__ */ - other = rhashtable_lookup_get_insert_fast(&homa->peers->ht, + other = rhashtable_lookup_get_insert_fast(&peertab->ht, &peer->ht_linkage, ht_params); -#endif /* __UNIT_TEST__ */ if (IS_ERR(other)) { + /* Couldn't insert; return the error info. */ + homa_peer_put(peer); homa_peer_free(peer); - rcu_read_unlock(); - return other; - } - if (other) { - /* Someone else already created the desired peer. */ - homa_peer_hold(other); - rcu_read_unlock(); + peer = other; + } else if (other) { + /* Someone else already created the desired peer; use that + * one instead of ours. + */ + homa_peer_put(peer); homa_peer_free(peer); - return other; + homa_peer_hold(other); + peer = other; } rcu_read_unlock(); return peer; diff --git a/homa_peer.h b/homa_peer.h index dd21afdf..26b8c6f0 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -15,51 +15,12 @@ struct homa_rpc; /** - * struct homa_dead_dst - Used to retain dst_entries that are no longer - * needed, until it is safe to delete them (I'm not confident that the RCU - * mechanism will be safe for these: the reference count could get incremented - * after it's on the RCU list?). - */ -struct homa_dead_dst { - /** @dst: Entry that is no longer used by a struct homa_peer. */ - struct dst_entry *dst; - - /** @gc_time: homa_clock() time when it is safe to free @dst. */ - u64 gc_time; - - /** @dst_links: Used to link together entries in peertab->dead_dsts. */ - struct list_head dst_links; -}; - -/** - * define HOMA_PEERTAB_BUCKET_BITS - Number of bits in the bucket index for a - * homa_peertab. Should be large enough to hold an entry for every server - * in a datacenter without long hash chains. - */ -#define HOMA_PEERTAB_BUCKET_BITS 16 - -/** define HOME_PEERTAB_BUCKETS - Number of buckets in a homa_peertab. */ -#define HOMA_PEERTAB_BUCKETS BIT(HOMA_PEERTAB_BUCKET_BITS) - -/** - * struct homa_peertab - A hash table that maps from IPv6 addresses - * to homa_peer objects. IPv4 entries are encapsulated as IPv6 addresses. - * Entries are gradually added to this table, but they are never removed - * except when the entire table is deleted. We can't safely delete because - * results returned by homa_peer_find may be retained indefinitely. - * - * This table is managed exclusively by homa_peertab.c, using RCU to - * permit efficient lookups. + * struct homa_peertab - Stores homa_peer objects, indexed by IPv6 + * address. */ struct homa_peertab { /** @ht: Hash table that stores all struct peers. */ struct rhashtable ht; - - /** - * @live: True means ht has been successfully initialized and - * not yet destructed. - */ - bool live; }; /** @@ -219,9 +180,11 @@ struct homa_peer { void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, struct homa_sock *hsk); -void homa_peertab_destroy(struct homa_peertab *peertab); +struct homa_peertab + *homa_peertab_alloc(void); +void homa_peertab_free(struct homa_peertab *peertab); +void homa_peertab_free_homa(struct homa *homa); void homa_peertab_free_fn(void *object, void *dummy); -int homa_peertab_init(struct homa_peertab *peertab); void homa_peer_add_ack(struct homa_rpc *rpc); struct homa_peer *homa_peer_alloc(struct homa *homa, const struct in6_addr *addr, @@ -286,7 +249,7 @@ static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk) { if (unlikely(peer->dst->obsolete > 0)) - homa_dst_refresh(hsk->homa->peers, peer, hsk); + homa_dst_refresh(hsk->homa->shared->peers, peer, hsk); dst_hold(peer->dst); return peer->dst; } diff --git a/homa_skb.c b/homa_skb.c index e22850b8..0817577a 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -270,6 +270,7 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) /* Step 2: can we retrieve a page from the pool for this NUMA node? */ pool = skb_core->pool; if (pool->avail) { + UNIT_HOOK("skb_page_alloc_race"); spin_lock_bh(&homa->page_pool_mutex); /* Must recheck: could have changed before locked. */ diff --git a/homa_utils.c b/homa_utils.c index 78b92f68..e8fd44cd 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -25,17 +25,24 @@ struct homa_shared *homa_shared; /** * homa_shared_alloc() - Allocate and initialize a new homa_shared * object. - * Return: the new homa_shared object, or NULL if memory allocation failed. + * Return: the new homa_shared object, or ERR_PTR on failure. */ struct homa_shared *homa_shared_alloc(void) { struct homa_shared *shared; + int err; shared = kmalloc(sizeof(*homa_shared), GFP_KERNEL); if (!shared) - return NULL; + return ERR_PTR(-ENOMEM); spin_lock_init(&shared->lock); INIT_LIST_HEAD(&shared->homas); + shared->peers = homa_peertab_alloc(); + if (IS_ERR(shared->peers)) { + err = PTR_ERR(shared->peers); + kfree(shared); + return ERR_PTR(err); + } return shared; } @@ -44,6 +51,7 @@ struct homa_shared *homa_shared_alloc(void) */ void homa_shared_free(struct homa_shared *shared) { + homa_peertab_free(shared->peers); kfree(shared); if (shared == homa_shared) homa_shared = NULL; @@ -72,8 +80,12 @@ int homa_init(struct homa *homa, struct net *net) if (!homa_shared) { homa_shared = homa_shared_alloc(); - if (!homa_shared) - return -ENOMEM; + if (IS_ERR(homa_shared)) { + int status = PTR_ERR(homa_shared); + + homa_shared = NULL; + return status; + } } homa->shared = homa_shared; spin_lock_bh(&homa_shared->lock); @@ -103,17 +115,6 @@ int homa_init(struct homa *homa, struct net *net) return -ENOMEM; } homa_socktab_init(homa->port_map); - homa->peers = kmalloc(sizeof(*homa->peers), GFP_KERNEL); - if (!homa->peers) { - pr_err("%s couldn't create peers: kmalloc failure", __func__); - return -ENOMEM; - } - err = homa_peertab_init(homa->peers); - if (err) { - pr_err("%s couldn't initialize peer table (errno %d)\n", - __func__, -err); - return err; - } #ifndef __STRIP__ /* See strip.py */ err = homa_skb_init(homa); if (err) { @@ -171,30 +172,23 @@ int homa_init(struct homa *homa, struct net *net) /** * homa_destroy() - Destructor for homa objects. - * @homa: Object to destroy. + * @homa: Object to destroy. It is safe if this object has already + * been previously destroyed. */ void homa_destroy(struct homa *homa) { + struct homa_shared *shared; + + if (!homa_shared) + /* Already destroyed. */ + return; + #ifdef __UNIT_TEST__ #include "utils.h" unit_homa_destroy(homa); #endif /* __UNIT_TEST__ */ - if (homa->shared) { - struct homa_shared *shared = homa->shared; - - spin_lock_bh(&shared->lock); - __list_del_entry(&homa->shared_links); - if (list_empty(&homa->shared->homas)) { - spin_unlock_bh(&shared->lock); - homa_shared_free(homa->shared); - } else { - spin_unlock_bh(&shared->lock); - } - homa->shared = NULL; - } - - /* The order of the following statements matters! */ + /* The order of the following cleanups matters! */ if (homa->port_map) { homa_socktab_destroy(homa->port_map); kfree(homa->port_map); @@ -210,14 +204,21 @@ void homa_destroy(struct homa *homa) homa_pacer_free(homa->pacer); homa->pacer = NULL; } - if (homa->peers) { - homa_peertab_destroy(homa->peers); - kfree(homa->peers); - homa->peers = NULL; - } #ifndef __STRIP__ /* See strip.py */ homa_skb_cleanup(homa); #endif /* See strip.py */ + homa_peertab_free_homa(homa); + + shared = homa->shared; + spin_lock_bh(&shared->lock); + __list_del_entry(&homa->shared_links); + if (list_empty(&homa->shared->homas)) { + spin_unlock_bh(&shared->lock); + homa_shared_free(homa->shared); + } else { + spin_unlock_bh(&shared->lock); + } + homa->shared = NULL; } #ifndef __STRIP__ /* See strip.py */ diff --git a/test/mock.c b/test/mock.c index 858aad01..c499b844 100644 --- a/test/mock.c +++ b/test/mock.c @@ -11,7 +11,6 @@ #include "homa_skb.h" #endif /* See strip.py */ #include "ccutils.h" -#include "mock.h" #include "utils.h" #include @@ -48,6 +47,7 @@ int mock_kthread_create_errors; int mock_prepare_to_wait_errors; int mock_register_protosw_errors; int mock_register_sysctl_errors; +int mock_rht_init_errors; int mock_rht_insert_errors; int mock_route_errors; int mock_spin_lock_held; @@ -231,10 +231,10 @@ static struct socket mock_socket; static struct homa *mock_homa; struct net mock_net; -/* Nonzero means don't generate an error message in homa_peertabe_free_fn - * if the reference count isn't zero. +/* Nonzero means don't generate a unit test failure when freeing peers + * if the reference count isn't zero (log a message instead). */ -int mock_peertab_free_fn_no_complain; +int mock_peer_free_no_fail; struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; struct netdev_queue mock_net_queue = {.state = 0}; @@ -1259,6 +1259,14 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} +int mock_rht_init(struct rhashtable *ht, + const struct rhashtable_params *params) +{ + if (mock_check_error(&mock_rht_init_errors)) + return -EINVAL; + return rhashtable_init(ht, params); +} + void *mock_rht_lookup_get_insert_fast(struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) @@ -2062,6 +2070,7 @@ void mock_teardown(void) mock_prepare_to_wait_errors = 0; mock_register_protosw_errors = 0; mock_register_sysctl_errors = 0; + mock_rht_init_errors = 0; mock_rht_insert_errors = 0; mock_wait_intr_irq_errors = 0; mock_copy_to_user_dont_copy = 0; @@ -2088,7 +2097,7 @@ void mock_teardown(void) mock_min_default_port = 0x8000; mock_homa = NULL; homa_net_id = 0; - mock_peertab_free_fn_no_complain = 0; + mock_peer_free_no_fail = 0; mock_net_device.gso_max_size = 0; mock_net_device.gso_max_segs = 1000; memset(inet_offloads, 0, sizeof(inet_offloads)); diff --git a/test/mock.h b/test/mock.h index 9f73eddc..f3e137c0 100644 --- a/test/mock.h +++ b/test/mock.h @@ -144,9 +144,10 @@ extern struct net_device mock_net_device; extern int mock_numa_mask; extern int mock_page_nid_mask; -extern int mock_peertab_free_fn_no_complain; +extern int mock_peer_free_no_fail; extern int mock_prepare_to_wait_status; extern char mock_printk_output[]; +extern int mock_rht_init_errors; extern int mock_rht_insert_errors; extern int mock_route_errors; extern int mock_signal_pending; @@ -191,6 +192,8 @@ struct ctl_table_header * mock_register_net_sysctl(struct net *net, const char *path, struct ctl_table *table); +int mock_rht_init(struct rhashtable *ht, + const struct rhashtable_params *params); void *mock_rht_lookup_get_insert_fast(struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params); diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 0c7cd789..1d6525b5 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -66,32 +66,56 @@ static void peer_race_hook(char *id) homa_peer_put(conflicting_peer); } -TEST_F(homa_peer, homa_peertab_init__success) +TEST_F(homa_peer, homa_peertab_alloc__success) { - struct homa_peertab table; + struct homa_peertab *peertab; - EXPECT_EQ(0, -homa_peertab_init(&table)); - EXPECT_EQ(1, table.live); + peertab = homa_peertab_alloc(); + EXPECT_FALSE(IS_ERR(peertab)); - homa_peertab_destroy(&table); + homa_peertab_free(peertab); +} +TEST_F(homa_peer, homa_peertab_alloc__cant_alloc_peertab) +{ + struct homa_peertab *peertab; + + mock_kmalloc_errors = 1; + peertab = homa_peertab_alloc(); + EXPECT_TRUE(IS_ERR(peertab)); + EXPECT_EQ(ENOMEM, -PTR_ERR(peertab)); +} +TEST_F(homa_peer, homa_peertab_alloc__rhashtable_init_fails) +{ + struct homa_peertab *peertab; + + mock_rht_init_errors = 1; + peertab = homa_peertab_alloc(); + EXPECT_TRUE(IS_ERR(peertab)); + EXPECT_EQ(EINVAL, -PTR_ERR(peertab)); } -TEST_F(homa_peer, homa_peertab_free_fn__ref_count_zero) +TEST_F(homa_peer, homa_peertab_free_homa) { + /* Create peers from two different "homa"s, make sure only + * those from one get freed. */ struct homa_peer *peer; - struct dst_entry *dst; + struct homa homa2; - peer = homa_peer_alloc(&self->homa, ip3333, &self->hsk.inet); - dst = peer->dst; - dst_hold(dst); - EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + homa_init(&homa2, &mock_net); + peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + homa_peer_put(peer); + peer = homa_peer_find(&self->homa, ip2222, &self->hsk.inet); + homa_peer_put(peer); + peer = homa_peer_find(&homa2, ip3333, &self->hsk.inet); homa_peer_put(peer); + EXPECT_EQ(3, unit_count_peers(&self->homa)); - homa_peertab_free_fn(peer, NULL); - EXPECT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); - dst_release(dst); + homa_peertab_free_homa(&self->homa); + EXPECT_EQ(1, unit_count_peers(&self->homa)); + homa_destroy(&homa2); } -TEST_F(homa_peer, homa_peertab_free_fn__bad_reference_count) + +TEST_F(homa_peer, homa_peertab_free_fn) { struct homa_peer *peer; struct dst_entry *dst; @@ -100,24 +124,27 @@ TEST_F(homa_peer, homa_peertab_free_fn__bad_reference_count) dst = peer->dst; dst_hold(dst); EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + homa_peer_put(peer); - mock_peertab_free_fn_no_complain = 1; homa_peertab_free_fn(peer, NULL); - EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + EXPECT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); dst_release(dst); - homa_peer_put(peer); - homa_peer_free(peer); } -TEST_F(homa_peer, homa_peertab__destroy) { - /* First call: peertab live. */ - EXPECT_EQ(1, self->homa.peers->live); - homa_peertab_destroy(self->homa.peers); - EXPECT_EQ(0, self->homa.peers->live); +TEST_F(homa_peer, homa_peertab_free) { + struct homa_peer *peer; - /* Second call: peertab no longer live. */ - homa_peertab_destroy(self->homa.peers); - EXPECT_EQ(0, self->homa.peers->live); + peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + homa_peer_put(peer); + peer = homa_peer_find(&self->homa, ip2222, &self->hsk.inet); + mock_peer_free_no_fail = 1; + + unit_log_clear(); + homa_peertab_free(self->homa.shared->peers); + EXPECT_STREQ("peer [2::2:2:2] has reference count 1", unit_log_get()); + + kfree(peer); + self->homa.shared->peers = homa_peertab_alloc(); } TEST_F(homa_peer, homa_peer_alloc__success) @@ -133,6 +160,7 @@ TEST_F(homa_peer, homa_peer_alloc__success) EXPECT_EQ(1, homa_metrics_per_cpu()->peer_new_entries); #endif /* See strip.py */ EXPECT_EQ(1, atomic_read(&peer->dst->__rcuref.refcnt)); + homa_peer_put(peer); homa_peer_free(peer); } TEST_F(homa_peer, homa_peer_alloc__kmalloc_error) @@ -160,7 +188,7 @@ TEST_F(homa_peer, homa_peer_alloc__route_error) #endif /* See strip.py */ } -TEST_F(homa_peer, homa_peer_free) +TEST_F(homa_peer, homa_peer_free__normal) { struct homa_peer *peer; struct dst_entry *dst; @@ -176,6 +204,19 @@ TEST_F(homa_peer, homa_peer_free) ASSERT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); dst_release(dst); } +TEST_F(homa_peer, homa_peer_free__nonzero_ref_count) +{ + struct homa_peer *peer; + + peer = homa_peer_alloc(&self->homa, ip2222, &self->hsk.inet); + ASSERT_FALSE(IS_ERR(peer)); + mock_peer_free_no_fail = 1; + + unit_log_clear(); + homa_peer_free(peer); + EXPECT_STREQ("peer [2::2:2:2] has reference count 1", unit_log_get()); + kfree(peer); +} TEST_F(homa_peer, homa_peer_find__basics) { @@ -220,7 +261,16 @@ TEST_F(homa_peer, homa_peer_find__error_in_homa_peer_alloc) EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); #endif /* See strip.py */ } -TEST_F(homa_peer, homa_peer_find__conflicting_creates) +TEST_F(homa_peer, homa_peer_find__insert_error) +{ + struct homa_peer *peer; + + mock_rht_insert_errors = 1; + peer = homa_peer_find(&self->homa, ip3333, &self->hsk.inet); + EXPECT_TRUE(IS_ERR(peer)); + EXPECT_EQ(EINVAL, -PTR_ERR(peer)); +} +TEST_F(homa_peer, homa_peer_find__conflicting_create) { struct homa_peer *peer; @@ -233,15 +283,6 @@ TEST_F(homa_peer, homa_peer_find__conflicting_creates) EXPECT_EQ(1, atomic_read(&peer->refs)); homa_peer_put(peer); } -TEST_F(homa_peer, homa_peer_find__insert_error) -{ - struct homa_peer *peer; - - mock_rht_insert_errors = 1; - peer = homa_peer_find(&self->homa, ip3333, &self->hsk.inet); - EXPECT_TRUE(IS_ERR(peer)); - EXPECT_EQ(EINVAL, -PTR_ERR(peer)); -} TEST_F(homa_peer, homa_dst_refresh__basics) { @@ -253,7 +294,7 @@ TEST_F(homa_peer, homa_dst_refresh__basics) EXPECT_EQ_IP(*ip1111, peer->addr); old_dst = peer->dst; - homa_dst_refresh(self->homa.peers, peer, &self->hsk); + homa_dst_refresh(self->homa.shared->peers, peer, &self->hsk); EXPECT_NE(old_dst, peer->dst); homa_peer_put(peer); } @@ -268,7 +309,7 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) old_dst = peer->dst; mock_route_errors = 1; - homa_dst_refresh(self->homa.peers, peer, &self->hsk); + homa_dst_refresh(self->homa.shared->peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index fe26f084..8280ba44 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -63,10 +63,10 @@ static void add_to_pool(struct homa *homa, int num_pages, int core) static struct homa_page_pool *hook_pool; -/* Used to remove a page from hook_pool when a lock is acquired. */ -static void spinlock_hook(char *id) +/* Used to remove a page from hook_pool in a race. */ +static void page_alloc_race_hook(char *id) { - if (strcmp(id, "spin_lock") != 0) + if (strcmp(id, "skb_page_alloc_race") != 0) return; if ((hook_pool == NULL) || (hook_pool->avail == 0)) return; @@ -338,7 +338,7 @@ TEST_F(homa_skb, homa_skb_page_alloc__pool_page_taken_while_locking) EXPECT_EQ(1, skb_core->pool->avail); EXPECT_EQ(0, skb_core->num_stashed_pages); hook_pool = skb_core->pool; - unit_hook_register(spinlock_hook); + unit_hook_register(page_alloc_race_hook); mock_alloc_page_errors = 3; EXPECT_FALSE(homa_skb_page_alloc(&self->homa, skb_core)); diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index a9809cce..45bdd2a6 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -53,12 +53,25 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, } #endif /* See strip.py */ -TEST_F(homa_utils, homa_shared_init__kmalloc_failure) +TEST_F(homa_utils, homa_shared_alloc__kmalloc_failure) { + struct homa_shared *shared; + mock_kmalloc_errors = 1; - EXPECT_EQ(NULL, homa_shared_alloc()); + shared = homa_shared_alloc(); + EXPECT_TRUE(IS_ERR(shared)); + EXPECT_EQ(ENOMEM, -PTR_ERR(shared)); +} +TEST_F(homa_utils, homa_shared_alloc__peertab_alloc_failure) +{ + struct homa_shared *shared; + + mock_kmalloc_errors = 2; + shared = homa_shared_alloc(); + EXPECT_TRUE(IS_ERR(shared)); + EXPECT_EQ(ENOMEM, -PTR_ERR(shared)); } -TEST_F(homa_utils, homa_shared_init__success) +TEST_F(homa_utils, homa_shared_alloc__success) { struct homa_shared *shared; @@ -79,34 +92,32 @@ TEST_F(homa_utils, homa_shared_free__clear_global_variable) homa_shared = saved; } -TEST_F(homa_utils, homa_init__kmalloc_failure_for_port_map) +TEST_F(homa_utils, homa_init__error_from_homa_shared_alloc) { + struct homa_shared *saved_shared = homa_shared; struct homa homa2; - memset(&homa2, 0, sizeof(homa2)); + homa_shared = NULL; mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); - EXPECT_EQ(NULL, homa2.port_map); - homa_destroy(&homa2); + EXPECT_EQ(0, atomic64_read(&homa2.next_outgoing_id)); + homa_shared = saved_shared; } -#ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_utils, homa_init__kmalloc_failure_for_peers) +TEST_F(homa_utils, homa_init__kmalloc_failure_for_port_map) { struct homa homa2; - memset(&homa2, 0, sizeof(homa2)); - mock_kmalloc_errors = 8; + mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); - EXPECT_NE(NULL, homa2.port_map); - EXPECT_EQ(NULL, homa2.peers); + EXPECT_EQ(NULL, homa2.port_map); homa_destroy(&homa2); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_init__homa_skb_init_failure) { struct homa homa2; - memset(&homa2, 0, sizeof(homa2)); - mock_kmalloc_errors = 0x20; + mock_kmalloc_errors = 0x8; EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); EXPECT_SUBSTR("Couldn't initialize skb management (errno 12)", mock_printk_output); @@ -114,6 +125,25 @@ TEST_F(homa_utils, homa_init__homa_skb_init_failure) } #endif /* See strip.py */ +TEST_F(homa_utils, homa_destroy__basics) +{ + struct homa homa2; + + homa_init(&homa2, &mock_net); + homa_destroy(&homa2); +} +TEST_F(homa_utils, homa_destroy__unlink_and_free_shared) +{ + struct homa homa2; + + homa_init(&homa2, &mock_net); + EXPECT_NE(NULL, homa_shared); + homa_destroy(&homa2); + EXPECT_NE(NULL, homa_shared); + homa_destroy(&self->homa); + EXPECT_EQ(NULL, homa_shared); +} + #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_print_ipv4_addr) { diff --git a/test/utils.c b/test/utils.c index edce3af9..b2bb4792 100644 --- a/test/utils.c +++ b/test/utils.c @@ -473,4 +473,30 @@ char *unit_ack_string(struct homa_ack *ack) void unit_homa_destroy(struct homa *homa) { /* Currently nothing to check. */ +} + +/** + * unit_log_peers() - Return a count of the number of peers in the + * homa_peertab for @homa (could also include peers from other homas). + * @homa: Used to locate homa_peertab to count. + */ +int unit_count_peers(struct homa *homa) +{ + struct rhashtable_iter iter; + struct homa_peer *peer; + int count = 0; + + rhashtable_walk_enter(&homa->shared->peers->ht, &iter); + rhashtable_walk_start(&iter); + while (1) { + peer = rhashtable_walk_next(&iter); + if (!peer) + break; + if (IS_ERR(peer)) + continue; + count++; + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + return count; } \ No newline at end of file diff --git a/test/utils.h b/test/utils.h index ac9b0d42..60f091f2 100644 --- a/test/utils.h +++ b/test/utils.h @@ -30,39 +30,42 @@ enum unit_rpc_state { UNIT_IN_SERVICE = 24, }; -extern char *unit_ack_string(struct homa_ack *ack); -extern struct homa_rpc - *unit_client_rpc(struct homa_sock *hsk, - enum unit_rpc_state state, struct in6_addr *client_ip, - struct in6_addr *server_ip, int server_port, int id, - int req_length, int resp_length); -extern struct in6_addr - unit_get_in_addr(char *s); -extern void unit_homa_destroy(struct homa *homa); -extern struct iov_iter - *unit_iov_iter(void *buffer, size_t length); -extern int unit_list_length(struct list_head *head); -extern void unit_log_active_ids(struct homa_sock *hsk); -extern void unit_log_filled_skbs(struct sk_buff *skb, int verbose); -extern void unit_log_frag_list(struct sk_buff *skb, int verbose); +char *unit_ack_string(struct homa_ack *ack); +struct homa_rpc + *unit_client_rpc(struct homa_sock *hsk, + enum unit_rpc_state state, struct in6_addr *client_ip, + struct in6_addr *server_ip, int server_port, int id, + int req_length, int resp_length); +int unit_count_peers(struct homa *homa); +struct in6_addr + unit_get_in_addr(char *s); +void unit_homa_destroy(struct homa *homa); +struct iov_iter + *unit_iov_iter(void *buffer, size_t length); +int unit_list_length(struct list_head *head); +void unit_log_active_ids(struct homa_sock *hsk); +void unit_log_filled_skbs(struct sk_buff *skb, int verbose); +void unit_log_frag_list(struct sk_buff *skb, int verbose); #ifndef __STRIP__ /* See strip.py */ -extern void unit_log_grantables(struct homa *homa); +void unit_log_grantables(struct homa *homa); #endif /* See strip.py */ -extern void unit_log_hashed_rpcs(struct homa_sock *hsk); -extern void unit_log_message_out_packets( - struct homa_message_out *message, int verbose); -extern const char *unit_print_gaps(struct homa_rpc *rpc); -extern struct homa_rpc - *unit_server_rpc(struct homa_sock *hsk, - enum unit_rpc_state state, struct in6_addr *server_ip, - struct in6_addr *client_ip, int client_port, int id, - int req_length, int resp_length); -extern void unit_log_skb_list(struct sk_buff_head *packets, - int verbose); -extern void unit_log_throttled(struct homa *homa); -extern void unit_teardown(void); +void unit_log_hashed_rpcs(struct homa_sock *hsk); +void unit_log_message_out_packets(struct homa_message_out *message, + int verbose); +const char *unit_print_gaps(struct homa_rpc *rpc); +struct homa_rpc + *unit_server_rpc(struct homa_sock *hsk, + enum unit_rpc_state state, + struct in6_addr *server_ip, + struct in6_addr *client_ip, + int client_port, int id, int req_length, + int resp_length); +void unit_log_skb_list(struct sk_buff_head *packets, + int verbose); +void unit_log_throttled(struct homa *homa); +void unit_teardown(void); /* Kludge to avoid including arpa/inet.h, which causes definition * conflicts with kernel header files. */ -extern int inet_pton(int af, const char *src, void *dst); +int inet_pton(int af, const char *src, void *dst); From 1e931efceb2bcb190ab0b77e712e8666c5df66c4 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 15 May 2025 09:04:46 -0700 Subject: [PATCH 318/625] Add missing synchronization for rhashtable updates in homa_peer --- homa_peer.c | 3 +++ homa_peer.h | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/homa_peer.c b/homa_peer.c index ed41b4e3..8b9100f5 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -102,6 +102,7 @@ void homa_peertab_free_fn(void *object, void *dummy) */ void homa_peertab_free(struct homa_peertab *peertab) { + spin_lock_init(&peertab->lock); rhashtable_free_and_destroy(&peertab->ht, homa_peertab_free_fn, NULL); kfree(peertab); @@ -218,8 +219,10 @@ struct homa_peer *homa_peer_find(struct homa *homa, rcu_read_unlock(); return peer; } + spin_lock_bh(&peertab->lock); other = rhashtable_lookup_get_insert_fast(&peertab->ht, &peer->ht_linkage, ht_params); + spin_unlock_bh(&peertab->lock); if (IS_ERR(other)) { /* Couldn't insert; return the error info. */ homa_peer_put(peer); diff --git a/homa_peer.h b/homa_peer.h index 26b8c6f0..0bf3f9cf 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -19,6 +19,12 @@ struct homa_rpc; * address. */ struct homa_peertab { + /** + * @lock: Used to synchronize updates to @ht as well as other + * operations on this object. + */ + spinlock_t lock; + /** @ht: Hash table that stores all struct peers. */ struct rhashtable ht; }; From 361d34812ea7d08371460ab1023f09191780215a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 15 May 2025 20:38:59 -0700 Subject: [PATCH 319/625] Introduce homa_net objects This reverts commits such as f0733410 that created a separate struct homa for each net namespace. The homa_shared structure, introduced in a recent commit, has been deleted. Instead, there is now a single global struct homa, with a homa_net object that contains information/functionality that is different for each net namespace. Right now there is no functionality in homa_net; future commits will need to move a few things from homa to homa_net, such as port number allocation. --- homa_devel.c | 20 +++-- homa_devel.h | 4 +- homa_grant.c | 14 ++-- homa_grant.h | 2 +- homa_impl.h | 95 +++++++---------------- homa_incoming.c | 4 +- homa_outgoing.c | 2 +- homa_pacer.c | 13 ++-- homa_pacer.h | 2 +- homa_peer.c | 74 +++++++++--------- homa_peer.h | 18 ++--- homa_plumbing.c | 159 ++++++++++++++++++++------------------ homa_rpc.c | 5 +- homa_sock.c | 52 ++++++++----- homa_sock.h | 63 ++++++++------- homa_timer.c | 4 +- homa_utils.c | 114 +++++++++------------------ test/mock.c | 76 ++++++++++++------ test/mock.h | 6 +- test/unit_homa_grant.c | 17 ++-- test/unit_homa_incoming.c | 25 +++--- test/unit_homa_interest.c | 7 +- test/unit_homa_metrics.c | 3 +- test/unit_homa_offload.c | 7 +- test/unit_homa_outgoing.c | 22 +++--- test/unit_homa_pacer.c | 19 ++--- test/unit_homa_peer.c | 92 +++++++++++----------- test/unit_homa_plumbing.c | 40 +++++++--- test/unit_homa_pool.c | 7 +- test/unit_homa_rpc.c | 15 ++-- test/unit_homa_skb.c | 3 +- test/unit_homa_sock.c | 64 ++++++++------- test/unit_homa_timer.c | 7 +- test/unit_homa_utils.c | 101 +++++++++++------------- test/utils.c | 2 +- 35 files changed, 574 insertions(+), 584 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 32bb4614..cf2d73ea 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -370,21 +370,25 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) } /** - * homa_freeze_peers() - Send FREEZE packets to all known peers. - * @homa: Provides info about peers. + * homa_freeze_peers() - Send FREEZE packets to all known peers in the + * root network namespace. */ -void homa_freeze_peers(struct homa *homa) +void homa_freeze_peers() { struct homa_socktab_scan scan; struct homa_freeze_hdr freeze; struct rhashtable_iter iter; struct homa_peer *peer; struct homa_sock *hsk; + struct homa_net *hnet; int err; - /* Find a socket to use (any will do). */ + /* Find a socket to use (any socket for the namespace will do). */ + hnet = homa_net_from_net(&init_net); rcu_read_lock(); - hsk = homa_socktab_start_scan(homa->port_map, &scan); + hsk = homa_socktab_start_scan(hnet->homa->port_map, &scan); + while (hsk && hsk->hnet != hnet) + hsk = homa_socktab_next(&scan); homa_socktab_end_scan(&scan); if (!hsk) { tt_record("homa_freeze_peers couldn't find a socket"); @@ -398,7 +402,7 @@ void homa_freeze_peers(struct homa *homa) IF_NO_STRIP(freeze.common.urgent = htons(HOMA_TCP_URGENT)); freeze.common.sender_id = 0; - rhashtable_walk_enter(&homa->shared->peers->ht, &iter); + rhashtable_walk_enter(&hnet->homa->peers->ht, &iter); rhashtable_walk_start(&iter); while (true) { peer = rhashtable_walk_next(&iter); @@ -410,7 +414,7 @@ void homa_freeze_peers(struct homa *homa) * that's OK. */ continue; - if (peer->ht_key.homa != homa) + if (peer->ht_key.hnet != hnet) continue; tt_record1("Sending freeze to 0x%x", tt_addr(peer->addr)); err = __homa_xmit_control(&freeze, sizeof(freeze), peer, hsk); @@ -553,7 +557,7 @@ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) tt_record2(format, rpc->id, tt_addr(rpc->peer->addr)); tt_freeze(); // homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); - homa_freeze_peers(rpc->hsk->homa); + homa_freeze_peers(); } } #endif /* See strip.py */ diff --git a/homa_devel.h b/homa_devel.h index 9b2e6d76..cdc33ec9 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -26,7 +26,7 @@ #define KERNEL_VERSION(...) 100 #endif /* __STRIP__ */ -struct homa; +struct homa_net; struct homa_rpc; /** @@ -83,7 +83,7 @@ void homa_check_addr(void *p); void homa_check_list(struct list_head *list, int max_length); void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format); -void homa_freeze_peers(struct homa *homa); +void homa_freeze_peers(void); char *homa_print_ipv4_addr(__be32 addr); char *homa_print_ipv6_addr(const struct in6_addr *addr); char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); diff --git a/homa_grant.c b/homa_grant.c index 7972dcde..1b22b596 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -79,17 +79,18 @@ static struct ctl_table grant_ctl_table[] = { /** * homa_grant_alloc() - Allocate and initialize a new grant object, which * will hold grant management information for @homa. - * @net: Network namespace that @homa is associated with. * Return: A pointer to the new struct grant, or a negative errno. */ -struct homa_grant *homa_grant_alloc(struct net *net) +struct homa_grant *homa_grant_alloc(void) { struct homa_grant *grant; int err; grant = kmalloc(sizeof(*grant), GFP_KERNEL | __GFP_ZERO); - if (!grant) + if (!grant) { + pr_err("%s couldn't allocate grant structure\n", __func__); return ERR_PTR(-ENOMEM); + } grant->max_incoming = 400000; spin_lock_init(&grant->lock); INIT_LIST_HEAD(&grant->grantable_peers); @@ -101,7 +102,7 @@ struct homa_grant *homa_grant_alloc(struct net *net) grant->fifo_fraction = 50; #ifndef __STRIP__ /* See strip.py */ - grant->sysctl_header = register_net_sysctl(net, "net/homa", + grant->sysctl_header = register_net_sysctl(&init_net, "net/homa", grant_ctl_table); if (!grant->sysctl_header) { err = -ENOMEM; @@ -1021,11 +1022,12 @@ void homa_grant_update_sysctl_deps(struct homa_grant *grant) int homa_grant_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct homa_grant *grant = - homa_from_net(current->nsproxy->net_ns)->grant; + struct homa_grant *grant; struct ctl_table table_copy; int result; + grant = homa_net_from_net(current->nsproxy->net_ns)->homa->grant; + /* Generate a new ctl_table that refers to a field in the * net-specific struct homa. */ diff --git a/homa_grant.h b/homa_grant.h index d645f568..61ae5edd 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -223,7 +223,7 @@ struct homa_grant_candidates { }; struct homa_grant - *homa_grant_alloc(struct net *net); + *homa_grant_alloc(void); void homa_grant_cand_add(struct homa_grant_candidates *cand, struct homa_rpc *rpc); void homa_grant_cand_check(struct homa_grant_candidates *cand, diff --git a/homa_impl.h b/homa_impl.h index 03204873..d8752cc2 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -103,17 +103,10 @@ union sockaddr_in_union { }; /** - * struct homa - Stores overall information about the implementation of - * Homa for a particular network namespace (there is a logcially separate - * implementation of Homa for each namespace). + * struct homa - Stores overall information about the Homa transport, which + * is shared across all Homa sockets and all network namespaces. */ struct homa { - /** @shared: information shared across all struct homas. */ - struct homa_shared *shared; - - /** shared_links: used to link this struct into shared->homas. */ - struct list_head shared_links; - /** * @next_outgoing_id: Id to use for next outgoing RPC request. * This is always even: it's used only to generate client-side ids. @@ -135,6 +128,12 @@ struct homa { */ struct homa_pacer *pacer; + /** + * @peers: Info about all the other hosts we have communicated with; + * includes peers from all network namespaces. + */ + struct homa_peertab *peers; + /** * @prev_default_port: The most recent port number assigned from * the range of default ports. @@ -466,23 +465,6 @@ struct homa { */ int next_id; -#ifndef __STRIP__ /* See strip.py */ - /** - * @sysctl_header: Used to remove sysctl values when this structure - * is destroyed. - */ - struct ctl_table_header *sysctl_header; -#endif /* See strip.py */ - - /** - * @timer_kthread: Thread that runs timer code to detect lost - * packets and crashed peers. - */ - struct task_struct *timer_kthread; - - /** @hrtimer: Used to wakeup @timer_kthread at regular intervals. */ - struct hrtimer hrtimer; - /** * @destroyed: True means that this structure is being destroyed * so everyone should clean up. @@ -506,28 +488,15 @@ struct homa { }; /** - * struct homa_shared - Contains "global" information that is shared - * across all instances of struct homa. + * struct homa_net - Contains Homa information that is specific to a + * particular network namespace. */ -struct homa_shared { - /** - * @lock: used when exclusive access is needed, such as when - * updating @homas. - */ - spinlock_t lock; - - /** - * @homas: contains all of the existing struct homas, linked - * through their shared_links fields. Managed with RCU. - */ - struct list_head homas; +struct homa_net { + /** @net: Network namespace corresponding to this structure. */ + struct net *net; - /** - * @peers: Info about all the other hosts we have communicated with; - * includes peers from all struct homas. Dynamically allocated; must - * be kfreed. - */ - struct homa_peertab *peers; + /** @homa: Global Homa information. */ + struct homa *homa; }; /** @@ -724,7 +693,7 @@ int homa_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int homa_hash(struct sock *sk); enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); -int homa_init(struct homa *homa, struct net *net); +int homa_init(struct homa *homa); int homa_ioctl(struct sock *sk, int cmd, int *karg); int homa_load(void); int homa_message_out_fill(struct homa_rpc *rpc, @@ -732,8 +701,11 @@ int homa_message_out_fill(struct homa_rpc *rpc, void homa_message_out_init(struct homa_rpc *rpc, int length); void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); -int homa_net_init(struct net *net); +void homa_net_destroy(struct homa_net *hnet); void homa_net_exit(struct net *net); +int homa_net_init(struct homa_net *hnet, struct net *net, + struct homa *homa); +int homa_net_start(struct net *net); __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -744,9 +716,8 @@ void homa_rpc_handoff(struct homa_rpc *rpc); int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); -struct homa_shared *homa_shared_alloc(void); -void homa_shared_free(struct homa_shared *shared); int homa_shutdown(struct socket *sock, int how); +int homa_socket(struct sock *sk); int homa_softirq(struct sk_buff *skb); void homa_spin(int ns); void homa_timer(struct homa *homa); @@ -794,25 +765,14 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc); #endif /* See strip.py */ /** - * homa_from_net() - Return the struct homa associated with a particular + * homa_net_from_net() - Return the struct homa_net associated with a particular * struct net. - * @net: Get the struct homa for this net namespace. - * Return: see above. - */ -static inline struct homa *homa_from_net(struct net *net) -{ - return (struct homa *)net_generic(net, homa_net_id); -} - -/** - * homa_from_sock() - Return the struct homa associated with a particular - * struct sock. - * @sock: Get the struct homa for this socket. + * @net: Get the Homa data for this net namespace. * Return: see above. */ -static inline struct homa *homa_from_sock(struct sock *sock) +static inline struct homa_net *homa_net_from_net(struct net *net) { - return (struct homa *)net_generic(sock_net(sock), homa_net_id); + return (struct homa_net *)net_generic(net, homa_net_id); } /** @@ -823,7 +783,10 @@ static inline struct homa *homa_from_sock(struct sock *sock) */ static inline struct homa *homa_from_skb(struct sk_buff *skb) { - return (struct homa *)net_generic(dev_net(skb->dev), homa_net_id); + struct homa_net *hnet; + + hnet = net_generic(dev_net(skb->dev), homa_net_id); + return hnet->homa; } /** diff --git a/homa_incoming.c b/homa_incoming.c index 3565d123..29f09b5c 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -891,7 +891,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) struct homa_peer *peer; int i; - peer = homa_peer_find(hsk->homa, &saddr, &hsk->inet); + peer = homa_peer_find(hsk, &saddr); if (!IS_ERR(peer)) { peer->unsched_cutoffs[0] = INT_MAX; for (i = 1; i < HOMA_MAX_PRIORITIES; i++) @@ -939,7 +939,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, #endif /* See strip.py */ goto done; } else { - peer = homa_peer_find(hsk->homa, &saddr, &hsk->inet); + peer = homa_peer_find(hsk, &saddr); if (IS_ERR(peer)) goto done; } diff --git a/homa_outgoing.c b/homa_outgoing.c index 1ba7bc97..167d1f1d 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -571,7 +571,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) unknown.common.urgent = htons(HOMA_TCP_URGENT); #endif /* See strip.py */ unknown.common.sender_id = cpu_to_be64(homa_local_id(h->sender_id)); - peer = homa_peer_find(hsk->homa, &saddr, &hsk->inet); + peer = homa_peer_find(hsk, &saddr); if (!IS_ERR(peer)) __homa_xmit_control(&unknown, sizeof(unknown), peer, hsk); homa_peer_put(peer); diff --git a/homa_pacer.c b/homa_pacer.c index 8f5e997c..ed2ca941 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -50,17 +50,18 @@ static struct ctl_table pacer_ctl_table[] = { * homa_pacer_alloc() - Allocate and initialize a new pacer object, which * will hold pacer-related information for @homa. * @homa: Homa transport that the pacer will be associated with. - * @net: Network namespace that @homa is associated with. * Return: A pointer to the new struct pacer, or a negative errno. */ -struct homa_pacer *homa_pacer_alloc(struct homa *homa, struct net *net) +struct homa_pacer *homa_pacer_alloc(struct homa *homa) { struct homa_pacer *pacer; int err; pacer = kmalloc(sizeof(*pacer), GFP_KERNEL | __GFP_ZERO); - if (!pacer) + if (!pacer) { + pr_err("%s couldn't allocate homa_pacer struct\n", __func__); return ERR_PTR(-ENOMEM); + } pacer->homa = homa; spin_lock_init(&pacer->mutex); pacer->fifo_count = 1000; @@ -82,7 +83,7 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa, struct net *net) atomic64_set(&pacer->link_idle_time, homa_clock()); #ifndef __STRIP__ /* See strip.py */ - pacer->sysctl_header = register_net_sysctl(net, "net/homa", + pacer->sysctl_header = register_net_sysctl(&init_net, "net/homa", pacer_ctl_table); if (!pacer->sysctl_header) { err = -ENOMEM; @@ -429,10 +430,12 @@ void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) int homa_pacer_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct homa_pacer *pacer = homa_from_net(current->nsproxy->net_ns)->pacer; + struct homa_pacer *pacer; struct ctl_table table_copy; int result; + pacer = homa_net_from_net(current->nsproxy->net_ns)->homa->pacer; + /* Generate a new ctl_table that refers to a field in the * net-specific struct homa. */ diff --git a/homa_pacer.h b/homa_pacer.h index 21641bc3..8611908a 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -150,7 +150,7 @@ struct homa_pacer { atomic64_t link_idle_time ____cacheline_aligned_in_smp; }; -struct homa_pacer *homa_pacer_alloc(struct homa *homa, struct net *net); +struct homa_pacer *homa_pacer_alloc(struct homa *homa); int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, bool force); int homa_pacer_dointvec(const struct ctl_table *table, int write, diff --git a/homa_peer.c b/homa_peer.c index 8b9100f5..56adafe2 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -55,9 +55,9 @@ struct homa_peertab *homa_peertab_alloc(void) * associated with a particular struct homa. * @homa: Object whose peers should be freed. */ -void homa_peertab_free_homa(struct homa *homa) +void homa_peertab_free_net(struct homa_net *hnet) { - struct homa_peertab *peertab = homa->shared->peers; + struct homa_peertab *peertab = hnet->homa->peers; struct rhashtable_iter iter; struct homa_peer *peer; @@ -69,7 +69,7 @@ void homa_peertab_free_homa(struct homa *homa) break; if (IS_ERR(peer)) continue; - if (peer->ht_key.homa != homa) + if (peer->ht_key.hnet != hnet) continue; rhashtable_remove_fast(&peertab->ht, &peer->ht_linkage, ht_params); @@ -110,17 +110,15 @@ void homa_peertab_free(struct homa_peertab *peertab) /** * homa_peer_alloc() - Allocate and initialize a new homa_peer object. - * @homa: Homa context in which the peer will be used. + * @hsk: Socket for which the peer will be used. * @addr: Address of the desired host: IPv4 addresses are represented * as IPv4-mapped IPv6 addresses. - * @inet: Socket that will be used for sending packets. * Return: The peer associated with @addr, or a negative errno if an * error occurred. On a successful return the reference count * will be incremented for the returned peer. */ -struct homa_peer *homa_peer_alloc(struct homa *homa, - const struct in6_addr *addr, - struct inet_sock *inet) +struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, + const struct in6_addr *addr) { struct homa_peer *peer; struct dst_entry *dst; @@ -131,10 +129,10 @@ struct homa_peer *homa_peer_alloc(struct homa *homa, return (struct homa_peer *)ERR_PTR(-ENOMEM); } peer->ht_key.addr = *addr; - peer->ht_key.homa = homa; + peer->ht_key.hnet = hsk->hnet; atomic_set(&peer->refs, 1); peer->addr = *addr; - dst = homa_peer_get_dst(peer, inet); + dst = homa_peer_get_dst(peer, hsk); if (IS_ERR(dst)) { INC_METRIC(peer_route_errors, 1); kfree(peer); @@ -185,26 +183,24 @@ void homa_peer_free(struct homa_peer *peer) /** * homa_peer_find() - Returns the peer associated with a given host; creates * a new homa_peer if one doesn't already exist. - * @homa: Homa context in which the peer will be used. + * @hsk: Socket where the peer will be used. * @addr: Address of the desired host: IPv4 addresses are represented * as IPv4-mapped IPv6 addresses. - * @inet: Socket that will be used for sending packets. * * Return: The peer associated with @addr, or a negative errno if an * error occurred. On a successful return the reference count * will be incremented for the returned peer. The caller must * eventually call homa_peer_put to release the reference. */ -struct homa_peer *homa_peer_find(struct homa *homa, - const struct in6_addr *addr, - struct inet_sock *inet) +struct homa_peer *homa_peer_find(struct homa_sock *hsk, + const struct in6_addr *addr) { - struct homa_peertab *peertab = homa->shared->peers; + struct homa_peertab *peertab = hsk->homa->peers; struct homa_peer *peer, *other; struct homa_peer_key key; key.addr = *addr; - key.homa = homa; + key.hnet = hsk->hnet; rcu_read_lock(); peer = rhashtable_lookup(&peertab->ht, &key, ht_params); if (peer) { @@ -214,7 +210,7 @@ struct homa_peer *homa_peer_find(struct homa *homa, } /* No existing entry, so we have to create a new one. */ - peer = homa_peer_alloc(homa, addr, inet); + peer = homa_peer_alloc(hsk, addr); if (IS_ERR(peer)) { rcu_read_unlock(); return peer; @@ -253,7 +249,7 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, { struct dst_entry *dst; - dst = homa_peer_get_dst(peer, &hsk->inet); + dst = homa_peer_get_dst(peer, hsk); if (IS_ERR(dst)) { #ifndef __STRIP__ /* See strip.py */ /* Retain the existing dst if we can't create a new one. */ @@ -296,46 +292,48 @@ int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, * or IPv6) for a peer. * @peer: The peer for which a dst is needed. Note: this peer's flow * struct will be overwritten. - * @inet: Socket that will be used for sending packets. + * @hsk: Socket that will be used for sending packets. * Return: The dst structure (or an ERR_PTR); a reference has been taken. */ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, - struct inet_sock *inet) + struct homa_sock *hsk) { memset(&peer->flow, 0, sizeof(peer->flow)); - if (inet->sk.sk_family == AF_INET) { + if (hsk->sock.sk_family == AF_INET) { struct rtable *rt; - flowi4_init_output(&peer->flow.u.ip4, inet->sk.sk_bound_dev_if, - inet->sk.sk_mark, inet->tos, - RT_SCOPE_UNIVERSE, inet->sk.sk_protocol, 0, + flowi4_init_output(&peer->flow.u.ip4, hsk->sock.sk_bound_dev_if, + hsk->sock.sk_mark, hsk->inet.tos, + RT_SCOPE_UNIVERSE, hsk->sock.sk_protocol, 0, peer->addr.in6_u.u6_addr32[3], - inet->inet_saddr, 0, 0, inet->sk.sk_uid); - security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); - rt = ip_route_output_flow(sock_net(&inet->sk), - &peer->flow.u.ip4, &inet->sk); + hsk->inet.inet_saddr, 0, 0, + hsk->sock.sk_uid); + security_sk_classify_flow(&hsk->sock, + &peer->flow.u.__fl_common); + rt = ip_route_output_flow(sock_net(&hsk->sock), + &peer->flow.u.ip4, &hsk->sock); if (IS_ERR(rt)) return (struct dst_entry *)(PTR_ERR(rt)); return &rt->dst; } - peer->flow.u.ip6.flowi6_oif = inet->sk.sk_bound_dev_if; + peer->flow.u.ip6.flowi6_oif = hsk->sock.sk_bound_dev_if; peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX; - peer->flow.u.ip6.flowi6_mark = inet->sk.sk_mark; + peer->flow.u.ip6.flowi6_mark = hsk->sock.sk_mark; peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE; - peer->flow.u.ip6.flowi6_proto = inet->sk.sk_protocol; + peer->flow.u.ip6.flowi6_proto = hsk->sock.sk_protocol; peer->flow.u.ip6.flowi6_flags = 0; peer->flow.u.ip6.flowi6_secid = 0; peer->flow.u.ip6.flowi6_tun_key.tun_id = 0; - peer->flow.u.ip6.flowi6_uid = inet->sk.sk_uid; + peer->flow.u.ip6.flowi6_uid = hsk->sock.sk_uid; peer->flow.u.ip6.daddr = peer->addr; - peer->flow.u.ip6.saddr = inet->pinet6->saddr; + peer->flow.u.ip6.saddr = hsk->inet.pinet6->saddr; peer->flow.u.ip6.fl6_dport = 0; peer->flow.u.ip6.fl6_sport = 0; peer->flow.u.ip6.mp_hash = 0; - peer->flow.u.ip6.__fl_common.flowic_tos = inet->tos; - peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(inet->tos, 0); - security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); - return ip6_dst_lookup_flow(sock_net(&inet->sk), &inet->sk, + peer->flow.u.ip6.__fl_common.flowic_tos = hsk->inet.tos; + peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(hsk->inet.tos, 0); + security_sk_classify_flow(&hsk->sock, &peer->flow.u.__fl_common); + return ip6_dst_lookup_flow(sock_net(&hsk->sock), &hsk->sock, &peer->flow.u.ip6, NULL); } diff --git a/homa_peer.h b/homa_peer.h index 0bf3f9cf..15586b16 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -39,8 +39,8 @@ struct homa_peer_key { */ struct in6_addr addr; - /** @homa: The context in which the peer will be used. */ - struct homa *homa; + /** @homa: The network namespace in which this peer is valid. */ + struct homa_net *hnet; }; /** @@ -189,20 +189,18 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peertab *homa_peertab_alloc(void); void homa_peertab_free(struct homa_peertab *peertab); -void homa_peertab_free_homa(struct homa *homa); +void homa_peertab_free_net(struct homa_net *hnet); void homa_peertab_free_fn(void *object, void *dummy); void homa_peer_add_ack(struct homa_rpc *rpc); struct homa_peer - *homa_peer_alloc(struct homa *homa, const struct in6_addr *addr, - struct inet_sock *inet); + *homa_peer_alloc(struct homa_sock *hsk, const struct in6_addr *addr); struct homa_peer - *homa_peer_find(struct homa *homa, const struct in6_addr *addr, - struct inet_sock *inet); + *homa_peer_find(struct homa_sock *hsk, const struct in6_addr *addr); void homa_peer_free(struct homa_peer *peer); int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst); struct dst_entry - *homa_peer_get_dst(struct homa_peer *peer, struct inet_sock *inet); + *homa_peer_get_dst(struct homa_peer *peer, struct homa_sock *hsk); #ifndef __STRIP__ /* See strip.py */ void homa_peer_lock_slow(struct homa_peer *peer); void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, @@ -255,7 +253,7 @@ static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk) { if (unlikely(peer->dst->obsolete > 0)) - homa_dst_refresh(hsk->homa->shared->peers, peer, hsk); + homa_dst_refresh(hsk->homa->peers, peer, hsk); dst_hold(peer->dst); return peer->dst; } @@ -322,7 +320,7 @@ static inline int homa_peer_compare(struct rhashtable_compare_arg *arg, const struct homa_peer_key *key = arg->key; return !ipv6_addr_equal(&key->addr, &peer->ht_key.addr) && - peer->ht_key.homa == key->homa; + peer->ht_key.hnet == key->hnet; } #endif /* _HOMA_PEER_H */ diff --git a/homa_plumbing.c b/homa_plumbing.c index 31b3b1ed..d83d6260 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -20,12 +20,27 @@ unsigned int homa_net_id; * pernet subsystem. */ static struct pernet_operations homa_net_ops = { - .init = homa_net_init, + .init = homa_net_start, .exit = homa_net_exit, .id = &homa_net_id, - .size = sizeof(struct homa) + .size = sizeof(struct homa_net) }; +/* Global data for Homa. Never reference homa_data directly. Always use + * the global_homa variable instead (or, even better, a homa pointer + * stored in a struct or passed via a parameter); this allows overriding + * during unit tests. + */ +static struct homa homa_data; + +/* This variable contains the address of the statically-allocated struct homa + * used throughout Homa. This variable should almost never be used directly: + * it should be passed as a parameter to functions that need it. This + * variable is used only by a few functions called from Linux where there + * is no struct homa* available. + */ +struct homa *global_homa = &homa_data; + /* This structure defines functions that handle various operations on * Homa sockets. These functions are relatively generic: they are called * to implement top-level system calls. Many of these operations can @@ -416,14 +431,28 @@ static __u16 header_lengths[] = { }; #endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ +/* Used to remove sysctl values when the module is unloaded. */ +static struct ctl_table_header *homa_ctl_header; +#endif /* See strip.py */ + +/* Thread that runs timer code to detect lost packets and crashed peers. */ +static struct task_struct *timer_kthread; static DECLARE_COMPLETION(timer_thread_done); +/* Used to wakeup timer_kthread at regular intervals. */ +static struct hrtimer hrtimer; + +/* Nonzero is an indication to the timer thread that it should exit. */ +static int timer_thread_exit; + /** * homa_load() - invoked when this module is loaded into the Linux kernel * Return: 0 on success, otherwise a negative errno. */ int __init homa_load(void) { + struct homa *homa = global_homa; int status; pr_err("Homa module loading\n"); @@ -476,12 +505,23 @@ int __init homa_load(void) status); goto add_protocol_v6_err; } + status = homa_init(homa); + if (status) + goto homa_init_err; #ifndef __STRIP__ /* See strip.py */ status = homa_metrics_init(); if (status != 0) goto metrics_err; + homa_ctl_header = register_net_sysctl(&init_net, "net/homa", + homa_ctl_table); + if (!homa_ctl_header) { + pr_err("couldn't register Homa sysctl parameters\n"); + status = -ENOMEM; + goto sysctl_err; + } + status = homa_offload_init(); if (status != 0) { pr_err("Homa couldn't init offloads\n"); @@ -495,23 +535,38 @@ int __init homa_load(void) status); goto net_err; } + timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); + if (IS_ERR(timer_kthread)) { + status = PTR_ERR(timer_kthread); + pr_err("couldn't create Homa pacer thread: error %d\n", + status); + timer_kthread = NULL; + goto timer_err; + } #ifndef __STRIP__ /* See strip.py */ homa_gro_hook_tcp(); #endif /* See strip.py */ #ifndef __UPSTREAM__ /* See strip.py */ tt_init("timetrace"); + tt_set_temp(homa->temp); #endif /* See strip.py */ return 0; +timer_err: + unregister_pernet_subsys(&homa_net_ops); net_err: #ifndef __STRIP__ /* See strip.py */ homa_offload_end(); offload_err: + unregister_net_sysctl_table(homa_ctl_header); +sysctl_err: homa_metrics_end(); metrics_err: #endif /* See strip.py */ + homa_destroy(homa); +homa_init_err: inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); add_protocol_v6_err: inet_del_protocol(&homa_protocol, IPPROTO_HOMA); @@ -531,19 +586,27 @@ int __init homa_load(void) */ void __exit homa_unload(void) { - pr_notice("Homa module unloading\n"); + struct homa *homa = global_homa; - unregister_pernet_subsys(&homa_net_ops); + pr_notice("Homa module unloading\n"); #ifndef __UPSTREAM__ /* See strip.py */ tt_destroy(); #endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ homa_gro_unhook_tcp(); + if (timer_kthread) { + timer_thread_exit = 1; + wake_up_process(timer_kthread); + wait_for_completion(&timer_thread_done); + } + unregister_pernet_subsys(&homa_net_ops); if (homa_offload_end() != 0) pr_err("Homa couldn't stop offloads\n"); + unregister_net_sysctl_table(homa_ctl_header); homa_metrics_end(); #endif /* See strip.py */ + homa_destroy(homa); inet_del_protocol(&homa_protocol, IPPROTO_HOMA); inet_unregister_protosw(&homa_protosw); inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); @@ -556,75 +619,25 @@ module_init(homa_load); module_exit(homa_unload); /** - * homa_net_init() - Initialize a new struct homa as a per-net subsystem. + * homa_net_start() - Initialize Homa for a new network namespace. * @net: The net that Homa will be associated with. * Return: 0 on success, otherwise a negative errno. */ -int homa_net_init(struct net *net) +int homa_net_start(struct net *net) { - struct homa *homa = homa_from_net(net); - int status; - pr_notice("Homa attaching to net namespace\n"); - - status = homa_init(homa, net); - if (status) - goto homa_init_err; -#ifndef __STRIP__ /* See strip.py */ - - homa->sysctl_header = register_net_sysctl(net, "net/homa", - homa_ctl_table); - if (!homa->sysctl_header) { - pr_err("couldn't register Homa sysctl parameters\n"); - status = -ENOMEM; - goto sysctl_err; - } -#endif /* See strip.py */ - - homa->timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); - if (IS_ERR(homa->timer_kthread)) { - status = PTR_ERR(homa->timer_kthread); - pr_err("couldn't create homa timer thread: error %d\n", - status); - homa->timer_kthread = NULL; - goto timer_err; - } - -#ifndef __UPSTREAM__ /* See strip.py */ - tt_set_temp(homa->temp); -#endif /* See strip.py */ - return 0; - -timer_err: -#ifndef __STRIP__ /* See strip.py */ - unregister_net_sysctl_table(homa->sysctl_header); -sysctl_err: -#endif /* See strip.py */ - homa_destroy(homa); -homa_init_err: - return status; + return homa_net_init(homa_net_from_net(net), net, global_homa); } /** - * homa_net_exit() - Remove Homa from a net. + * homa_net_exit() - Perform Homa cleanup needed when a network namespace + * is destroyed. * @net: The net from which Homa should be removed. */ void homa_net_exit(struct net *net) { - struct homa *homa = homa_from_net(net); - pr_notice("Homa detaching from net namespace\n"); - - homa->destroyed = true; - if (homa->timer_kthread) - wake_up_process(homa->timer_kthread); - wait_for_completion(&timer_thread_done); - -#ifndef __STRIP__ /* See strip.py */ - if (homa->sysctl_header) - unregister_net_sysctl_table(homa->sysctl_header); -#endif /* See strip.py */ - homa_destroy(homa); + homa_net_destroy(homa_net_from_net(net)); } /** @@ -772,10 +785,9 @@ int homa_ioctl(struct sock *sk, int cmd, int *karg) int homa_socket(struct sock *sk) { struct homa_sock *hsk = homa_sk(sk); - struct homa *homa = homa_from_sock(sk); int result; - result = homa_sock_init(hsk, homa); + result = homa_sock_init(hsk); if (result != 0) homa_sock_destroy(hsk); return result; @@ -1537,7 +1549,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, int homa_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct homa *homa = homa_from_net(current->nsproxy->net_ns); + struct homa *homa = homa_net_from_net(current->nsproxy->net_ns)->homa; struct ctl_table table_copy; int result; @@ -1590,7 +1602,7 @@ int homa_dointvec(const struct ctl_table *table, int write, } else if (homa->sysctl_action == 7) { homa_rpc_log_active_tt(homa, 0); tt_record("Freezing cluster because of action 7"); - homa_freeze_peers(homa); + homa_freeze_peers(); tt_record("Finished freezing cluster"); tt_freeze(); } else if (homa->sysctl_action == 8) { @@ -1691,10 +1703,7 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, int write, */ enum hrtimer_restart homa_hrtimer(struct hrtimer *timer) { - struct homa *homa; - - homa = container_of(timer, struct homa, hrtimer); - wake_up_process(homa->timer_kthread); + wake_up_process(timer_kthread); return HRTIMER_NORESTART; } @@ -1711,27 +1720,27 @@ int homa_timer_main(void *transport) u64 nsec; #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 15, 0) - hrtimer_init(&homa->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - homa->hrtimer.function = &homa_hrtimer; + hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer.function = &homa_hrtimer; #else - hrtimer_setup(&homa->hrtimer, homa_hrtimer, CLOCK_MONOTONIC, + hrtimer_setup(&hrtimer, homa_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif nsec = 1000000; /* 1 ms */ tick_interval = ns_to_ktime(nsec); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!homa->destroyed) { - hrtimer_start(&homa->hrtimer, tick_interval, + if (!timer_thread_exit) { + hrtimer_start(&hrtimer, tick_interval, HRTIMER_MODE_REL); schedule(); } __set_current_state(TASK_RUNNING); - if (homa->destroyed) + if (timer_thread_exit) break; homa_timer(homa); } - hrtimer_cancel(&homa->hrtimer); + hrtimer_cancel(&hrtimer); kthread_complete_and_exit(&timer_thread_done, 0); return 0; } diff --git a/homa_rpc.c b/homa_rpc.c index 06ccb998..3b681b2e 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -46,8 +46,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, bucket = homa_client_rpc_bucket(hsk, crpc->id); crpc->bucket = bucket; crpc->state = RPC_OUTGOING; - crpc->peer = homa_peer_find(hsk->homa, &dest_addr_as_ipv6, - &hsk->inet); + crpc->peer = homa_peer_find(hsk, &dest_addr_as_ipv6); if (IS_ERR(crpc->peer)) { tt_record("error in homa_peer_find"); err = PTR_ERR(crpc->peer); @@ -151,7 +150,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, srpc->hsk = hsk; srpc->bucket = bucket; srpc->state = RPC_INCOMING; - srpc->peer = homa_peer_find(hsk->homa, source, &hsk->inet); + srpc->peer = homa_peer_find(hsk, source); if (IS_ERR(srpc->peer)) { err = PTR_ERR(srpc->peer); srpc->peer = NULL; diff --git a/homa_sock.c b/homa_sock.c index 74c4ef40..0a83be88 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -126,23 +126,34 @@ void homa_socktab_end_scan(struct homa_socktab_scan *scan) /** * homa_sock_init() - Constructor for homa_sock objects. This function * initializes only the parts of the socket that are owned by Homa. - * @hsk: Object to initialize. - * @homa: Homa implementation that will manage the socket. + * @hsk: Object to initialize. The Homa-specific parts must have been + * initialized to zeroes by the caller. * * Return: 0 for success, otherwise a negative errno. */ -int homa_sock_init(struct homa_sock *hsk, struct homa *homa) +int homa_sock_init(struct homa_sock *hsk) { - struct homa_socktab *socktab = homa->port_map; struct homa_pool *buffer_pool; + struct homa_socktab *socktab; struct homa_sock *other; + struct homa_net *hnet; + struct homa *homa; int starting_port; int result = 0; int i; + hnet = (struct homa_net *)net_generic(sock_net(&hsk->sock), + homa_net_id); + homa = hnet->homa; + socktab = homa->port_map; + /* Initialize fields outside the Homa part. */ hsk->sock.sk_sndbuf = homa->wmem_max; sock_set_flag(&hsk->inet.sk, SOCK_RCU_FREE); +#ifndef __STRIP__ /* See strip.py */ + if (homa->hijack_tcp) + hsk->sock.sk_protocol = IPPROTO_TCP; +#endif /* See strip.py */ /* Do things requiring memory allocation before locking the socket, * so that GFP_ATOMIC is not needed. @@ -152,14 +163,15 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) return PTR_ERR(buffer_pool); /* Initialize Homa-specific fields. */ - spin_lock_bh(&socktab->write_lock); - spin_lock_init(&hsk->lock); - atomic_set(&hsk->protect_count, 0); hsk->homa = homa; - hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) - ? sizeof(struct iphdr) : sizeof(struct ipv6hdr); - hsk->is_server = false; - hsk->shutdown = false; + hsk->hnet = hnet; + hsk->buffer_pool = buffer_pool; + + /* Pick a default port. Must keep the socktab locked from now + * until the new socket is added to the socktab, to ensure that + * no other socket chooses the same port. + */ + spin_lock_bh(&socktab->write_lock); starting_port = homa->prev_default_port; while (1) { homa->prev_default_port++; @@ -179,8 +191,13 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) hsk->port = homa->prev_default_port; hsk->inet.inet_num = hsk->port; hsk->inet.inet_sport = htons(hsk->port); - hlist_add_head_rcu(&hsk->socktab_links, - &socktab->buckets[homa_port_hash(hsk->port)]); + + hsk->is_server = false; + hsk->shutdown = false; + hsk->ip_header_length = (hsk->inet.sk.sk_family == AF_INET) ? + sizeof(struct iphdr) : sizeof(struct ipv6hdr); + spin_lock_init(&hsk->lock); + atomic_set(&hsk->protect_count, 0); INIT_LIST_HEAD(&hsk->active_rpcs); INIT_LIST_HEAD(&hsk->dead_rpcs); hsk->dead_skbs = 0; @@ -201,11 +218,10 @@ int homa_sock_init(struct homa_sock *hsk, struct homa *homa) bucket->id = i + 1000000; INIT_HLIST_HEAD(&bucket->rpcs); } - hsk->buffer_pool = buffer_pool; -#ifndef __STRIP__ /* See strip.py */ - if (homa->hijack_tcp) - hsk->sock.sk_protocol = IPPROTO_TCP; -#endif /* See strip.py */ + + /* Link the socket into the port map. */ + hlist_add_head_rcu(&hsk->socktab_links, + &socktab->buckets[homa_port_hash(hsk->port)]); spin_unlock_bh(&socktab->write_lock); return result; diff --git a/homa_sock.h b/homa_sock.h index 4f78ba94..8d54e390 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -122,26 +122,28 @@ struct homa_sock { }; /** - * @lock: Must be held when modifying fields such as interests - * and lists of RPCs. This lock is used in place of sk->sk_lock - * because it's used differently (it's always used as a simple - * spin lock). See sync.txt for more on Homa's synchronization - * strategy. + * @homa: Overall state about the Homa implementation. NULL + * means this socket was never initialized or has been deleted. */ - spinlock_t lock; + struct homa *homa; /** - * @protect_count: counts the number of calls to homa_protect_rpcs - * for which there have not yet been calls to homa_unprotect_rpcs. - * See sync.txt for more info. + * @homa_net: Overall state specific to the network namespace for + * this socket. */ - atomic_t protect_count; + struct homa_net *hnet; /** - * @homa: Overall state about the Homa implementation. NULL - * means this socket was never initialized or has been deleted. + * @buffer_pool: used to allocate buffer space for incoming messages. + * Storage is dynamically allocated. */ - struct homa *homa; + struct homa_pool *buffer_pool; + + /** + * @port: Port number: identifies this socket uniquely among all + * those on this node. + */ + __u16 port; /** * @is_server: True means that this socket can act as both client @@ -156,12 +158,6 @@ struct homa_sock { */ bool shutdown; - /** - * @port: Port number: identifies this socket uniquely among all - * those on this node. - */ - __u16 port; - /** * @ip_header_length: Length of IP headers for this socket (depends * on IPv4 vs. IPv6). @@ -171,6 +167,26 @@ struct homa_sock { /** @socktab_links: Links this socket into a homa_socktab bucket. */ struct hlist_node socktab_links; + /* Information above is (almost) never modified; start a new + * cache line below for info that is modified frequently. + */ + + /** + * @lock: Must be held when modifying fields such as interests + * and lists of RPCs. This lock is used in place of sk->sk_lock + * because it's used differently (it's always used as a simple + * spin lock). See sync.txt for more on Homa's synchronization + * strategy. + */ + spinlock_t lock ____cacheline_aligned_in_smp; + + /** + * @protect_count: counts the number of calls to homa_protect_rpcs + * for which there have not yet been calls to homa_unprotect_rpcs. + * See sync.txt for more info. + */ + atomic_t protect_count; + /** * @active_rpcs: List of all existing RPCs related to this socket, * including both client and server RPCs. This list isn't strictly @@ -223,12 +239,6 @@ struct homa_sock { * the socket lock. */ struct homa_rpc_bucket server_rpc_buckets[HOMA_SERVER_RPC_BUCKETS]; - - /** - * @buffer_pool: used to allocate buffer space for incoming messages. - * Storage is dynamically allocated. - */ - struct homa_pool *buffer_pool; }; /** @@ -251,11 +261,10 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, __u16 port); void homa_sock_destroy(struct homa_sock *hsk); struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port); -int homa_sock_init(struct homa_sock *hsk, struct homa *homa); +int homa_sock_init(struct homa_sock *hsk); void homa_sock_shutdown(struct homa_sock *hsk); void homa_sock_unlink(struct homa_sock *hsk); int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking); -int homa_socket(struct sock *sk); void homa_socktab_destroy(struct homa_socktab *socktab); void homa_socktab_end_scan(struct homa_socktab_scan *scan); void homa_socktab_init(struct homa_socktab *socktab); diff --git a/homa_timer.c b/homa_timer.c index 1159b7b2..71f25c56 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -99,7 +99,7 @@ void homa_timer_check_rpc(struct homa_rpc *rpc) #if 0 homa_rpc_log_active_tt(homa, 0); tt_record1("Freezing because of RPC abort (id %d)", rpc->id); - homa_freeze_peers(homa); + homa_freeze_peers(); tt_freeze(); #endif if (homa->verbose) @@ -215,7 +215,7 @@ void homa_timer(struct homa *homa) pr_err("%s found no grants going out\n", __func__); homa_rpc_log_active_tt(homa, 0); tt_record("freezing because no grants are going out"); - homa_freeze_peers(homa); + homa_freeze_peers(); tt_freeze(); } } else { diff --git a/homa_utils.c b/homa_utils.c index e8fd44cd..43a4e51c 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -17,56 +17,15 @@ #include "homa_stub.h" #endif /* See strip.py */ -/* Pointer to the singleton homa_shared object, of NULL if there are - * currently no struct homa objects in existence. - */ -struct homa_shared *homa_shared; - -/** - * homa_shared_alloc() - Allocate and initialize a new homa_shared - * object. - * Return: the new homa_shared object, or ERR_PTR on failure. - */ -struct homa_shared *homa_shared_alloc(void) -{ - struct homa_shared *shared; - int err; - - shared = kmalloc(sizeof(*homa_shared), GFP_KERNEL); - if (!shared) - return ERR_PTR(-ENOMEM); - spin_lock_init(&shared->lock); - INIT_LIST_HEAD(&shared->homas); - shared->peers = homa_peertab_alloc(); - if (IS_ERR(shared->peers)) { - err = PTR_ERR(shared->peers); - kfree(shared); - return ERR_PTR(err); - } - return shared; -} - -/** - * homa_shared_free() - Clean up and free a homa_shared object. - */ -void homa_shared_free(struct homa_shared *shared) -{ - homa_peertab_free(shared->peers); - kfree(shared); - if (shared == homa_shared) - homa_shared = NULL; -} - /** * homa_init() - Constructor for homa objects. * @homa: Object to initialize. - * @net: Network namespace that @homa is associated with. * * Return: 0 on success, or a negative errno if there was an error. Even * if an error occurs, it is safe (and necessary) to call * homa_destroy at some point. */ -int homa_init(struct homa *homa, struct net *net) +int homa_init(struct homa *homa) { int err; #ifndef __STRIP__ /* See strip.py */ @@ -78,36 +37,27 @@ int homa_init(struct homa *homa, struct net *net) memset(homa, 0, sizeof(*homa)); - if (!homa_shared) { - homa_shared = homa_shared_alloc(); - if (IS_ERR(homa_shared)) { - int status = PTR_ERR(homa_shared); - - homa_shared = NULL; - return status; - } - } - homa->shared = homa_shared; - spin_lock_bh(&homa_shared->lock); - list_add_tail(&homa->shared_links, &homa_shared->homas); - spin_unlock_bh(&homa_shared->lock); - atomic64_set(&homa->next_outgoing_id, 2); #ifndef __STRIP__ /* See strip.py */ - homa->grant = homa_grant_alloc(net); + homa->grant = homa_grant_alloc(); if (IS_ERR(homa->grant)) { err = PTR_ERR(homa->grant); homa->grant = NULL; return err; } #endif /* See strip.py */ - homa->pacer = homa_pacer_alloc(homa, net); + homa->pacer = homa_pacer_alloc(homa); if (IS_ERR(homa->pacer)) { err = PTR_ERR(homa->pacer); homa->pacer = NULL; return err; } - homa->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; + homa->peers = homa_peertab_alloc(); + if (IS_ERR(homa->peers)) { + err = PTR_ERR(homa->peers); + homa->peers = NULL; + return err; + } homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL); if (!homa->port_map) { pr_err("%s couldn't create port_map: kmalloc failure", @@ -177,12 +127,6 @@ int homa_init(struct homa *homa, struct net *net) */ void homa_destroy(struct homa *homa) { - struct homa_shared *shared; - - if (!homa_shared) - /* Already destroyed. */ - return; - #ifdef __UNIT_TEST__ #include "utils.h" unit_homa_destroy(homa); @@ -204,21 +148,39 @@ void homa_destroy(struct homa *homa) homa_pacer_free(homa->pacer); homa->pacer = NULL; } + if (homa->peers) { + homa_peertab_free(homa->peers); + homa->peers = NULL; + } #ifndef __STRIP__ /* See strip.py */ + homa_skb_cleanup(homa); #endif /* See strip.py */ - homa_peertab_free_homa(homa); +} - shared = homa->shared; - spin_lock_bh(&shared->lock); - __list_del_entry(&homa->shared_links); - if (list_empty(&homa->shared->homas)) { - spin_unlock_bh(&shared->lock); - homa_shared_free(homa->shared); - } else { - spin_unlock_bh(&shared->lock); - } - homa->shared = NULL; +/** + * homa_net_init() - Initialize a new struct homa_net as a per-net subsystem. + * @hnet: Struct to initialzie. + * @net: The network namespace the struct will be associated with. + * @homa: The main Homa data structure to use for the net. + * Return: 0 on success, otherwise a negative errno. + */ +int homa_net_init(struct homa_net *hnet, struct net *net, struct homa *homa) +{ + memset(hnet, 0, sizeof(*hnet)); + hnet->net = net; + hnet->homa = homa; + return 0; +} + +/** + * homa_net_destroy() - Release any resources associated with a homa_net. + * @hnet: Object to destroy; must not be used again after this function + * returns. + */ +void homa_net_destroy(struct homa_net *hnet) +{ + homa_peertab_free_net(hnet); } #ifndef __STRIP__ /* See strip.py */ diff --git a/test/mock.c b/test/mock.c index c499b844..9aab5db0 100644 --- a/test/mock.c +++ b/test/mock.c @@ -225,11 +225,10 @@ __u16 mock_min_default_port = 0x8000; /* Used as sk_socket for all sockets created by mock_sock_init. */ static struct socket mock_socket; -/* Will be used as the struct homa for functions such as homa_from_net - * and homa_from_sock. - */ -static struct homa *mock_homa; -struct net mock_net; +#define MOCK_MAX_NETS 10 +static struct net mock_nets[MOCK_MAX_NETS]; +static struct homa_net mock_hnets[MOCK_MAX_NETS]; +static int mock_num_hnets; /* Nonzero means don't generate a unit test failure when freeing peers * if the reference count isn't zero (log a message instead). @@ -242,7 +241,7 @@ struct net_device mock_net_device = { .gso_max_segs = 1000, .gso_max_size = 0, ._tx = &mock_net_queue, - .nd_net = {.net = &mock_net} + .nd_net = {.net = &mock_nets[0]} }; const struct net_offload *inet_offloads[MAX_INET_PROTOS]; const struct net_offload *inet6_offloads[MAX_INET_PROTOS]; @@ -1079,6 +1078,12 @@ int _printk(const char *format, ...) available -= 2; } va_start(ap, format); + + /* Skip initial characters of format that are used to + * indicate priority. + */ + if (format[0] == 1) + format += 2; vsnprintf(mock_printk_output + len, available, format, ap); va_end(ap); @@ -1545,6 +1550,26 @@ int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, return 0; } +/** + * mock_alloc_hnet: Allocate a new struct homa_net. + * @homa: struct homa that the homa_net will be associated with. + * Return: The new homa_net. + */ +struct homa_net *mock_alloc_hnet(struct homa *homa) +{ + struct homa_net *hnet; + + if (mock_num_hnets >= MOCK_MAX_NETS) { + FAIL("Max number of network namespaces (%d) exceeded", + MOCK_MAX_NETS); + return &mock_hnets[0]; + } + hnet = &mock_hnets[mock_num_hnets]; + homa_net_init(hnet, &mock_nets[mock_num_hnets], homa); + mock_num_hnets++; + return hnet; +} + /** * mock_alloc_pages() - Called instead of alloc_pages when Homa is compiled * for unit testing. @@ -1662,8 +1687,16 @@ void mock_get_page(struct page *page) void *mock_net_generic(const struct net *net, unsigned int id) { - if (id == homa_net_id) - return mock_homa; + struct homa_net *hnet; + int i; + + if (id != homa_net_id) + return NULL; + for (i = 0; i < MOCK_MAX_NETS; i++) { + hnet = &mock_hnets[i]; + if (hnet->net == net) + return hnet; + } return NULL; } @@ -1790,16 +1823,6 @@ void mock_rpc_put(struct homa_rpc *rpc) atomic_dec(&rpc->refs); } -/** - * mock_set_homa() - Arrange for a particular struct homa to be used in - * tests (e.g., it will be discovered by homa_from_net etc.). - */ -void mock_set_homa(struct homa *homa) -{ - mock_homa = homa; - homa_net_id = 167; -} - /** * mock_set_clock_vals() - Specify one or more clock values to be returned * by the next calls to homa_clock(). The list of arguments must be @@ -1983,24 +2006,27 @@ void mock_sock_put(struct sock *sk) /** * mock_sock_init() - Constructor for sockets; initializes the Homa-specific * part, and mocks out the non-Homa-specific parts. - * @hsk: Storage area to be initialized.\ - * @homa: Overall information about the Homa protocol. + * @hsk: Storage area to be initialized. + * @hnet: Network namesspace for the socket. * @port: Port number to use for the socket, or 0 to * use default. * Return: 0 for success, otherwise a negative errno. */ -int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) +int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port) { - int saved_port = homa->prev_default_port; static struct ipv6_pinfo hsk_pinfo; struct sock *sk = &hsk->sock; + struct homa *homa; + int saved_port; int err = 0; + homa = hnet->homa; + saved_port = homa->prev_default_port; memset(hsk, 0, sizeof(*hsk)); sk->sk_data_ready = mock_data_ready; sk->sk_family = mock_ipv6 ? AF_INET6 : AF_INET; sk->sk_socket = &mock_socket; - sk->sk_net.net = &mock_net; + sk->sk_net.net = hnet->net; memset(&mock_socket, 0, sizeof(mock_socket)); refcount_set(&sk->sk_wmem_alloc, 1); init_waitqueue_head(&mock_socket.wq.wait); @@ -2008,7 +2034,7 @@ int mock_sock_init(struct homa_sock *hsk, struct homa *homa, int port) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; if (port != 0 && port >= mock_min_default_port) homa->prev_default_port = port - 1; - err = homa_sock_init(hsk, homa); + err = homa_sock_init(hsk); hsk->is_server = true; if (port != 0) homa->prev_default_port = saved_port; @@ -2095,8 +2121,8 @@ void mock_teardown(void) mock_page_nid_mask = 0; mock_printk_output[0] = 0; mock_min_default_port = 0x8000; - mock_homa = NULL; homa_net_id = 0; + mock_num_hnets = 0; mock_peer_free_no_fail = 0; mock_net_device.gso_max_size = 0; mock_net_device.gso_max_segs = 1000; diff --git a/test/mock.h b/test/mock.h index f3e137c0..73ce1ddd 100644 --- a/test/mock.h +++ b/test/mock.h @@ -139,7 +139,6 @@ extern int mock_max_grants; extern int mock_max_skb_frags; extern __u16 mock_min_default_port; extern int mock_mtu; -extern struct net mock_net; extern struct net_device mock_net_device; extern int mock_numa_mask; @@ -166,6 +165,8 @@ extern struct task_struct *current_task; struct page * mock_alloc_pages(gfp_t gfp, unsigned order); +struct homa_net + *mock_alloc_hnet(struct homa *homa); int mock_check_error(int *errorMask); void mock_clear_xmit_prios(void); unsigned int mock_compound_order(struct page *page); @@ -201,7 +202,6 @@ void mock_rpc_hold(struct homa_rpc *rpc); void mock_rpc_put(struct homa_rpc *rpc); void mock_set_clock_vals(u64 t, ...); void mock_set_core(int num); -void mock_set_homa(struct homa *homa); void mock_set_ipv6(struct homa_sock *hsk); void mock_spin_lock(spinlock_t *lock); void mock_spin_unlock(spinlock_t *lock); @@ -212,7 +212,7 @@ int mock_skb_count(void); void mock_sock_destroy(struct homa_sock *hsk, struct homa_socktab *socktab); void mock_sock_hold(struct sock *sk); -int mock_sock_init(struct homa_sock *hsk, struct homa *homa, +int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port); void mock_sock_put(struct sock *sk); void mock_teardown(void); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index fd9ff277..f06e89df 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -46,6 +46,7 @@ FIXTURE(homa_grant) { u64 server_id; union sockaddr_in_union server_addr; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; struct homa_data_hdr data; int incoming_delta; @@ -67,8 +68,8 @@ FIXTURE_SETUP(homa_grant) self->server_port = 99; self->client_id = 1234; self->server_id = 1235; - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); self->homa.num_priorities = 1; self->homa.poll_cycles = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; @@ -77,7 +78,7 @@ FIXTURE_SETUP(homa_grant) self->homa.grant->window = 10000; self->homa.grant->max_incoming = 50000; self->homa.grant->max_rpcs_per_peer = 10; - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); @@ -137,7 +138,7 @@ TEST_F(homa_grant, homa_grant_alloc__success) { struct homa_grant *grant; - grant = homa_grant_alloc(&mock_net); + grant = homa_grant_alloc(); EXPECT_EQ(50, grant->fifo_fraction); homa_grant_free(grant); } @@ -146,7 +147,7 @@ TEST_F(homa_grant, homa_grant_alloc__cant_allocate_memory) struct homa_grant *grant; mock_kmalloc_errors = 1; - grant = homa_grant_alloc(&mock_net); + grant = homa_grant_alloc(); EXPECT_TRUE(IS_ERR(grant)); EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); } @@ -155,7 +156,7 @@ TEST_F(homa_grant, homa_grant_alloc__cant_register_sysctls) struct homa_grant *grant; mock_register_sysctl_errors = 1; - grant = homa_grant_alloc(&mock_net); + grant = homa_grant_alloc(); EXPECT_TRUE(IS_ERR(grant)); EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); } @@ -164,7 +165,7 @@ TEST_F(homa_grant, homa_grant_free__basics) { struct homa_grant *grant; - grant = homa_grant_alloc(&mock_net); + grant = homa_grant_alloc(); homa_grant_free(grant); EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); } @@ -172,7 +173,7 @@ TEST_F(homa_grant, homa_grant_free__sysctls_not_registered) { struct homa_grant *grant; - grant = homa_grant_alloc(&mock_net); + grant = homa_grant_alloc(); grant->sysctl_header = NULL; homa_grant_free(grant); EXPECT_STREQ("", unit_log_get()); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 90ccff74..206577a5 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -71,6 +71,7 @@ FIXTURE(homa_incoming) { u64 server_id; union sockaddr_in_union server_addr; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; struct homa_sock hsk2; struct homa_data_hdr data; @@ -88,8 +89,8 @@ FIXTURE_SETUP(homa_incoming) self->server_port = 99; self->client_id = 1234; self->server_id = 1235; - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 1; self->homa.poll_cycles = 0; @@ -100,8 +101,8 @@ FIXTURE_SETUP(homa_incoming) self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; #endif /* See strip.py */ - mock_sock_init(&self->hsk, &self->homa, 0); - mock_sock_init(&self->hsk2, &self->homa, self->server_port); + mock_sock_init(&self->hsk, self->hnet, 0); + mock_sock_init(&self->hsk2, self->hnet, self->server_port); self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); @@ -866,7 +867,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv4) // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); unit_log_clear(); @@ -883,7 +884,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv6) // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); unit_log_clear(); @@ -900,7 +901,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__server_not_enabled) // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); self->hsk.is_server = false; skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); @@ -918,7 +919,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_free_many_packets) // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); skb2 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); @@ -1042,7 +1043,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), &self->homa); - peer = homa_peer_find(&self->homa, self->server_ip, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, self->server_ip); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(400, peer->cutoff_version); EXPECT_EQ(9, peer->unsched_cutoffs[1]); @@ -1817,8 +1818,7 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) mock_kmalloc_errors = 1; homa_cutoffs_pkt(skb, &self->hsk); EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); - peer = homa_peer_find(&self->homa, self->server_ip, - &self->hsk.inet); + peer = homa_peer_find(&self->hsk, self->server_ip); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(0, peer->cutoff_version); homa_peer_put(peer); @@ -1894,8 +1894,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) { - struct homa_peer *peer = homa_peer_find(&self->homa, self->server_ip, - &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->hsk, self->server_ip); struct homa_need_ack_hdr h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index 74afbd44..9fb49e0c 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -43,6 +43,7 @@ static void notify_hook(char *id) FIXTURE(homa_interest) { struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; struct in6_addr client_ip; int client_port; @@ -54,9 +55,9 @@ FIXTURE(homa_interest) { }; FIXTURE_SETUP(homa_interest) { - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); self->client_port = 40000; self->server_ip = unit_get_in_addr("1.2.3.4"); diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c index 5ee79d55..86f9a304 100644 --- a/test/unit_homa_metrics.c +++ b/test/unit_homa_metrics.c @@ -12,8 +12,7 @@ FIXTURE(homa_metrics) { }; FIXTURE_SETUP(homa_metrics) { - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); } FIXTURE_TEARDOWN(homa_metrics) { diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 8997f530..ffa0677f 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -27,6 +27,7 @@ static struct sk_buff *unit_tcp6_gro_receive(struct list_head *held_list, FIXTURE(homa_offload) { struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; struct in6_addr ip; struct homa_data_hdr header; @@ -40,11 +41,11 @@ FIXTURE_SETUP(homa_offload) { int i; - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.unsched_bytes = 10000; - mock_sock_init(&self->hsk, &self->homa, 99); + mock_sock_init(&self->hsk, self->hnet, 99); self->ip = unit_get_in_addr("196.168.0.1"); memset(&self->header, 0, sizeof(self->header)); self->header.common = (struct homa_common_hdr){ diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 92fa180f..a244129a 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -66,6 +66,7 @@ FIXTURE(homa_outgoing) { u64 client_id; u64 server_id; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; union sockaddr_in_union server_addr; struct homa_peer *peer; @@ -78,8 +79,8 @@ FIXTURE_SETUP(homa_outgoing) self->server_port = 99; self->client_id = 1234; self->server_id = 1235; - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); mock_clock = 10000; self->homa.pacer->cycles_per_mbyte = 1000000; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; @@ -88,13 +89,12 @@ FIXTURE_SETUP(homa_outgoing) self->homa.grant->window = 10000; self->homa.pacer->fifo_fraction = 0; #endif /* See strip.py */ - mock_sock_init(&self->hsk, &self->homa, self->client_port); + mock_sock_init(&self->hsk, self->hnet, self->client_port); self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->peer = homa_peer_find(&self->homa, - &self->server_addr.in6.sin6_addr, - &self->hsk.inet); + self->peer = homa_peer_find(&self->hsk, + &self->server_addr.in6.sin6_addr); unit_log_clear(); } FIXTURE_TEARDOWN(homa_outgoing) @@ -293,7 +293,7 @@ TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__multiple_segments_tcp_hijacking) char buffer[1000]; self->homa.hijack_tcp = 1; - mock_sock_init(&hsk, &self->homa, self->client_port+1); + mock_sock_init(&hsk, self->hnet, self->client_port+1); crpc = homa_rpc_alloc_client(&hsk, &self->server_addr); homa_rpc_unlock(crpc); homa_message_out_init(crpc, 10000); @@ -716,7 +716,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, self->client_port); + mock_sock_init(&self->hsk, self->hnet, self->client_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); @@ -740,7 +740,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, self->client_port); + mock_sock_init(&self->hsk, self->hnet, self->client_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, 1111, 10000, 10000); @@ -968,7 +968,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, self->client_port); + mock_sock_init(&self->hsk, self->hnet, self->client_port); crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, @@ -986,7 +986,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, self->client_port); + mock_sock_init(&self->hsk, self->hnet, self->client_port); crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index e9cbb466..986273dd 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -52,6 +52,7 @@ FIXTURE(homa_pacer) { u64 client_id; u64 server_id; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; }; FIXTURE_SETUP(homa_pacer) @@ -62,14 +63,14 @@ FIXTURE_SETUP(homa_pacer) self->server_port = 99; self->client_id = 1234; self->server_id = 1235; - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); self->homa.pacer->cycles_per_mbyte = 1000000; self->homa.pacer->throttle_min_bytes = 0; #ifndef __STRIP__ /* See strip.py */ self->homa.pacer->fifo_fraction = 0; #endif /* See strip.py */ - mock_sock_init(&self->hsk, &self->homa, self->client_port); + mock_sock_init(&self->hsk, self->hnet, self->client_port); } FIXTURE_TEARDOWN(homa_pacer) { @@ -81,7 +82,7 @@ TEST_F(homa_pacer, homa_pacer_new__success) { struct homa_pacer *pacer; - pacer = homa_pacer_alloc(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa); EXPECT_FALSE(IS_ERR(pacer)); EXPECT_EQ(&self->homa, pacer->homa); homa_pacer_free(pacer); @@ -91,7 +92,7 @@ TEST_F(homa_pacer, homa_pacer_new__cant_allocate_memory) struct homa_pacer *pacer; mock_kmalloc_errors = 1; - pacer = homa_pacer_alloc(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa); EXPECT_TRUE(IS_ERR(pacer)); EXPECT_EQ(ENOMEM, -PTR_ERR(pacer)); } @@ -100,7 +101,7 @@ TEST_F(homa_pacer, homa_pacer_new__cant_create_pacer_thread) struct homa_pacer *pacer; mock_kthread_create_errors = 1; - pacer = homa_pacer_alloc(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa); EXPECT_TRUE(IS_ERR(pacer)); EXPECT_EQ(EACCES, -PTR_ERR(pacer)); } @@ -110,7 +111,7 @@ TEST_F(homa_pacer, homa_pacer_new__cant_register_sysctls) struct homa_pacer *pacer; mock_register_sysctl_errors = 1; - pacer = homa_pacer_alloc(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa); EXPECT_TRUE(IS_ERR(pacer)); EXPECT_EQ(ENOMEM, -PTR_ERR(pacer)); } @@ -120,7 +121,7 @@ TEST_F(homa_pacer, homa_pacer_free__basics) { struct homa_pacer *pacer; - pacer = homa_pacer_alloc(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa); EXPECT_FALSE(IS_ERR(pacer)); unit_log_clear(); homa_pacer_free(pacer); @@ -136,7 +137,7 @@ TEST_F(homa_pacer, homa_pacer_free__no_thread) { struct homa_pacer *pacer; - pacer = homa_pacer_alloc(&self->homa, &mock_net); + pacer = homa_pacer_alloc(&self->homa); EXPECT_FALSE(IS_ERR(pacer)); pacer->kthread = NULL; unit_log_clear(); diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 1d6525b5..081aecdf 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -15,6 +15,7 @@ struct in6_addr ip3333[1]; FIXTURE(homa_peer) { struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; struct in6_addr client_ip[1]; struct in6_addr server_ip[1]; @@ -22,9 +23,9 @@ FIXTURE(homa_peer) { }; FIXTURE_SETUP(homa_peer) { - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); self->client_ip[0] = unit_get_in_addr("196.168.0.1"); self->server_ip[0] = unit_get_in_addr("1.2.3.4"); ip1111[0] = unit_get_in_addr("1::1:1:1"); @@ -61,8 +62,7 @@ static void peer_race_hook(char *id) /* Create a peer with the same address as the one being created * by the current test. */ - conflicting_peer = homa_peer_find(&test_data->homa, ip3333, - &test_data->hsk.inet); + conflicting_peer = homa_peer_find(&test_data->hsk, ip3333); homa_peer_put(conflicting_peer); } @@ -94,25 +94,27 @@ TEST_F(homa_peer, homa_peertab_alloc__rhashtable_init_fails) EXPECT_EQ(EINVAL, -PTR_ERR(peertab)); } -TEST_F(homa_peer, homa_peertab_free_homa) +TEST_F(homa_peer, homa_peertab_free_net) { - /* Create peers from two different "homa"s, make sure only + /* Create peers from two different netns's, make sure only * those from one get freed. */ struct homa_peer *peer; - struct homa homa2; + struct homa_sock hsk2; + struct homa_net *hnet2; - homa_init(&homa2, &mock_net); - peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + hnet2 = mock_alloc_hnet(&self->homa); + mock_sock_init(&hsk2, hnet2, 44); + + peer = homa_peer_find(&self->hsk, ip1111); homa_peer_put(peer); - peer = homa_peer_find(&self->homa, ip2222, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, ip2222); homa_peer_put(peer); - peer = homa_peer_find(&homa2, ip3333, &self->hsk.inet); + peer = homa_peer_find(&hsk2, ip3333); homa_peer_put(peer); EXPECT_EQ(3, unit_count_peers(&self->homa)); - homa_peertab_free_homa(&self->homa); + homa_peertab_free_net(self->hnet); EXPECT_EQ(1, unit_count_peers(&self->homa)); - homa_destroy(&homa2); } TEST_F(homa_peer, homa_peertab_free_fn) @@ -120,7 +122,7 @@ TEST_F(homa_peer, homa_peertab_free_fn) struct homa_peer *peer; struct dst_entry *dst; - peer = homa_peer_alloc(&self->homa, ip3333, &self->hsk.inet); + peer = homa_peer_alloc(&self->hsk, ip3333); dst = peer->dst; dst_hold(dst); EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); @@ -134,24 +136,24 @@ TEST_F(homa_peer, homa_peertab_free_fn) TEST_F(homa_peer, homa_peertab_free) { struct homa_peer *peer; - peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, ip1111); homa_peer_put(peer); - peer = homa_peer_find(&self->homa, ip2222, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, ip2222); mock_peer_free_no_fail = 1; unit_log_clear(); - homa_peertab_free(self->homa.shared->peers); + homa_peertab_free(self->homa.peers); EXPECT_STREQ("peer [2::2:2:2] has reference count 1", unit_log_get()); kfree(peer); - self->homa.shared->peers = homa_peertab_alloc(); + self->homa.peers = homa_peertab_alloc(); } TEST_F(homa_peer, homa_peer_alloc__success) { struct homa_peer *peer; - peer = homa_peer_alloc(&self->homa, ip1111, &self->hsk.inet); + peer = homa_peer_alloc(&self->hsk, ip1111); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ_IP(*ip1111, peer->addr); #ifndef __STRIP__ /* See strip.py */ @@ -168,7 +170,7 @@ TEST_F(homa_peer, homa_peer_alloc__kmalloc_error) struct homa_peer *peer; mock_kmalloc_errors = 1; - peer = homa_peer_alloc(&self->homa, ip3333, &self->hsk.inet); + peer = homa_peer_alloc(&self->hsk, ip3333); EXPECT_EQ(ENOMEM, -PTR_ERR(peer)); #ifndef __STRIP__ /* See strip.py */ @@ -180,7 +182,7 @@ TEST_F(homa_peer, homa_peer_alloc__route_error) struct homa_peer *peer; mock_route_errors = 1; - peer = homa_peer_alloc(&self->homa, ip3333, &self->hsk.inet); + peer = homa_peer_alloc(&self->hsk, ip3333); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); #ifndef __STRIP__ /* See strip.py */ @@ -193,7 +195,7 @@ TEST_F(homa_peer, homa_peer_free__normal) struct homa_peer *peer; struct dst_entry *dst; - peer = homa_peer_alloc(&self->homa, ip1111, &self->hsk.inet); + peer = homa_peer_alloc(&self->hsk, ip1111); ASSERT_FALSE(IS_ERR(peer)); dst = peer->dst; dst_hold(dst); @@ -208,7 +210,7 @@ TEST_F(homa_peer, homa_peer_free__nonzero_ref_count) { struct homa_peer *peer; - peer = homa_peer_alloc(&self->homa, ip2222, &self->hsk.inet); + peer = homa_peer_alloc(&self->hsk, ip2222); ASSERT_FALSE(IS_ERR(peer)); mock_peer_free_no_fail = 1; @@ -223,7 +225,7 @@ TEST_F(homa_peer, homa_peer_find__basics) struct homa_peer *peer, *peer2; /* First call: create new peer. */ - peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, ip1111); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ_IP(*ip1111, peer->addr); #ifndef __STRIP__ /* See strip.py */ @@ -232,12 +234,12 @@ TEST_F(homa_peer, homa_peer_find__basics) #endif /* See strip.py */ /* Second call: lookup existing peer. */ - peer2 = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + peer2 = homa_peer_find(&self->hsk, ip1111); EXPECT_EQ(peer, peer2); EXPECT_EQ(2, atomic_read(&peer->refs)); /* Third call: lookup new peer. */ - peer2 = homa_peer_find(&self->homa, ip2222, &self->hsk.inet); + peer2 = homa_peer_find(&self->hsk, ip2222); EXPECT_NE(peer, peer2); ASSERT_FALSE(IS_ERR(peer2)); EXPECT_EQ(1, atomic_read(&peer2->refs)); @@ -254,7 +256,7 @@ TEST_F(homa_peer, homa_peer_find__error_in_homa_peer_alloc) struct homa_peer *peer; mock_route_errors = 1; - peer = homa_peer_find(&self->homa, ip3333, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, ip3333); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); #ifndef __STRIP__ /* See strip.py */ @@ -266,7 +268,7 @@ TEST_F(homa_peer, homa_peer_find__insert_error) struct homa_peer *peer; mock_rht_insert_errors = 1; - peer = homa_peer_find(&self->homa, ip3333, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, ip3333); EXPECT_TRUE(IS_ERR(peer)); EXPECT_EQ(EINVAL, -PTR_ERR(peer)); } @@ -277,7 +279,7 @@ TEST_F(homa_peer, homa_peer_find__conflicting_create) test_data = self; peer_race_hook_invocations = 0; unit_hook_register(peer_race_hook); - peer = homa_peer_find(&self->homa, ip3333, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, ip3333); EXPECT_FALSE(IS_ERR(conflicting_peer)); EXPECT_EQ(conflicting_peer, peer); EXPECT_EQ(1, atomic_read(&peer->refs)); @@ -289,12 +291,12 @@ TEST_F(homa_peer, homa_dst_refresh__basics) struct dst_entry *old_dst; struct homa_peer *peer; - peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, ip1111); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); old_dst = peer->dst; - homa_dst_refresh(self->homa.shared->peers, peer, &self->hsk); + homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_NE(old_dst, peer->dst); homa_peer_put(peer); } @@ -303,13 +305,13 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) struct dst_entry *old_dst; struct homa_peer *peer; - peer = homa_peer_find(&self->homa, ip1111, &self->hsk.inet); + peer = homa_peer_find(&self->hsk, ip1111); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); old_dst = peer->dst; mock_route_errors = 1; - homa_dst_refresh(self->homa.shared->peers, peer, &self->hsk); + homa_dst_refresh(self->homa.peers, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); @@ -337,14 +339,13 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv4) // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); - struct homa_peer *peer = homa_peer_find(&self->homa, - &self->client_ip[0], - &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->hsk, + &self->client_ip[0]); ASSERT_NE(NULL, peer); - dst = homa_peer_get_dst(peer, &self->hsk.inet); + dst = homa_peer_get_dst(peer, &self->hsk); ASSERT_NE(NULL, dst); dst_release(dst); EXPECT_STREQ("196.168.0.1", @@ -360,13 +361,12 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); - struct homa_peer *peer = homa_peer_find(&self->homa, &ip1111[0], - &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->hsk, &ip1111[0]); ASSERT_NE(NULL, peer); - dst = homa_peer_get_dst(peer, &self->hsk.inet); + dst = homa_peer_get_dst(peer, &self->hsk); ASSERT_NE(NULL, dst); dst_release(dst); addr = ntohl(peer->flow.u.ip4.daddr); @@ -380,8 +380,7 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_peer, homa_peer_lock_slow) { - struct homa_peer *peer = homa_peer_find(&self->homa, ip3333, - &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->hsk, ip3333); ASSERT_NE(NULL, peer); mock_clock = 10000; @@ -450,8 +449,7 @@ TEST_F(homa_peer, homa_peer_add_ack) TEST_F(homa_peer, homa_peer_get_acks) { - struct homa_peer *peer = homa_peer_find(&self->homa, ip3333, - &self->hsk.inet); + struct homa_peer *peer = homa_peer_find(&self->hsk, ip3333); struct homa_ack acks[2]; ASSERT_NE(NULL, peer); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index c7a6a2e0..494ac772 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -29,6 +29,7 @@ FIXTURE(homa_plumbing) { u64 client_id; u64 server_id; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; union sockaddr_in_union client_addr; union sockaddr_in_union server_addr; @@ -55,11 +56,11 @@ FIXTURE_SETUP(homa_plumbing) self->client_addr.in6.sin6_port = htons(self->client_port); self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - homa_init(&self->homa, &mock_net); + homa_init(&self->homa); if (self->homa.wmem_max == 0) printf("homa_plumbing fixture found wmem_max 0\n"); - mock_set_homa(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); + self->hnet = mock_alloc_hnet(&self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); self->client_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; if (self->hsk.inet.sk.sk_family == AF_INET) { @@ -126,6 +127,21 @@ TEST_F(homa_plumbing, homa_load__error_in_inet6_register_protosw) homa_unload(); } +TEST_F(homa_plumbing, homa_net_exit__free_peers) +{ + struct in6_addr addr1 = unit_get_in_addr("1.2.3.4"); + struct in6_addr addr2 = unit_get_in_addr("1.2.3.5"); + struct in6_addr addr3 = unit_get_in_addr("1.2.3.6"); + + homa_peer_put(homa_peer_find(&self->hsk, &addr1)); + homa_peer_put(homa_peer_find(&self->hsk, &addr2)); + homa_peer_put(homa_peer_find(&self->hsk, &addr3)); + + EXPECT_EQ(3, unit_count_peers(&self->homa)); + homa_net_exit(self->hsk.hnet->net); + EXPECT_EQ(0, unit_count_peers(&self->homa)); +} + TEST_F(homa_plumbing, homa_bind__version_mismatch) { struct sockaddr addr = {}; @@ -135,7 +151,7 @@ TEST_F(homa_plumbing, homa_bind__version_mismatch) // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); addr.sa_family = AF_INET6; sock.sk = &self->hsk.inet.sk; result = homa_bind(&sock, &addr, sizeof(addr)); @@ -150,7 +166,7 @@ TEST_F(homa_plumbing, homa_bind__ipv6_address_too_short) // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); addr.in6.sin6_family = AF_INET6; sock.sk = &self->hsk.inet.sk; @@ -166,7 +182,7 @@ TEST_F(homa_plumbing, homa_bind__ipv6_ok) // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); self->hsk.is_server = false; addr.in6.sin6_family = AF_INET6; @@ -186,7 +202,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_address_too_short) // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); addr.in4.sin_family = AF_INET; sock.sk = &self->hsk.inet.sk; @@ -202,7 +218,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_ok) // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); self->hsk.is_server = false; addr.in4.sin_family = AF_INET; @@ -264,7 +280,7 @@ TEST_F(homa_plumbing, homa_socket__success) struct homa_sock hsk; memset(&hsk, 0, sizeof(hsk)); - hsk.sock.sk_net.net = &mock_net; + hsk.sock.sk_net.net = self->hnet->net; refcount_set(&hsk.sock.sk_wmem_alloc, 1); EXPECT_EQ(0, homa_socket(&hsk.sock)); homa_sock_destroy(&hsk); @@ -274,7 +290,7 @@ TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) struct homa_sock hsk; memset(&hsk, 0, sizeof(hsk)); - hsk.sock.sk_net.net = &mock_net; + hsk.sock.sk_net.net = self->hnet->net; refcount_set(&hsk.sock.sk_wmem_alloc, 1); mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_socket(&hsk.sock)); @@ -755,7 +771,7 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) // Make sure the test uses IPv4. mock_ipv6 = false; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, pages, 0)); crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, @@ -786,7 +802,7 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv6) // Make sure the test uses IPv6. mock_ipv6 = true; homa_sock_destroy(&self->hsk); - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); server_ip6 = unit_get_in_addr("1::3:5:7"); crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 02d2a695..67e93b20 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -13,19 +13,20 @@ static struct homa_pool *cur_pool; FIXTURE(homa_pool) { struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; struct in6_addr client_ip; struct in6_addr server_ip; }; FIXTURE_SETUP(homa_pool) { - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; #endif /* See strip.py */ - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); self->server_ip = unit_get_in_addr("1.2.3.4"); cur_pool = self->hsk.buffer_pool; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 13ccd189..a421168d 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -50,6 +50,7 @@ FIXTURE(homa_rpc) { u64 client_id; u64 server_id; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; union sockaddr_in_union server_addr; struct homa_data_hdr data; @@ -68,13 +69,13 @@ FIXTURE_SETUP(homa_rpc) self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; #endif /* See strip.py */ - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); memset(&self->data, 0, sizeof(self->data)); self->data.common = (struct homa_common_hdr){ .sport = htons(self->client_port), @@ -352,7 +353,7 @@ TEST_F(homa_rpc, homa_rpc_acked__basics) struct homa_sock hsk; struct homa_ack ack = {}; - mock_sock_init(&hsk, &self->homa, self->server_port); + mock_sock_init(&hsk, self->hnet, self->server_port); srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 3000); @@ -370,7 +371,7 @@ TEST_F(homa_rpc, homa_rpc_acked__lookup_socket) struct homa_rpc *srpc; struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); + mock_sock_init(&hsk, self->hnet, self->server_port); srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 3000); @@ -388,7 +389,7 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_socket) struct homa_rpc *srpc; struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); + mock_sock_init(&hsk, self->hnet, self->server_port); srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 3000); @@ -406,7 +407,7 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) struct homa_rpc *srpc; struct homa_sock hsk; - mock_sock_init(&hsk, &self->homa, self->server_port); + mock_sock_init(&hsk, self->hnet, self->server_port); srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 100, 3000); diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 8280ba44..55702ddc 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -80,8 +80,7 @@ FIXTURE(homa_skb) { }; FIXTURE_SETUP(homa_skb) { - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); self->skb = alloc_skb_fclone(200, GFP_KERNEL); if (!self->skb) FAIL("unit_homa_skb setup couldn't allocate skb"); diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 5a7d8cdf..12e8a12a 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -29,6 +29,7 @@ static void schedule_hook(char *id) FIXTURE(homa_sock) { struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; struct in6_addr client_ip[1]; int client_port; @@ -38,9 +39,9 @@ FIXTURE(homa_sock) { }; FIXTURE_SETUP(homa_sock) { - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); - mock_sock_init(&self->hsk, &self->homa, 0); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); self->client_ip[0] = unit_get_in_addr("196.168.0.1"); self->client_port = 40000; self->server_ip[0] = unit_get_in_addr("1.2.3.4"); @@ -66,9 +67,8 @@ TEST_F(homa_sock, homa_socktab_start_scan) struct homa_socktab_scan scan; homa_destroy(&self->homa); - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); - mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); + homa_init(&self->homa); + mock_sock_init(&self->hsk, self->hnet, HOMA_MIN_DEFAULT_PORT+100); EXPECT_EQ(&self->hsk, homa_socktab_start_scan(self->homa.port_map, &scan)); EXPECT_EQ(100, scan.current_bucket); @@ -83,12 +83,11 @@ TEST_F(homa_sock, homa_socktab_next) int first_port = 34000; homa_destroy(&self->homa); - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); - mock_sock_init(&hsk1, &self->homa, first_port); - mock_sock_init(&hsk2, &self->homa, first_port+HOMA_SOCKTAB_BUCKETS); - mock_sock_init(&hsk3, &self->homa, first_port+2*HOMA_SOCKTAB_BUCKETS); - mock_sock_init(&hsk4, &self->homa, first_port+5); + homa_init(&self->homa); + mock_sock_init(&hsk1, self->hnet, first_port); + mock_sock_init(&hsk2, self->hnet, first_port+HOMA_SOCKTAB_BUCKETS); + mock_sock_init(&hsk3, self->hnet, first_port+2*HOMA_SOCKTAB_BUCKETS); + mock_sock_init(&hsk4, self->hnet, first_port+5); hsk = homa_socktab_start_scan(self->homa.port_map, &scan); EXPECT_EQ(first_port+2*HOMA_SOCKTAB_BUCKETS, hsk->port); EXPECT_EQ(1, mock_sock_holds); @@ -116,9 +115,8 @@ TEST_F(homa_sock, homa_socktab_end_scan) struct homa_socktab_scan scan1, scan2, scan3; homa_destroy(&self->homa); - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); - mock_sock_init(&self->hsk, &self->homa, HOMA_MIN_DEFAULT_PORT+100); + homa_init(&self->homa); + mock_sock_init(&self->hsk, self->hnet, HOMA_MIN_DEFAULT_PORT+100); homa_socktab_start_scan(self->homa.port_map, &scan1); homa_socktab_start_scan(self->homa.port_map, &scan2); homa_socktab_start_scan(self->homa.port_map, &scan3); @@ -138,7 +136,7 @@ TEST_F(homa_sock, homa_sock_init__cant_allocate_buffer_pool) struct homa_sock sock; mock_kmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_sock_init(&sock, &self->homa)); + EXPECT_EQ(ENOMEM, -homa_sock_init(&sock)); homa_sock_destroy(&sock); } TEST_F(homa_sock, homa_sock_init__skip_port_in_use) @@ -146,8 +144,8 @@ TEST_F(homa_sock, homa_sock_init__skip_port_in_use) struct homa_sock hsk2, hsk3; self->homa.prev_default_port = 0xfffe; - mock_sock_init(&hsk2, &self->homa, 0); - mock_sock_init(&hsk3, &self->homa, 0); + mock_sock_init(&hsk2, self->hnet, 0); + mock_sock_init(&hsk3, self->hnet, 0); EXPECT_EQ(65535, hsk2.port); EXPECT_EQ(32769, hsk3.port); homa_sock_destroy(&hsk2); @@ -158,9 +156,9 @@ TEST_F(homa_sock, homa_sock_init__all_ports_in_use) struct homa_sock hsk2, hsk3, hsk4; mock_min_default_port = -2; - EXPECT_EQ(0, -mock_sock_init(&hsk2, &self->homa, 0)); - EXPECT_EQ(0, -mock_sock_init(&hsk3, &self->homa, 0)); - EXPECT_EQ(EADDRNOTAVAIL, -mock_sock_init(&hsk4, &self->homa, 0)); + EXPECT_EQ(0, -mock_sock_init(&hsk2, self->hnet, 0)); + EXPECT_EQ(0, -mock_sock_init(&hsk3, self->hnet, 0)); + EXPECT_EQ(EADDRNOTAVAIL, -mock_sock_init(&hsk4, self->hnet, 0)); EXPECT_EQ(65534, hsk2.port); EXPECT_EQ(65535, hsk3.port); EXPECT_EQ(1, hsk4.shutdown); @@ -173,9 +171,9 @@ TEST_F(homa_sock, homa_sock_init__ip_header_length) struct homa_sock hsk_v4, hsk_v6; mock_ipv6 = false; - mock_sock_init(&hsk_v4, &self->homa, 0); + mock_sock_init(&hsk_v4, self->hnet, 0); mock_ipv6 = true; - mock_sock_init(&hsk_v6, &self->homa, 0); + mock_sock_init(&hsk_v6, self->hnet, 0); EXPECT_EQ(sizeof(struct iphdr), hsk_v4.ip_header_length); EXPECT_EQ(sizeof(struct ipv6hdr), hsk_v6.ip_header_length); homa_sock_destroy(&hsk_v4); @@ -187,9 +185,9 @@ TEST_F(homa_sock, homa_sock_init__hijack_tcp) struct homa_sock hijack, no_hijack; self->homa.hijack_tcp = 0; - mock_sock_init(&no_hijack, &self->homa, 0); + mock_sock_init(&no_hijack, self->hnet, 0); self->homa.hijack_tcp = 1; - mock_sock_init(&hijack, &self->homa, 0); + mock_sock_init(&hijack, self->hnet, 0); EXPECT_EQ(0, no_hijack.sock.sk_protocol); EXPECT_EQ(IPPROTO_TCP, hijack.sock.sk_protocol); homa_sock_destroy(&hijack); @@ -202,10 +200,10 @@ TEST_F(homa_sock, homa_sock_unlink__remove_from_map) struct homa_sock hsk2, hsk3; int client2, client3; - mock_sock_init(&hsk2, &self->homa, 0); + mock_sock_init(&hsk2, self->hnet, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); client2 = hsk2.port; - mock_sock_init(&hsk3, &self->homa, 0); + mock_sock_init(&hsk3, self->hnet, 0); client3 = hsk3.port; EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, client2)); @@ -230,7 +228,7 @@ TEST_F(homa_sock, homa_sock_shutdown__unlink_socket) struct homa_sock hsk; int client; - mock_sock_init(&hsk, &self->homa, 0); + mock_sock_init(&hsk, self->hnet, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk, 100)); client = hsk.port; EXPECT_EQ(&hsk, homa_sock_find(self->homa.port_map, client)); @@ -287,7 +285,7 @@ TEST_F(homa_sock, homa_sock_bind) { struct homa_sock hsk2; - mock_sock_init(&hsk2, &self->homa, 0); + mock_sock_init(&hsk2, self->hnet, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &self->hsk, 0)); @@ -323,7 +321,7 @@ TEST_F(homa_sock, homa_sock_find__basics) { struct homa_sock hsk2; - mock_sock_init(&hsk2, &self->homa, 0); + mock_sock_init(&hsk2, self->hnet, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, self->hsk.port)); @@ -341,13 +339,13 @@ TEST_F(homa_sock, homa_sock_find__long_hash_chain) struct homa_sock hsk2, hsk3, hsk4; EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &self->hsk, 13)); - mock_sock_init(&hsk2, &self->homa, 0); + mock_sock_init(&hsk2, self->hnet, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 2*HOMA_SOCKTAB_BUCKETS + 13)); - mock_sock_init(&hsk3, &self->homa, 0); + mock_sock_init(&hsk3, self->hnet, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk3, 3*HOMA_SOCKTAB_BUCKETS + 13)); - mock_sock_init(&hsk4, &self->homa, 0); + mock_sock_init(&hsk4, self->hnet, 0); EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk4, 5*HOMA_SOCKTAB_BUCKETS + 13)); diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index b127a9ad..5f4d92d4 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -19,6 +19,7 @@ FIXTURE(homa_timer) { u64 server_id; union sockaddr_in_union server_addr; struct homa homa; + struct homa_net *hnet; struct homa_sock hsk; }; FIXTURE_SETUP(homa_timer) @@ -32,8 +33,8 @@ FIXTURE_SETUP(homa_timer) self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.resend_ticks = 2; self->homa.timer_ticks = 100; @@ -41,7 +42,7 @@ FIXTURE_SETUP(homa_timer) self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; #endif /* See strip.py */ - mock_sock_init(&self->hsk, &self->homa, 0); + mock_sock_init(&self->hsk, self->hnet, 0); unit_log_clear(); } FIXTURE_TEARDOWN(homa_timer) diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 45bdd2a6..c7446501 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_peer.h" +#include "homa_sock.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -12,11 +14,14 @@ FIXTURE(homa_utils) { struct homa homa; + struct homa_net *hnet; + struct homa_sock hsk; }; FIXTURE_SETUP(homa_utils) { - homa_init(&self->homa, &mock_net); - mock_set_homa(&self->homa); + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); + mock_sock_init(&self->hsk, self->hnet, 0); unit_log_clear(); } FIXTURE_TEARDOWN(homa_utils) @@ -53,62 +58,53 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, } #endif /* See strip.py */ -TEST_F(homa_utils, homa_shared_alloc__kmalloc_failure) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_utils, homa_init__grant_alloc_failure) { - struct homa_shared *shared; + struct homa homa2; mock_kmalloc_errors = 1; - shared = homa_shared_alloc(); - EXPECT_TRUE(IS_ERR(shared)); - EXPECT_EQ(ENOMEM, -PTR_ERR(shared)); + unit_log_clear(); + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_SUBSTR("homa_grant_alloc couldn't allocate grant structure", + mock_printk_output); + EXPECT_EQ(NULL, homa2.grant); + homa_destroy(&homa2); } -TEST_F(homa_utils, homa_shared_alloc__peertab_alloc_failure) +#endif /* See strip.py */ +TEST_F(homa_utils, homa_init__pacer_alloc_failure) { - struct homa_shared *shared; + struct homa homa2; mock_kmalloc_errors = 2; - shared = homa_shared_alloc(); - EXPECT_TRUE(IS_ERR(shared)); - EXPECT_EQ(ENOMEM, -PTR_ERR(shared)); -} -TEST_F(homa_utils, homa_shared_alloc__success) -{ - struct homa_shared *shared; - - shared = homa_shared_alloc(); - EXPECT_NE(NULL, shared); - EXPECT_EQ(1, list_empty(&shared->homas)); - homa_shared_free(shared); -} - -TEST_F(homa_utils, homa_shared_free__clear_global_variable) -{ - struct homa_shared *saved; - - saved = homa_shared; - homa_shared = homa_shared_alloc(); - homa_shared_free(homa_shared); - EXPECT_EQ(NULL, homa_shared); - homa_shared = saved; + unit_log_clear(); + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_SUBSTR("homa_pacer_alloc couldn't allocate homa_pacer struct", + mock_printk_output); + EXPECT_EQ(NULL, homa2.pacer); + homa_destroy(&homa2); } - -TEST_F(homa_utils, homa_init__error_from_homa_shared_alloc) +TEST_F(homa_utils, homa_init__peertab_alloc_failure) { - struct homa_shared *saved_shared = homa_shared; struct homa homa2; - homa_shared = NULL; - mock_kmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); - EXPECT_EQ(0, atomic64_read(&homa2.next_outgoing_id)); - homa_shared = saved_shared; + mock_kmalloc_errors = 4; + unit_log_clear(); + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_SUBSTR("homa_peertab_alloc couldn't create peers: kmalloc failure", + mock_printk_output); + EXPECT_EQ(NULL, homa2.peers); + homa_destroy(&homa2); } -TEST_F(homa_utils, homa_init__kmalloc_failure_for_port_map) +TEST_F(homa_utils, homa_init__cant_allocate_port_map) { struct homa homa2; - mock_kmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); + mock_kmalloc_errors = 0x10; + unit_log_clear(); + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); + EXPECT_SUBSTR("homa_init couldn't create port_map: kmalloc failure", + mock_printk_output); EXPECT_EQ(NULL, homa2.port_map); homa_destroy(&homa2); } @@ -117,32 +113,21 @@ TEST_F(homa_utils, homa_init__homa_skb_init_failure) { struct homa homa2; - mock_kmalloc_errors = 0x8; - EXPECT_EQ(ENOMEM, -homa_init(&homa2, &mock_net)); + mock_kmalloc_errors = 0x20; + EXPECT_EQ(ENOMEM, -homa_init(&homa2)); EXPECT_SUBSTR("Couldn't initialize skb management (errno 12)", mock_printk_output); homa_destroy(&homa2); } #endif /* See strip.py */ -TEST_F(homa_utils, homa_destroy__basics) +TEST_F(homa_utils, homa_destroy) { struct homa homa2; - homa_init(&homa2, &mock_net); + homa_init(&homa2); homa_destroy(&homa2); } -TEST_F(homa_utils, homa_destroy__unlink_and_free_shared) -{ - struct homa homa2; - - homa_init(&homa2, &mock_net); - EXPECT_NE(NULL, homa_shared); - homa_destroy(&homa2); - EXPECT_NE(NULL, homa_shared); - homa_destroy(&self->homa); - EXPECT_EQ(NULL, homa_shared); -} #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_print_ipv4_addr) diff --git a/test/utils.c b/test/utils.c index b2bb4792..c3d366c9 100644 --- a/test/utils.c +++ b/test/utils.c @@ -486,7 +486,7 @@ int unit_count_peers(struct homa *homa) struct homa_peer *peer; int count = 0; - rhashtable_walk_enter(&homa->shared->peers->ht, &iter); + rhashtable_walk_enter(&homa->peers->ht, &iter); rhashtable_walk_start(&iter); while (1) { peer = rhashtable_walk_next(&iter); From 444eb86a71a8b665e672a97a2a95194d2a607d59 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 16 May 2025 17:32:35 -0700 Subject: [PATCH 320/625] Make socket port numbers homa_net-specific --- homa_devel.c | 2 +- homa_impl.h | 32 +++++++--- homa_incoming.c | 7 +- homa_plumbing.c | 6 +- homa_rpc.c | 10 +-- homa_sock.c | 51 ++++++++------- homa_sock.h | 35 +++++----- homa_timer.c | 2 +- homa_utils.c | 18 +++--- test/mock.c | 10 ++- test/unit_homa_incoming.c | 125 ++++++++++++++---------------------- test/unit_homa_outgoing.c | 2 +- test/unit_homa_plumbing.c | 2 +- test/unit_homa_sock.c | 131 +++++++++++++++++++++++--------------- test/unit_homa_utils.c | 45 ++++++++++++- test/utils.c | 10 ++- 16 files changed, 274 insertions(+), 214 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index cf2d73ea..93ac9c1f 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -386,7 +386,7 @@ void homa_freeze_peers() /* Find a socket to use (any socket for the namespace will do). */ hnet = homa_net_from_net(&init_net); rcu_read_lock(); - hsk = homa_socktab_start_scan(hnet->homa->port_map, &scan); + hsk = homa_socktab_start_scan(hnet->homa->socktab, &scan); while (hsk && hsk->hnet != hnet) hsk = homa_socktab_next(&scan); homa_socktab_end_scan(&scan); diff --git a/homa_impl.h b/homa_impl.h index d8752cc2..15bbd029 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -135,16 +135,10 @@ struct homa { struct homa_peertab *peers; /** - * @prev_default_port: The most recent port number assigned from - * the range of default ports. - */ - __u16 prev_default_port; - - /** - * @port_map: Information about all open sockets. Dynamically + * @socktab: Information about all open sockets. Dynamically * allocated; must be kfreed. */ - struct homa_socktab *port_map; + struct homa_socktab *socktab; #ifndef __STRIP__ /* See strip.py */ /** @@ -497,6 +491,12 @@ struct homa_net { /** @homa: Global Homa information. */ struct homa *homa; + + /** + * @prev_default_port: The most recent port number assigned from + * the range of default ports. + */ + __u16 prev_default_port; }; /** @@ -680,7 +680,7 @@ void homa_close(struct sock *sock, long timeout); int homa_copy_to_user(struct homa_rpc *rpc); void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc); void homa_destroy(struct homa *homa); -void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa); +void homa_dispatch_pkts(struct sk_buff *skb); int homa_err_handler_v4(struct sk_buff *skb, u32 info); int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, @@ -789,6 +789,20 @@ static inline struct homa *homa_from_skb(struct sk_buff *skb) return hnet->homa; } +/** + * homa_net_from_skb() - Return the struct homa_net associated with a particular + * sk_buff. + * @skb: Get the struct homa for this packet buffer. + * Return: see above. + */ +static inline struct homa_net *homa_net_from_skb(struct sk_buff *skb) +{ + struct homa_net *hnet; + + hnet = net_generic(dev_net(skb->dev), homa_net_id); + return hnet; +} + /** * homa_clock() - Return a fine-grain clock value that is monotonic and * consistent across cores. diff --git a/homa_incoming.c b/homa_incoming.c index 29f09b5c..8c286611 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -387,9 +387,8 @@ int homa_copy_to_user(struct homa_rpc *rpc) * homa_dispatch_pkts() - Top-level function that processes a batch of packets, * all related to the same RPC. * @skb: First packet in the batch, linked through skb->next. - * @homa: Overall information about the Homa transport. */ -void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) +void homa_dispatch_pkts(struct sk_buff *skb) { #ifdef __UNIT_TEST__ #define MAX_ACKS 2 @@ -410,11 +409,13 @@ void homa_dispatch_pkts(struct sk_buff *skb, struct homa *homa) struct homa_ack acks[MAX_ACKS]; struct homa_rpc *rpc = NULL; struct homa_sock *hsk; + struct homa_net *hnet; struct sk_buff *next; int num_acks = 0; /* Find the appropriate socket.*/ - hsk = homa_sock_find(homa->port_map, dport); + hnet = homa_net_from_skb(skb); + hsk = homa_sock_find(hnet, dport); if (!hsk || (!homa_is_client(id) && !hsk->is_server)) { if (skb_is_ipv6(skb)) icmp6_send(skb, ICMPV6_DEST_UNREACH, diff --git a/homa_plumbing.c b/homa_plumbing.c index d83d6260..cc5f6069 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -667,7 +667,7 @@ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; port = ntohs(addr_in->in6.sin6_port); } - return homa_sock_bind(hsk->homa->port_map, hsk, port); + return homa_sock_bind(hsk->hnet, hsk, port); } /** @@ -1358,7 +1358,7 @@ int homa_softirq(struct sk_buff *skb) h->type); *prev_link = skb->next; skb->next = NULL; - homa_dispatch_pkts(skb, homa); + homa_dispatch_pkts(skb); } else { prev_link = &skb->next; } @@ -1409,7 +1409,7 @@ int homa_softirq(struct sk_buff *skb) UNIT_LOG("", " %d", ntohl(h3->seg.offset)); } #endif /* __UNIT_TEST__ */ - homa_dispatch_pkts(packets, homa); + homa_dispatch_pkts(packets); packets = other_pkts; } diff --git a/homa_rpc.c b/homa_rpc.c index 3b681b2e..f893078d 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -232,7 +232,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, /* Without RCU, sockets other than hsk can be deleted * out from under us. */ - hsk2 = homa_sock_find(hsk->homa->port_map, server_port); + hsk2 = homa_sock_find(hsk->hnet, server_port); if (!hsk2) return; } @@ -364,7 +364,7 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, struct homa_rpc *rpc; struct homa_sock *hsk; - for (hsk = homa_socktab_start_scan(homa->port_map, &scan); hsk; + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { /* Skip the (expensive) lock acquisition if there's no * work to do. @@ -711,7 +711,7 @@ void homa_rpc_log_active(struct homa *homa, uint64_t id) pr_notice("Logging active Homa RPCs:\n"); rcu_read_lock(); - for (hsk = homa_socktab_start_scan(homa->port_map, &scan); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { if (list_empty(&hsk->active_rpcs) || hsk->shutdown) continue; @@ -794,7 +794,7 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) tt_record("Logging Homa RPCs:"); rcu_read_lock(); - for (hsk = homa_socktab_start_scan(homa->port_map, &scan); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { if (list_empty(&hsk->active_rpcs) || hsk->shutdown) continue; @@ -851,7 +851,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) atomic_read(&homa->grant->total_incoming)); *link_errors = 0; rcu_read_lock(); - for (hsk = homa_socktab_start_scan(homa->port_map, &scan); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { if (list_empty(&hsk->active_rpcs) || hsk->shutdown) continue; diff --git a/homa_sock.c b/homa_sock.c index 0a83be88..b46422ee 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -25,16 +25,20 @@ void homa_socktab_init(struct homa_socktab *socktab) } /** - * homa_socktab_destroy() - Destructor for homa_socktabs. + * homa_socktab_destroy() - Destructor for homa_socktabs: deletes all + * existing sockets. * @socktab: The object to destroy. + * @hnet: If non-NULL, only sockets for this namespace are deleted. */ -void homa_socktab_destroy(struct homa_socktab *socktab) +void homa_socktab_destroy(struct homa_socktab *socktab, struct homa_net *hnet) { struct homa_socktab_scan scan; struct homa_sock *hsk; for (hsk = homa_socktab_start_scan(socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { + if (hnet && hnet != hsk->hnet) + continue; homa_sock_destroy(hsk); } homa_socktab_end_scan(&scan); @@ -145,7 +149,7 @@ int homa_sock_init(struct homa_sock *hsk) hnet = (struct homa_net *)net_generic(sock_net(&hsk->sock), homa_net_id); homa = hnet->homa; - socktab = homa->port_map; + socktab = homa->socktab; /* Initialize fields outside the Homa part. */ hsk->sock.sk_sndbuf = homa->wmem_max; @@ -172,23 +176,23 @@ int homa_sock_init(struct homa_sock *hsk) * no other socket chooses the same port. */ spin_lock_bh(&socktab->write_lock); - starting_port = homa->prev_default_port; + starting_port = hnet->prev_default_port; while (1) { - homa->prev_default_port++; - if (homa->prev_default_port < HOMA_MIN_DEFAULT_PORT) - homa->prev_default_port = HOMA_MIN_DEFAULT_PORT; - other = homa_sock_find(socktab, homa->prev_default_port); + hnet->prev_default_port++; + if (hnet->prev_default_port < HOMA_MIN_DEFAULT_PORT) + hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT; + other = homa_sock_find(hnet, hnet->prev_default_port); if (!other) break; sock_put(&other->sock); - if (homa->prev_default_port == starting_port) { + if (hnet->prev_default_port == starting_port) { spin_unlock_bh(&socktab->write_lock); hsk->shutdown = true; result = -EADDRNOTAVAIL; goto error; } } - hsk->port = homa->prev_default_port; + hsk->port = hnet->prev_default_port; hsk->inet.inet_num = hsk->port; hsk->inet.inet_sport = htons(hsk->port); @@ -218,10 +222,9 @@ int homa_sock_init(struct homa_sock *hsk) bucket->id = i + 1000000; INIT_HLIST_HEAD(&bucket->rpcs); } - - /* Link the socket into the port map. */ hlist_add_head_rcu(&hsk->socktab_links, - &socktab->buckets[homa_port_hash(hsk->port)]); + &socktab->buckets[homa_socktab_bucket(hnet, + hsk->port)]); spin_unlock_bh(&socktab->write_lock); return result; @@ -238,7 +241,7 @@ int homa_sock_init(struct homa_sock *hsk) */ void homa_sock_unlink(struct homa_sock *hsk) { - struct homa_socktab *socktab = hsk->homa->port_map; + struct homa_socktab *socktab = hsk->homa->socktab; spin_lock_bh(&socktab->write_lock); hlist_del_rcu(&hsk->socktab_links); @@ -347,7 +350,7 @@ void homa_sock_destroy(struct homa_sock *hsk) /** * homa_sock_bind() - Associates a server port with a socket; if there * was a previous server port assignment for @hsk, it is abandoned. - * @socktab: Hash table in which the binding will be recorded. + * @hnet: Network namespace with which port is associated. * @hsk: Homa socket. * @port: Desired server port for @hsk. If 0, then this call * becomes a no-op: the socket will continue to use @@ -355,9 +358,10 @@ void homa_sock_destroy(struct homa_sock *hsk) * * Return: 0 for success, otherwise a negative errno. */ -int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, +int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, __u16 port) { + struct homa_socktab *socktab = hnet->homa->socktab; struct homa_sock *owner; int result = 0; @@ -372,7 +376,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, goto done; } - owner = homa_sock_find(socktab, port); + owner = homa_sock_find(hnet, port); if (owner) { sock_put(&owner->sock); if (owner != hsk) @@ -384,7 +388,7 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, hsk->inet.inet_num = port; hsk->inet.inet_sport = htons(hsk->port); hlist_add_head_rcu(&hsk->socktab_links, - &socktab->buckets[homa_port_hash(port)]); + &socktab->buckets[homa_socktab_bucket(hnet, port)]); hsk->is_server = true; done: spin_unlock_bh(&socktab->write_lock); @@ -394,21 +398,22 @@ int homa_sock_bind(struct homa_socktab *socktab, struct homa_sock *hsk, /** * homa_sock_find() - Returns the socket associated with a given port. - * @socktab: Hash table in which to perform lookup. + * @hnet: Network namespace where the socket will be used. * @port: The port of interest. * Return: The socket that owns @port, or NULL if none. If non-NULL * then this method has taken a reference on the socket and * the caller must call sock_put to release it. */ -struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port) +struct homa_sock *homa_sock_find(struct homa_net *hnet, __u16 port) { - struct homa_sock *hsk; + int bucket = homa_socktab_bucket(hnet, port); struct homa_sock *result = NULL; + struct homa_sock *hsk; rcu_read_lock(); - hlist_for_each_entry_rcu(hsk, &socktab->buckets[homa_port_hash(port)], + hlist_for_each_entry_rcu(hsk, &hnet->homa->socktab->buckets[bucket], socktab_links) { - if (hsk->port == port) { + if (hsk->port == port && hsk->hnet == hnet) { result = hsk; sock_hold(&hsk->sock); break; diff --git a/homa_sock.h b/homa_sock.h index 8d54e390..748b0eb6 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -15,9 +15,11 @@ void homa_sock_lock_slow(struct homa_sock *hsk); /** * define HOMA_SOCKTAB_BUCKETS - Number of hash buckets in a homa_socktab. - * Must be a power of 2. + * Must be a power of 2. Note: can't use BIT here because the result needs + * to be signed. */ -#define HOMA_SOCKTAB_BUCKETS 1024 +#define HOMA_SOCKTAB_BUCKET_BITS 10 +#define HOMA_SOCKTAB_BUCKETS (1 << HOMA_SOCKTAB_BUCKET_BITS) /** * struct homa_socktab - A hash table that maps from port numbers (either @@ -257,15 +259,16 @@ struct homa_v6_sock { void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id); #endif /* See strip.py */ -int homa_sock_bind(struct homa_socktab *socktab, - struct homa_sock *hsk, __u16 port); +int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, + __u16 port); void homa_sock_destroy(struct homa_sock *hsk); -struct homa_sock *homa_sock_find(struct homa_socktab *socktab, __u16 port); +struct homa_sock *homa_sock_find(struct homa_net *hnet, __u16 port); int homa_sock_init(struct homa_sock *hsk); void homa_sock_shutdown(struct homa_sock *hsk); void homa_sock_unlink(struct homa_sock *hsk); int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking); -void homa_socktab_destroy(struct homa_socktab *socktab); +void homa_socktab_destroy(struct homa_socktab *socktab, + struct homa_net *hnet); void homa_socktab_end_scan(struct homa_socktab_scan *scan); void homa_socktab_init(struct homa_socktab *socktab); struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan); @@ -307,19 +310,21 @@ static inline void homa_sock_unlock(struct homa_sock *hsk) } /** - * homa_port_hash() - Hash function for port numbers. - * @port: Port number being looked up. + * homa_socktab_bucket() - Compute the bucket number in a homa_socktab + * that will contain a particular socket. + * @hnet: Network namespace of the desired socket. + * @port: Port number of the socket. * - * Return: The index of the bucket in which this port will be found (if - * it exists. + * Return: The index of the bucket in which a socket matching @hnet and + * @port will be found (if it exists). */ -static inline int homa_port_hash(__u16 port) +static inline int homa_socktab_bucket(struct homa_net *hnet, __u16 port) { - /* We can use a really simple hash function here because client - * port numbers are allocated sequentially and server port numbers - * are unpredictable. - */ +#ifdef __UNIT_TEST__ return port & (HOMA_SOCKTAB_BUCKETS - 1); +#else /* __UNIT_TEST__ */ + return hash_32((uintptr_t)hnet ^ port, HOMA_SOCKTAB_BUCKET_BITS); +#endif /* __UNIT_TEST__ */ } /** diff --git a/homa_timer.c b/homa_timer.c index 71f25c56..eb0d82d1 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -225,7 +225,7 @@ void homa_timer(struct homa *homa) #endif /* See strip.py */ /* Scan all existing RPCs in all sockets. */ - for (hsk = homa_socktab_start_scan(homa->port_map, &scan); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { while (hsk->dead_skbs >= homa->dead_buffs_limit) { /* If we get here, it means that homa_wait_for_message diff --git a/homa_utils.c b/homa_utils.c index 43a4e51c..540873fb 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -58,13 +58,13 @@ int homa_init(struct homa *homa) homa->peers = NULL; return err; } - homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL); - if (!homa->port_map) { - pr_err("%s couldn't create port_map: kmalloc failure", + homa->socktab = kmalloc(sizeof(*homa->socktab), GFP_KERNEL); + if (!homa->socktab) { + pr_err("%s couldn't create socktab: kmalloc failure", __func__); return -ENOMEM; } - homa_socktab_init(homa->port_map); + homa_socktab_init(homa->socktab); #ifndef __STRIP__ /* See strip.py */ err = homa_skb_init(homa); if (err) { @@ -133,10 +133,10 @@ void homa_destroy(struct homa *homa) #endif /* __UNIT_TEST__ */ /* The order of the following cleanups matters! */ - if (homa->port_map) { - homa_socktab_destroy(homa->port_map); - kfree(homa->port_map); - homa->port_map = NULL; + if (homa->socktab) { + homa_socktab_destroy(homa->socktab, NULL); + kfree(homa->socktab); + homa->socktab = NULL; } #ifndef __STRIP__ /* See strip.py */ if (homa->grant) { @@ -170,6 +170,7 @@ int homa_net_init(struct homa_net *hnet, struct net *net, struct homa *homa) memset(hnet, 0, sizeof(*hnet)); hnet->net = net; hnet->homa = homa; + hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; return 0; } @@ -180,6 +181,7 @@ int homa_net_init(struct homa_net *hnet, struct net *net, struct homa *homa) */ void homa_net_destroy(struct homa_net *hnet) { + homa_socktab_destroy(hnet->homa->socktab, hnet); homa_peertab_free_net(hnet); } diff --git a/test/mock.c b/test/mock.c index 9aab5db0..bf327a50 100644 --- a/test/mock.c +++ b/test/mock.c @@ -2016,12 +2016,10 @@ int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port) { static struct ipv6_pinfo hsk_pinfo; struct sock *sk = &hsk->sock; - struct homa *homa; int saved_port; int err = 0; - homa = hnet->homa; - saved_port = homa->prev_default_port; + saved_port = hnet->prev_default_port; memset(hsk, 0, sizeof(*hsk)); sk->sk_data_ready = mock_data_ready; sk->sk_family = mock_ipv6 ? AF_INET6 : AF_INET; @@ -2033,15 +2031,15 @@ int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port) rcu_assign_pointer(sk->sk_wq, &mock_socket.wq); sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; if (port != 0 && port >= mock_min_default_port) - homa->prev_default_port = port - 1; + hnet->prev_default_port = port - 1; err = homa_sock_init(hsk); hsk->is_server = true; if (port != 0) - homa->prev_default_port = saved_port; + hnet->prev_default_port = saved_port; if (err != 0) return err; if (port != 0 && port < mock_min_default_port) - homa_sock_bind(homa->port_map, hsk, port); + homa_sock_bind(hnet, hsk, port); hsk->inet.pinet6 = &hsk_pinfo; mock_mtu = UNIT_TEST_DATA_PER_PACKET + hsk->ip_header_length + sizeof(struct homa_data_hdr); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 206577a5..d32395b4 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -871,7 +871,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv4) skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); unit_log_clear(); - homa_dispatch_pkts(skb, &self->homa); + homa_dispatch_pkts(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("icmp_send type 3, code 3", unit_log_get()); } @@ -888,7 +888,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv6) skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); unit_log_clear(); - homa_dispatch_pkts(skb, &self->homa); + homa_dispatch_pkts(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("icmp6_send type 1, code 4", unit_log_get()); } @@ -906,7 +906,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__server_not_enabled) skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); unit_log_clear(); - homa_dispatch_pkts(skb, &self->homa); + homa_dispatch_pkts(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("icmp_send type 3, code 3", unit_log_get()); } @@ -927,14 +927,14 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_free_many_packets) skb->next = skb2; skb2->next = skb3; unit_log_clear(); - homa_dispatch_pkts(skb, &self->homa); + homa_dispatch_pkts(skb); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_STREQ("icmp6_send type 1, code 4", unit_log_get()); } TEST_F(homa_incoming, homa_dispatch_pkts__new_server_rpc) { homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, - 1400, 0), &self->homa); + 1400, 0)); EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); EXPECT_EQ(1, mock_skb_count()); } @@ -942,7 +942,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cant_create_server_rpc) { mock_kmalloc_errors = 1; homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, - 1400, 0), &self->homa); + 1400, 0)); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(0, mock_skb_count()); #ifndef __STRIP__ /* See strip.py */ @@ -960,7 +960,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__existing_server_rpc) self->data.seg.offset = htonl(1400); self->data.common.sender_id = cpu_to_be64(self->client_id); homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, - 1400, 0), &self->homa); + 1400, 0)); EXPECT_EQ(7200, srpc->msgin.bytes_remaining); } TEST_F(homa_incoming, homa_dispatch_pkts__non_data_packet_for_existing_server_rpc) @@ -983,8 +983,8 @@ TEST_F(homa_incoming, homa_dispatch_pkts__non_data_packet_for_existing_server_rp ASSERT_NE(NULL, srpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &resend.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &resend.common, + 0, 0)); EXPECT_STREQ("xmit BUSY", unit_log_get()); } #ifndef __STRIP__ /* See strip.py */ @@ -1013,8 +1013,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_client_rpc) .type = RPC_UNKNOWN}}; mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) @@ -1025,8 +1024,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_server_rpc) .type = GRANT}}; mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(0, homa_metrics_per_cpu()->unknown_rpcs); } TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) @@ -1041,8 +1039,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) .cutoff_version = 400}; struct homa_peer *peer; - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); peer = homa_peer_find(&self->hsk, self->server_ip); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(400, peer->cutoff_version); @@ -1063,8 +1060,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) .offset = 0, .length = 2000}; #endif /* See strip.py */ - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit RPC_UNKNOWN", unit_log_get()); } #ifndef __STRIP__ /* See strip.py */ @@ -1086,8 +1082,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) unit_log_clear(); crpc->silent_ticks = 5; crpc->peer->outstanding_resends = 2; - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_EQ(0, crpc->silent_ticks); EXPECT_EQ(0, crpc->peer->outstanding_resends); @@ -1095,8 +1090,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) h.common.type = CUTOFFS; crpc->silent_ticks = 5; crpc->peer->outstanding_resends = 2; - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_EQ(5, crpc->silent_ticks); EXPECT_EQ(0, crpc->peer->outstanding_resends); } @@ -1123,7 +1117,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__multiple_ack_packets) skb2->next = skb3; unit_log_clear(); - homa_dispatch_pkts(skb, &self->homa); + homa_dispatch_pkts(skb); EXPECT_SUBSTR("ack 1239", unit_log_get()); } TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) @@ -1141,7 +1135,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_type) struct homa_common_hdr h = {.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = 99}; - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h, 0, 0), &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h, 0, 0)); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->unknown_packet_types); #endif /* See strip.py */ @@ -1159,7 +1153,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) self->data.common.sender_id = cpu_to_be64(self->client_id+10); unit_log_clear(); homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, - 1400, 0), &self->homa); + 1400, 0)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); EXPECT_SUBSTR("ack 1235", unit_log_get()); } @@ -1179,7 +1173,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__too_many_acks) skb3 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); skb->next = skb2; skb2->next = skb3; - homa_dispatch_pkts(skb, &self->homa); + homa_dispatch_pkts(skb); EXPECT_STREQ("sk->sk_data_ready invoked; ack 1237; ack 1235", unit_log_get()); } @@ -1190,7 +1184,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__invoke_homa_grant_check_rpc) self->data.incoming = htonl(1000); self->data.message_length = htonl(20000); homa_dispatch_pkts(mock_skb_new(self->server_ip, &self->data.common, - 0, 0), &self->homa); + 0, 0)); unit_log_clear(); unit_log_grantables(&self->homa); EXPECT_SUBSTR("id 1235", unit_log_get()); @@ -1220,7 +1214,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) /* First packet: below the threshold for reaps. */ self->data.common.dport = htons(self->hsk.port); homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, - 1400, 0), &self->homa); + 1400, 0)); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(31, self->hsk.dead_skbs); #else /* See strip.py */ @@ -1234,7 +1228,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) self->homa.dead_buffs_limit = 15; self->homa.reap_limit = 10; homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, - 1400, 0), &self->homa); + 1400, 0)); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(21, self->hsk.dead_skbs); #else /* See strip.py */ @@ -1414,7 +1408,7 @@ TEST_F(homa_incoming, homa_data_pkt__send_cutoffs) self->data.message_length = htonl(5000); mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, - 1400, 0), &self->homa); + 1400, 0)); EXPECT_SUBSTR("cutoffs 19 18 17 16 15 14 13 12, version 2", unit_log_get()); @@ -1425,7 +1419,7 @@ TEST_F(homa_incoming, homa_data_pkt__send_cutoffs) self->homa.cutoff_version = 3; self->data.seg.offset = 1400; homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, - 1400, 0), &self->homa); + 1400, 0)); EXPECT_STREQ("", unit_log_get()); } TEST_F(homa_incoming, homa_data_pkt__cutoffs_up_to_date) @@ -1433,7 +1427,7 @@ TEST_F(homa_incoming, homa_data_pkt__cutoffs_up_to_date) self->homa.cutoff_version = 123; self->data.cutoff_version = htons(123); homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, - 1400, 0), &self->homa); + 1400, 0)); EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); } @@ -1456,16 +1450,14 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) homa_rpc_unlock(srpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(11000, srpc->msgout.granted); EXPECT_STREQ("xmit DATA 1400@10000", unit_log_get()); /* Don't let grant offset go backwards. */ h.offset = htonl(10000); unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(11000, srpc->msgout.granted); EXPECT_STREQ("", unit_log_get()); @@ -1473,8 +1465,7 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) h.offset = htonl(20000); srpc->state = RPC_INCOMING; unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(11000, srpc->msgout.granted); EXPECT_STREQ("", unit_log_get()); @@ -1502,8 +1493,7 @@ TEST_F(homa_incoming, homa_grant_pkt__reset) EXPECT_EQ(10000, srpc->msgout.granted); EXPECT_EQ(10000, srpc->msgout.next_xmit_offset); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(10000, srpc->msgout.granted); EXPECT_EQ(10000, srpc->msgout.next_xmit_offset); EXPECT_STREQ("xmit DATA retrans 1400@0; " @@ -1529,8 +1519,7 @@ TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(20000, crpc->msgout.granted); } #endif /* See strip.py */ @@ -1544,8 +1533,7 @@ TEST_F(homa_incoming, homa_resend_pkt__unknown_rpc) .offset = htonl(100), .length = htonl(200)}; - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit RPC_UNKNOWN", unit_log_get()); } TEST_F(homa_incoming, homa_resend_pkt__rpc_in_service_server_sends_busy) @@ -1563,8 +1551,7 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_in_service_server_sends_busy) ASSERT_NE(NULL, srpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit BUSY", unit_log_get()); } TEST_F(homa_incoming, homa_resend_pkt__rpc_incoming_server_sends_busy) @@ -1588,8 +1575,7 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_incoming_server_sends_busy) #endif /* See strip.py */ unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); // The server might send a GRANT right after BUSY so just check substr EXPECT_SUBSTR("xmit BUSY", unit_log_get()); } @@ -1611,8 +1597,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); } #ifndef __STRIP__ /* See strip.py */ @@ -1631,8 +1616,7 @@ TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) ASSERT_NE(NULL, crpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_SUBSTR("xmit BUSY", unit_log_get()); } #endif /* See strip.py */ @@ -1660,8 +1644,7 @@ TEST_F(homa_incoming, homa_resend_pkt__client_send_data) unit_log_clear(); mock_clear_xmit_prios(); - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_SUBSTR("xmit DATA retrans 1400@0", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("3", mock_xmit_prios); @@ -1691,8 +1674,7 @@ TEST_F(homa_incoming, homa_resend_pkt__server_send_data) unit_log_clear(); mock_clear_xmit_prios(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit DATA retrans 1400@0; " "xmit DATA retrans 1400@1400", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ @@ -1717,8 +1699,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); #ifndef __STRIP__ /* See strip.py */ EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 2000, RETRANSMIT; " "xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 1400, data_length 600, incoming 2000, RETRANSMIT", @@ -1750,8 +1731,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); #ifndef __STRIP__ /* See strip.py */ EXPECT_SUBSTR("xmit DATA from 0.0.0.0:32768, dport 99, id 1234, message_length 2000, offset 0, data_length 1400, incoming 1400, RETRANSMIT", unit_log_get()); @@ -1774,8 +1754,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__free_server_rpc) ASSERT_NE(NULL, srpc); unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); } @@ -1797,8 +1776,7 @@ TEST_F(homa_incoming, homa_cutoffs_pkt_basics) EXPECT_EQ(10000, crpc->msgout.granted); unit_log_clear(); - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_EQ(400, crpc->peer->cutoff_version); EXPECT_EQ(9, crpc->peer->unsched_cutoffs[1]); EXPECT_EQ(3, crpc->peer->unsched_cutoffs[7]); @@ -1839,8 +1817,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_fully_received) ASSERT_NE(NULL, crpc); unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ @@ -1862,8 +1839,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) ASSERT_NE(NULL, crpc); unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_STREQ("", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ @@ -1884,8 +1860,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) ASSERT_NE(NULL, crpc); unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_STREQ("", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ @@ -1905,8 +1880,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) peer->acks[0].client_id = cpu_to_be64(self->client_id+2); peer->num_acks = 1; mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks [sp 99, id 1236]", unit_log_get()); homa_peer_put(peer); @@ -1928,8 +1902,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_no_extras) EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); unit_log_clear(); mock_xmit_log_verbose = 1; - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ACK - DATA]); @@ -1963,8 +1936,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_plus_extras) .client_id = cpu_to_be64(self->server_id+1)}; h.acks[1] = (struct homa_ack) {.server_port = htons(self->server_port), .client_id = cpu_to_be64(self->server_id+3)}; - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(0, unit_list_length(&self->hsk2.active_rpcs)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc1)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc2)); @@ -1994,8 +1966,7 @@ TEST_F(homa_incoming, homa_ack_pkt__target_rpc_doesnt_exist) .client_id = cpu_to_be64(self->server_id+5)}; h.acks[1] = (struct homa_ack) {.server_port = htons(self->server_port), .client_id = cpu_to_be64(self->server_id+1)}; - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0), - &self->homa); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); EXPECT_EQ(1, unit_list_length(&self->hsk2.active_rpcs)); EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc1)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc2)); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index a244129a..cd7a1610 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -641,7 +641,7 @@ TEST_F(homa_outgoing, homa_xmit_control__server_request) struct homa_busy_hdr h; struct homa_rpc *srpc; - homa_sock_bind(self->homa.port_map, &self->hsk, self->server_port); + homa_sock_bind(self->hnet, &self->hsk, self->server_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 10000); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 494ac772..96a3f0be 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -69,7 +69,7 @@ FIXTURE_SETUP(homa_plumbing) self->server_addr.in4.sin_addr.s_addr = ipv6_to_ipv4(self->server_addr.in6.sin6_addr); } - homa_sock_bind(self->homa.port_map, &self->hsk, self->server_port); + homa_sock_bind(self->hnet, &self->hsk, self->server_port); memset(&self->data, 0, sizeof(self->data)); self->data = (struct homa_data_hdr){.common = { .sport = htons(self->client_port), diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index 12e8a12a..d1c43fb4 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -55,11 +55,26 @@ FIXTURE_TEARDOWN(homa_sock) unit_teardown(); } -TEST_F(homa_sock, homa_port_hash) +TEST_F(homa_sock, homa_socktab_destroy) { - EXPECT_EQ(1023, homa_port_hash(0xffff)); - EXPECT_EQ(18, homa_port_hash(0x6012)); - EXPECT_EQ(99, homa_port_hash(99)); + struct homa_sock hsk1, hsk2, hsk3; + struct homa_net *hnet; + + hnet = mock_alloc_hnet(&self->homa); + mock_sock_init(&hsk1, hnet, 100); + mock_sock_init(&hsk2, hnet, 101); + mock_sock_init(&hsk3, self->hnet, 100); + EXPECT_EQ(0, hsk1.shutdown); + EXPECT_EQ(0, hsk2.shutdown); + EXPECT_EQ(0, hsk3.shutdown); + + homa_socktab_destroy(self->homa.socktab, hnet); + EXPECT_EQ(1, hsk1.shutdown); + EXPECT_EQ(1, hsk2.shutdown); + EXPECT_EQ(0, hsk3.shutdown); + + homa_socktab_destroy(self->homa.socktab, NULL); + EXPECT_EQ(1, hsk3.shutdown); } TEST_F(homa_sock, homa_socktab_start_scan) @@ -69,7 +84,7 @@ TEST_F(homa_sock, homa_socktab_start_scan) homa_destroy(&self->homa); homa_init(&self->homa); mock_sock_init(&self->hsk, self->hnet, HOMA_MIN_DEFAULT_PORT+100); - EXPECT_EQ(&self->hsk, homa_socktab_start_scan(self->homa.port_map, + EXPECT_EQ(&self->hsk, homa_socktab_start_scan(self->homa.socktab, &scan)); EXPECT_EQ(100, scan.current_bucket); EXPECT_EQ(1, mock_sock_holds); @@ -88,7 +103,7 @@ TEST_F(homa_sock, homa_socktab_next) mock_sock_init(&hsk2, self->hnet, first_port+HOMA_SOCKTAB_BUCKETS); mock_sock_init(&hsk3, self->hnet, first_port+2*HOMA_SOCKTAB_BUCKETS); mock_sock_init(&hsk4, self->hnet, first_port+5); - hsk = homa_socktab_start_scan(self->homa.port_map, &scan); + hsk = homa_socktab_start_scan(self->homa.socktab, &scan); EXPECT_EQ(first_port+2*HOMA_SOCKTAB_BUCKETS, hsk->port); EXPECT_EQ(1, mock_sock_holds); hsk = homa_socktab_next(&scan); @@ -117,9 +132,9 @@ TEST_F(homa_sock, homa_socktab_end_scan) homa_destroy(&self->homa); homa_init(&self->homa); mock_sock_init(&self->hsk, self->hnet, HOMA_MIN_DEFAULT_PORT+100); - homa_socktab_start_scan(self->homa.port_map, &scan1); - homa_socktab_start_scan(self->homa.port_map, &scan2); - homa_socktab_start_scan(self->homa.port_map, &scan3); + homa_socktab_start_scan(self->homa.socktab, &scan1); + homa_socktab_start_scan(self->homa.socktab, &scan2); + homa_socktab_start_scan(self->homa.socktab, &scan3); EXPECT_EQ(3, mock_sock_holds); homa_socktab_next(&scan2); EXPECT_EQ(2, mock_sock_holds); @@ -143,7 +158,7 @@ TEST_F(homa_sock, homa_sock_init__skip_port_in_use) { struct homa_sock hsk2, hsk3; - self->homa.prev_default_port = 0xfffe; + self->hnet->prev_default_port = 0xfffe; mock_sock_init(&hsk2, self->hnet, 0); mock_sock_init(&hsk3, self->hnet, 0); EXPECT_EQ(65535, hsk2.port); @@ -201,26 +216,26 @@ TEST_F(homa_sock, homa_sock_unlink__remove_from_map) int client2, client3; mock_sock_init(&hsk2, self->hnet, 0); - EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk2, 100)); client2 = hsk2.port; mock_sock_init(&hsk3, self->hnet, 0); client3 = hsk3.port; - EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, client2)); - EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, client3)); + EXPECT_EQ(&hsk2, homa_sock_find(self->hnet, client2)); + EXPECT_EQ(&hsk3, homa_sock_find(self->hnet, client3)); sock_put(&hsk2.sock); sock_put(&hsk3.sock); homa_sock_shutdown(&hsk2); - EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client2)); - EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, client3)); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, client2)); + EXPECT_EQ(&hsk3, homa_sock_find(self->hnet, client3)); sock_put(&hsk3.sock); homa_sock_shutdown(&hsk3); - EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client2)); - EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client3)); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, client2)); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, client3)); } TEST_F(homa_sock, homa_sock_shutdown__unlink_socket) @@ -229,13 +244,13 @@ TEST_F(homa_sock, homa_sock_shutdown__unlink_socket) int client; mock_sock_init(&hsk, self->hnet, 0); - EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk, 100)); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk, 100)); client = hsk.port; - EXPECT_EQ(&hsk, homa_sock_find(self->homa.port_map, client)); + EXPECT_EQ(&hsk, homa_sock_find(self->hnet, client)); sock_put(&hsk.sock); homa_sock_shutdown(&hsk); - EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, client)); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, client)); } TEST_F(homa_sock, homa_sock_shutdown__already_shutdown) { @@ -286,35 +301,30 @@ TEST_F(homa_sock, homa_sock_bind) struct homa_sock hsk2; mock_sock_init(&hsk2, self->hnet, 0); - EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk2, 100)); - EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &self->hsk, 0)); + EXPECT_EQ(0, -homa_sock_bind(self->hnet, &self->hsk, 0)); EXPECT_EQ(HOMA_MIN_DEFAULT_PORT, self->hsk.port); - EXPECT_EQ(EINVAL, -homa_sock_bind(self->homa.port_map, &self->hsk, + EXPECT_EQ(EINVAL, -homa_sock_bind(self->hnet, &self->hsk, HOMA_MIN_DEFAULT_PORT + 100)); - EXPECT_EQ(EADDRINUSE, -homa_sock_bind(self->homa.port_map, &self->hsk, - 100)); - EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &hsk2, - 100)); + EXPECT_EQ(EADDRINUSE, -homa_sock_bind(self->hnet, &self->hsk, 100)); + EXPECT_EQ(0, -homa_sock_bind(self->hnet, &hsk2, 100)); - EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &self->hsk, - 110)); + EXPECT_EQ(0, -homa_sock_bind(self->hnet, &self->hsk, 110)); - EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, 110)); + EXPECT_EQ(&self->hsk, homa_sock_find(self->hnet, 110)); sock_put(&self->hsk.sock); - EXPECT_EQ(0, -homa_sock_bind(self->homa.port_map, &self->hsk, - 120)); - EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, 110)); - EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, 120)); + EXPECT_EQ(0, -homa_sock_bind(self->hnet, &self->hsk, 120)); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, 110)); + EXPECT_EQ(&self->hsk, homa_sock_find(self->hnet, 120)); sock_put(&self->hsk.sock); homa_sock_destroy(&hsk2); } TEST_F(homa_sock, homa_sock_bind__socket_shutdown) { homa_sock_shutdown(&self->hsk); - EXPECT_EQ(ESHUTDOWN, -homa_sock_bind(self->homa.port_map, &self->hsk, - 100)); + EXPECT_EQ(ESHUTDOWN, -homa_sock_bind(self->hnet, &self->hsk, 100)); } TEST_F(homa_sock, homa_sock_find__basics) @@ -322,15 +332,32 @@ TEST_F(homa_sock, homa_sock_find__basics) struct homa_sock hsk2; mock_sock_init(&hsk2, self->hnet, 0); - EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, 100)); - EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, - self->hsk.port)); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk2, 100)); + EXPECT_EQ(&self->hsk, homa_sock_find(self->hnet, self->hsk.port)); sock_put(&self->hsk.sock); - EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, - hsk2.port)); + EXPECT_EQ(&hsk2, homa_sock_find(self->hnet, hsk2.port)); + sock_put(&hsk2.sock); + EXPECT_EQ(NULL, homa_sock_find(self->hnet, hsk2.port + 1)); + homa_sock_destroy(&hsk2); +} +TEST_F(homa_sock, homa_sock_find__same_port_in_different_hnets) +{ + struct homa_sock hsk1, hsk2; + struct homa_sock *hsk; + struct homa_net *hnet; + + hnet = mock_alloc_hnet(&self->homa); + mock_sock_init(&hsk1, self->hnet, 100); + mock_sock_init(&hsk2, hnet, 100); + + hsk = homa_sock_find(self->hnet, 100); + EXPECT_EQ(&hsk1, hsk); + hsk = homa_sock_find(hnet, 100); + EXPECT_EQ(&hsk2, hsk); + + sock_put(&hsk1.sock); sock_put(&hsk2.sock); - EXPECT_EQ(NULL, homa_sock_find(self->homa.port_map, - hsk2.port + 1)); + homa_sock_destroy(&hsk1); homa_sock_destroy(&hsk2); } @@ -338,27 +365,25 @@ TEST_F(homa_sock, homa_sock_find__long_hash_chain) { struct homa_sock hsk2, hsk3, hsk4; - EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &self->hsk, 13)); + EXPECT_EQ(0, homa_sock_bind(self->hnet, &self->hsk, 13)); mock_sock_init(&hsk2, self->hnet, 0); - EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk2, + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk2, 2*HOMA_SOCKTAB_BUCKETS + 13)); mock_sock_init(&hsk3, self->hnet, 0); - EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk3, + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk3, 3*HOMA_SOCKTAB_BUCKETS + 13)); mock_sock_init(&hsk4, self->hnet, 0); - EXPECT_EQ(0, homa_sock_bind(self->homa.port_map, &hsk4, + EXPECT_EQ(0, homa_sock_bind(self->hnet, &hsk4, 5*HOMA_SOCKTAB_BUCKETS + 13)); - EXPECT_EQ(&self->hsk, homa_sock_find(self->homa.port_map, - 13)); + EXPECT_EQ(&self->hsk, homa_sock_find(self->hnet, 13)); sock_put(&self->hsk.sock); - EXPECT_EQ(&hsk2, homa_sock_find(self->homa.port_map, - 2*HOMA_SOCKTAB_BUCKETS + 13)); + EXPECT_EQ(&hsk2, homa_sock_find(self->hnet, 2*HOMA_SOCKTAB_BUCKETS + 13)); sock_put(&hsk2.sock); - EXPECT_EQ(&hsk3, homa_sock_find(self->homa.port_map, + EXPECT_EQ(&hsk3, homa_sock_find(self->hnet, 3*HOMA_SOCKTAB_BUCKETS + 13)); sock_put(&hsk3.sock); - EXPECT_EQ(&hsk4, homa_sock_find(self->homa.port_map, + EXPECT_EQ(&hsk4, homa_sock_find(self->hnet, 5*HOMA_SOCKTAB_BUCKETS + 13)); sock_put(&hsk4.sock); diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index c7446501..c924bd41 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -103,9 +103,9 @@ TEST_F(homa_utils, homa_init__cant_allocate_port_map) mock_kmalloc_errors = 0x10; unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); - EXPECT_SUBSTR("homa_init couldn't create port_map: kmalloc failure", + EXPECT_SUBSTR("homa_init couldn't create socktab: kmalloc failure", mock_printk_output); - EXPECT_EQ(NULL, homa2.port_map); + EXPECT_EQ(NULL, homa2.socktab); homa_destroy(&homa2); } #ifndef __STRIP__ /* See strip.py */ @@ -129,6 +129,47 @@ TEST_F(homa_utils, homa_destroy) homa_destroy(&homa2); } +TEST_F(homa_utils, homa_net_destroy__delete_sockets) +{ + struct homa_sock hsk1, hsk2, hsk3; + struct homa_net *hnet; + + hnet = mock_alloc_hnet(&self->homa); + mock_sock_init(&hsk1, hnet, 100); + mock_sock_init(&hsk2, hnet, 101); + mock_sock_init(&hsk3, self->hnet, 100); + + homa_net_destroy(hnet); + EXPECT_EQ(1, hsk1.shutdown); + EXPECT_EQ(1, hsk2.shutdown); + EXPECT_EQ(0, hsk3.shutdown); + + homa_sock_destroy(&hsk3); +} +TEST_F(homa_utils, homa_net_destroy__delete_peers) +{ + struct homa_peer *peer; + struct homa_net *hnet; + struct homa_sock hsk2; + struct in6_addr addr; + + hnet = mock_alloc_hnet(&self->homa); + mock_sock_init(&hsk2, hnet, 44); + + addr = unit_get_in_addr("1.2.3.4"); + peer = homa_peer_find(&hsk2, &addr); + homa_peer_put(peer); + peer = homa_peer_find(&self->hsk, &addr); + homa_peer_put(peer); + addr = unit_get_in_addr("1.2.3.5"); + peer = homa_peer_find(&hsk2, &addr); + homa_peer_put(peer); + EXPECT_EQ(3, unit_count_peers(&self->homa)); + + homa_net_destroy(hnet); + EXPECT_EQ(1, unit_count_peers(&self->homa)); +} + #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_print_ipv4_addr) { diff --git a/test/utils.c b/test/utils.c index c3d366c9..44151913 100644 --- a/test/utils.c +++ b/test/utils.c @@ -75,8 +75,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, this_size = (resp_length > UNIT_TEST_DATA_PER_PACKET) ? UNIT_TEST_DATA_PER_PACKET : resp_length; - homa_dispatch_pkts(mock_skb_alloc(server_ip, &h.common, this_size, 0), - hsk->homa); + homa_dispatch_pkts(mock_skb_alloc(server_ip, &h.common, this_size, 0)); if (state == UNIT_RCVD_ONE_PKT) return crpc; for (bytes_received = UNIT_TEST_DATA_PER_PACKET; @@ -87,7 +86,7 @@ struct homa_rpc *unit_client_rpc(struct homa_sock *hsk, this_size = UNIT_TEST_DATA_PER_PACKET; h.seg.offset = htonl(bytes_received); homa_dispatch_pkts(mock_skb_alloc(server_ip, &h.common, - this_size, 0), hsk->homa); + this_size, 0)); } if (state == UNIT_RCVD_MSG) return crpc; @@ -387,8 +386,7 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, homa_rpc_unlock(srpc); homa_dispatch_pkts(mock_skb_alloc(client_ip, &h.common, (req_length > UNIT_TEST_DATA_PER_PACKET) - ? UNIT_TEST_DATA_PER_PACKET : req_length, 0), - hsk->homa); + ? UNIT_TEST_DATA_PER_PACKET : req_length, 0)); if (state == UNIT_RCVD_ONE_PKT) return srpc; for (bytes_received = UNIT_TEST_DATA_PER_PACKET; @@ -400,7 +398,7 @@ struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, this_size = UNIT_TEST_DATA_PER_PACKET; h.seg.offset = htonl(bytes_received); homa_dispatch_pkts(mock_skb_alloc(client_ip, &h.common, - this_size, 0), hsk->homa); + this_size, 0)); } if (state == UNIT_RCVD_MSG) return srpc; From 001fce120fdf1870989de5125053b90049037bfa Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 20 May 2025 16:05:09 -0700 Subject: [PATCH 321/625] Fix race in timetrace deletion Timetrace buffers could get deleted while other threads (such as timer) were still calling tt_record. --- homa_plumbing.c | 6 +++--- timetrace.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index cc5f6069..f7b6ecb7 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -590,9 +590,6 @@ void __exit homa_unload(void) pr_notice("Homa module unloading\n"); -#ifndef __UPSTREAM__ /* See strip.py */ - tt_destroy(); -#endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ homa_gro_unhook_tcp(); if (timer_kthread) { @@ -613,6 +610,9 @@ void __exit homa_unload(void) inet6_unregister_protosw(&homav6_protosw); proto_unregister(&homa_prot); proto_unregister(&homav6_prot); +#ifndef __UPSTREAM__ /* See strip.py */ + tt_destroy(); +#endif /* See strip.py */ } module_init(homa_load); diff --git a/timetrace.c b/timetrace.c index 793234da..9916f5b1 100644 --- a/timetrace.c +++ b/timetrace.c @@ -141,11 +141,6 @@ int tt_init(char *proc_file) tt_dir_entry = NULL; } - mutex_init(&tt_mutex); - atomic_set(&tt_freeze_count, 0); - atomic_set(&tt_frozen, 0); - init = true; - #ifdef TT_KERNEL for (i = 0; i < nr_cpu_ids; i++) tt_linux_buffers[i] = tt_buffers[i]; @@ -160,6 +155,11 @@ int tt_init(char *proc_file) memset(tt_debug_int64, 0, sizeof(tt_debug_int64)); #endif + mutex_init(&tt_mutex); + atomic_set(&tt_frozen, 0); + atomic_set(&tt_freeze_count, 0); + init = true; + return 0; error: @@ -191,6 +191,7 @@ void tt_destroy(void) { int i; + tt_freeze_count.counter = 1; mutex_lock(&tt_mutex); if (init) { init = false; @@ -201,7 +202,6 @@ void tt_destroy(void) kfree(tt_buffers[i]); tt_buffers[i] = NULL; } - tt_freeze_count.counter = 1; #ifdef TT_KERNEL tt_linux_record = ltt_record_nop; From c72382668b480d014d5e5b24bdb3aa1a5cf16416 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 20 May 2025 14:12:57 -0700 Subject: [PATCH 322/625] Add limits on peer usage There is now a mechanism to free homa_peer structs if too many of them accumulate. --- homa_devel.c | 2 +- homa_impl.h | 7 +- homa_metrics.c | 4 +- homa_metrics.h | 4 +- homa_peer.c | 396 +++++++++++++++++++++++++++++++-- homa_peer.h | 137 +++++++++++- homa_peer_old.c | 2 +- homa_timer.c | 1 + homa_utils.c | 14 +- man/homa.7 | 35 +++ test/mock.c | 58 +++-- test/mock.h | 3 + test/unit_homa_peer.c | 484 ++++++++++++++++++++++++++++++++++++++++- test/unit_homa_utils.c | 4 +- test/utils.c | 17 +- test/utils.h | 1 + 16 files changed, 1099 insertions(+), 70 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 93ac9c1f..75132653 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -402,7 +402,7 @@ void homa_freeze_peers() IF_NO_STRIP(freeze.common.urgent = htons(HOMA_TCP_URGENT)); freeze.common.sender_id = 0; - rhashtable_walk_enter(&hnet->homa->peers->ht, &iter); + rhashtable_walk_enter(&hnet->homa->peertab->ht, &iter); rhashtable_walk_start(&iter); while (true) { peer = rhashtable_walk_next(&iter); diff --git a/homa_impl.h b/homa_impl.h index 15bbd029..3571bee9 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -132,7 +132,7 @@ struct homa { * @peers: Info about all the other hosts we have communicated with; * includes peers from all network namespaces. */ - struct homa_peertab *peers; + struct homa_peertab *peertab; /** * @socktab: Information about all open sockets. Dynamically @@ -497,6 +497,11 @@ struct homa_net { * the range of default ports. */ __u16 prev_default_port; + + /* @num_peers: The total number of struct homa_peers that exist + * for this namespace. Managed by homa_peer.c under the peertab lock. + */ + int num_peers; }; /** diff --git a/homa_metrics.c b/homa_metrics.c index 84569578..9e65ee77 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -261,8 +261,8 @@ char *homa_metrics_print(void) m->throttled_cycles); M("resent_packets %15llu DATA packets sent in response to RESENDs\n", m->resent_packets); - M("peer_new_entries %15llu New entries created in peer table\n", - m->peer_new_entries); + M("peer_allocs %15llu New entries created in peer table\n", + m->peer_allocs); M("peer_kmalloc_errors %15llu kmalloc failures creating peer table entries\n", m->peer_kmalloc_errors); M("peer_route_errors %15llu Routing failures creating peer table entries\n", diff --git a/homa_metrics.h b/homa_metrics.h index 228ec5c2..33f79666 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -305,11 +305,11 @@ struct homa_metrics { u64 resent_packets; /** - * @peer_new_entries: total # of new entries created in Homa's + * @peer_allocs: total # of new entries created in Homa's * peer table (this value doesn't increment if the desired peer is * found in the entry in its hash chain). */ - u64 peer_new_entries; + u64 peer_allocs; /** * @peer_kmalloc_errors: total number of times homa_peer_find diff --git a/homa_peer.c b/homa_peer.c index 56adafe2..1feb94b2 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -14,6 +14,9 @@ #undef rhashtable_lookup_get_insert_fast #define rhashtable_lookup_get_insert_fast mock_rht_lookup_get_insert_fast + +#undef rhashtable_walk_next +#define rhashtable_walk_next mock_rht_walk_next #endif /* __UNIT_TEST__ */ const struct rhashtable_params ht_params = { @@ -25,6 +28,44 @@ const struct rhashtable_params ht_params = { .obj_cmpfn = homa_peer_compare }; +#ifndef __STRIP__ /* See strip.py */ +/* Used to enable sysctl access to peertab-specific configuration parameters. + * The @data fields are actually offsets within a struct homa_peertab; these + * are converted to pointers into a struct peertab later. + */ +#define OFFSET(field) ((void *)offsetof(struct homa_peertab, field)) +static struct ctl_table peer_ctl_table[] = { + { + .procname = "peer_gc_threshold", + .data = OFFSET(gc_threshold), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_peer_dointvec + }, + { + .procname = "peer_idle_secs_min", + .data = OFFSET(idle_secs_min), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_peer_dointvec + }, + { + .procname = "peer_idle_secs_max", + .data = OFFSET(idle_secs_max), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_peer_dointvec + }, + { + .procname = "peer_net_max", + .data = OFFSET(net_max), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_peer_dointvec + }, +}; +#endif /* See strip.py */ + /** * homa_peertab_alloc() - Allocate and initialize a homa_peertab. * @@ -36,9 +77,9 @@ struct homa_peertab *homa_peertab_alloc(void) struct homa_peertab *peertab; int err; - peertab = kmalloc(sizeof(*peertab), GFP_KERNEL); + peertab = kmalloc(sizeof(*peertab), GFP_KERNEL | __GFP_ZERO); if (!peertab) { - pr_err("%s couldn't create peers: kmalloc failure", __func__); + pr_err("%s couldn't create peertab: kmalloc failure", __func__); return ERR_PTR(-ENOMEM); } @@ -47,20 +88,47 @@ struct homa_peertab *homa_peertab_alloc(void) kfree(peertab); return ERR_PTR(err); } + peertab->ht_valid = true; + rhashtable_walk_enter(&peertab->ht, &peertab->ht_iter); + INIT_LIST_HEAD(&peertab->dead_peers); + peertab->gc_threshold = 5000; + peertab->net_max = 10000; + peertab->idle_secs_min = 10; + peertab->idle_secs_max = 120; + +#ifndef __STRIP__ /* See strip.py */ + peertab->sysctl_header = register_net_sysctl(&init_net, "net/homa", + peer_ctl_table); + if (!peertab->sysctl_header) { + err = -ENOMEM; + pr_err("couldn't register sysctl parameters for Homa peertab\n"); + goto error; + } +#endif /* See strip.py */ + homa_peer_update_sysctl_deps(peertab); return peertab; + +error: + homa_peertab_free(peertab); + return ERR_PTR(err); } /** - * homa_peertab_free_homa() - Garbage collect all of the peer information - * associated with a particular struct homa. - * @homa: Object whose peers should be freed. + * homa_peertab_free_net() - Garbage collect all of the peer information + * associated with a particular network namespace. + * @hnet: Network namespace whose peers should be freed. There must not + * be any active sockets or RPCs for this namespace. */ void homa_peertab_free_net(struct homa_net *hnet) { - struct homa_peertab *peertab = hnet->homa->peers; + struct homa_peertab *peertab = hnet->homa->peertab; struct rhashtable_iter iter; struct homa_peer *peer; + spin_lock_bh(&peertab->lock); + peertab->gc_stop_count++; + spin_unlock_bh(&peertab->lock); + rhashtable_walk_enter(&peertab->ht, &iter); rhashtable_walk_start(&iter); while (1) { @@ -71,12 +139,21 @@ void homa_peertab_free_net(struct homa_net *hnet) continue; if (peer->ht_key.hnet != hnet) continue; - rhashtable_remove_fast(&peertab->ht, &peer->ht_linkage, - ht_params); - homa_peer_free(peer); + if (rhashtable_remove_fast(&peertab->ht, &peer->ht_linkage, + ht_params) == 0) { + homa_peer_free(peer); + hnet->num_peers--; + peertab->num_peers--; + } } rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); + WARN(hnet->num_peers != 0, "%s ended up with hnet->num_peers %d", + __func__, hnet->num_peers); + + spin_lock_bh(&peertab->lock); + peertab->gc_stop_count--; + spin_unlock_bh(&peertab->lock); } /** @@ -102,12 +179,241 @@ void homa_peertab_free_fn(void *object, void *dummy) */ void homa_peertab_free(struct homa_peertab *peertab) { - spin_lock_init(&peertab->lock); - rhashtable_free_and_destroy(&peertab->ht, homa_peertab_free_fn, - NULL); + spin_lock_bh(&peertab->lock); + peertab->gc_stop_count++; + spin_unlock_bh(&peertab->lock); + + if (peertab->ht_valid) { + rhashtable_walk_exit(&peertab->ht_iter); + rhashtable_free_and_destroy(&peertab->ht, homa_peertab_free_fn, + NULL); + } + while (!list_empty(&peertab->dead_peers)) + homa_peer_free_dead(peertab); +#ifndef __STRIP__ /* See strip.py */ + if (peertab->sysctl_header) { + unregister_net_sysctl_table(peertab->sysctl_header); + peertab->sysctl_header = NULL; + } +#endif /* See strip.py */ kfree(peertab); } +/** + * homa_peer_rcu_callback() - This function is invoked as the callback + * for an invocation of call_rcu. It just marks a peertab to indicate that + * it was invoked. + * @head: Contains information used to locate the peertab. + */ +void homa_peer_rcu_callback(struct rcu_head *head) +{ + struct homa_peertab *peertab; + + peertab = container_of(head, struct homa_peertab, rcu_head); + atomic_set(&peertab->call_rcu_pending, 0); +} + +/** + * homa_peer_free_dead() - Release peers on peertab->dead_peers + * if possible. + * @peertab: Check the dead peers here. + */ +void homa_peer_free_dead(struct homa_peertab *peertab) + __must_hold(&peertab->lock) +{ + struct homa_peer *peer, *tmp; + + /* A dead peer can be freed only if: + * (a) there are no call_rcu calls pending (if there are, it's + * possible that a new reference might get created for the + * peer) + * (b) the peer's reference count is zero. + */ + if (atomic_read(&peertab->call_rcu_pending)) + return; + list_for_each_entry_safe(peer, tmp, &peertab->dead_peers, dead_links) { + if (atomic_read(&peer->refs) == 0) { + tt_record1("homa_peer_free_dead freeing homa_peer 0x%x", + tt_addr(peer->addr)); + list_del_init(&peer->dead_links); + homa_peer_free(peer); + } + } +} + +/** + * homa_peer_wait_dead() - Don't return until all of the dead peers have + * been freed. + * @peertab: Overall information about peers, which includes a dead list. + * + */ +void homa_peer_wait_dead(struct homa_peertab *peertab) +{ + while (1) { + spin_lock_bh(&peertab->lock); + homa_peer_free_dead(peertab); + if (list_empty(&peertab->dead_peers)) { + spin_unlock_bh(&peertab->lock); + return; + } + spin_unlock_bh(&peertab->lock); + } +} + +/** + * homa_peer_prefer_evict() - Given two peers, determine which one is + * a better candidate for eviction. + * @peertab: Overall information used to manage peers. + * @peer1: First peer. + * @peer2: Second peer. + * Return: True if @peer1 is a better candidate for eviction than @peer2. + */ +int homa_peer_prefer_evict(struct homa_peertab *peertab, + struct homa_peer *peer1, + struct homa_peer *peer2) +{ + /* Prefer a peer whose homa-net is over its limit; if both are either + * over or under, then prefer the peer with the shortest idle time. + */ + if (peer1->ht_key.hnet->num_peers > peertab->net_max) { + if (peer2->ht_key.hnet->num_peers <= peertab->net_max) + return true; + else + return peer1->access_jiffies < peer2->access_jiffies; + } + if (peer2->ht_key.hnet->num_peers > peertab->net_max) + return false; + else + return peer1->access_jiffies < peer2->access_jiffies; +} + +/** + * homa_peer_pick_victims() - Select a few peers that can be freed. + * @peertab: Choose peers that are stored here. + * @victims: Return addresses of victims here. + * @max_victims: Limit on how many victims to choose (and size of @victims + * array). + * Return: The number of peers stored in @victims; may be zero. + */ +int homa_peer_pick_victims(struct homa_peertab *peertab, + struct homa_peer *victims[], int max_victims) +{ + struct homa_peer *peer; + int num_victims = 0; + int to_scan; + int i, idle; + + /* Scan 2 peers for every potential victim and keep the "best" + * peers for removal. + */ + rhashtable_walk_start(&peertab->ht_iter); + for (to_scan = 2 * max_victims; to_scan > 0; to_scan--) { + peer = rhashtable_walk_next(&peertab->ht_iter); + if (!peer) { + /* Reached the end of the table; restart at + * the beginning. + */ + rhashtable_walk_stop(&peertab->ht_iter); + rhashtable_walk_exit(&peertab->ht_iter); + rhashtable_walk_enter(&peertab->ht, &peertab->ht_iter); + rhashtable_walk_start(&peertab->ht_iter); + peer = rhashtable_walk_next(&peertab->ht_iter); + if (!peer) + break; + } + if (IS_ERR(peer)) { + /* rhashtable decided to restart the search at the + * beginning. + */ + peer = rhashtable_walk_next(&peertab->ht_iter); + if (!peer || IS_ERR(peer)) + break; + } + + /* Has this peer been idle long enough to be candidate for + * eviction? + */ + idle = jiffies - peer->access_jiffies; + if (idle < peertab->idle_jiffies_min) + continue; + if (idle < peertab->idle_jiffies_max && + peer->ht_key.hnet->num_peers <= peertab->net_max) + continue; + + /* Sort the candidate into the existing list of victims. */ + for (i = 0; i < num_victims; i++) { + if (peer == victims[i]) { + /* This can happen if there aren't very many + * peers and we wrapped around in the hash + * table. + */ + peer = NULL; + break; + } + if (homa_peer_prefer_evict(peertab, peer, victims[i])) { + struct homa_peer *tmp; + + tmp = victims[i]; + victims[i] = peer; + peer = tmp; + } + } + + if (num_victims < max_victims && peer) { + victims[num_victims] = peer; + num_victims++; + } + } + rhashtable_walk_stop(&peertab->ht_iter); + return num_victims; +} + +/** + * homa_peer_gc() - This function is invoked by Homa at regular intervals; + * its job is to ensure that the number of peers stays within limits. + * If the number grows too large, it selectively deletes peers to get + * back under the limit. + * @peertab: Structure whose peers should be considered for garbage + * collection. + */ +void homa_peer_gc(struct homa_peertab *peertab) +{ +#define EVICT_BATCH_SIZE 5 + struct homa_peer *victims[EVICT_BATCH_SIZE]; + int num_victims; + int i; + + spin_lock_bh(&peertab->lock); + if (peertab->gc_stop_count != 0) + goto done; + if (!list_empty(&peertab->dead_peers)) + homa_peer_free_dead(peertab); + if (atomic_read(&peertab->call_rcu_pending) || + peertab->num_peers < peertab->gc_threshold) + goto done; + num_victims = homa_peer_pick_victims(peertab, victims, + EVICT_BATCH_SIZE); + if (num_victims == 0) + goto done; + + for (i = 0; i < num_victims; i++) { + struct homa_peer *peer = victims[i]; + + if (rhashtable_remove_fast(&peertab->ht, &peer->ht_linkage, + ht_params) == 0) { + list_add_tail(&peer->dead_links, &peertab->dead_peers); + peertab->num_peers--; + peer->ht_key.hnet->num_peers--; + tt_record1("homa_peer_gc removed homa_peer 0x%x", + tt_addr(peer->addr)); + } + } + atomic_set(&peertab->call_rcu_pending, 1); + call_rcu(&peertab->rcu_head, homa_peer_rcu_callback); +done: + spin_unlock_bh(&peertab->lock); +} + /** * homa_peer_alloc() - Allocate and initialize a new homa_peer object. * @hsk: Socket for which the peer will be used. @@ -130,7 +436,9 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, } peer->ht_key.addr = *addr; peer->ht_key.hnet = hsk->hnet; + INIT_LIST_HEAD(&peer->dead_links); atomic_set(&peer->refs, 1); + peer->access_jiffies = jiffies; peer->addr = *addr; dst = homa_peer_get_dst(peer, hsk); if (IS_ERR(dst)) { @@ -147,7 +455,9 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, #endif /* See strip.py */ peer->current_ticks = -1; spin_lock_init(&peer->ack_lock); - INC_METRIC(peer_new_entries, 1); + INC_METRIC(peer_allocs, 1); + tt_record1("Allocated new homa_peer for node 0x%x", + tt_addr(peer->addr)); return peer; } @@ -167,8 +477,8 @@ void homa_peer_free(struct homa_peer *peer) #ifdef __UNIT_TEST__ if (!mock_peer_free_no_fail) FAIL(" %s found peer %s with reference count %d", - __func__, homa_print_ipv6_addr(&peer->addr), - atomic_read(&peer->refs)); + __func__, homa_print_ipv6_addr(&peer->addr), + atomic_read(&peer->refs)); else UNIT_LOG("; ", "peer %s has reference count %d", homa_print_ipv6_addr(&peer->addr), @@ -195,7 +505,7 @@ void homa_peer_free(struct homa_peer *peer) struct homa_peer *homa_peer_find(struct homa_sock *hsk, const struct in6_addr *addr) { - struct homa_peertab *peertab = hsk->homa->peers; + struct homa_peertab *peertab = hsk->homa->peertab; struct homa_peer *peer, *other; struct homa_peer_key key; @@ -205,6 +515,7 @@ struct homa_peer *homa_peer_find(struct homa_sock *hsk, peer = rhashtable_lookup(&peertab->ht, &key, ht_params); if (peer) { homa_peer_hold(peer); + peer->access_jiffies = jiffies; rcu_read_unlock(); return peer; } @@ -218,7 +529,6 @@ struct homa_peer *homa_peer_find(struct homa_sock *hsk, spin_lock_bh(&peertab->lock); other = rhashtable_lookup_get_insert_fast(&peertab->ht, &peer->ht_linkage, ht_params); - spin_unlock_bh(&peertab->lock); if (IS_ERR(other)) { /* Couldn't insert; return the error info. */ homa_peer_put(peer); @@ -230,9 +540,14 @@ struct homa_peer *homa_peer_find(struct homa_sock *hsk, */ homa_peer_put(peer); homa_peer_free(peer); - homa_peer_hold(other); peer = other; + homa_peer_hold(peer); + peer->access_jiffies = jiffies; + } else { + peertab->num_peers++; + key.hnet->num_peers++; } + spin_unlock_bh(&peertab->lock); rcu_read_unlock(); return peer; } @@ -441,3 +756,48 @@ int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst) homa_peer_unlock(peer); return count; } + +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_peer_update_sysctl_deps() - Update any peertab fields that depend + * on values set by sysctl. This function is invoked anytime a peer sysctl + * value is updated. + * @peertab: Struct to update. + */ +void homa_peer_update_sysctl_deps(struct homa_peertab *peertab) +{ + peertab->idle_jiffies_min = peertab->idle_secs_min * HZ; + peertab->idle_jiffies_max = peertab->idle_secs_max * HZ; +} + +/** + * homa_peer_dointvec() - This function is a wrapper around proc_dointvec. It + * is invoked to read and write peer-related sysctl values. + * @table: sysctl table describing value to be read or written. + * @write: Nonzero means value is being written, 0 means read. + * @buffer: Address in user space of the input/output data. + * @lenp: Not exactly sure. + * @ppos: Not exactly sure. + * + * Return: 0 for success, nonzero for error. + */ +int homa_peer_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct homa_peertab *peertab; + struct ctl_table table_copy; + int result; + + peertab = homa_net_from_net(current->nsproxy->net_ns)->homa->peertab; + + /* Generate a new ctl_table that refers to a field in the + * net-specific struct homa. + */ + table_copy = *table; + table_copy.data = ((char *)peertab) + (uintptr_t)table_copy.data; + + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + homa_peer_update_sysctl_deps(peertab); + return result; +} +#endif /* See strip.py */ diff --git a/homa_peer.h b/homa_peer.h index 15586b16..7ebb67df 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -27,6 +27,91 @@ struct homa_peertab { /** @ht: Hash table that stores all struct peers. */ struct rhashtable ht; + + /** @ht_iter: Used to scan ht to find peers to garbage collect. */ + struct rhashtable_iter ht_iter; + + /** @num_peers: Total number of peers currently in @ht. */ + int num_peers; + + /** + * @ht_valid: True means ht and ht_iter have been initialized and must + * eventually be destroyed. + */ + bool ht_valid; + + /** + * @dead_peers: List of peers that have been removed from ht + * but can't yet be freed (because they have nonzero reference + * counts or an rcu sync point hasn't been reached). + */ + struct list_head dead_peers; + + /** @rcu_head: Holds state of a pending call_rcu invocation. */ + struct rcu_head rcu_head; + + /** + * @call_rcu_pending: Nonzero means that call_rcu has been + * invoked but it has not invoked the callback function; until the + * callback has been invoked we can't free peers on dead_peers or + * invoke call_rcu again (which means we can't add more peers to + * dead_peers). + */ + atomic_t call_rcu_pending; + + /** + * @gc_stop_count: Nonzero means that peer garbage collection + * should not be performed (conflicting state changes are underway). + */ + int gc_stop_count; + + /** + * @gc_threshold: If @num_peers is less than this, don't bother + * doing any peer garbage collection. Set externally via sysctl. + */ + int gc_threshold; + + /** + * @net_max: If the number of peers for a homa_net exceeds this number, + * work aggressivley to reclaim peers for that homa_net. Set + * externally via sysctl. + */ + int net_max; + + /** + * @idle_secs_min: A peer will not be considered for garbage collection + * under any circumstances if it has been idle less than this many + * seconds. Set externally via sysctl. + */ + int idle_secs_min; + + /** + * @idle_jiffies_min: Same as idle_secs_min except in units + * of jiffies. + */ + unsigned long idle_jiffies_min; + + /** + * @idle_secs_max: A peer that has been idle for less than + * this many seconds will not be considered for garbage collection + * unless its homa_net has more than @net_threshold peers. Set + * externally via sysctl. + */ + int idle_secs_max; + + /** + * @idle_jiffies_max: Same as idle_secs_max except in units + * of jiffies. + */ + unsigned long idle_jiffies_max; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @sysctl_header: Used to remove sysctl values when this structure + * is destroyed. + */ + struct ctl_table_header *sysctl_header; +#endif /* See strip.py */ }; /** @@ -39,7 +124,7 @@ struct homa_peer_key { */ struct in6_addr addr; - /** @homa: The network namespace in which this peer is valid. */ + /** @hnet: The network namespace in which this peer is valid. */ struct homa_net *hnet; }; @@ -48,20 +133,30 @@ struct homa_peer_key { * have communicated with (either as client or server). */ struct homa_peer { - /** @key: The hash table key for this peer in peertab->ht. */ + /** @ht_key: The hash table key for this peer in peertab->ht. */ struct homa_peer_key ht_key; /** - * @ht: Used by rashtable implement to link this peer into peertab->ht. + * @ht_linkage: Used by rashtable implement to link this peer into + * peertab->ht. */ struct rhash_head ht_linkage; + /** @dead_links: Used to link this peer into peertab->dead_peers. */ + struct list_head dead_links; + /** * @refs: Number of unmatched calls to homa_peer_hold; it's not safe * to free this object until the reference count is zero. */ atomic_t refs ____cacheline_aligned_in_smp; + /** + * @access_jiffies: Time in jiffies of most recent access to this + * peer. + */ + unsigned long access_jiffies; + /** * @addr: IPv6 address for the machine (IPv4 addresses are stored * as IPv4-mapped IPv6 addresses). @@ -194,17 +289,29 @@ void homa_peertab_free_fn(void *object, void *dummy); void homa_peer_add_ack(struct homa_rpc *rpc); struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, const struct in6_addr *addr); +int homa_peer_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); struct homa_peer *homa_peer_find(struct homa_sock *hsk, const struct in6_addr *addr); void homa_peer_free(struct homa_peer *peer); +void homa_peer_free_dead(struct homa_peertab *peertab); +void homa_peer_gc(struct homa_peertab *peertab); int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst); struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, struct homa_sock *hsk); #ifndef __STRIP__ /* See strip.py */ void homa_peer_lock_slow(struct homa_peer *peer); +int homa_peer_pick_victims(struct homa_peertab *peertab, + struct homa_peer *victims[], int max_victims); +int homa_peer_prefer_evict(struct homa_peertab *peertab, + struct homa_peer *peer1, + struct homa_peer *peer2); +void homa_peer_rcu_callback(struct rcu_head *head); void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7); +void homa_peer_update_sysctl_deps(struct homa_peertab *peertab); +void homa_peer_wait_dead(struct homa_peertab *peertab); #endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ @@ -253,7 +360,7 @@ static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk) { if (unlikely(peer->dst->obsolete > 0)) - homa_dst_refresh(hsk->homa->peers, peer, hsk); + homa_dst_refresh(hsk->homa->peertab, peer, hsk); dst_hold(peer->dst); return peer->dst; } @@ -270,7 +377,7 @@ static inline void homa_peer_hold(struct homa_peer *peer) /** * homa_peer_put() - Release a reference on a peer (cancels the effect of - * a previous call to homa_peer_put). If the reference count becomes zero + * a previous call to homa_peer_hold). If the reference count becomes zero * then the peer may be deleted at any time. * @peer: Object to release. */ @@ -279,6 +386,14 @@ static inline void homa_peer_put(struct homa_peer *peer) atomic_dec(&peer->refs); } +/** + * homa_peer_hash() - Hash function used for @peertab->ht. + * @data: Pointer to key for which a hash is desired. Must actually + * be a struct homa_peer_key. + * @dummy: Not used + * @seed: Seed for the hash. + * Return: A 32-bit hash value for the given key. + */ static inline u32 homa_peer_hash(const void *data, u32 dummy, u32 seed) { /* This is MurmurHash3, used instead of the jhash default because it @@ -291,9 +406,9 @@ static inline u32 homa_peer_hash(const void *data, u32 dummy, u32 seed) const u32 *key = data; u32 h = seed; - for (size_t i = 0; i < len; i++) { u32 k = key[i]; + k *= c1; k = (k << 15) | (k >> (32 - 15)); k *= c2; @@ -313,14 +428,20 @@ static inline u32 homa_peer_hash(const void *data, u32 dummy, u32 seed) return h; } +/** + * homa_peer_compare() - Comparison function for entries in @peertab->ht. + * @arg: Contains one of the keys to compare. + * @obj: homa_peer object containing the other key to compare. + * Return: 0 means the keys match, 1 means mismatch. + */ static inline int homa_peer_compare(struct rhashtable_compare_arg *arg, const void *obj) { const struct homa_peer *peer = obj; const struct homa_peer_key *key = arg->key; - return !ipv6_addr_equal(&key->addr, &peer->ht_key.addr) && - peer->ht_key.hnet == key->hnet; + return !(ipv6_addr_equal(&key->addr, &peer->ht_key.addr) && + peer->ht_key.hnet == key->hnet); } #endif /* _HOMA_PEER_H */ diff --git a/homa_peer_old.c b/homa_peer_old.c index 8a24e577..70141874 100644 --- a/homa_peer_old.c +++ b/homa_peer_old.c @@ -231,7 +231,7 @@ struct homa_peer *homa_peer_find(struct homa_peertab *peertab, hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]); peer->current_ticks = -1; spin_lock_init(&peer->ack_lock); - INC_METRIC(peer_new_entries, 1); + INC_METRIC(peer_allocs, 1); done: spin_unlock_bh(&peertab->write_lock); diff --git a/homa_timer.c b/homa_timer.c index eb0d82d1..cc84eb12 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -288,6 +288,7 @@ void homa_timer(struct homa *homa) atomic_read(&homa->grant->total_incoming)); #endif /* See strip.py */ homa_skb_release_pages(homa); + homa_peer_gc(homa->peertab); #ifndef __STRIP__ /* See strip.py */ end = homa_clock(); INC_METRIC(timer_cycles, end - start); diff --git a/homa_utils.c b/homa_utils.c index 540873fb..d6825170 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -52,10 +52,10 @@ int homa_init(struct homa *homa) homa->pacer = NULL; return err; } - homa->peers = homa_peertab_alloc(); - if (IS_ERR(homa->peers)) { - err = PTR_ERR(homa->peers); - homa->peers = NULL; + homa->peertab = homa_peertab_alloc(); + if (IS_ERR(homa->peertab)) { + err = PTR_ERR(homa->peertab); + homa->peertab = NULL; return err; } homa->socktab = kmalloc(sizeof(*homa->socktab), GFP_KERNEL); @@ -148,9 +148,9 @@ void homa_destroy(struct homa *homa) homa_pacer_free(homa->pacer); homa->pacer = NULL; } - if (homa->peers) { - homa_peertab_free(homa->peers); - homa->peers = NULL; + if (homa->peertab) { + homa_peertab_free(homa->peertab); + homa->peertab = NULL; } #ifndef __STRIP__ /* See strip.py */ diff --git a/man/homa.7 b/man/homa.7 index 31a2340d..0064c02d 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -539,6 +539,41 @@ however, under very extreme loads a small value does provide benefit for the largest messages, when used with .I grant_fifo_fraction. .TP +.IR peer_gc_threshold +.PD 0 +.TP +.IR peer_idle_secs_min +.TP +.IR peer_idle_secs_max +.TP +.IR peer_net_max +.IP +These options control garbage collection of peer objets. Homa maintains +long-lived state for each peer machine that it has communicated with; peer +objects are kept separately for each network namespace. +These options are used to limit memory utilization from peer objects. If the +total number of peer objects across all namespaces is less than +.IR peer_gc_threshold +then no peer garbage collection occurs. If the number of peer objects is +at least +.IR peer_gc_threshold +then Homa will free peers that have not been referenced in the last +.IR peer_idle_secs_max +in order to reduce the total number of peer objects below +.IR peer_gc_threshold . +In addition, if a given network namespace has more than +.IR peer_net_max +peers allocated, then peers in that namespace are candidates for +freeing if they have not been referenced in the last +.IR peer_idle_secs_min +seconds. When choosing among candidates to free, Homa uses a semi-random +approach that +(a) prefers to evict peers from namespaces above the +.IR peer_net_max +threshold over those from underloaded namespaces +and (b) prefers to evict peers whose most recent usage is farthest in the past. +.PD +.TP .IR poll_usecs When a thread waits for an incoming message, Homa first busy-waits for a short amount of time before putting the thread to sleep. If a message arrives diff --git a/test/mock.c b/test/mock.c index bf327a50..10f1c953 100644 --- a/test/mock.c +++ b/test/mock.c @@ -219,6 +219,10 @@ int mock_page_nid_mask; /* Used to collect printk output. */ char mock_printk_output [5000]; +/* Used as the return values from rhashtable_walk_next calls. */ +void **mock_rht_walk_results; +int mock_rht_num_walk_results; + /* Used instead of HOMA_MIN_DEFAULT_PORT by homa_skb.c. */ __u16 mock_min_default_port = 0x8000; @@ -328,7 +332,7 @@ void BUG_func(void) void call_rcu(struct rcu_head *head, void free_func(struct rcu_head *head)) { - free_func(head); + unit_log_printf("; ", "call_rcu invoked"); } bool cancel_work_sync(struct work_struct *work) @@ -844,6 +848,7 @@ void kfree(const void *block) { if (block == NULL) return; + UNIT_HOOK("kfree"); if (!kmallocs_in_use || unit_hash_get(kmallocs_in_use, block) == NULL) { FAIL(" %s on unknown block %p", __func__, block); return; @@ -1264,23 +1269,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} -int mock_rht_init(struct rhashtable *ht, - const struct rhashtable_params *params) -{ - if (mock_check_error(&mock_rht_init_errors)) - return -EINVAL; - return rhashtable_init(ht, params); -} - -void *mock_rht_lookup_get_insert_fast(struct rhashtable *ht, - struct rhash_head *obj, - const struct rhashtable_params params) -{ - if (mock_check_error(&mock_rht_insert_errors)) - return ERR_PTR(-EINVAL); - return rhashtable_lookup_get_insert_fast(ht, obj, params); -} - void schedule(void) { UNIT_HOOK("schedule"); @@ -1809,6 +1797,37 @@ struct ctl_table_header *mock_register_net_sysctl(struct net *net, return (struct ctl_table_header *)11111; } +int mock_rht_init(struct rhashtable *ht, + const struct rhashtable_params *params) +{ + if (mock_check_error(&mock_rht_init_errors)) + return -EINVAL; + return rhashtable_init(ht, params); +} + +void *mock_rht_lookup_get_insert_fast(struct rhashtable *ht, + struct rhash_head *obj, + const struct rhashtable_params params) +{ + if (mock_check_error(&mock_rht_insert_errors)) + return ERR_PTR(-EINVAL); + return rhashtable_lookup_get_insert_fast(ht, obj, params); +} + +void *mock_rht_walk_next(struct rhashtable_iter *iter) +{ + void *result; + + if (!mock_rht_walk_results) + return rhashtable_walk_next(iter); + if (mock_rht_num_walk_results == 0) + return NULL; + result = *mock_rht_walk_results; + mock_rht_walk_results++; + mock_rht_num_walk_results--; + return result; +} + void mock_rpc_hold(struct homa_rpc *rpc) { mock_rpc_holds++; @@ -2118,6 +2137,8 @@ void mock_teardown(void) mock_compound_order_mask = 0; mock_page_nid_mask = 0; mock_printk_output[0] = 0; + mock_rht_walk_results = NULL; + mock_rht_num_walk_results = 0; mock_min_default_port = 0x8000; homa_net_id = 0; mock_num_hnets = 0; @@ -2129,6 +2150,7 @@ void mock_teardown(void) memset(inet6_offloads, 0, sizeof(inet6_offloads)); inet6_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) &tcp_v6_offload; + jiffies = 1100; count = unit_hash_size(skbs_in_use); if (count > 0) diff --git a/test/mock.h b/test/mock.h index 73ce1ddd..5abf8568 100644 --- a/test/mock.h +++ b/test/mock.h @@ -148,6 +148,8 @@ extern int mock_prepare_to_wait_status; extern char mock_printk_output[]; extern int mock_rht_init_errors; extern int mock_rht_insert_errors; +extern void **mock_rht_walk_results; +extern int mock_rht_num_walk_results; extern int mock_route_errors; extern int mock_signal_pending; extern int mock_sock_holds; @@ -198,6 +200,7 @@ int mock_rht_init(struct rhashtable *ht, void *mock_rht_lookup_get_insert_fast(struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params); +void *mock_rht_walk_next(struct rhashtable_iter *iter); void mock_rpc_hold(struct homa_rpc *rpc); void mock_rpc_put(struct homa_rpc *rpc); void mock_set_clock_vals(u64 t, ...); diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 081aecdf..c0582e3a 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -12,6 +12,9 @@ struct in6_addr ip1111[1]; struct in6_addr ip2222[1]; struct in6_addr ip3333[1]; +struct in6_addr ip4444[1]; +struct in6_addr ip5555[1]; +struct in6_addr ip6666[1]; FIXTURE(homa_peer) { struct homa homa; @@ -31,6 +34,9 @@ FIXTURE_SETUP(homa_peer) ip1111[0] = unit_get_in_addr("1::1:1:1"); ip2222[0] = unit_get_in_addr("2::2:2:2"); ip3333[0] = unit_get_in_addr("3::3:3:3"); + ip4444[0] = unit_get_in_addr("4::4:4:4"); + ip5555[0] = unit_get_in_addr("5::5:5:5"); + ip6666[0] = unit_get_in_addr("6::6:6:6"); self->server_port = 99; } FIXTURE_TEARDOWN(homa_peer) @@ -64,6 +70,26 @@ static void peer_race_hook(char *id) */ conflicting_peer = homa_peer_find(&test_data->hsk, ip3333); homa_peer_put(conflicting_peer); + jiffies += 10; +} + +static struct homa_peertab *hook_peertab; +static void stop_gc_hook(char *id) +{ + if (strcmp(id, "kfree") != 0) + return; + unit_log_printf("; ", "gc_stop_count %d", hook_peertab->gc_stop_count); +} + +static int hook_free_count; +static void complete_rcu_hook(char *id) { + if (strcmp(id, "unlock") != 0) + return; + if (hook_free_count == 0) + return; + hook_free_count--; + if (hook_free_count == 0) + homa_peer_rcu_callback(&hook_peertab->rcu_head); } TEST_F(homa_peer, homa_peertab_alloc__success) @@ -93,8 +119,19 @@ TEST_F(homa_peer, homa_peertab_alloc__rhashtable_init_fails) EXPECT_TRUE(IS_ERR(peertab)); EXPECT_EQ(EINVAL, -PTR_ERR(peertab)); } +TEST_F(homa_peer, homa_peertab_alloc__cant_register_sysctl) +{ + struct homa_peertab *peertab; + + mock_register_sysctl_errors = 1; + peertab = homa_peertab_alloc(); + EXPECT_TRUE(IS_ERR(peertab)); + EXPECT_EQ(ENOMEM, -PTR_ERR(peertab)); + EXPECT_SUBSTR("couldn't register sysctl parameters for Homa peertab", + mock_printk_output); +} -TEST_F(homa_peer, homa_peertab_free_net) +TEST_F(homa_peer, homa_peertab_free_net__basics) { /* Create peers from two different netns's, make sure only * those from one get freed. */ @@ -112,9 +149,29 @@ TEST_F(homa_peer, homa_peertab_free_net) peer = homa_peer_find(&hsk2, ip3333); homa_peer_put(peer); EXPECT_EQ(3, unit_count_peers(&self->homa)); + EXPECT_EQ(3, self->homa.peertab->num_peers); + EXPECT_EQ(2, self->hnet->num_peers); homa_peertab_free_net(self->hnet); EXPECT_EQ(1, unit_count_peers(&self->homa)); + EXPECT_EQ(1, self->homa.peertab->num_peers); +} +TEST_F(homa_peer, homa_peertab_free_net__set_gc_stop_count) +{ + struct homa_peer *peer; + + peer = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peer); + + unit_hook_register(stop_gc_hook); + hook_peertab = self->homa.peertab; + unit_log_clear(); + self->homa.peertab->gc_stop_count = 3; + + homa_peertab_free_net(self->hnet); + EXPECT_EQ(0, unit_count_peers(&self->homa)); + EXPECT_STREQ("gc_stop_count 4", unit_log_get()); + EXPECT_EQ(3, self->homa.peertab->gc_stop_count); } TEST_F(homa_peer, homa_peertab_free_fn) @@ -133,7 +190,7 @@ TEST_F(homa_peer, homa_peertab_free_fn) dst_release(dst); } -TEST_F(homa_peer, homa_peertab_free) { +TEST_F(homa_peer, homa_peertab_free__basics) { struct homa_peer *peer; peer = homa_peer_find(&self->hsk, ip1111); @@ -142,24 +199,410 @@ TEST_F(homa_peer, homa_peertab_free) { mock_peer_free_no_fail = 1; unit_log_clear(); - homa_peertab_free(self->homa.peers); - EXPECT_STREQ("peer [2::2:2:2] has reference count 1", unit_log_get()); + homa_peertab_free(self->homa.peertab); + EXPECT_STREQ("peer [2::2:2:2] has reference count 1; " + "unregister_net_sysctl_table", unit_log_get()); kfree(peer); - self->homa.peers = homa_peertab_alloc(); + self->homa.peertab = homa_peertab_alloc(); +} +TEST_F(homa_peer, homa_peertab_free__free_dead_peers) { + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 100; + peer = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peer); + peer = homa_peer_find(&self->hsk, ip2222); + homa_peer_put(peer); + + jiffies = peertab->idle_jiffies_max + 1000; + peertab->num_peers = peertab->gc_threshold + 100; + homa_peer_gc(peertab); + EXPECT_EQ(2, unit_list_length(&peertab->dead_peers)); + + homa_peer_rcu_callback(&peertab->rcu_head); + homa_peertab_free(self->homa.peertab); + + /* Can't check explicitly for problems (peertab is gone now), but + * end-of-test checks will complain if the peers weren't freed. + */ + self->homa.peertab = homa_peertab_alloc(); +} + +TEST_F(homa_peer, homa_peer_rcu_callback) { + atomic_set(&self->homa.peertab->call_rcu_pending, 4); + homa_peer_rcu_callback(&self->homa.peertab->rcu_head); + EXPECT_EQ(0, atomic_read(&self->homa.peertab->call_rcu_pending)); +} + +TEST_F(homa_peer, homa_peer_free_dead) { + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer1, *peer2; + + peer1 = homa_peer_alloc(&self->hsk, ip1111); + peer2 = homa_peer_alloc(&self->hsk, ip2222); + + list_add_tail(&peer1->dead_links, &peertab->dead_peers); + list_add_tail(&peer2->dead_links, &peertab->dead_peers); + unit_log_clear(); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("[1::1:1:1]; [2::2:2:2]", unit_log_get()); + + /* First call: RCU pending. */ + atomic_set(&peertab->call_rcu_pending, 1); + homa_peer_free_dead(peertab); + unit_log_clear(); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("[1::1:1:1]; [2::2:2:2]", unit_log_get()); + + /* Second call: peers have nonzero reference counts. */ + atomic_set(&peertab->call_rcu_pending, 0); + homa_peer_free_dead(peertab); + unit_log_clear(); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("[1::1:1:1]; [2::2:2:2]", unit_log_get()); + + /* Third call: all reference counts zero. */ + homa_peer_put(peer1); + homa_peer_put(peer2); + homa_peer_free_dead(peertab); + unit_log_clear(); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} + +TEST_F(homa_peer, homa_peer_wait_dead) { + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + peer = homa_peer_alloc(&self->hsk, ip1111); + homa_peer_put(peer); + list_add_tail(&peer->dead_links, &peertab->dead_peers); + unit_log_clear(); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("[1::1:1:1]", unit_log_get()); + atomic_set(&peertab->call_rcu_pending, 1); + + unit_hook_register(complete_rcu_hook); + hook_peertab = self->homa.peertab; + hook_free_count = 5; + + homa_peer_wait_dead(peertab); + unit_log_clear(); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, hook_free_count); +} + +TEST_F(homa_peer, homa_peertab_prefer_evict) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer1, *peer2, *peer3, *peer4; + struct homa_net *hnet2; + struct homa_sock hsk2; + + hnet2 = mock_alloc_hnet(&self->homa); + mock_sock_init(&hsk2, hnet2, 44); + + peer1 = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peer1); + peer1->access_jiffies = 100; + + peer2 = homa_peer_find(&self->hsk, ip2222); + homa_peer_put(peer2); + peer2->access_jiffies = 1000; + + peer3 = homa_peer_find(&hsk2, ip3333); + homa_peer_put(peer3); + peer3->access_jiffies = 500; + + peer4 = homa_peer_find(&hsk2, ip1111); + homa_peer_put(peer4); + peer4->access_jiffies = 300; + hnet2->num_peers = peertab->net_max + 1; + + EXPECT_EQ(1, homa_peer_prefer_evict(peertab, peer3, peer1)); + EXPECT_EQ(0, homa_peer_prefer_evict(peertab, peer3, peer4)); + EXPECT_EQ(0, homa_peer_prefer_evict(peertab, peer1, peer4)); + EXPECT_EQ(1, homa_peer_prefer_evict(peertab, peer1, peer2)); + + homa_sock_destroy(&hsk2); + homa_peertab_free_net(hnet2); +} + +TEST_F(homa_peer, homa_peertab_pick_victims__hash_table_wraparound) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[3], *victims[5]; + + jiffies = 50; + peers[0] = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peers[0]); + + peers[1] = NULL; + + peers[2] = homa_peer_find(&self->hsk, ip2222); + homa_peer_put(peers[2]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 3; + jiffies = peertab->idle_jiffies_max + 100; + + EXPECT_EQ(2, homa_peer_pick_victims(peertab, victims, 5)); + EXPECT_EQ(peers[0], victims[0]); + EXPECT_EQ(peers[2], victims[1]); +} +TEST_F(homa_peer, homa_peertab_pick_victims__EAGAIN_from_rht_walk) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[5], *victims[5]; + + jiffies = 50; + peers[0] = ERR_PTR(-EAGAIN); + + peers[1] = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peers[1]); + + peers[2] = ERR_PTR(-EAGAIN); + + peers[3] = ERR_PTR(-EAGAIN); + + peers[4] = homa_peer_find(&self->hsk, ip2222); + homa_peer_put(peers[4]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 5; + jiffies = peertab->idle_jiffies_max + 100; + + EXPECT_EQ(1, homa_peer_pick_victims(peertab, victims, 5)); + EXPECT_EQ(peers[1], victims[0]); +} +TEST_F(homa_peer, homa_peertab_pick_victims__filter_idle_jiffies_min) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[2], *victims[5]; + + jiffies = 100; + peers[1] = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peers[1]); + + jiffies = 200; + peers[0] = homa_peer_find(&self->hsk, ip2222); + homa_peer_put(peers[0]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 2; + jiffies = peertab->idle_jiffies_min + 150; + self->hnet->num_peers = peertab->net_max + 1000; + + EXPECT_EQ(1, homa_peer_pick_victims(peertab, victims, 5)); + EXPECT_EQ(peers[1], victims[0]); +} +TEST_F(homa_peer, homa_peertab_pick_victims__filter_idle_jiffies_max) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[3], *victims[5]; + struct homa_net *hnet2; + struct homa_sock hsk2; + + hnet2 = mock_alloc_hnet(&self->homa); + mock_sock_init(&hsk2, hnet2, 44); + hnet2->num_peers = peertab->net_max + 1; + + /* First peer: net below limit, idle < max. */ + jiffies = 150; + peers[0] = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peers[0]); + + /* Second peer: net above limit, idle > max. */ + jiffies = 50; + peers[1] = homa_peer_find(&hsk2, ip2222); + homa_peer_put(peers[1]); + + /* Third peer: net below limit, idle > max. */ + jiffies = 50; + peers[2] = homa_peer_find(&self->hsk, ip3333); + homa_peer_put(peers[2]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 3; + jiffies = peertab->idle_jiffies_max + 100; + + EXPECT_EQ(2, homa_peer_pick_victims(peertab, victims, 5)); + EXPECT_EQ(peers[1], victims[0]); + EXPECT_EQ(peers[2], victims[1]); +} +TEST_F(homa_peer, homa_peertab_pick_victims__duplicate_peer) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[3], *victims[3]; + + jiffies = 300; + peers[0] = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peers[0]); + + peers[1] = peers[0]; + peers[2] = peers[0]; + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 3; + jiffies = peertab->idle_jiffies_max + 1000; + + EXPECT_EQ(1, homa_peer_pick_victims(peertab, victims, 3)); + EXPECT_EQ(peers[0], victims[0]); +} +TEST_F(homa_peer, homa_peertab_pick_victims__select_best_candidates) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peers[6], *victims[3]; + + jiffies = 300; + peers[0] = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peers[0]); + + jiffies = 400; + peers[1] = homa_peer_find(&self->hsk, ip2222); + homa_peer_put(peers[1]); + + jiffies = 500; + peers[2] = homa_peer_find(&self->hsk, ip3333); + homa_peer_put(peers[2]); + + jiffies = 200; + peers[3] = homa_peer_find(&self->hsk, ip4444); + homa_peer_put(peers[3]); + + jiffies = 350; + peers[4] = homa_peer_find(&self->hsk, ip5555); + homa_peer_put(peers[4]); + + jiffies = 600; + peers[5] = homa_peer_find(&self->hsk, ip6666); + homa_peer_put(peers[5]); + + mock_rht_walk_results = (void **)peers; + mock_rht_num_walk_results = 6; + jiffies = peertab->idle_jiffies_max + 1000; + + EXPECT_EQ(3, homa_peer_pick_victims(peertab, victims, 3)); + EXPECT_EQ(peers[3], victims[0]); + EXPECT_EQ(peers[0], victims[1]); + EXPECT_EQ(peers[4], victims[2]); +} + +TEST_F(homa_peer, homa_peer_gc__basics) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 300; + peer = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peer); + EXPECT_EQ(1, self->hnet->num_peers); + + jiffies = peertab->idle_jiffies_max + 1000; + peertab->num_peers = peertab->gc_threshold; + + unit_log_clear(); + homa_peer_gc(peertab); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("call_rcu invoked; [1::1:1:1]", unit_log_get()); + EXPECT_EQ(1, atomic_read(&peertab->call_rcu_pending)); + EXPECT_EQ(0, self->hnet->num_peers); + EXPECT_EQ(peertab->gc_threshold - 1, peertab->num_peers); + + homa_peer_rcu_callback(&peertab->rcu_head); + unit_log_clear(); + homa_peer_gc(peertab); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, atomic_read(&peertab->call_rcu_pending)); +} +TEST_F(homa_peer, homa_peer_gc__gc_stop_count) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 300; + peer = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peer); + + jiffies = peertab->idle_jiffies_max + 1000; + peertab->num_peers = peertab->gc_threshold; + peertab->gc_stop_count = 1; + + unit_log_clear(); + homa_peer_gc(peertab); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_peer, homa_peer_gc__call_rcu_pending) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 300; + peer = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peer); + + jiffies = peertab->idle_jiffies_max + 1000; + peertab->num_peers = peertab->gc_threshold; + atomic_set(&peertab->call_rcu_pending, 1); + + unit_log_clear(); + homa_peer_gc(peertab); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_peer, homa_peer_gc__peers_below_gc_threshold) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 300; + peer = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peer); + + jiffies = peertab->idle_jiffies_max + 1000; + peertab->num_peers = peertab->gc_threshold - 1; + + unit_log_clear(); + homa_peer_gc(peertab); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_peer, homa_peer_gc__no_suitable_candidates) +{ + struct homa_peertab *peertab = self->homa.peertab; + struct homa_peer *peer; + + jiffies = 100; + peer = homa_peer_find(&self->hsk, ip1111); + homa_peer_put(peer); + + jiffies = peertab->idle_jiffies_min; + peertab->num_peers = peertab->gc_threshold - 1; + + unit_log_clear(); + homa_peer_gc(peertab); + unit_log_dead_peers(&self->homa); + EXPECT_STREQ("", unit_log_get()); } TEST_F(homa_peer, homa_peer_alloc__success) { struct homa_peer *peer; + jiffies = 999; peer = homa_peer_alloc(&self->hsk, ip1111); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ_IP(*ip1111, peer->addr); + EXPECT_EQ(999, peer->access_jiffies); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); EXPECT_EQ(0, peer->cutoff_version); - EXPECT_EQ(1, homa_metrics_per_cpu()->peer_new_entries); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_allocs); #endif /* See strip.py */ EXPECT_EQ(1, atomic_read(&peer->dst->__rcuref.refcnt)); homa_peer_put(peer); @@ -225,27 +668,35 @@ TEST_F(homa_peer, homa_peer_find__basics) struct homa_peer *peer, *peer2; /* First call: create new peer. */ + jiffies = 456; peer = homa_peer_find(&self->hsk, ip1111); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ_IP(*ip1111, peer->addr); + EXPECT_EQ(456, peer->access_jiffies); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); EXPECT_EQ(0, peer->cutoff_version); #endif /* See strip.py */ + EXPECT_EQ(1, self->homa.peertab->num_peers); + EXPECT_EQ(1, self->hnet->num_peers); /* Second call: lookup existing peer. */ peer2 = homa_peer_find(&self->hsk, ip1111); EXPECT_EQ(peer, peer2); EXPECT_EQ(2, atomic_read(&peer->refs)); + EXPECT_EQ(1, self->homa.peertab->num_peers); + EXPECT_EQ(1, self->hnet->num_peers); /* Third call: lookup new peer. */ peer2 = homa_peer_find(&self->hsk, ip2222); EXPECT_NE(peer, peer2); ASSERT_FALSE(IS_ERR(peer2)); EXPECT_EQ(1, atomic_read(&peer2->refs)); + EXPECT_EQ(2, self->homa.peertab->num_peers); + EXPECT_EQ(2, self->hnet->num_peers); #ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(2, homa_metrics_per_cpu()->peer_new_entries); + EXPECT_EQ(2, homa_metrics_per_cpu()->peer_allocs); #endif /* See strip.py */ homa_peer_put(peer); homa_peer_put(peer); @@ -279,11 +730,15 @@ TEST_F(homa_peer, homa_peer_find__conflicting_create) test_data = self; peer_race_hook_invocations = 0; unit_hook_register(peer_race_hook); + jiffies = 100; peer = homa_peer_find(&self->hsk, ip3333); EXPECT_FALSE(IS_ERR(conflicting_peer)); EXPECT_EQ(conflicting_peer, peer); EXPECT_EQ(1, atomic_read(&peer->refs)); + EXPECT_EQ(110, peer->access_jiffies); homa_peer_put(peer); + EXPECT_EQ(1, self->homa.peertab->num_peers); + EXPECT_EQ(1, self->hnet->num_peers); } TEST_F(homa_peer, homa_dst_refresh__basics) @@ -296,7 +751,7 @@ TEST_F(homa_peer, homa_dst_refresh__basics) EXPECT_EQ_IP(*ip1111, peer->addr); old_dst = peer->dst; - homa_dst_refresh(self->homa.peers, peer, &self->hsk); + homa_dst_refresh(self->homa.peertab, peer, &self->hsk); EXPECT_NE(old_dst, peer->dst); homa_peer_put(peer); } @@ -311,7 +766,7 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) old_dst = peer->dst; mock_route_errors = 1; - homa_dst_refresh(self->homa.peers, peer, &self->hsk); + homa_dst_refresh(self->homa.peertab, peer, &self->hsk); EXPECT_EQ(old_dst, peer->dst); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); @@ -482,3 +937,14 @@ TEST_F(homa_peer, homa_peer_get_acks) unit_ack_string(&acks[0])); homa_peer_put(peer); } + +TEST_F(homa_peer, homa_peer_update_sysctl_deps) +{ + struct homa_peertab *peertab = self->homa.peertab; + + peertab->idle_secs_min = 10; + peertab->idle_secs_max = 100; + homa_peer_update_sysctl_deps(peertab); + EXPECT_EQ(10*HZ, peertab->idle_jiffies_min); + EXPECT_EQ(100*HZ, peertab->idle_jiffies_max); +} diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index c924bd41..58859607 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -91,9 +91,9 @@ TEST_F(homa_utils, homa_init__peertab_alloc_failure) mock_kmalloc_errors = 4; unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); - EXPECT_SUBSTR("homa_peertab_alloc couldn't create peers: kmalloc failure", + EXPECT_SUBSTR("homa_peertab_alloc couldn't create peertab: kmalloc failure", mock_printk_output); - EXPECT_EQ(NULL, homa2.peers); + EXPECT_EQ(NULL, homa2.peertab); homa_destroy(&homa2); } TEST_F(homa_utils, homa_init__cant_allocate_port_map) diff --git a/test/utils.c b/test/utils.c index 44151913..27f8a14d 100644 --- a/test/utils.c +++ b/test/utils.c @@ -318,6 +318,21 @@ void unit_log_throttled(struct homa *homa) } } +/** + * unit_log_dead_peers() - Append to the test log the addresses of all + * peers in peertab->dead_peers for @homa. + * @homa: Homa's overall state. + */ +void unit_log_dead_peers(struct homa *homa) +{ + struct homa_peer *peer; + + list_for_each_entry(peer, &homa->peertab->dead_peers, dead_links) { + unit_log_printf("; ", "%s", + homa_print_ipv6_addr(&peer->ht_key.addr)); + } +} + /** * unit_print_gaps() - Returns a static string describing the gaps in an RPC. * @rpc: Log the gaps in this RPC. @@ -484,7 +499,7 @@ int unit_count_peers(struct homa *homa) struct homa_peer *peer; int count = 0; - rhashtable_walk_enter(&homa->peers->ht, &iter); + rhashtable_walk_enter(&homa->peertab->ht, &iter); rhashtable_walk_start(&iter); while (1) { peer = rhashtable_walk_next(&iter); diff --git a/test/utils.h b/test/utils.h index 60f091f2..036fc49b 100644 --- a/test/utils.h +++ b/test/utils.h @@ -44,6 +44,7 @@ struct iov_iter *unit_iov_iter(void *buffer, size_t length); int unit_list_length(struct list_head *head); void unit_log_active_ids(struct homa_sock *hsk); +void unit_log_dead_peers(struct homa *homa); void unit_log_filled_skbs(struct sk_buff *skb, int verbose); void unit_log_frag_list(struct sk_buff *skb, int verbose); #ifndef __STRIP__ /* See strip.py */ From 17c2386cb3aa4c86c7ff7f0541ecb6e94fc323da Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 20 May 2025 17:43:38 -0700 Subject: [PATCH 323/625] Renamed several functions in homa_peer * homa_peer_put -> homa_peer_release * homa_peer_find -> homa_peer_get * homa_peertab_* -> homa_peer_* --- homa_incoming.c | 8 +- homa_metrics.h | 4 +- homa_outgoing.c | 4 +- homa_peer.c | 32 +-- homa_peer.h | 20 +- homa_peer_old.c | 442 -------------------------------------- homa_rpc.c | 12 +- homa_utils.c | 6 +- test/unit_homa_incoming.c | 12 +- test/unit_homa_outgoing.c | 4 +- test/unit_homa_peer.c | 248 ++++++++++----------- test/unit_homa_plumbing.c | 6 +- test/unit_homa_utils.c | 14 +- 13 files changed, 185 insertions(+), 627 deletions(-) delete mode 100644 homa_peer_old.c diff --git a/homa_incoming.c b/homa_incoming.c index 8c286611..e56675d3 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -892,13 +892,13 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) struct homa_peer *peer; int i; - peer = homa_peer_find(hsk, &saddr); + peer = homa_peer_get(hsk, &saddr); if (!IS_ERR(peer)) { peer->unsched_cutoffs[0] = INT_MAX; for (i = 1; i < HOMA_MAX_PRIORITIES; i++) peer->unsched_cutoffs[i] = ntohl(h->unsched_cutoffs[i]); peer->cutoff_version = h->cutoff_version; - homa_peer_put(peer); + homa_peer_release(peer); } kfree_skb(skb); } @@ -940,7 +940,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, #endif /* See strip.py */ goto done; } else { - peer = homa_peer_find(hsk, &saddr); + peer = homa_peer_get(hsk, &saddr); if (IS_ERR(peer)) goto done; } @@ -963,7 +963,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, __homa_xmit_control(&ack, sizeof(ack), peer, hsk); tt_record3("Responded to NEED_ACK for id %d, peer %0x%x with %d other acks", id, tt_addr(saddr), ntohs(ack.num_acks)); - homa_peer_put(peer); + homa_peer_release(peer); done: kfree_skb(skb); diff --git a/homa_metrics.h b/homa_metrics.h index 33f79666..059840d7 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -312,14 +312,14 @@ struct homa_metrics { u64 peer_allocs; /** - * @peer_kmalloc_errors: total number of times homa_peer_find + * @peer_kmalloc_errors: total number of times homa_peer_get * returned an error because it couldn't allocate memory for a new * peer. */ u64 peer_kmalloc_errors; /** - * @peer_route_errors: total number of times homa_peer_find + * @peer_route_errors: total number of times homa_peer_get * returned an error because it couldn't create a route to the peer. */ u64 peer_route_errors; diff --git a/homa_outgoing.c b/homa_outgoing.c index 167d1f1d..752633d4 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -571,10 +571,10 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) unknown.common.urgent = htons(HOMA_TCP_URGENT); #endif /* See strip.py */ unknown.common.sender_id = cpu_to_be64(homa_local_id(h->sender_id)); - peer = homa_peer_find(hsk, &saddr); + peer = homa_peer_get(hsk, &saddr); if (!IS_ERR(peer)) __homa_xmit_control(&unknown, sizeof(unknown), peer, hsk); - homa_peer_put(peer); + homa_peer_release(peer); } /** diff --git a/homa_peer.c b/homa_peer.c index 1feb94b2..7c660c56 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -67,12 +67,12 @@ static struct ctl_table peer_ctl_table[] = { #endif /* See strip.py */ /** - * homa_peertab_alloc() - Allocate and initialize a homa_peertab. + * homa_peer_alloc_peertab() - Allocate and initialize a homa_peertab. * * Return: A pointer to the new homa_peertab, or ERR_PTR(-errno) if there * was a problem. */ -struct homa_peertab *homa_peertab_alloc(void) +struct homa_peertab *homa_peer_alloc_peertab(void) { struct homa_peertab *peertab; int err; @@ -109,17 +109,17 @@ struct homa_peertab *homa_peertab_alloc(void) return peertab; error: - homa_peertab_free(peertab); + homa_peer_free_peertab(peertab); return ERR_PTR(err); } /** - * homa_peertab_free_net() - Garbage collect all of the peer information + * homa_peer_free_net() - Garbage collect all of the peer information * associated with a particular network namespace. * @hnet: Network namespace whose peers should be freed. There must not * be any active sockets or RPCs for this namespace. */ -void homa_peertab_free_net(struct homa_net *hnet) +void homa_peer_free_net(struct homa_net *hnet) { struct homa_peertab *peertab = hnet->homa->peertab; struct rhashtable_iter iter; @@ -157,13 +157,13 @@ void homa_peertab_free_net(struct homa_net *hnet) } /** - * homa_peertab_free_fn() - This function is invoked for each entry in + * homa_peer_free_fn() - This function is invoked for each entry in * the peer hash table by the rhashtable code when the table is being * deleted. It frees its argument. * @object: struct homa_peer to free. * @dummy: Not used. */ -void homa_peertab_free_fn(void *object, void *dummy) +void homa_peer_free_fn(void *object, void *dummy) { struct homa_peer *peer = object; @@ -171,13 +171,13 @@ void homa_peertab_free_fn(void *object, void *dummy) } /** - * homa_peertab_free() - Destructor for homa_peertabs. After this + * homa_peer_free_peertab() - Destructor for homa_peertabs. After this * function returns, it is unsafe to use any results from previous calls - * to homa_peer_find, since all existing homa_peer objects will have been + * to homa_peer_get, since all existing homa_peer objects will have been * destroyed. * @peertab: The table to destroy. */ -void homa_peertab_free(struct homa_peertab *peertab) +void homa_peer_free_peertab(struct homa_peertab *peertab) { spin_lock_bh(&peertab->lock); peertab->gc_stop_count++; @@ -185,7 +185,7 @@ void homa_peertab_free(struct homa_peertab *peertab) if (peertab->ht_valid) { rhashtable_walk_exit(&peertab->ht_iter); - rhashtable_free_and_destroy(&peertab->ht, homa_peertab_free_fn, + rhashtable_free_and_destroy(&peertab->ht, homa_peer_free_fn, NULL); } while (!list_empty(&peertab->dead_peers)) @@ -491,7 +491,7 @@ void homa_peer_free(struct homa_peer *peer) } /** - * homa_peer_find() - Returns the peer associated with a given host; creates + * homa_peer_get() - Returns the peer associated with a given host; creates * a new homa_peer if one doesn't already exist. * @hsk: Socket where the peer will be used. * @addr: Address of the desired host: IPv4 addresses are represented @@ -500,9 +500,9 @@ void homa_peer_free(struct homa_peer *peer) * Return: The peer associated with @addr, or a negative errno if an * error occurred. On a successful return the reference count * will be incremented for the returned peer. The caller must - * eventually call homa_peer_put to release the reference. + * eventually call homa_peer_release to release the reference. */ -struct homa_peer *homa_peer_find(struct homa_sock *hsk, +struct homa_peer *homa_peer_get(struct homa_sock *hsk, const struct in6_addr *addr) { struct homa_peertab *peertab = hsk->homa->peertab; @@ -531,14 +531,14 @@ struct homa_peer *homa_peer_find(struct homa_sock *hsk, &peer->ht_linkage, ht_params); if (IS_ERR(other)) { /* Couldn't insert; return the error info. */ - homa_peer_put(peer); + homa_peer_release(peer); homa_peer_free(peer); peer = other; } else if (other) { /* Someone else already created the desired peer; use that * one instead of ours. */ - homa_peer_put(peer); + homa_peer_release(peer); homa_peer_free(peer); peer = other; homa_peer_hold(peer); diff --git a/homa_peer.h b/homa_peer.h index 7ebb67df..f153822b 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -281,21 +281,21 @@ struct homa_peer { void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, struct homa_sock *hsk); -struct homa_peertab - *homa_peertab_alloc(void); -void homa_peertab_free(struct homa_peertab *peertab); -void homa_peertab_free_net(struct homa_net *hnet); -void homa_peertab_free_fn(void *object, void *dummy); void homa_peer_add_ack(struct homa_rpc *rpc); struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, const struct in6_addr *addr); +struct homa_peertab + *homa_peer_alloc_peertab(void); int homa_peer_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -struct homa_peer - *homa_peer_find(struct homa_sock *hsk, const struct in6_addr *addr); void homa_peer_free(struct homa_peer *peer); void homa_peer_free_dead(struct homa_peertab *peertab); +void homa_peer_free_fn(void *object, void *dummy); +void homa_peer_free_net(struct homa_net *hnet); +void homa_peer_free_peertab(struct homa_peertab *peertab); void homa_peer_gc(struct homa_peertab *peertab); +struct homa_peer + *homa_peer_get(struct homa_sock *hsk, const struct in6_addr *addr); int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst); struct dst_entry @@ -367,7 +367,7 @@ static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, /** * homa_peer_hold() - Increment the reference count on an RPC, which will - * prevent it from being freed until homa_peer_put() is called. + * prevent it from being freed until homa_peer_release() is called. * @peer: Object on which to take a reference. */ static inline void homa_peer_hold(struct homa_peer *peer) @@ -376,12 +376,12 @@ static inline void homa_peer_hold(struct homa_peer *peer) } /** - * homa_peer_put() - Release a reference on a peer (cancels the effect of + * homa_peer_release() - Release a reference on a peer (cancels the effect of * a previous call to homa_peer_hold). If the reference count becomes zero * then the peer may be deleted at any time. * @peer: Object to release. */ -static inline void homa_peer_put(struct homa_peer *peer) +static inline void homa_peer_release(struct homa_peer *peer) { atomic_dec(&peer->refs); } diff --git a/homa_peer_old.c b/homa_peer_old.c deleted file mode 100644 index 70141874..00000000 --- a/homa_peer_old.c +++ /dev/null @@ -1,442 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause - -/* This file provides functions related to homa_peer and homa_peertab - * objects. - */ - -#include "homa_impl.h" -#include "homa_peer.h" -#include "homa_rpc.h" - -/** - * homa_peertab_init() - Constructor for homa_peertabs. - * @peertab: The object to initialize; previous contents are discarded. - * - * Return: 0 in the normal case, or a negative errno if there was a problem. - */ -int homa_peertab_init(struct homa_peertab *peertab) -{ - /* Note: when we return, the object must be initialized so it's - * safe to call homa_peertab_destroy, even if this function returns - * an error. - */ - int i; - - spin_lock_init(&peertab->write_lock); - peertab->buckets = vmalloc(HOMA_PEERTAB_BUCKETS * - sizeof(*peertab->buckets)); - if (!peertab->buckets) - return -ENOMEM; - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) - INIT_HLIST_HEAD(&peertab->buckets[i]); - return 0; -} - -/** - * homa_peertab_destroy() - Destructor for homa_peertabs. After this - * function returns, it is unsafe to use any results from previous calls - * to homa_peer_find, since all existing homa_peer objects will have been - * destroyed. - * @peertab: The table to destroy. - */ -void homa_peertab_destroy(struct homa_peertab *peertab) -{ - struct hlist_node *next; - struct homa_peer *peer; - int i; - - if (!peertab->buckets) - return; - - spin_lock_bh(&peertab->write_lock); - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - hlist_for_each_entry_safe(peer, next, &peertab->buckets[i], - peertab_links) { - if (atomic_read(&peer->refs) != 0) -#ifdef __UNIT_TEST__ - FAIL(" %s found peer %s with reference count %d", - __func__, - homa_print_ipv6_addr(&peer->addr), - atomic_read(&peer->refs)); - -#else /* __UNIT_TEST__ */ - pr_err("%s found peer with reference count %d", - __func__, atomic_read(&peer->refs)); -#endif - dst_release(peer->dst); - kfree(peer); - } - } - vfree(peertab->buckets); - spin_unlock_bh(&peertab->write_lock); -} - -#ifndef __UPSTREAM__ /* See strip.py */ -/** - * homa_peertab_get_peers() - Return information about all of the peers - * currently known - * @peertab: The table to search for peers. - * @num_peers: Modified to hold the number of peers returned. - * Return: kmalloced array holding pointers to all known peers. The - * caller must free this. If there is an error, or if there - * are no peers, NULL is returned. Note: if a large number - * of new peers are created while this function executes, - * then the results may not be complete. - */ -struct homa_peer **homa_peertab_get_peers(struct homa_peertab *peertab, - int *num_peers) -{ - int i, slots, next_slot; - struct homa_peer **result; - struct homa_peer *peer; - - /* Note: RCU must be used in the iterators below to ensure safety - * with concurrent insertions. Technically, rcu_read_lock and - * rcu_read_unlock shouldn't be necessary because we don't have to - * worry about concurrent deletions. But without them, some sanity - * checkers will complain. - */ - rcu_read_lock(); - - /* Figure out how large an array to allocate. */ - slots = 0; - next_slot = 0; - result = NULL; - if (peertab->buckets) { - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - hlist_for_each_entry_rcu(peer, &peertab->buckets[i], - peertab_links) - slots++; - } - } - if (slots == 0) - goto done; - - /* Allocate extra space in case new peers got created while we - * were counting. - */ - slots += 10; - result = kmalloc_array(slots, sizeof(peer), GFP_ATOMIC); - if (!result) - goto done; - for (i = 0; i < HOMA_PEERTAB_BUCKETS; i++) { - hlist_for_each_entry_rcu(peer, &peertab->buckets[i], - peertab_links) { - result[next_slot] = peer; - next_slot++; - - /* We might not have allocated enough extra space. */ - if (next_slot >= slots) - goto done; - } - } -done: - rcu_read_unlock(); - *num_peers = next_slot; - return result; -} -#endif /* See strip.py */ - -/** - * homa_peer_find() - Returns the peer associated with a given host; creates - * a new homa_peer if one doesn't already exist. - * @peertab: Peer table in which to perform lookup. - * @addr: Address of the desired host: IPv4 addresses are represented - * as IPv4-mapped IPv6 addresses. - * @inet: Socket that will be used for sending packets. - * - * Return: The peer associated with @addr, or a negative errno if an - * error occurred. On a successful return the reference count - * will be incremented for the returned peer. The caller must - * eventually call homa_peer_put to release the reference. - */ -struct homa_peer *homa_peer_find(struct homa_peertab *peertab, - const struct in6_addr *addr, - struct inet_sock *inet) -{ - struct homa_peer *peer; - struct dst_entry *dst; - u64 start = homa_clock(); - - // Should use siphash or jhash here: - tt_record("homa_peer_find starting"); - u32 bucket = hash_32((__force u32)addr->in6_u.u6_addr32[0], - HOMA_PEERTAB_BUCKET_BITS); - - bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[1], - HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[2], - HOMA_PEERTAB_BUCKET_BITS); - bucket ^= hash_32((__force u32)addr->in6_u.u6_addr32[3], - HOMA_PEERTAB_BUCKET_BITS); - - /* Use RCU operators to ensure safety even if a concurrent call is - * adding a new entry. The calls to rcu_read_lock and rcu_read_unlock - * shouldn't actually be needed, since we don't need to protect - * against concurrent deletion. - */ - rcu_read_lock(); - hlist_for_each_entry_rcu(peer, &peertab->buckets[bucket], - peertab_links) { - if (ipv6_addr_equal(&peer->addr, addr)) { - tt_record("homa_peer_find before homa_peer_hold"); - homa_peer_hold(peer); - tt_record("homa_peer_find after homa_peer_hold"); - rcu_read_unlock(); - tt_record1("homa_peer_find took %d cycles to find existing peer", - homa_clock() - start); - return peer; - } - INC_METRIC(peer_hash_links, 1); - } - rcu_read_unlock(); - - /* No existing entry; create a new one. - * - * Note: after we acquire the lock, we have to check again to - * make sure the entry still doesn't exist (it might have been - * created by a concurrent invocation of this function). - */ - spin_lock_bh(&peertab->write_lock); - hlist_for_each_entry(peer, &peertab->buckets[bucket], - peertab_links) { - if (ipv6_addr_equal(&peer->addr, addr)) { - homa_peer_hold(peer); - goto done; - } - } - peer = kmalloc(sizeof(*peer), GFP_ATOMIC | __GFP_ZERO); - if (!peer) { - peer = (struct homa_peer *)ERR_PTR(-ENOMEM); - INC_METRIC(peer_kmalloc_errors, 1); - goto done; - } - atomic_set(&peer->refs, 1); - peer->addr = *addr; - dst = homa_peer_get_dst(peer, inet); - if (IS_ERR(dst)) { - kfree(peer); - peer = (struct homa_peer *)PTR_ERR(dst); - INC_METRIC(peer_route_errors, 1); - goto done; - } - peer->dst = dst; -#ifndef __STRIP__ /* See strip.py */ - peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; - peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 2] = INT_MAX; - INIT_LIST_HEAD(&peer->grantable_rpcs); - INIT_LIST_HEAD(&peer->grantable_links); -#endif /* See strip.py */ - smp_wmb(); - hlist_add_head_rcu(&peer->peertab_links, &peertab->buckets[bucket]); - peer->current_ticks = -1; - spin_lock_init(&peer->ack_lock); - INC_METRIC(peer_allocs, 1); - -done: - spin_unlock_bh(&peertab->write_lock); - return peer; -} - -/** - * homa_dst_refresh() - This method is called when the dst for a peer is - * obsolete; it releases that dst and creates a new one. - * @peertab: Table containing the peer. - * @peer: Peer whose dst is obsolete. - * @hsk: Socket that will be used to transmit data to the peer. - */ -void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, - struct homa_sock *hsk) -{ - struct dst_entry *dst; - - dst = homa_peer_get_dst(peer, &hsk->inet); - if (IS_ERR(dst)) { -#ifndef __STRIP__ /* See strip.py */ - /* Retain the existing dst if we can't create a new one. */ - if (hsk->homa->verbose) - pr_notice("%s couldn't recreate dst: error %ld", - __func__, PTR_ERR(dst)); - INC_METRIC(peer_route_errors, 1); -#endif /* See strip.py */ - return; - } - dst_release(peer->dst); - peer->dst = dst; -} - -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_unsched_priority() - Returns the priority level to use for - * unscheduled packets of a message. - * @homa: Overall data about the Homa protocol implementation. - * @peer: The destination of the message. - * @length: Number of bytes in the message. - * - * Return: A priority level. - */ -int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, - int length) -{ - int i; - - for (i = homa->num_priorities - 1; ; i--) { - if (peer->unsched_cutoffs[i] >= length) - return i; - } - /* Can't ever get here */ -} -#endif /* See strip.py */ - -/** - * homa_peer_get_dst() - Find an appropriate dst structure (either IPv4 - * or IPv6) for a peer. - * @peer: The peer for which a dst is needed. Note: this peer's flow - * struct will be overwritten. - * @inet: Socket that will be used for sending packets. - * Return: The dst structure (or an ERR_PTR). - */ -struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, - struct inet_sock *inet) -{ - memset(&peer->flow, 0, sizeof(peer->flow)); - if (inet->sk.sk_family == AF_INET) { - struct rtable *rt; - - flowi4_init_output(&peer->flow.u.ip4, inet->sk.sk_bound_dev_if, - inet->sk.sk_mark, inet->tos, - RT_SCOPE_UNIVERSE, inet->sk.sk_protocol, 0, - peer->addr.in6_u.u6_addr32[3], - inet->inet_saddr, 0, 0, inet->sk.sk_uid); - security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); - rt = ip_route_output_flow(sock_net(&inet->sk), - &peer->flow.u.ip4, &inet->sk); - if (IS_ERR(rt)) - return (struct dst_entry *)(PTR_ERR(rt)); - return &rt->dst; - } - peer->flow.u.ip6.flowi6_oif = inet->sk.sk_bound_dev_if; - peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX; - peer->flow.u.ip6.flowi6_mark = inet->sk.sk_mark; - peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE; - peer->flow.u.ip6.flowi6_proto = inet->sk.sk_protocol; - peer->flow.u.ip6.flowi6_flags = 0; - peer->flow.u.ip6.flowi6_secid = 0; - peer->flow.u.ip6.flowi6_tun_key.tun_id = 0; - peer->flow.u.ip6.flowi6_uid = inet->sk.sk_uid; - peer->flow.u.ip6.daddr = peer->addr; - peer->flow.u.ip6.saddr = inet->pinet6->saddr; - peer->flow.u.ip6.fl6_dport = 0; - peer->flow.u.ip6.fl6_sport = 0; - peer->flow.u.ip6.mp_hash = 0; - peer->flow.u.ip6.__fl_common.flowic_tos = inet->tos; - peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(inet->tos, 0); - security_sk_classify_flow(&inet->sk, &peer->flow.u.__fl_common); - return ip6_dst_lookup_flow(sock_net(&inet->sk), &inet->sk, - &peer->flow.u.ip6, NULL); -} - -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_peer_set_cutoffs() - Set the cutoffs for unscheduled priorities in - * a peer object. This is a convenience function used primarily by unit tests. - * @peer: Homa_peer object whose cutoffs should be set. - * @c0: Largest message size that will use priority 0. - * @c1: Largest message size that will use priority 1. - * @c2: Largest message size that will use priority 2. - * @c3: Largest message size that will use priority 3. - * @c4: Largest message size that will use priority 4. - * @c5: Largest message size that will use priority 5. - * @c6: Largest message size that will use priority 6. - * @c7: Largest message size that will use priority 7. - */ -void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, - int c3, int c4, int c5, int c6, int c7) -{ - peer->unsched_cutoffs[0] = c0; - peer->unsched_cutoffs[1] = c1; - peer->unsched_cutoffs[2] = c2; - peer->unsched_cutoffs[3] = c3; - peer->unsched_cutoffs[4] = c4; - peer->unsched_cutoffs[5] = c5; - peer->unsched_cutoffs[6] = c6; - peer->unsched_cutoffs[7] = c7; -} - -/** - * homa_peer_lock_slow() - This function implements the slow path for - * acquiring a peer's @ack_lock. It is invoked when the lock isn't - * immediately available. It waits for the lock, but also records statistics - * about the waiting time. - * @peer: Peer to lock. - */ -void homa_peer_lock_slow(struct homa_peer *peer) - __acquires(&peer->ack_lock) -{ - u64 start = homa_clock(); - - tt_record("beginning wait for peer lock"); - spin_lock_bh(&peer->ack_lock); - tt_record("ending wait for peer lock"); - INC_METRIC(peer_ack_lock_misses, 1); - INC_METRIC(peer_ack_lock_miss_cycles, homa_clock() - start); -} -#endif /* See strip.py */ - -/** - * homa_peer_add_ack() - Add a given RPC to the list of unacked - * RPCs for its server. Once this method has been invoked, it's safe - * to delete the RPC, since it will eventually be acked to the server. - * @rpc: Client RPC that has now completed. - */ -void homa_peer_add_ack(struct homa_rpc *rpc) -{ - struct homa_peer *peer = rpc->peer; - struct homa_ack_hdr ack; - - homa_peer_lock(peer); - if (peer->num_acks < HOMA_MAX_ACKS_PER_PKT) { - peer->acks[peer->num_acks].client_id = cpu_to_be64(rpc->id); - peer->acks[peer->num_acks].server_port = htons(rpc->dport); - peer->num_acks++; - homa_peer_unlock(peer); - return; - } - - /* The peer has filled up; send an ACK message to empty it. The - * RPC in the message header will also be considered ACKed. - */ - INC_METRIC(ack_overflows, 1); - memcpy(ack.acks, peer->acks, sizeof(peer->acks)); - ack.num_acks = htons(peer->num_acks); - peer->num_acks = 0; - homa_peer_unlock(peer); - homa_xmit_control(ACK, &ack, sizeof(ack), rpc); -} - -/** - * homa_peer_get_acks() - Copy acks out of a peer, and remove them from the - * peer. - * @peer: Peer to check for possible unacked RPCs. - * @count: Maximum number of acks to return. - * @dst: The acks are copied to this location. - * - * Return: The number of acks extracted from the peer (<= count). - */ -int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst) -{ - /* Don't waste time acquiring the lock if there are no ids available. */ - if (peer->num_acks == 0) - return 0; - - homa_peer_lock(peer); - - if (count > peer->num_acks) - count = peer->num_acks; - memcpy(dst, &peer->acks[peer->num_acks - count], - count * sizeof(peer->acks[0])); - peer->num_acks -= count; - - homa_peer_unlock(peer); - return count; -} diff --git a/homa_rpc.c b/homa_rpc.c index f893078d..affad4cc 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -46,9 +46,9 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, bucket = homa_client_rpc_bucket(hsk, crpc->id); crpc->bucket = bucket; crpc->state = RPC_OUTGOING; - crpc->peer = homa_peer_find(hsk, &dest_addr_as_ipv6); + crpc->peer = homa_peer_get(hsk, &dest_addr_as_ipv6); if (IS_ERR(crpc->peer)) { - tt_record("error in homa_peer_find"); + tt_record("error in homa_peer_get"); err = PTR_ERR(crpc->peer); crpc->peer = NULL; goto error; @@ -90,7 +90,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, error: if (crpc->peer) - homa_peer_put(crpc->peer); + homa_peer_release(crpc->peer); kfree(crpc); return ERR_PTR(err); } @@ -150,7 +150,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, srpc->hsk = hsk; srpc->bucket = bucket; srpc->state = RPC_INCOMING; - srpc->peer = homa_peer_find(hsk, source); + srpc->peer = homa_peer_get(hsk, source); if (IS_ERR(srpc->peer)) { err = PTR_ERR(srpc->peer); srpc->peer = NULL; @@ -204,7 +204,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, error: homa_bucket_unlock(bucket, id); if (srpc && srpc->peer) - homa_peer_put(srpc->peer); + homa_peer_release(srpc->peer); kfree(srpc); return ERR_PTR(err); } @@ -547,7 +547,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) } } if (rpc->peer) { - homa_peer_put(rpc->peer); + homa_peer_release(rpc->peer); rpc->peer = NULL; } tt_record2("homa_rpc_reap finished reaping id %d, socket %d", diff --git a/homa_utils.c b/homa_utils.c index d6825170..5c70a9fc 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -52,7 +52,7 @@ int homa_init(struct homa *homa) homa->pacer = NULL; return err; } - homa->peertab = homa_peertab_alloc(); + homa->peertab = homa_peer_alloc_peertab(); if (IS_ERR(homa->peertab)) { err = PTR_ERR(homa->peertab); homa->peertab = NULL; @@ -149,7 +149,7 @@ void homa_destroy(struct homa *homa) homa->pacer = NULL; } if (homa->peertab) { - homa_peertab_free(homa->peertab); + homa_peer_free_peertab(homa->peertab); homa->peertab = NULL; } #ifndef __STRIP__ /* See strip.py */ @@ -182,7 +182,7 @@ int homa_net_init(struct homa_net *hnet, struct net *net, struct homa *homa) void homa_net_destroy(struct homa_net *hnet) { homa_socktab_destroy(hnet->homa->socktab, hnet); - homa_peertab_free_net(hnet); + homa_peer_free_net(hnet); } #ifndef __STRIP__ /* See strip.py */ diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index d32395b4..f0db68a2 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1040,12 +1040,12 @@ TEST_F(homa_incoming, homa_dispatch_pkts__cutoffs_for_unknown_client_rpc) struct homa_peer *peer; homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); - peer = homa_peer_find(&self->hsk, self->server_ip); + peer = homa_peer_get(&self->hsk, self->server_ip); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(400, peer->cutoff_version); EXPECT_EQ(9, peer->unsched_cutoffs[1]); EXPECT_EQ(3, peer->unsched_cutoffs[7]); - homa_peer_put(peer); + homa_peer_release(peer); } #endif /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__resend_for_unknown_server_rpc) @@ -1796,10 +1796,10 @@ TEST_F(homa_incoming, homa_cutoffs__cant_find_peer) mock_kmalloc_errors = 1; homa_cutoffs_pkt(skb, &self->hsk); EXPECT_EQ(1, homa_metrics_per_cpu()->peer_kmalloc_errors); - peer = homa_peer_find(&self->hsk, self->server_ip); + peer = homa_peer_get(&self->hsk, self->server_ip); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ(0, peer->cutoff_version); - homa_peer_put(peer); + homa_peer_release(peer); } #endif /* See strip.py */ @@ -1869,7 +1869,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) { - struct homa_peer *peer = homa_peer_find(&self->hsk, self->server_ip); + struct homa_peer *peer = homa_peer_get(&self->hsk, self->server_ip); struct homa_need_ack_hdr h = {.common = { .sport = htons(self->server_port), .dport = htons(self->hsk.port), @@ -1883,7 +1883,7 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit ACK from 0.0.0.0:32768, dport 99, id 1234, acks [sp 99, id 1236]", unit_log_get()); - homa_peer_put(peer); + homa_peer_release(peer); } TEST_F(homa_incoming, homa_ack_pkt__target_rpc_exists_no_extras) diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index cd7a1610..a35f4471 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -93,13 +93,13 @@ FIXTURE_SETUP(homa_outgoing) self->server_addr.in6.sin6_family = AF_INET; self->server_addr.in6.sin6_addr = self->server_ip[0]; self->server_addr.in6.sin6_port = htons(self->server_port); - self->peer = homa_peer_find(&self->hsk, + self->peer = homa_peer_get(&self->hsk, &self->server_addr.in6.sin6_addr); unit_log_clear(); } FIXTURE_TEARDOWN(homa_outgoing) { - homa_peer_put(self->peer); + homa_peer_release(self->peer); homa_destroy(&self->homa); unit_teardown(); } diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index c0582e3a..fe9bd3e1 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -68,8 +68,8 @@ static void peer_race_hook(char *id) /* Create a peer with the same address as the one being created * by the current test. */ - conflicting_peer = homa_peer_find(&test_data->hsk, ip3333); - homa_peer_put(conflicting_peer); + conflicting_peer = homa_peer_get(&test_data->hsk, ip3333); + homa_peer_release(conflicting_peer); jiffies += 10; } @@ -92,46 +92,46 @@ static void complete_rcu_hook(char *id) { homa_peer_rcu_callback(&hook_peertab->rcu_head); } -TEST_F(homa_peer, homa_peertab_alloc__success) +TEST_F(homa_peer, homa_peer_alloc_peertab__success) { struct homa_peertab *peertab; - peertab = homa_peertab_alloc(); + peertab = homa_peer_alloc_peertab(); EXPECT_FALSE(IS_ERR(peertab)); - homa_peertab_free(peertab); + homa_peer_free_peertab(peertab); } -TEST_F(homa_peer, homa_peertab_alloc__cant_alloc_peertab) +TEST_F(homa_peer, homa_peer_alloc_peertab__cant_alloc_peertab) { struct homa_peertab *peertab; mock_kmalloc_errors = 1; - peertab = homa_peertab_alloc(); + peertab = homa_peer_alloc_peertab(); EXPECT_TRUE(IS_ERR(peertab)); EXPECT_EQ(ENOMEM, -PTR_ERR(peertab)); } -TEST_F(homa_peer, homa_peertab_alloc__rhashtable_init_fails) +TEST_F(homa_peer, homa_peer_alloc_peertab__rhashtable_init_fails) { struct homa_peertab *peertab; mock_rht_init_errors = 1; - peertab = homa_peertab_alloc(); + peertab = homa_peer_alloc_peertab(); EXPECT_TRUE(IS_ERR(peertab)); EXPECT_EQ(EINVAL, -PTR_ERR(peertab)); } -TEST_F(homa_peer, homa_peertab_alloc__cant_register_sysctl) +TEST_F(homa_peer, homa_peer_alloc_peertab__cant_register_sysctl) { struct homa_peertab *peertab; mock_register_sysctl_errors = 1; - peertab = homa_peertab_alloc(); + peertab = homa_peer_alloc_peertab(); EXPECT_TRUE(IS_ERR(peertab)); EXPECT_EQ(ENOMEM, -PTR_ERR(peertab)); EXPECT_SUBSTR("couldn't register sysctl parameters for Homa peertab", mock_printk_output); } -TEST_F(homa_peer, homa_peertab_free_net__basics) +TEST_F(homa_peer, homa_peer_free_net__basics) { /* Create peers from two different netns's, make sure only * those from one get freed. */ @@ -142,39 +142,39 @@ TEST_F(homa_peer, homa_peertab_free_net__basics) hnet2 = mock_alloc_hnet(&self->homa); mock_sock_init(&hsk2, hnet2, 44); - peer = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer); - peer = homa_peer_find(&self->hsk, ip2222); - homa_peer_put(peer); - peer = homa_peer_find(&hsk2, ip3333); - homa_peer_put(peer); + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + peer = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peer); + peer = homa_peer_get(&hsk2, ip3333); + homa_peer_release(peer); EXPECT_EQ(3, unit_count_peers(&self->homa)); EXPECT_EQ(3, self->homa.peertab->num_peers); EXPECT_EQ(2, self->hnet->num_peers); - homa_peertab_free_net(self->hnet); + homa_peer_free_net(self->hnet); EXPECT_EQ(1, unit_count_peers(&self->homa)); EXPECT_EQ(1, self->homa.peertab->num_peers); } -TEST_F(homa_peer, homa_peertab_free_net__set_gc_stop_count) +TEST_F(homa_peer, homa_peer_free_net__set_gc_stop_count) { struct homa_peer *peer; - peer = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer); + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); unit_hook_register(stop_gc_hook); hook_peertab = self->homa.peertab; unit_log_clear(); self->homa.peertab->gc_stop_count = 3; - homa_peertab_free_net(self->hnet); + homa_peer_free_net(self->hnet); EXPECT_EQ(0, unit_count_peers(&self->homa)); EXPECT_STREQ("gc_stop_count 4", unit_log_get()); EXPECT_EQ(3, self->homa.peertab->gc_stop_count); } -TEST_F(homa_peer, homa_peertab_free_fn) +TEST_F(homa_peer, homa_peer_free_fn) { struct homa_peer *peer; struct dst_entry *dst; @@ -183,38 +183,38 @@ TEST_F(homa_peer, homa_peertab_free_fn) dst = peer->dst; dst_hold(dst); EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); - homa_peer_put(peer); + homa_peer_release(peer); - homa_peertab_free_fn(peer, NULL); + homa_peer_free_fn(peer, NULL); EXPECT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); dst_release(dst); } -TEST_F(homa_peer, homa_peertab_free__basics) { +TEST_F(homa_peer, homa_peer_free_peertab__basics) { struct homa_peer *peer; - peer = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer); - peer = homa_peer_find(&self->hsk, ip2222); + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + peer = homa_peer_get(&self->hsk, ip2222); mock_peer_free_no_fail = 1; unit_log_clear(); - homa_peertab_free(self->homa.peertab); + homa_peer_free_peertab(self->homa.peertab); EXPECT_STREQ("peer [2::2:2:2] has reference count 1; " "unregister_net_sysctl_table", unit_log_get()); kfree(peer); - self->homa.peertab = homa_peertab_alloc(); + self->homa.peertab = homa_peer_alloc_peertab(); } -TEST_F(homa_peer, homa_peertab_free__free_dead_peers) { +TEST_F(homa_peer, homa_peer_free_peertab__free_dead_peers) { struct homa_peertab *peertab = self->homa.peertab; struct homa_peer *peer; jiffies = 100; - peer = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer); - peer = homa_peer_find(&self->hsk, ip2222); - homa_peer_put(peer); + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); + peer = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peer); jiffies = peertab->idle_jiffies_max + 1000; peertab->num_peers = peertab->gc_threshold + 100; @@ -222,12 +222,12 @@ TEST_F(homa_peer, homa_peertab_free__free_dead_peers) { EXPECT_EQ(2, unit_list_length(&peertab->dead_peers)); homa_peer_rcu_callback(&peertab->rcu_head); - homa_peertab_free(self->homa.peertab); + homa_peer_free_peertab(self->homa.peertab); /* Can't check explicitly for problems (peertab is gone now), but * end-of-test checks will complain if the peers weren't freed. */ - self->homa.peertab = homa_peertab_alloc(); + self->homa.peertab = homa_peer_alloc_peertab(); } TEST_F(homa_peer, homa_peer_rcu_callback) { @@ -264,8 +264,8 @@ TEST_F(homa_peer, homa_peer_free_dead) { EXPECT_STREQ("[1::1:1:1]; [2::2:2:2]", unit_log_get()); /* Third call: all reference counts zero. */ - homa_peer_put(peer1); - homa_peer_put(peer2); + homa_peer_release(peer1); + homa_peer_release(peer2); homa_peer_free_dead(peertab); unit_log_clear(); unit_log_dead_peers(&self->homa); @@ -277,7 +277,7 @@ TEST_F(homa_peer, homa_peer_wait_dead) { struct homa_peer *peer; peer = homa_peer_alloc(&self->hsk, ip1111); - homa_peer_put(peer); + homa_peer_release(peer); list_add_tail(&peer->dead_links, &peertab->dead_peers); unit_log_clear(); unit_log_dead_peers(&self->homa); @@ -295,7 +295,7 @@ TEST_F(homa_peer, homa_peer_wait_dead) { EXPECT_EQ(0, hook_free_count); } -TEST_F(homa_peer, homa_peertab_prefer_evict) +TEST_F(homa_peer, homa_peer_prefer_evict) { struct homa_peertab *peertab = self->homa.peertab; struct homa_peer *peer1, *peer2, *peer3, *peer4; @@ -305,20 +305,20 @@ TEST_F(homa_peer, homa_peertab_prefer_evict) hnet2 = mock_alloc_hnet(&self->homa); mock_sock_init(&hsk2, hnet2, 44); - peer1 = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer1); + peer1 = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer1); peer1->access_jiffies = 100; - peer2 = homa_peer_find(&self->hsk, ip2222); - homa_peer_put(peer2); + peer2 = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peer2); peer2->access_jiffies = 1000; - peer3 = homa_peer_find(&hsk2, ip3333); - homa_peer_put(peer3); + peer3 = homa_peer_get(&hsk2, ip3333); + homa_peer_release(peer3); peer3->access_jiffies = 500; - peer4 = homa_peer_find(&hsk2, ip1111); - homa_peer_put(peer4); + peer4 = homa_peer_get(&hsk2, ip1111); + homa_peer_release(peer4); peer4->access_jiffies = 300; hnet2->num_peers = peertab->net_max + 1; @@ -328,22 +328,22 @@ TEST_F(homa_peer, homa_peertab_prefer_evict) EXPECT_EQ(1, homa_peer_prefer_evict(peertab, peer1, peer2)); homa_sock_destroy(&hsk2); - homa_peertab_free_net(hnet2); + homa_peer_free_net(hnet2); } -TEST_F(homa_peer, homa_peertab_pick_victims__hash_table_wraparound) +TEST_F(homa_peer, homa_peer_pick_victims__hash_table_wraparound) { struct homa_peertab *peertab = self->homa.peertab; struct homa_peer *peers[3], *victims[5]; jiffies = 50; - peers[0] = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peers[0]); + peers[0] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[0]); peers[1] = NULL; - peers[2] = homa_peer_find(&self->hsk, ip2222); - homa_peer_put(peers[2]); + peers[2] = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peers[2]); mock_rht_walk_results = (void **)peers; mock_rht_num_walk_results = 3; @@ -353,7 +353,7 @@ TEST_F(homa_peer, homa_peertab_pick_victims__hash_table_wraparound) EXPECT_EQ(peers[0], victims[0]); EXPECT_EQ(peers[2], victims[1]); } -TEST_F(homa_peer, homa_peertab_pick_victims__EAGAIN_from_rht_walk) +TEST_F(homa_peer, homa_peer_pick_victims__EAGAIN_from_rht_walk) { struct homa_peertab *peertab = self->homa.peertab; struct homa_peer *peers[5], *victims[5]; @@ -361,15 +361,15 @@ TEST_F(homa_peer, homa_peertab_pick_victims__EAGAIN_from_rht_walk) jiffies = 50; peers[0] = ERR_PTR(-EAGAIN); - peers[1] = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peers[1]); + peers[1] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[1]); peers[2] = ERR_PTR(-EAGAIN); peers[3] = ERR_PTR(-EAGAIN); - peers[4] = homa_peer_find(&self->hsk, ip2222); - homa_peer_put(peers[4]); + peers[4] = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peers[4]); mock_rht_walk_results = (void **)peers; mock_rht_num_walk_results = 5; @@ -378,18 +378,18 @@ TEST_F(homa_peer, homa_peertab_pick_victims__EAGAIN_from_rht_walk) EXPECT_EQ(1, homa_peer_pick_victims(peertab, victims, 5)); EXPECT_EQ(peers[1], victims[0]); } -TEST_F(homa_peer, homa_peertab_pick_victims__filter_idle_jiffies_min) +TEST_F(homa_peer, homa_peer_pick_victims__filter_idle_jiffies_min) { struct homa_peertab *peertab = self->homa.peertab; struct homa_peer *peers[2], *victims[5]; jiffies = 100; - peers[1] = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peers[1]); + peers[1] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[1]); jiffies = 200; - peers[0] = homa_peer_find(&self->hsk, ip2222); - homa_peer_put(peers[0]); + peers[0] = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peers[0]); mock_rht_walk_results = (void **)peers; mock_rht_num_walk_results = 2; @@ -399,7 +399,7 @@ TEST_F(homa_peer, homa_peertab_pick_victims__filter_idle_jiffies_min) EXPECT_EQ(1, homa_peer_pick_victims(peertab, victims, 5)); EXPECT_EQ(peers[1], victims[0]); } -TEST_F(homa_peer, homa_peertab_pick_victims__filter_idle_jiffies_max) +TEST_F(homa_peer, homa_peer_pick_victims__filter_idle_jiffies_max) { struct homa_peertab *peertab = self->homa.peertab; struct homa_peer *peers[3], *victims[5]; @@ -412,18 +412,18 @@ TEST_F(homa_peer, homa_peertab_pick_victims__filter_idle_jiffies_max) /* First peer: net below limit, idle < max. */ jiffies = 150; - peers[0] = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peers[0]); + peers[0] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[0]); /* Second peer: net above limit, idle > max. */ jiffies = 50; - peers[1] = homa_peer_find(&hsk2, ip2222); - homa_peer_put(peers[1]); + peers[1] = homa_peer_get(&hsk2, ip2222); + homa_peer_release(peers[1]); /* Third peer: net below limit, idle > max. */ jiffies = 50; - peers[2] = homa_peer_find(&self->hsk, ip3333); - homa_peer_put(peers[2]); + peers[2] = homa_peer_get(&self->hsk, ip3333); + homa_peer_release(peers[2]); mock_rht_walk_results = (void **)peers; mock_rht_num_walk_results = 3; @@ -433,14 +433,14 @@ TEST_F(homa_peer, homa_peertab_pick_victims__filter_idle_jiffies_max) EXPECT_EQ(peers[1], victims[0]); EXPECT_EQ(peers[2], victims[1]); } -TEST_F(homa_peer, homa_peertab_pick_victims__duplicate_peer) +TEST_F(homa_peer, homa_peer_pick_victims__duplicate_peer) { struct homa_peertab *peertab = self->homa.peertab; struct homa_peer *peers[3], *victims[3]; jiffies = 300; - peers[0] = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peers[0]); + peers[0] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[0]); peers[1] = peers[0]; peers[2] = peers[0]; @@ -452,34 +452,34 @@ TEST_F(homa_peer, homa_peertab_pick_victims__duplicate_peer) EXPECT_EQ(1, homa_peer_pick_victims(peertab, victims, 3)); EXPECT_EQ(peers[0], victims[0]); } -TEST_F(homa_peer, homa_peertab_pick_victims__select_best_candidates) +TEST_F(homa_peer, homa_peer_pick_victims__select_best_candidates) { struct homa_peertab *peertab = self->homa.peertab; struct homa_peer *peers[6], *victims[3]; jiffies = 300; - peers[0] = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peers[0]); + peers[0] = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peers[0]); jiffies = 400; - peers[1] = homa_peer_find(&self->hsk, ip2222); - homa_peer_put(peers[1]); + peers[1] = homa_peer_get(&self->hsk, ip2222); + homa_peer_release(peers[1]); jiffies = 500; - peers[2] = homa_peer_find(&self->hsk, ip3333); - homa_peer_put(peers[2]); + peers[2] = homa_peer_get(&self->hsk, ip3333); + homa_peer_release(peers[2]); jiffies = 200; - peers[3] = homa_peer_find(&self->hsk, ip4444); - homa_peer_put(peers[3]); + peers[3] = homa_peer_get(&self->hsk, ip4444); + homa_peer_release(peers[3]); jiffies = 350; - peers[4] = homa_peer_find(&self->hsk, ip5555); - homa_peer_put(peers[4]); + peers[4] = homa_peer_get(&self->hsk, ip5555); + homa_peer_release(peers[4]); jiffies = 600; - peers[5] = homa_peer_find(&self->hsk, ip6666); - homa_peer_put(peers[5]); + peers[5] = homa_peer_get(&self->hsk, ip6666); + homa_peer_release(peers[5]); mock_rht_walk_results = (void **)peers; mock_rht_num_walk_results = 6; @@ -497,8 +497,8 @@ TEST_F(homa_peer, homa_peer_gc__basics) struct homa_peer *peer; jiffies = 300; - peer = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer); + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); EXPECT_EQ(1, self->hnet->num_peers); jiffies = peertab->idle_jiffies_max + 1000; @@ -525,8 +525,8 @@ TEST_F(homa_peer, homa_peer_gc__gc_stop_count) struct homa_peer *peer; jiffies = 300; - peer = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer); + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); jiffies = peertab->idle_jiffies_max + 1000; peertab->num_peers = peertab->gc_threshold; @@ -543,8 +543,8 @@ TEST_F(homa_peer, homa_peer_gc__call_rcu_pending) struct homa_peer *peer; jiffies = 300; - peer = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer); + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); jiffies = peertab->idle_jiffies_max + 1000; peertab->num_peers = peertab->gc_threshold; @@ -561,8 +561,8 @@ TEST_F(homa_peer, homa_peer_gc__peers_below_gc_threshold) struct homa_peer *peer; jiffies = 300; - peer = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer); + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); jiffies = peertab->idle_jiffies_max + 1000; peertab->num_peers = peertab->gc_threshold - 1; @@ -578,8 +578,8 @@ TEST_F(homa_peer, homa_peer_gc__no_suitable_candidates) struct homa_peer *peer; jiffies = 100; - peer = homa_peer_find(&self->hsk, ip1111); - homa_peer_put(peer); + peer = homa_peer_get(&self->hsk, ip1111); + homa_peer_release(peer); jiffies = peertab->idle_jiffies_min; peertab->num_peers = peertab->gc_threshold - 1; @@ -605,7 +605,7 @@ TEST_F(homa_peer, homa_peer_alloc__success) EXPECT_EQ(1, homa_metrics_per_cpu()->peer_allocs); #endif /* See strip.py */ EXPECT_EQ(1, atomic_read(&peer->dst->__rcuref.refcnt)); - homa_peer_put(peer); + homa_peer_release(peer); homa_peer_free(peer); } TEST_F(homa_peer, homa_peer_alloc__kmalloc_error) @@ -644,7 +644,7 @@ TEST_F(homa_peer, homa_peer_free__normal) dst_hold(dst); ASSERT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); - homa_peer_put(peer); + homa_peer_release(peer); homa_peer_free(peer); ASSERT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); dst_release(dst); @@ -669,7 +669,7 @@ TEST_F(homa_peer, homa_peer_find__basics) /* First call: create new peer. */ jiffies = 456; - peer = homa_peer_find(&self->hsk, ip1111); + peer = homa_peer_get(&self->hsk, ip1111); ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ_IP(*ip1111, peer->addr); EXPECT_EQ(456, peer->access_jiffies); @@ -681,14 +681,14 @@ TEST_F(homa_peer, homa_peer_find__basics) EXPECT_EQ(1, self->hnet->num_peers); /* Second call: lookup existing peer. */ - peer2 = homa_peer_find(&self->hsk, ip1111); + peer2 = homa_peer_get(&self->hsk, ip1111); EXPECT_EQ(peer, peer2); EXPECT_EQ(2, atomic_read(&peer->refs)); EXPECT_EQ(1, self->homa.peertab->num_peers); EXPECT_EQ(1, self->hnet->num_peers); /* Third call: lookup new peer. */ - peer2 = homa_peer_find(&self->hsk, ip2222); + peer2 = homa_peer_get(&self->hsk, ip2222); EXPECT_NE(peer, peer2); ASSERT_FALSE(IS_ERR(peer2)); EXPECT_EQ(1, atomic_read(&peer2->refs)); @@ -698,16 +698,16 @@ TEST_F(homa_peer, homa_peer_find__basics) #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(2, homa_metrics_per_cpu()->peer_allocs); #endif /* See strip.py */ - homa_peer_put(peer); - homa_peer_put(peer); - homa_peer_put(peer2); + homa_peer_release(peer); + homa_peer_release(peer); + homa_peer_release(peer2); } TEST_F(homa_peer, homa_peer_find__error_in_homa_peer_alloc) { struct homa_peer *peer; mock_route_errors = 1; - peer = homa_peer_find(&self->hsk, ip3333); + peer = homa_peer_get(&self->hsk, ip3333); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(peer)); #ifndef __STRIP__ /* See strip.py */ @@ -719,7 +719,7 @@ TEST_F(homa_peer, homa_peer_find__insert_error) struct homa_peer *peer; mock_rht_insert_errors = 1; - peer = homa_peer_find(&self->hsk, ip3333); + peer = homa_peer_get(&self->hsk, ip3333); EXPECT_TRUE(IS_ERR(peer)); EXPECT_EQ(EINVAL, -PTR_ERR(peer)); } @@ -731,12 +731,12 @@ TEST_F(homa_peer, homa_peer_find__conflicting_create) peer_race_hook_invocations = 0; unit_hook_register(peer_race_hook); jiffies = 100; - peer = homa_peer_find(&self->hsk, ip3333); + peer = homa_peer_get(&self->hsk, ip3333); EXPECT_FALSE(IS_ERR(conflicting_peer)); EXPECT_EQ(conflicting_peer, peer); EXPECT_EQ(1, atomic_read(&peer->refs)); EXPECT_EQ(110, peer->access_jiffies); - homa_peer_put(peer); + homa_peer_release(peer); EXPECT_EQ(1, self->homa.peertab->num_peers); EXPECT_EQ(1, self->hnet->num_peers); } @@ -746,21 +746,21 @@ TEST_F(homa_peer, homa_dst_refresh__basics) struct dst_entry *old_dst; struct homa_peer *peer; - peer = homa_peer_find(&self->hsk, ip1111); + peer = homa_peer_get(&self->hsk, ip1111); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); old_dst = peer->dst; homa_dst_refresh(self->homa.peertab, peer, &self->hsk); EXPECT_NE(old_dst, peer->dst); - homa_peer_put(peer); + homa_peer_release(peer); } TEST_F(homa_peer, homa_dst_refresh__routing_error) { struct dst_entry *old_dst; struct homa_peer *peer; - peer = homa_peer_find(&self->hsk, ip1111); + peer = homa_peer_get(&self->hsk, ip1111); ASSERT_NE(NULL, peer); EXPECT_EQ_IP(*ip1111, peer->addr); @@ -771,7 +771,7 @@ TEST_F(homa_peer, homa_dst_refresh__routing_error) #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); #endif /* See strip.py */ - homa_peer_put(peer); + homa_peer_release(peer); } #ifndef __STRIP__ /* See strip.py */ @@ -796,7 +796,7 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv4) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); - struct homa_peer *peer = homa_peer_find(&self->hsk, + struct homa_peer *peer = homa_peer_get(&self->hsk, &self->client_ip[0]); ASSERT_NE(NULL, peer); @@ -805,7 +805,7 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv4) dst_release(dst); EXPECT_STREQ("196.168.0.1", homa_print_ipv4_addr(peer->flow.u.ip4.daddr)); - homa_peer_put(peer); + homa_peer_release(peer); } TEST_F(homa_peer, homa_peer_get_dst_ipv6) { @@ -818,7 +818,7 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) homa_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); - struct homa_peer *peer = homa_peer_find(&self->hsk, &ip1111[0]); + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); ASSERT_NE(NULL, peer); dst = homa_peer_get_dst(peer, &self->hsk); @@ -829,13 +829,13 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv6) (addr >> 16) & 0xff, (addr >> 8) & 0xff, addr & 0xff); EXPECT_STREQ("[1::1:1:1]", homa_print_ipv6_addr(&peer->flow.u.ip6.daddr)); - homa_peer_put(peer); + homa_peer_release(peer); } #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_peer, homa_peer_lock_slow) { - struct homa_peer *peer = homa_peer_find(&self->hsk, ip3333); + struct homa_peer *peer = homa_peer_get(&self->hsk, ip3333); ASSERT_NE(NULL, peer); mock_clock = 10000; @@ -850,7 +850,7 @@ TEST_F(homa_peer, homa_peer_lock_slow) EXPECT_EQ(1, homa_metrics_per_cpu()->peer_ack_lock_misses); EXPECT_EQ(1000, homa_metrics_per_cpu()->peer_ack_lock_miss_cycles); homa_peer_unlock(peer); - homa_peer_put(peer); + homa_peer_release(peer); } #endif /* See strip.py */ @@ -904,7 +904,7 @@ TEST_F(homa_peer, homa_peer_add_ack) TEST_F(homa_peer, homa_peer_get_acks) { - struct homa_peer *peer = homa_peer_find(&self->hsk, ip3333); + struct homa_peer *peer = homa_peer_get(&self->hsk, ip3333); struct homa_ack acks[2]; ASSERT_NE(NULL, peer); @@ -935,7 +935,7 @@ TEST_F(homa_peer, homa_peer_get_acks) EXPECT_EQ(1, homa_peer_get_acks(peer, 2, acks)); EXPECT_STREQ("server_port 5000, client_id 100", unit_ack_string(&acks[0])); - homa_peer_put(peer); + homa_peer_release(peer); } TEST_F(homa_peer, homa_peer_update_sysctl_deps) diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 96a3f0be..d5907157 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -133,9 +133,9 @@ TEST_F(homa_plumbing, homa_net_exit__free_peers) struct in6_addr addr2 = unit_get_in_addr("1.2.3.5"); struct in6_addr addr3 = unit_get_in_addr("1.2.3.6"); - homa_peer_put(homa_peer_find(&self->hsk, &addr1)); - homa_peer_put(homa_peer_find(&self->hsk, &addr2)); - homa_peer_put(homa_peer_find(&self->hsk, &addr3)); + homa_peer_release(homa_peer_get(&self->hsk, &addr1)); + homa_peer_release(homa_peer_get(&self->hsk, &addr2)); + homa_peer_release(homa_peer_get(&self->hsk, &addr3)); EXPECT_EQ(3, unit_count_peers(&self->homa)); homa_net_exit(self->hsk.hnet->net); diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 58859607..0c7968b7 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -91,7 +91,7 @@ TEST_F(homa_utils, homa_init__peertab_alloc_failure) mock_kmalloc_errors = 4; unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); - EXPECT_SUBSTR("homa_peertab_alloc couldn't create peertab: kmalloc failure", + EXPECT_SUBSTR("homa_peer_alloc_peertab couldn't create peertab: kmalloc failure", mock_printk_output); EXPECT_EQ(NULL, homa2.peertab); homa_destroy(&homa2); @@ -157,13 +157,13 @@ TEST_F(homa_utils, homa_net_destroy__delete_peers) mock_sock_init(&hsk2, hnet, 44); addr = unit_get_in_addr("1.2.3.4"); - peer = homa_peer_find(&hsk2, &addr); - homa_peer_put(peer); - peer = homa_peer_find(&self->hsk, &addr); - homa_peer_put(peer); + peer = homa_peer_get(&hsk2, &addr); + homa_peer_release(peer); + peer = homa_peer_get(&self->hsk, &addr); + homa_peer_release(peer); addr = unit_get_in_addr("1.2.3.5"); - peer = homa_peer_find(&hsk2, &addr); - homa_peer_put(peer); + peer = homa_peer_get(&hsk2, &addr); + homa_peer_release(peer); EXPECT_EQ(3, unit_count_peers(&self->homa)); homa_net_destroy(hnet); From 59e0d28d829a27b0a2118d7f6e9b15a2c6e8627b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 21 May 2025 15:08:43 -0700 Subject: [PATCH 324/625] Eliminate stale references to homa_wait_for_message --- homa_incoming.c | 5 ++--- homa_metrics.h | 2 +- homa_plumbing.c | 2 +- homa_timer.c | 6 +++--- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index e56675d3..6b25f248 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -594,9 +594,8 @@ void homa_dispatch_pkts(struct sk_buff *skb) } if (hsk->dead_skbs >= 2 * hsk->homa->dead_buffs_limit) { - /* We get here if neither homa_wait_for_message - * nor homa_timer can keep up with reaping dead - * RPCs. See reap.txt for details. + /* We get here if other approaches are not keeping up with + * reaping dead RPCs. See reap.txt for details. */ #ifndef __STRIP__ /* See strip.py */ u64 start = homa_clock(); diff --git a/homa_metrics.h b/homa_metrics.h index 059840d7..b4e40b1f 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -533,7 +533,7 @@ struct homa_metrics { u64 reaper_dead_skbs; /** - * @forced_reaps: total number of times that homa_wait_for_message + * @forced_reaps: total number of times that homa_wait_shared/privatea * invoked the reaper because dead_skbs was too high. */ u64 forced_reaps; diff --git a/homa_plumbing.c b/homa_plumbing.c index f7b6ecb7..a5551029 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1214,7 +1214,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, else rpc->state = RPC_IN_SERVICE; } - homa_rpc_unlock(rpc); /* Locked by homa_wait_for_message. */ + homa_rpc_unlock(rpc); /* Locked by homa_wait_shared/private. */ if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags)) { /* There are tasks waiting for tx memory, so reap diff --git a/homa_timer.c b/homa_timer.c index cc84eb12..0648f1bf 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -228,9 +228,9 @@ void homa_timer(struct homa *homa) for (hsk = homa_socktab_start_scan(homa->socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { while (hsk->dead_skbs >= homa->dead_buffs_limit) { - /* If we get here, it means that homa_wait_for_message - * isn't keeping up with RPC reaping, so we'll help - * out. See reap.txt for more info. + /* If we get here, it means that Homa isn't keeping + * up with RPC reaping, so we'll help out. See + * reap.txt for more info. */ #ifndef __STRIP__ /* See strip.py */ u64 rpc_start = homa_clock(); From 624088e2d92df2ed9674acf8b1be391b9cbc8011 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 21 May 2025 16:38:59 -0700 Subject: [PATCH 325/625] Eliminate unused metric: forced_reaps --- homa_metrics.c | 2 -- homa_metrics.h | 6 ------ util/metrics.py | 2 +- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index 9e65ee77..2c6f0bd6 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -335,8 +335,6 @@ char *homa_metrics_print(void) m->reaper_calls); M("reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper calls\n", m->reaper_dead_skbs); - M("forced_reaps %15llu Reaps forced by accumulation of dead RPCs\n", - m->forced_reaps); M("throttle_list_adds %15llu Calls to homa_add_to_throttled\n", m->throttle_list_adds); M("throttle_list_checks %15llu List elements checked in homa_add_to_throttled\n", diff --git a/homa_metrics.h b/homa_metrics.h index b4e40b1f..678f0c1e 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -532,12 +532,6 @@ struct homa_metrics { */ u64 reaper_dead_skbs; - /** - * @forced_reaps: total number of times that homa_wait_shared/privatea - * invoked the reaper because dead_skbs was too high. - */ - u64 forced_reaps; - /** * @throttle_list_adds: total number of calls to homa_add_to_throttled. */ diff --git a/util/metrics.py b/util/metrics.py index 734e8370..e87f997a 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -443,7 +443,7 @@ def scale_number(number): "data_xmit_errors", "server_cant_create_rpcs", "server_cant_create_rpcs", "short_packets", "rpc_timeouts", "server_rpc_discards", - "server_rpcs_unknown", "forced_reaps", "buffer_alloc_failures", + "server_rpcs_unknown", "buffer_alloc_failures", "dropped_data_no_bufs", "linux_pkt_alloc_bytes"]: if deltas[symbol] == 0: continue From 7eb3220209b1909d5b2bed695467ed4b6a0f6a4a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 21 May 2025 16:35:19 -0700 Subject: [PATCH 326/625] Eliminate reap.txt Move its contents into the code as a comment in homa_rpc_reap. --- homa_incoming.c | 3 ++- homa_rpc.c | 62 ++++++++++++++++++++++++++++++++++++++++++------- homa_timer.c | 3 ++- reap.txt | 50 --------------------------------------- 4 files changed, 58 insertions(+), 60 deletions(-) delete mode 100644 reap.txt diff --git a/homa_incoming.c b/homa_incoming.c index 6b25f248..567c6bed 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -595,7 +595,8 @@ void homa_dispatch_pkts(struct sk_buff *skb) if (hsk->dead_skbs >= 2 * hsk->homa->dead_buffs_limit) { /* We get here if other approaches are not keeping up with - * reaping dead RPCs. See reap.txt for details. + * reaping dead RPCs. See "RPC Reaping Strategy" in + * homa_rpc_reap code for details. */ #ifndef __STRIP__ /* See strip.py */ u64 start = homa_clock(); diff --git a/homa_rpc.c b/homa_rpc.c index affad4cc..5a81d01b 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -391,24 +391,70 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, /** * homa_rpc_reap() - Invoked to release resources associated with dead - * RPCs for a given socket. For a large RPC, it can take a long time to - * free all of its packet buffers, so we try to perform this work - * off the critical path where it won't delay applications. Each call to - * this function normally does a small chunk of work (unless reap_all is - * true). See the file reap.txt for more information. + * RPCs for a given socket. * @hsk: Homa socket that may contain dead RPCs. Must not be locked by the * caller; this function will lock and release. * @reap_all: False means do a small chunk of work; there may still be - * unreaped RPCs on return. True means reap all dead rpcs for + * unreaped RPCs on return. True means reap all dead RPCs for * hsk. Will busy-wait if reaping has been disabled for some RPCs. * * Return: A return value of 0 means that we ran out of work to do; calling * again will do no work (there could be unreaped RPCs, but if so, - * reaping has been disabled for them). A value greater than - * zero means there is still more reaping work to be done. + * they cannot currently be reaped). A value greater than zero means + * there is still more reaping work to be done. */ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) { + /* RPC Reaping Strategy: + * + * (Note: there are references to this comment elsewhere in the + * Homa code) + * + * Most of the cost of reaping comes from freeing sk_buffs; this can be + * quite expensive for RPCs with long messages. + * + * The natural time to reap is when homa_rpc_end is invoked to + * terminate an RPC, but this doesn't work for two reasons. First, + * there may be outstanding references to the RPC; it cannot be reaped + * until all of those references have been released. Second, reaping + * is potentially expensive and RPC termination could occur in + * homa_softirq when there are short messages waiting to be processed. + * Taking time to reap a long RPC could result in significant delays + * for subsequent short RPCs. + * + * Thus Homa doesn't reap immediately in homa_rpc_end. Instead, dead + * RPCs are queued up and reaping occurs in this function, which is + * invoked later when it is less likely to impact latency. The + * challenge is to do this so that (a) we don't allow large numbers of + * dead RPCs to accumulate and (b) we minimize the impact of reaping + * on latency. + * + * The primary place where homa_rpc_reap is invoked is when threads + * are waiting for incoming messages. The thread has nothing else to + * do (it may even be polling for input), so reaping can be performed + * with no latency impact on the application. However, if a machine + * is overloaded then it may never wait, so this mechanism isn't always + * sufficient. + * + * Homa now reaps in two other places, if reaping while waiting for + * messages isn't adequate: + * 1. If too may dead skbs accumulate, then homa_timer will call + * homa_rpc_reap. + * 2. If this timer thread cannot keep up with all the reaping to be + * done then as a last resort homa_dispatch_pkts will reap in small + * increments (a few sk_buffs or RPCs) for every incoming batch + * of packets . This is undesirable because it will impact Homa's + * performance. + * + * During the introduction of homa_pools for managing input + * buffers, freeing of packets for incoming messages was moved to + * homa_copy_to_user under the assumption that this code wouldn't be + * on the critical path. However, there is evidence that with + * fast networks (e.g. 100 Gbps) copying to user space is the + * bottleneck for incoming messages, and packet freeing takes about + * 20-25% of the total time in homa_copy_to_user. So, it may eventually + * be desirable to remove packet freeing out of homa_copy_to_user. + */ #ifdef __UNIT_TEST__ #define BATCH_MAX 3 #else /* __UNIT_TEST__ */ diff --git a/homa_timer.c b/homa_timer.c index 0648f1bf..dc1a0c9e 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -230,7 +230,8 @@ void homa_timer(struct homa *homa) while (hsk->dead_skbs >= homa->dead_buffs_limit) { /* If we get here, it means that Homa isn't keeping * up with RPC reaping, so we'll help out. See - * reap.txt for more info. + * "RPC Reaping Strategy" in homa_rpc_reap code for + * details. */ #ifndef __STRIP__ /* See strip.py */ u64 rpc_start = homa_clock(); diff --git a/reap.txt b/reap.txt deleted file mode 100644 index a5956039..00000000 --- a/reap.txt +++ /dev/null @@ -1,50 +0,0 @@ -This file discusses issues related to freeing resources for completed RPCs -("reaping"). - -* Most of the cost of reaping comes from freeing skbuffs; this can be - quite expensive for RPCs with long messages. - -* The natural time to reap is when homa_rpc_end is invoked to mark an - RPC completed, but this can result in severe performance hiccups. However, - this can happen in homa_softirq at a time when there are short messages - waiting to be processed. Freeing a long RPC could result in significant - delay for a subsequent short RPC. - -* Thus Homa doesn't reap immediately in homa_rpc_end. Instead, dead RPCs - are queued up and reaping occurs later, at a more convenient time where - it is less likely to impact latency. The challenge is to figure out how to - do this so that (a) we keep up with dead RPCs and (b) we minimize - the impact of reaping on latency. - -* The ideal time to reap is when threads are waiting for incoming messages - in homa_wait_for_message. The thread has nothing else to do, so reaping - can be performed with no latency impact on the application. However, - if a machine is overloaded then it may never wait, so this mechanism - isn't always sufficient. - -* Homa now reaps in two other places, if homa_wait_for_message can't - keep up: - * If dead_buffs_limit dead skbs accumulate, then homa_timer will - reap to get down to that limit. However, it seems possible that - there may be cases where a single thread cannot keep up with all - the reaping to be done. - * If homa_timer can't keep up, then as a last resort, homa_dispatch_pkts - will reap a few buffers for every incoming data packet. This is undesirable - because it will impact Homa's performance. - -* During the conversion to the new input buffering scheme, freeing of packets - for incoming messages was moved to homa_copy_to_user, under the assumption - that this code wouldn't be on the critical path. However, right now the - packet freeing is taking 20-25% of the total time in that function, and - with faster networks it's quite possible that this code will indeed be on - the critical path. So, it may eventually be necessary to remove - packet freeing from homa_copy_to_user. - -* Here are some approaches that have been tried and eventually abandoned: - * Occasionally when data packets arrive, reap if too much dead info has - accumulated. This will cause a latency impact. The amount to reap is - chosen dynamically (by homa_timer) to be as small as possible while - gradually working through the backlog. Unfortunately, the formula for - computing how much to reap was fragile and resulted in situations where - the backlog of dead RPCs grew without bound. This approach was abandoned - in October 2021. From df6de24e28db3bb2313b45610335b01547fcf313 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 21 May 2025 17:33:44 -0700 Subject: [PATCH 327/625] Remove sync.txt Move its contents into the code as comments, primarily in homa_impl.h --- homa_impl.h | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++ homa_pacer.c | 9 +++-- homa_pool.c | 5 ++- homa_rpc.h | 12 ++---- homa_sock.c | 3 +- homa_sock.h | 28 +++++++++++--- sync.txt | 73 ------------------------------------- 7 files changed, 137 insertions(+), 94 deletions(-) delete mode 100644 sync.txt diff --git a/homa_impl.h b/homa_impl.h index 3571bee9..4df2de59 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -900,4 +900,105 @@ static inline u64 homa_usecs_to_cycles(u64 usecs) #endif /* __UNIT_TEST__ */ } +/* Homa Locking Strategy: + * + * (Note: this documentation is referenced in several other places in the + * Homa code) + * + * In the Linux TCP/IP stack the primary locking mechanism is a sleep-lock + * per socket. However, per-socket locks aren't adequate for Homa, because + * sockets are "larger" in Homa. In TCP, a socket corresponds to a single + * connection between two peers; an application can have hundreds or + * thousands of sockets open at once, so per-socket locks leave lots of + * opportunities for concurrency. With Homa, a single socket can be used for + * communicating with any number of peers, so there will typically be just + * one socket per thread. As a result, a single Homa socket must support many + * concurrent RPCs efficiently, and a per-socket lock would create a bottleneck + * (Homa tried this approach initially). + * + * Thus, the primary locks used in Homa spinlocks at RPC granularity. This + * allows operations on different RPCs for the same socket to proceed + * concurrently. Homa also has socket locks (which are spinlocks different + * from the official socket sleep-locks) but these are used much less + * frequently than RPC locks. + * + * Lock Ordering: + * + * There are several other locks in Homa besides RPC locks, all of which + * are spinlocks. When multiple locks are held, they must be acquired in a + * consistent order in order to prevent deadlock. Here are the rules for Homa: + * 1. Except for RPC and socket locks, all locks should be considered + * "leaf" locks: don't accquire other locks while holding them. + * 2. The lock order is: + * * RPC lock + * * Socket lock + * * Other lock + * 3. It is not safe to wait on an RPC lock while holding any other lock. + * 4. It is safe to wait on a socket lock while holding an RPC lock, but + * not while holding any other lock. + * + * It may seem surprising that RPC locks are acquired *before* socket locks, + * but this is essential for high performance. Homa has been designed so that + * many common operations (such as processing input packets) can be performed + * while holding only an RPC lock; this allows operations on different RPCs + * to proceed in parallel. Only a few operations, such as handing off an + * incoming message to a waiting thread, require the socket lock. If socket + * locks had to be acquired first, any operation that might eventually need + * the socket lock would have to acquire it before the RPC lock, which would + * severely restrict concurrency. + * + * Socket Shutdown: + * + * It is possible for socket shutdown to begin while operations are underway + * that hold RPC locks but not the socket lock. For example, a new RPC + * creation might be underway when a socket is shut down. The RPC creation + * will eventually acquire the socket lock and add the new RPC to those + * for the socket; it would be very bad if this were to happen after + * homa_sock_shutdown things is has deleted all RPCs for the socket. + * In general, any operation that acquires a socket lock must check + * hsk->shutdown after acquiring the lock and abort if hsk->shutdown is set. + * + * Spinlock Implications: + * + * Homa uses spinlocks exclusively; this is needed because locks typically + * need to be acquired at atomic level, such as in SoftIRQ code. + * + * Operations that can block, such as memory allocation and copying data + * to/from user space, are not permitted while holding spinlocks (spinlocks + * disable interrupts, so the holder must not block. This results in awkward + * code in several places to move restricted operations outside locked + * regions. Such code typically looks like this: + * - Acquire a reference on an object such as an RPC, in order to prevent + * the object from being deleted. + * - Release the object's lock. + * - Perform the restricted operation. + * - Re-acquire the lock. + * - Release the reference. + * It is possible that the object may have been modified by some other party + * while it was unlocked, so additional checks may be needed after reacquiring + * the lock. As one example, an RPC may have been terminated, in which case + * any operation in progress on that RPC should be aborted after reacquiring + * the lock. + * + * Lists of RPCs: + * + * There are a few places where Homa needs to process all of the RPCs + * associated with a socket, such as the timer. Such code must first lock + * the socket (to protect access to the link pointers) then lock + * individual RPCs on the list. However, this violates the rules for locking + * order. It isn't safe to unlock the socket before locking the individual RPCs, + * because RPCs could be deleted and their memory recycled between the unlock + * of the socket lock and the lock of the RPC; this could result in corruption. + * Homa uses two different approaches to handle this situation: + * 1. Use ``homa_protect_rpcs`` to prevent RPC reaping for a socket. RPCs can + * still be terminated, but their memory won't go away until + * homa_unprotect_rpcs is invoked. This allows the socket lock to be + * released before acquiring RPC locks; after acquiring each RPC lock, + * the RPC must be checked to see if it has been terminated; if so, skip it. + * 2. Use ``spin_trylock_bh`` to acquire the RPC lock while still holding the + * socket lock. If this fails, then release the socket lock and retry + * both the socket lock and the RPC lock. Of course, the state of both + * socket and RPC could change before the locks are finally acquired. + */ + #endif /* _HOMA_IMPL_H */ diff --git a/homa_pacer.c b/homa_pacer.c index ed2ca941..465d236a 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -258,10 +258,11 @@ void homa_pacer_xmit(struct homa_pacer *pacer) /* Lock the first throttled RPC. This may not be possible * because we have to hold throttle_lock while locking * the RPC; that means we can't wait for the RPC lock because - * of lock ordering constraints (see sync.txt). Thus, if - * the RPC lock isn't available, do nothing. Holding the - * throttle lock while locking the RPC is important because - * it keeps the RPC from being deleted before it can be locked. + * of lock ordering constraints (see "Homa Locking Strategy" in + * homa_impl.h). Thus, if the RPC lock isn't available, do + * nothing. Holding the throttle lock while locking the RPC + * is important because it keeps the RPC from being deleted + * before it can be locked. */ homa_pacer_throttle_lock(pacer); pacer->fifo_count -= pacer->fifo_fraction; diff --git a/homa_pool.c b/homa_pool.c index 3143fd12..901d30cc 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -506,8 +506,9 @@ void homa_pool_check_waiting(struct homa_pool *pool) if (!homa_rpc_try_lock(rpc)) { /* Can't just spin on the RPC lock because we're * holding the socket lock and the lock order is - * rpc->socket (see sync.txt). Instead, release the - * socket lock and try the entire operation again. + * rpc-then-socket (see "Homa Locking Strategy" in + * homa_impl.h). Instead, release the socket lock + * and try the entire operation again. */ homa_sock_unlock(pool->hsk); UNIT_LOG("; ", "rpc lock unavailable in %s", __func__); diff --git a/homa_rpc.h b/homa_rpc.h index ca1f80b3..fb7369d3 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -447,13 +447,7 @@ int homa_validate_incoming(struct homa *homa, int verbose, /** * homa_rpc_lock() - Acquire the lock for an RPC. - * @rpc: RPC to lock. Note: this function is only safe under - * limited conditions (in most cases homa_bucket_lock should be - * used). The caller must ensure that the RPC cannot be reaped - * before the lock is acquired, such as by taking a reference on - * the rpc with homa_rpc_hold or calling homa_protect_rpcs. - * Don't use this function unless you are very sure what you are - * doing! See sync.txt for more info on locking. + * @rpc: RPC to lock. */ static inline void homa_rpc_lock(struct homa_rpc *rpc) __acquires(rpc_bucket_lock) @@ -489,7 +483,9 @@ static inline void homa_rpc_unlock(struct homa_rpc *rpc) * homa_protect_rpcs() - Ensures that no RPCs will be reaped for a given * socket until homa_sock_unprotect is called. Typically used by functions * that want to scan the active RPCs for a socket without holding the socket - * lock. Multiple calls to this function may be in effect at once. + * lock. Multiple calls to this function may be in effect at once. See + * "Homa Locking Strategy" in homa_impl.h for more info on why this function + * is needed. * @hsk: Socket whose RPCs should be protected. Must not be locked * by the caller; will be locked here. * diff --git a/homa_sock.c b/homa_sock.c index b46422ee..b6ebf5dc 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -281,7 +281,8 @@ void homa_sock_shutdown(struct homa_sock *hsk) * there will be no concurrent activities on individual RPCs. * 5. Don't delete the buffer pool until after all of the RPCs * have been reaped. - * See sync.txt for additional information about locking. + * See "Homa Locking Strategy" in homa_impl.h for additional information + * about locking. */ hsk->shutdown = true; homa_sock_unlink(hsk); diff --git a/homa_sock.h b/homa_sock.h index 748b0eb6..073d0a9b 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -74,9 +74,26 @@ struct homa_rpc_bucket { /** * @lock: serves as a lock both for this bucket (e.g., when * adding and removing RPCs) and also for all of the RPCs in - * the bucket. Must be held whenever manipulating an RPC in - * this bucket. This dual purpose permits clean and safe - * deletion and garbage collection of RPCs. + * the bucket. Must be held whenever looking up an RPC in + * this bucket or manipulating an RPC in the bucket. This approach + * has the following properties: + * 1. An RPC can be looked up and locked (a common operation) with + * a single lock acquisition. + * 2. Looking up and locking are atomic: there is no window of + * vulnerability where someone else could delete an RPC after + * it has been looked up and before it has been locked. + * 3. The lookup mechanism does not use RCU. This is important because + * RPCs are created rapidly and typically live only a few tens of + * microseconds. As of May 2027 RCU introduces a lag of about + * 25 ms before objects can be deleted; for RPCs this would result + * in hundreds or thousands of RPCs accumulating before RCU allows + * them to be deleted. + * This approach has the disadvantage that RPCs within a bucket share + * locks and thus may not be able to work concurently, but there are + * enough buckets in the table to make such colllisions rare. + * + * See "Homa Locking Strategy" in homa_impl.h for more info about + * locking. */ spinlock_t lock __context__(rpc_bucket_lock, 1, 1); @@ -177,15 +194,14 @@ struct homa_sock { * @lock: Must be held when modifying fields such as interests * and lists of RPCs. This lock is used in place of sk->sk_lock * because it's used differently (it's always used as a simple - * spin lock). See sync.txt for more on Homa's synchronization - * strategy. + * spin lock). See "Homa Locking Strategy" in homa_impl.h + * for more on Homa's synchronization strategy. */ spinlock_t lock ____cacheline_aligned_in_smp; /** * @protect_count: counts the number of calls to homa_protect_rpcs * for which there have not yet been calls to homa_unprotect_rpcs. - * See sync.txt for more info. */ atomic_t protect_count; diff --git a/sync.txt b/sync.txt deleted file mode 100644 index 1f063f3d..00000000 --- a/sync.txt +++ /dev/null @@ -1,73 +0,0 @@ -This file describes the synchronization strategy used for Homa. - -* In the Linux TCP/IP stack, the primary locking mechanism is a lock - per socket. However, per-socket locks aren't adequate for Homa, because - sockets are "larger" in Homa. In TCP, a socket corresponds to a single - connection between the source and destination; an application can have - hundreds or thousands of sockets open at once, so per-socket locks leave - lots of opportunities for concurrency. With Homa, a single socket can be - used for communicating with any number of peers, so there will typically - be no more than one socket per thread. As a result, a single Homa socket - must support many concurrent RPCs efficiently, and a per-socket lock would - create a bottleneck (Homa tried this approach initially). - -* Thus, the primary lock used in Homa is a per-RPC spinlock. This allows operations - on different RPCs to proceed concurrently. RPC locks are actually stored in - the hash table buckets used to look them up. This is important because it - makes looking up RPCs and locking them atomic. Without this approach it - is possible that an RPC could get deleted after it was looked up but before - it was locked. - -* Certain operations are not permitted while holding spinlocks, such as memory - allocation and copying data to/from user space (spinlocks disable - interrupts, so the holder must not block). RPC locks are spinlocks, - and that results in awkward code in several places to move prohibited - operations outside the locked regions. In particular, there is extra - complexity to make sure that RPCs are not garbage-collected while these - operations are occurring without a lock. - -* There are several other locks in Homa besides RPC locks. When multiple - locks are held, they must always be acquired in a consistent order, in - order to prevent deadlock. Overall rules: - * RPC locks are "top level" - * Other locks may be acquired while holding an RPC lock - * It is not safe to wait on an RPC lock while holding any other locks. - * It is safe to acquire port_map.write_lock while holding a socket lock - * Other than these rules, all locks should be considered "leaf" locks: - don't accquire other locks while holding them. - -* Homa's approach means that socket shutdown and deletion can potentially - begin while operations are underway that hold RPC locks but not the socket - lock. For example, a new RPC creation might be underway when a socket - is shut down, which could attempt to add the new RPC after homa_sock_shutdown - thinks it has deleted all RPCs. Handling this requires careful checking - of hsk->shutdown. For example, during new RPC creation the socket lock - must be acquired to add the new RPC to those for the socket; after acquiring - the lock, it must check hsk->shutdown and abort the RPC creation if the - socket has been shutdown. - -* There are a few places where Homa needs to process RPCs on lists - associated with a socket, such as the timer. Such code must first lock - the socket (to synchronize access to the link pointers) then lock - individual RPCs on the list. However, this violates the rules for locking - order. It isn't safe to unlock the socket before locking the RPC, because - the RPC could be deleted and its memory recycled between the unlock of the - socket lock and the lock of the RPC; this could result in corruption. Homa - uses a few different ways to handle this situation: - * Use homa_protect_rpcs to prevent RPC reaping for a socket. RPCs can still - be deleted, but their memory won't go away until homa_unprotect_rpcs is - invoked. This allows the socket lock to be released before acquiring - the RPC lock; after acquiring the RPC lock check to see if it has been - deleted; if so, skip it. Note: the Linux RCU mechanism could have been - used to achieve the same effect, but it results in *very* long delays - before final reclamation (tens of ms), even without contention, which - means that a large number of dead RPCs could accumulate. - * Use spin_trylock_bh to acquire the RPC lock, while still holding the - socket lock. If this fails, then release the socket lock, then retry - both the socket lock and the RPC lock. - -* There are also a few places where Homa is doing something related to an - RPC (such as copying message data to user space) and needs the RPC to stay - around, but it isn't holding the RPC lock. In this situations, Homa sets - a bit in rpc->flags and homa_rpc_reap will not reap RPCs with any of these - flags set. \ No newline at end of file From c2259e01a2b7e25ee6fcb8c6be590966c6fb09e8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 21 May 2025 21:28:47 -0700 Subject: [PATCH 328/625] Remove unnecessary _Static_assert in homa_pool.h --- homa_pool.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/homa_pool.h b/homa_pool.h index 237cad1d..708183e2 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -39,13 +39,6 @@ struct homa_bpage { u64 expiration; } ____cacheline_aligned_in_smp; -#ifndef __STRIP__ /* See strip.py */ -#ifndef CONFIG_LOCKDEP -_Static_assert(sizeof(struct homa_bpage) == L1_CACHE_BYTES, - "homa_bpage overflowed a cache line"); -#endif -#endif /* See strip.py */ - /** * struct homa_pool_core - Holds core-specific data for a homa_pool (a bpage * out of which that core is allocating small chunks). From 2eff9fa385dc9c8b08f1bac4863435f7eb96812e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 22 May 2025 09:13:59 -0700 Subject: [PATCH 329/625] Eliminate use of _Static_assert Instead, use BUILD_BUG_ON statements in homa_load (can't use BUILD_BUG_ON in header files). --- homa.h | 19 ------------------- homa_plumbing.c | 39 +++++++++++++++++++++++++++++++++++++++ homa_utils.c | 7 +------ homa_wire.h | 25 +------------------------ 4 files changed, 41 insertions(+), 49 deletions(-) diff --git a/homa.h b/homa.h index 2863d2c3..c33d1163 100644 --- a/homa.h +++ b/homa.h @@ -74,13 +74,6 @@ struct homa_sendmsg_args { __u32 reserved; }; -#if !defined(__cplusplus) -_Static_assert(sizeof(struct homa_sendmsg_args) >= 24, - "homa_sendmsg_args shrunk"); -_Static_assert(sizeof(struct homa_sendmsg_args) <= 24, - "homa_sendmsg_args grew"); -#endif - /* Flag bits for homa_sendmsg_args.flags (see man page for documentation): */ #define HOMA_SENDMSG_PRIVATE 0x01 @@ -125,13 +118,6 @@ struct homa_recvmsg_args { __u32 bpage_offsets[HOMA_MAX_BPAGES]; }; -#if !defined(__cplusplus) -_Static_assert(sizeof(struct homa_recvmsg_args) >= 88, - "homa_recvmsg_args shrunk"); -_Static_assert(sizeof(struct homa_recvmsg_args) <= 88, - "homa_recvmsg_args grew"); -#endif - #ifndef __STRIP__ /* See strip.py */ /** * struct homa_abort_args - Structure that passes arguments and results @@ -153,11 +139,6 @@ struct homa_abort_args { /** @_pad2: Reserved. */ __u64 _pad2[2]; }; - -#if !defined(__cplusplus) -_Static_assert(sizeof(struct homa_abort_args) >= 32, "homa_abort_args shrunk"); -_Static_assert(sizeof(struct homa_abort_args) <= 32, "homa_abort_args grew"); -#endif #endif /* See strip.py */ /** define SO_HOMA_RCVBUF: setsockopt option for specifying buffer region. */ diff --git a/homa_plumbing.c b/homa_plumbing.c index a5551029..ee915ced 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -455,6 +455,45 @@ int __init homa_load(void) struct homa *homa = global_homa; int status; + /* Compile-time validations that no packet header is longer + * than HOMA_MAX_HEADER. + */ + BUILD_BUG_ON(sizeof(struct homa_data_hdr) > HOMA_MAX_HEADER); +#ifndef __STRIP__ /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_grant_hdr) > HOMA_MAX_HEADER); +#endif /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_resend_hdr) > HOMA_MAX_HEADER); + BUILD_BUG_ON(sizeof(struct homa_rpc_unknown_hdr) > HOMA_MAX_HEADER); + BUILD_BUG_ON(sizeof(struct homa_busy_hdr) > HOMA_MAX_HEADER); +#ifndef __STRIP__ /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_cutoffs_hdr) > HOMA_MAX_HEADER); +#endif /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_freeze_hdr) > HOMA_MAX_HEADER); +#endif /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_need_ack_hdr) > HOMA_MAX_HEADER); + BUILD_BUG_ON(sizeof(struct homa_ack_hdr) > HOMA_MAX_HEADER); + + /* Extra constraints on data packets: + * - Ensure minimum header length so Homa doesn't have to worry about + * padding data packets. + * - Make sure data packet headers are a multiple of 4 bytes (needed + * for TCP/TSO compatibility). + */ + BUILD_BUG_ON(sizeof(struct homa_data_hdr) < HOMA_MIN_PKT_LENGTH); + BUILD_BUG_ON((sizeof(struct homa_data_hdr) - + sizeof(struct homa_seg_hdr)) & 0x3); + + /* Homa requires at least 8 priority levels. */ + BUILD_BUG_ON(HOMA_MAX_PRIORITIES < 8); + + /* Detect size changes in uAPI structs. */ + BUILD_BUG_ON(sizeof(struct homa_sendmsg_args) != 24); + BUILD_BUG_ON(sizeof(struct homa_recvmsg_args) != 88); +#ifndef __STRIP__ /* See strip.py */ + BUILD_BUG_ON(sizeof(struct homa_abort_args) != 32); +#endif /* See strip.py */ + pr_err("Homa module loading\n"); #ifndef __STRIP__ /* See strip.py */ pr_notice("Homa structure sizes: homa_data_hdr %lu, homa_seg_hdr %lu, ack %lu, peer %lu, ip_hdr %lu flowi %lu ipv6_hdr %lu, flowi6 %lu tcp_sock %lu homa_rpc %lu sk_buff %lu rcvmsg_control %lu union sockaddr_in_union %lu HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", diff --git a/homa_utils.c b/homa_utils.c index 5c70a9fc..55db0ea5 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -28,12 +28,7 @@ int homa_init(struct homa *homa) { int err; -#ifndef __STRIP__ /* See strip.py */ - int i; - - _Static_assert(HOMA_MAX_PRIORITIES >= 8, - "Homa requires at least 8 priority levels"); -#endif /* See strip.py */ + IF_NO_STRIP(int i); memset(homa, 0, sizeof(*homa)); diff --git a/homa_wire.h b/homa_wire.h index 43e6d9c6..c15c8245 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -350,13 +350,6 @@ struct homa_data_hdr { /** @seg: First of possibly many segments. */ struct homa_seg_hdr seg; } __packed; -_Static_assert(sizeof(struct homa_data_hdr) <= HOMA_MAX_HEADER, - "homa_data_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); -_Static_assert(sizeof(struct homa_data_hdr) >= HOMA_MIN_PKT_LENGTH, - "homa_data_hdr too small: Homa doesn't currently have code to pad data packets"); -_Static_assert(((sizeof(struct homa_data_hdr) - sizeof(struct homa_seg_hdr)) & - 0x3) == 0, - " homa_data_hdr length not a multiple of 4 bytes (required for TCP/TSO compatibility"); /** * homa_data_len() - Returns the total number of bytes in a DATA packet @@ -403,8 +396,6 @@ struct homa_grant_hdr { */ __u8 resend_all; } __packed; -_Static_assert(sizeof(struct homa_grant_hdr) <= HOMA_MAX_HEADER, - "homa_grant_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); #endif /* See strip.py */ /** @@ -444,8 +435,6 @@ struct homa_resend_hdr { __u8 priority; #endif /* See strip.py */ } __packed; -_Static_assert(sizeof(struct homa_resend_hdr) <= HOMA_MAX_HEADER, - "homa_resend_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct homa_rpc_unknown_hdr - Wire format for RPC_UNKNOWN packets. @@ -460,8 +449,6 @@ struct homa_rpc_unknown_hdr { /** @common: Fields common to all packet types. */ struct homa_common_hdr common; } __packed; -_Static_assert(sizeof(struct homa_rpc_unknown_hdr) <= HOMA_MAX_HEADER, - "homa_rpc_unknown_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct homa_busy_hdr - Wire format for BUSY packets. @@ -473,8 +460,6 @@ struct homa_busy_hdr { /** @common: Fields common to all packet types. */ struct homa_common_hdr common; } __packed; -_Static_assert(sizeof(struct homa_busy_hdr) <= HOMA_MAX_HEADER, - "homa_busy_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); #ifndef __STRIP__ /* See strip.py */ /** @@ -501,9 +486,7 @@ struct homa_cutoffs_hdr { */ __be16 cutoff_version; } __packed; -_Static_assert(sizeof(struct homa_cutoffs_hdr) <= HOMA_MAX_HEADER, - "homa_cutoffs_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); - #endif /* See strip.py */ +#endif /* See strip.py */ #ifndef __UPSTREAM__ /* See strip.py */ /** @@ -516,8 +499,6 @@ struct homa_freeze_hdr { /** @common: Fields common to all packet types. */ struct homa_common_hdr common; } __packed; -_Static_assert(sizeof(struct homa_freeze_hdr) <= HOMA_MAX_HEADER, - "homa_freeze_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); #endif /* See strip.py */ /** @@ -530,8 +511,6 @@ struct homa_need_ack_hdr { /** @common: Fields common to all packet types. */ struct homa_common_hdr common; } __packed; -_Static_assert(sizeof(struct homa_need_ack_hdr) <= HOMA_MAX_HEADER, - "homa_need_ack_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * struct homa_ack_hdr - Wire format for ACK packets. @@ -551,8 +530,6 @@ struct homa_ack_hdr { /** @acks: Info about RPCs that are no longer active. */ struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; } __packed; -_Static_assert(sizeof(struct homa_ack_hdr) <= HOMA_MAX_HEADER, - "homa_ack_hdr too large for HOMA_MAX_HEADER; must adjust HOMA_MAX_HEADER"); /** * homa_local_id(): given an RPC identifier from an input packet (which From 0299e612278756762db86d23119ae4a6a51dbe44 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 22 May 2025 10:37:26 -0700 Subject: [PATCH 330/625] Invoke dst->check in homa_get_dst Must do this in addition to checking dst->obsolete. Also added new metric: peer_dst_refreshes. --- homa_metrics.c | 2 ++ homa_metrics.h | 6 ++++++ homa_peer.c | 1 + homa_peer.h | 3 ++- test/mock.c | 13 ++++++++++++- test/mock.h | 3 +++ test/unit_homa_peer.c | 31 +++++++++++++++++++++++++++++-- 7 files changed, 55 insertions(+), 4 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index 2c6f0bd6..00732843 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -267,6 +267,8 @@ char *homa_metrics_print(void) m->peer_kmalloc_errors); M("peer_route_errors %15llu Routing failures creating peer table entries\n", m->peer_route_errors); + M("peer_dst_refreshes %15llu Obsolete dsts had to be regenerated\n", + m->peer_dst_refreshes); M("control_xmit_errors %15llu Errors sending control packets\n", m->control_xmit_errors); M("data_xmit_errors %15llu Errors sending data packets\n", diff --git a/homa_metrics.h b/homa_metrics.h index 678f0c1e..02626b62 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -324,6 +324,12 @@ struct homa_metrics { */ u64 peer_route_errors; + /** + * @peer_dst_refresh: total number of times that homa_dst_refresh + * was called to update an obsolete dst for a peer. + */ + u64 peer_dst_refreshes; + /** * @control_xmit_errors: total number of times ip_queue_xmit * failed when transmitting a control packet. diff --git a/homa_peer.c b/homa_peer.c index 7c660c56..493940e8 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -564,6 +564,7 @@ void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, { struct dst_entry *dst; + INC_METRIC(peer_dst_refreshes, 1); dst = homa_peer_get_dst(peer, hsk); if (IS_ERR(dst)) { #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_peer.h b/homa_peer.h index f153822b..29d49e9d 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -359,7 +359,8 @@ static inline void homa_peer_unlock(struct homa_peer *peer) static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk) { - if (unlikely(peer->dst->obsolete > 0)) + if (unlikely(peer->dst->obsolete && + !peer->dst->ops->check(peer->dst, 0))) homa_dst_refresh(hsk->homa->peertab, peer, hsk); dst_hold(peer->dst); return peer->dst; diff --git a/test/mock.c b/test/mock.c index 10f1c953..6801b269 100644 --- a/test/mock.c +++ b/test/mock.c @@ -38,6 +38,7 @@ int mock_copy_data_errors; int mock_copy_to_iter_errors; int mock_copy_to_user_errors; int mock_cpu_idle; +int mock_dst_check_errors; int mock_import_ubuf_errors; int mock_import_iovec_errors; int mock_ip6_xmit_errors; @@ -239,7 +240,9 @@ static int mock_num_hnets; */ int mock_peer_free_no_fail; -struct dst_ops mock_dst_ops = {.mtu = mock_get_mtu}; +struct dst_ops mock_dst_ops = { + .mtu = mock_get_mtu, + .check = mock_dst_check}; struct netdev_queue mock_net_queue = {.state = 0}; struct net_device mock_net_device = { .gso_max_segs = 1000, @@ -1639,6 +1642,13 @@ void mock_data_ready(struct sock *sk) unit_log_printf("; ", "sk->sk_data_ready invoked"); } +struct dst_entry *mock_dst_check(struct dst_entry *dst, __u32 cookie) +{ + if (mock_check_error(&mock_dst_check_errors)) + return NULL; + return dst; +} + /** * mock_get_clock() - Replacement for homa_clock; allows time to be * controlled by unit tests. @@ -2104,6 +2114,7 @@ void mock_teardown(void) mock_num_clock_vals = 0; mock_tt_cycles = 0; mock_ipv6 = mock_ipv6_default; + mock_dst_check_errors = 0; mock_import_ubuf_errors = 0; mock_import_iovec_errors = 0; mock_ip6_xmit_errors = 0; diff --git a/test/mock.h b/test/mock.h index 5abf8568..9f508b33 100644 --- a/test/mock.h +++ b/test/mock.h @@ -120,6 +120,7 @@ extern int mock_copy_data_errors; extern int mock_copy_to_user_dont_copy; extern int mock_copy_to_user_errors; extern int mock_cpu_idle; +extern int mock_dst_check_errors; extern int mock_import_iovec_errors; extern int mock_import_ubuf_errors; extern int mock_ip6_xmit_errors; @@ -174,6 +175,8 @@ void mock_clear_xmit_prios(void); unsigned int mock_compound_order(struct page *page); int mock_cpu_to_node(int core); void mock_data_ready(struct sock *sk); +struct dst_entry + *mock_dst_check(struct dst_entry *, __u32 cookie); cycles_t mock_get_cycles(void); unsigned int mock_get_mtu(const struct dst_entry *dst); diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index fe9bd3e1..dc21e8a5 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -787,7 +787,7 @@ TEST_F(homa_peer, homa_unsched_priority) } #endif /* See strip.py */ -TEST_F(homa_peer, homa_peer_get_dst_ipv4) +TEST_F(homa_peer, homa_peer_get_dst__ipv4) { struct dst_entry *dst; @@ -807,7 +807,7 @@ TEST_F(homa_peer, homa_peer_get_dst_ipv4) homa_print_ipv4_addr(peer->flow.u.ip4.daddr)); homa_peer_release(peer); } -TEST_F(homa_peer, homa_peer_get_dst_ipv6) +TEST_F(homa_peer, homa_peer_get_dst__ipv6) { struct dst_entry *dst; char buffer[30]; @@ -948,3 +948,30 @@ TEST_F(homa_peer, homa_peer_update_sysctl_deps) EXPECT_EQ(10*HZ, peertab->idle_jiffies_min); EXPECT_EQ(100*HZ, peertab->idle_jiffies_max); } + +/* Functions in homa_peer.h: */ + +TEST_F(homa_peer, homa_get_dst__normal) +{ + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + struct dst_entry *dst; + + dst = homa_get_dst(peer, &self->hsk); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + EXPECT_EQ(0, homa_metrics_per_cpu()->peer_dst_refreshes); + dst_release(dst); + homa_peer_release(peer); +} +TEST_F(homa_peer, homa_get_dst__must_refresh) +{ + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + struct dst_entry *dst; + + peer->dst->obsolete = 1; + mock_dst_check_errors = 1; + dst = homa_get_dst(peer, &self->hsk); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes); + dst_release(dst); + homa_peer_release(peer); +} From b7b6a30192d37478d1a079b15613e882319b9a68 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 23 May 2025 09:12:13 -0700 Subject: [PATCH 331/625] Fix issues with __STRIP__ caused by recent commits --- homa_impl.h | 4 +--- homa_interest.c | 2 +- homa_peer.c | 4 +++- homa_peer.h | 8 ++++---- homa_plumbing.c | 4 +++- homa_stub.h | 2 +- test/mock.c | 9 +++++++++ test/unit_homa_peer.c | 10 ++++++++-- test/unit_homa_pool.c | 4 ++++ test/unit_homa_rpc.c | 2 +- test/unit_homa_utils.c | 12 ++++++++++++ 11 files changed, 47 insertions(+), 14 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 4df2de59..297fc69c 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -639,7 +639,7 @@ static inline bool is_homa_pkt(struct sk_buff *skb) ((iph->protocol == IPPROTO_TCP) && (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); #else /* See strip.py */ - return ((iph->protocol == IPPROTO_HOMA); + return iph->protocol == IPPROTO_HOMA; #endif /* See strip.py */ } @@ -660,7 +660,6 @@ static inline bool homa_make_header_avl(struct sk_buff *skb) return pskb_may_pull(skb, pull_length); } -#ifndef __STRIP__ /* See strip.py */ #ifdef __UNIT_TEST__ void unit_log_printf(const char *separator, const char *format, ...) __printf(2, 3); @@ -671,7 +670,6 @@ void unit_hook(char *id); #define UNIT_LOG(...) #define UNIT_HOOK(...) #endif /* __UNIT_TEST__ */ -#endif /* See strip.py */ extern unsigned int homa_net_id; extern struct homa_shared *homa_shared; diff --git a/homa_interest.c b/homa_interest.c index caf2a3a1..69fb00b8 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -48,7 +48,7 @@ int homa_interest_init_private(struct homa_interest *interest, interest->rpc = rpc; atomic_set(&interest->ready, 0); - interest->core = raw_smp_processor_id(); + IF_NO_STRIP(interest->core = raw_smp_processor_id()); interest->blocked = 0; init_waitqueue_head(&interest->wait_queue); interest->hsk = rpc->hsk; diff --git a/homa_peer.c b/homa_peer.c index 493940e8..7c55572d 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -108,9 +108,11 @@ struct homa_peertab *homa_peer_alloc_peertab(void) homa_peer_update_sysctl_deps(peertab); return peertab; +#ifndef __STRIP__ /* See strip.py */ error: homa_peer_free_peertab(peertab); return ERR_PTR(err); +#endif /* See strip.py */ } /** @@ -758,7 +760,6 @@ int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst) return count; } -#ifndef __STRIP__ /* See strip.py */ /** * homa_peer_update_sysctl_deps() - Update any peertab fields that depend * on values set by sysctl. This function is invoked anytime a peer sysctl @@ -771,6 +772,7 @@ void homa_peer_update_sysctl_deps(struct homa_peertab *peertab) peertab->idle_jiffies_max = peertab->idle_secs_max * HZ; } +#ifndef __STRIP__ /* See strip.py */ /** * homa_peer_dointvec() - This function is a wrapper around proc_dointvec. It * is invoked to read and write peer-related sysctl values. diff --git a/homa_peer.h b/homa_peer.h index 29d49e9d..31f3a8a2 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -300,18 +300,18 @@ int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst); struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, struct homa_sock *hsk); -#ifndef __STRIP__ /* See strip.py */ -void homa_peer_lock_slow(struct homa_peer *peer); int homa_peer_pick_victims(struct homa_peertab *peertab, struct homa_peer *victims[], int max_victims); int homa_peer_prefer_evict(struct homa_peertab *peertab, struct homa_peer *peer1, struct homa_peer *peer2); void homa_peer_rcu_callback(struct rcu_head *head); +void homa_peer_wait_dead(struct homa_peertab *peertab); +void homa_peer_update_sysctl_deps(struct homa_peertab *peertab); +#ifndef __STRIP__ /* See strip.py */ +void homa_peer_lock_slow(struct homa_peer *peer); void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7); -void homa_peer_update_sysctl_deps(struct homa_peertab *peertab); -void homa_peer_wait_dead(struct homa_peertab *peertab); #endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_plumbing.c b/homa_plumbing.c index ee915ced..8f158381 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -484,8 +484,10 @@ int __init homa_load(void) BUILD_BUG_ON((sizeof(struct homa_data_hdr) - sizeof(struct homa_seg_hdr)) & 0x3); +#ifndef __STRIP__ /* See strip.py */ /* Homa requires at least 8 priority levels. */ BUILD_BUG_ON(HOMA_MAX_PRIORITIES < 8); +#endif /* See strip.py */ /* Detect size changes in uAPI structs. */ BUILD_BUG_ON(sizeof(struct homa_sendmsg_args) != 24); @@ -636,12 +638,12 @@ void __exit homa_unload(void) wake_up_process(timer_kthread); wait_for_completion(&timer_thread_done); } - unregister_pernet_subsys(&homa_net_ops); if (homa_offload_end() != 0) pr_err("Homa couldn't stop offloads\n"); unregister_net_sysctl_table(homa_ctl_header); homa_metrics_end(); #endif /* See strip.py */ + unregister_pernet_subsys(&homa_net_ops); homa_destroy(homa); inet_del_protocol(&homa_protocol, IPPROTO_HOMA); inet_unregister_protosw(&homa_protosw); diff --git a/homa_stub.h b/homa_stub.h index 8dee617c..aefe816d 100644 --- a/homa_stub.h +++ b/homa_stub.h @@ -79,7 +79,7 @@ static inline struct sk_buff *homa_skb_alloc_tx(int length) skb = alloc_skb(HOMA_SKB_EXTRA + sizeof(struct homa_skb_info) + length, GFP_ATOMIC); if (likely(skb)) { - skb_reserve(skb, HOMA_SKB_EXTRA + HOMA_IPV6_HEADER_LENGTH); + skb_reserve(skb, HOMA_SKB_EXTRA); skb_reset_transport_header(skb); } return skb; diff --git a/test/mock.c b/test/mock.c index 6801b269..5e1af8c3 100644 --- a/test/mock.c +++ b/test/mock.c @@ -496,6 +496,15 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_base.get_time = &hrtimer_get_time; } +void hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + timer->base = &clock_base; + clock_base.get_time = &hrtimer_get_time; + timer->function = function; +} + void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode) {} diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index dc21e8a5..97fef956 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -119,6 +119,7 @@ TEST_F(homa_peer, homa_peer_alloc_peertab__rhashtable_init_fails) EXPECT_TRUE(IS_ERR(peertab)); EXPECT_EQ(EINVAL, -PTR_ERR(peertab)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_peer, homa_peer_alloc_peertab__cant_register_sysctl) { struct homa_peertab *peertab; @@ -130,6 +131,7 @@ TEST_F(homa_peer, homa_peer_alloc_peertab__cant_register_sysctl) EXPECT_SUBSTR("couldn't register sysctl parameters for Homa peertab", mock_printk_output); } +#endif /* See strip.py */ TEST_F(homa_peer, homa_peer_free_net__basics) { @@ -200,8 +202,12 @@ TEST_F(homa_peer, homa_peer_free_peertab__basics) { unit_log_clear(); homa_peer_free_peertab(self->homa.peertab); +#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("peer [2::2:2:2] has reference count 1; " "unregister_net_sysctl_table", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("peer [2::2:2:2] has reference count 1", unit_log_get()); +#endif /* See strip.py */ kfree(peer); self->homa.peertab = homa_peer_alloc_peertab(); @@ -958,7 +964,7 @@ TEST_F(homa_peer, homa_get_dst__normal) dst = homa_get_dst(peer, &self->hsk); EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); - EXPECT_EQ(0, homa_metrics_per_cpu()->peer_dst_refreshes); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->peer_dst_refreshes)); dst_release(dst); homa_peer_release(peer); } @@ -971,7 +977,7 @@ TEST_F(homa_peer, homa_get_dst__must_refresh) mock_dst_check_errors = 1; dst = homa_get_dst(peer, &self->hsk); EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); - EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes)); dst_release(dst); homa_peer_release(peer); } diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 67e93b20..045057a2 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -60,6 +60,7 @@ static void steal_bpages_hook(char *id) atomic_set(&cur_pool->descriptors[3].refs, 1); } } +#ifndef __STRIP__ /* See strip.py */ static void change_owner_hook(char *id) { if (strcmp(id, "spin_lock") != 0) @@ -69,6 +70,7 @@ static void change_owner_hook(char *id) cur_pool->descriptors[cur_pool->cores[smp_processor_id()] .page_hint].owner = -1; } +#endif /* See strip.py */ TEST_F(homa_pool, set_bpages_needed) { @@ -339,6 +341,7 @@ TEST_F(homa_pool, homa_pool_alloc_msg__no_partial_page) EXPECT_EQ(HOMA_BPAGE_SIZE, crpc->msgin.bpage_offsets[1]); EXPECT_EQ(0, atomic_read(&pool->free_bpages)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_pool, homa_pool_alloc_msg__owned_page_locked_and_page_stolen) { struct homa_pool *pool = self->hsk.buffer_pool; @@ -367,6 +370,7 @@ TEST_F(homa_pool, homa_pool_alloc_msg__owned_page_locked_and_page_stolen) EXPECT_EQ(1, pool->descriptors[3].owner); EXPECT_EQ(38, atomic_read(&pool->free_bpages)); } +#endif /* See strip.py */ TEST_F(homa_pool, homa_pool_alloc_msg__page_wrap_around) { struct homa_pool *pool = self->hsk.buffer_pool; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index a421168d..9b90203c 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -109,7 +109,7 @@ static const char *dead_rpcs(struct homa_sock *hsk) struct homa_rpc *rpc; list_for_each_entry_rcu(rpc, &hsk->dead_rpcs, dead_links) - UNIT_LOG(" ", "%llu", rpc->id); + unit_log_printf(" ", "%llu", rpc->id); return unit_log_get(); } diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 0c7968b7..ebd42808 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -76,7 +76,11 @@ TEST_F(homa_utils, homa_init__pacer_alloc_failure) { struct homa homa2; +#ifndef __STRIP__ /* See strip.py */ mock_kmalloc_errors = 2; +#else /* See strip.py */ + mock_kmalloc_errors = 1; +#endif/* See strip.py */ unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); EXPECT_SUBSTR("homa_pacer_alloc couldn't allocate homa_pacer struct", @@ -88,7 +92,11 @@ TEST_F(homa_utils, homa_init__peertab_alloc_failure) { struct homa homa2; +#ifndef __STRIP__ /* See strip.py */ mock_kmalloc_errors = 4; +#else /* See strip.py */ + mock_kmalloc_errors = 2; +#endif/* See strip.py */ unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); EXPECT_SUBSTR("homa_peer_alloc_peertab couldn't create peertab: kmalloc failure", @@ -100,7 +108,11 @@ TEST_F(homa_utils, homa_init__cant_allocate_port_map) { struct homa homa2; +#ifndef __STRIP__ /* See strip.py */ mock_kmalloc_errors = 0x10; +#else /* See strip.py */ + mock_kmalloc_errors = 8; +#endif/* See strip.py */ unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); EXPECT_SUBSTR("homa_init couldn't create socktab: kmalloc failure", From 82c89b32837b842e87e76fb4427b29011b2a6a02 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 23 May 2025 09:49:17 -0700 Subject: [PATCH 332/625] Refactor hom_skb so that BIT can be used for HOMA_SOCKTAB_BUCKETS --- homa_sock.c | 5 +++-- homa_sock.h | 7 +++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/homa_sock.c b/homa_sock.c index b6ebf5dc..cd0beb7b 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -96,8 +96,9 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) if (next) goto success; } - while (scan->current_bucket < HOMA_SOCKTAB_BUCKETS - 1) { - scan->current_bucket++; + for (scan->current_bucket++; + scan->current_bucket < HOMA_SOCKTAB_BUCKETS; + scan->current_bucket++) { bucket = &scan->socktab->buckets[scan->current_bucket]; next = rcu_dereference(hlist_first_rcu(bucket)); if (next) diff --git a/homa_sock.h b/homa_sock.h index 073d0a9b..671e22ee 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -15,11 +15,10 @@ void homa_sock_lock_slow(struct homa_sock *hsk); /** * define HOMA_SOCKTAB_BUCKETS - Number of hash buckets in a homa_socktab. - * Must be a power of 2. Note: can't use BIT here because the result needs - * to be signed. + * Must be a power of 2. */ #define HOMA_SOCKTAB_BUCKET_BITS 10 -#define HOMA_SOCKTAB_BUCKETS (1 << HOMA_SOCKTAB_BUCKET_BITS) +#define HOMA_SOCKTAB_BUCKETS BIT(HOMA_SOCKTAB_BUCKET_BITS) /** * struct homa_socktab - A hash table that maps from port numbers (either @@ -61,7 +60,7 @@ struct homa_socktab_scan { /** * @current_bucket: The index of the bucket in socktab->buckets - * currently being scanned. + * currently being scanned (-1 if @hsk == NULL). */ int current_bucket; }; From c384a4ea7e94258b1c295c403ecb2d83d959ddfb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 23 May 2025 14:33:44 -0700 Subject: [PATCH 333/625] Cleanup issues from checkpatch.pl, kernel-doc, and sparse --- Makefile | 38 ++++++++++++++++++++++---------------- homa_devel.c | 4 ++-- homa_impl.h | 11 ++++++----- homa_incoming.c | 5 +++-- homa_metrics.h | 2 +- homa_outgoing.c | 6 +++--- homa_peer.c | 2 +- homa_peer.h | 2 +- homa_plumbing.c | 4 ++-- homa_rpc.c | 18 +++++++++--------- homa_rpc.h | 2 +- homa_sock.c | 4 ++-- homa_sock.h | 7 ++----- util/strip.py | 2 ++ 14 files changed, 57 insertions(+), 50 deletions(-) diff --git a/Makefile b/Makefile index 05075d31..795b6cfa 100644 --- a/Makefile +++ b/Makefile @@ -14,11 +14,6 @@ HOMA_OBJS := homa_devel.o \ homa_utils.o \ timetrace.o -ifneq ($(KERNELRELEASE),) - -obj-m += homa.o -homa-y = $(HOMA_OBJS) - ifneq ($(__STRIP__),) MY_CFLAGS += -D__STRIP__ else @@ -28,10 +23,8 @@ HOMA_OBJS += homa_grant.o \ homa_skb.o endif -MY_CFLAGS += -g -ccflags-y += $(MY_CFLAGS) - -else +CHECK_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o timetrace.o, $(HOMA_OBJS))) +CHECK_SRCS += $(filter-out homa_receiver.h homa_devel.h, $(wildcard *.h)) ifneq ($(KERNEL_SRC),) # alternatively to variable KDIR accept variable KERNEL_SRC as used in @@ -42,17 +35,31 @@ endif LINUX_VERSION ?= $(shell uname -r) KDIR ?= /lib/modules/$(LINUX_VERSION)/build +LINUX_SRC_DIR ?= ../net-next + +ifneq ($(KERNELRELEASE),) + +obj-m += homa.o +homa-y = $(HOMA_OBJS) + +MY_CFLAGS += -g +ccflags-y += $(MY_CFLAGS) + +else + all: $(MAKE) -C $(KDIR) M=$(shell pwd) modules install: $(MAKE) -C $(KDIR) M=$(shell pwd) modules_install -check: - ../homaLinux/scripts/kernel-doc -none *.c +kdoc: + $(LINUX_SRC_DIR)/scripts/kernel-doc -none $(CHECK_SRCS) + +checkpatch: + $(LINUX_SRC_DIR)/scripts/checkpatch.pl --file --strict $(CHECK_SRCS) # Copy stripped source files to a Linux source tree -LINUX_SRC_DIR ?= ../net-next HOMA_TARGET ?= $(LINUX_SRC_DIR)/net/homa CP_HDRS := homa_impl.h \ homa_interest.h \ @@ -63,10 +70,9 @@ CP_HDRS := homa_impl.h \ homa_sock.h \ homa_stub.h \ homa_wire.h -CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o timetrace.o, $(HOMA_OBJS))) -CP_EXTRAS := reap.txt \ - sync.txt \ - Kconfig \ +CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o homa_grant.o \ + homa_metrics.o homa_offload.o homa_skb.o timetrace.o, $(HOMA_OBJS))) +CP_EXTRAS := Kconfig \ Makefile \ strip_decl.py CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS) $(CP_EXTRAS)) diff --git a/homa_devel.c b/homa_devel.c index 75132653..e7527d50 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -373,7 +373,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) * homa_freeze_peers() - Send FREEZE packets to all known peers in the * root network namespace. */ -void homa_freeze_peers() +void homa_freeze_peers(void) { struct homa_socktab_scan scan; struct homa_freeze_hdr freeze; @@ -420,7 +420,7 @@ void homa_freeze_peers() err = __homa_xmit_control(&freeze, sizeof(freeze), peer, hsk); if (err != 0) tt_record2("homa_freeze_peers got error %d in xmit to 0x%x\n", - err, tt_addr(peer->addr)); + err, tt_addr(peer->addr)); } rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); diff --git a/homa_impl.h b/homa_impl.h index 297fc69c..47e09348 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -129,7 +129,7 @@ struct homa { struct homa_pacer *pacer; /** - * @peers: Info about all the other hosts we have communicated with; + * @peertab: Info about all the other hosts we have communicated with; * includes peers from all network namespaces. */ struct homa_peertab *peertab; @@ -449,7 +449,7 @@ struct homa { /** * @bpage_lease_cycles: same as bpage_lease_usecs except in * homa_clock() units. - * */ + */ int bpage_lease_cycles; /** @@ -498,7 +498,8 @@ struct homa_net { */ __u16 prev_default_port; - /* @num_peers: The total number of struct homa_peers that exist + /** + * @num_peers: The total number of struct homa_peers that exist * for this namespace. Managed by homa_peer.c under the peertab lock. */ int num_peers; @@ -876,7 +877,7 @@ static inline u64 homa_ns_to_cycles(u64 ns) } /** - * homa_usec_to_cycles() - Convert from units of microseconds to units of + * homa_usecs_to_cycles() - Convert from units of microseconds to units of * homa_clock(). * @usecs: A time measurement in microseconds * Return: The time in homa_clock() units corresponding to @usecs. @@ -926,7 +927,7 @@ static inline u64 homa_usecs_to_cycles(u64 usecs) * are spinlocks. When multiple locks are held, they must be acquired in a * consistent order in order to prevent deadlock. Here are the rules for Homa: * 1. Except for RPC and socket locks, all locks should be considered - * "leaf" locks: don't accquire other locks while holding them. + * "leaf" locks: don't acquire other locks while holding them. * 2. The lock order is: * * RPC lock * * Socket lock diff --git a/homa_incoming.c b/homa_incoming.c index 567c6bed..9e56f0cd 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -146,7 +146,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (start > rpc->msgin.recv_end) { /* Packet creates a new gap. */ if (!homa_gap_alloc(&rpc->msgin.gaps, - rpc->msgin.recv_end, start)) { + rpc->msgin.recv_end, start)) { pr_err("Homa couldn't allocate gap: insufficient memory\n"); tt_record2("Couldn't allocate gap for id %d (start %d): no memory", rpc->id, start); @@ -476,7 +476,8 @@ void homa_dispatch_pkts(struct sk_buff *skb) * already exist. */ rpc = homa_rpc_alloc_server(hsk, &saddr, - h, &created); + h, + &created); if (IS_ERR(rpc)) { pr_warn("homa_pkt_dispatch couldn't create server rpc: error %lu", -PTR_ERR(rpc)); diff --git a/homa_metrics.h b/homa_metrics.h index 02626b62..b1416d20 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -325,7 +325,7 @@ struct homa_metrics { u64 peer_route_errors; /** - * @peer_dst_refresh: total number of times that homa_dst_refresh + * @peer_dst_refreshes: total number of times that homa_dst_refresh * was called to update an obsolete dst for a peer. */ u64 peer_dst_refreshes; diff --git a/homa_outgoing.c b/homa_outgoing.c index 752633d4..ebdfd6aa 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -125,8 +125,8 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, * Return: A pointer to the new packet, or a negative errno. */ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, - struct iov_iter *iter, int offset, - int length, int max_seg_data) + struct iov_iter *iter, int offset, + int length, int max_seg_data) __must_hold(rpc_bucket_lock) { struct homa_skb_info *homa_info; @@ -356,7 +356,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) if (skb_data_bytes > bytes_left) skb_data_bytes = bytes_left; skb = homa_tx_data_pkt_alloc(rpc, iter, offset, skb_data_bytes, - max_seg_data); + max_seg_data); if (IS_ERR(skb)) { err = PTR_ERR(skb); homa_rpc_lock(rpc); diff --git a/homa_peer.c b/homa_peer.c index 7c55572d..43e2bb60 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -505,7 +505,7 @@ void homa_peer_free(struct homa_peer *peer) * eventually call homa_peer_release to release the reference. */ struct homa_peer *homa_peer_get(struct homa_sock *hsk, - const struct in6_addr *addr) + const struct in6_addr *addr) { struct homa_peertab *peertab = hsk->homa->peertab; struct homa_peer *peer, *other; diff --git a/homa_peer.h b/homa_peer.h index 31f3a8a2..5ac76899 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -442,7 +442,7 @@ static inline int homa_peer_compare(struct rhashtable_compare_arg *arg, const struct homa_peer_key *key = arg->key; return !(ipv6_addr_equal(&key->addr, &peer->ht_key.addr) && - peer->ht_key.hnet == key->hnet); + peer->ht_key.hnet == key->hnet); } #endif /* _HOMA_PEER_H */ diff --git a/homa_plumbing.c b/homa_plumbing.c index 8f158381..14c0282d 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1312,7 +1312,7 @@ int homa_softirq(struct sk_buff *skb) { struct sk_buff *packets, *other_pkts, *next; struct sk_buff **prev_link, **other_link; - struct homa *homa = homa_from_skb(skb); + IF_NO_STRIP(struct homa *homa = homa_from_skb(skb)); struct homa_common_hdr *h; int header_offset; #ifndef __STRIP__ /* See strip.py */ @@ -1378,7 +1378,7 @@ int homa_softirq(struct sk_buff *skb) */ if (unlikely(h->type == FREEZE)) { if (!atomic_read(&tt_frozen)) { - homa_rpc_log_active_tt(homa, 0); + homa_rpc_log_active_tt(homa_from_skb(skb), 0); tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", ntohs(h->dport), tt_addr(skb_canonical_ipv6_saddr(skb)), diff --git a/homa_rpc.c b/homa_rpc.c index 5a81d01b..df2c9d9e 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -28,7 +28,7 @@ * caller must eventually unlock it. */ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, - const union sockaddr_in_union *dest) + const union sockaddr_in_union *dest) __acquires(rpc_bucket_lock) { struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); @@ -112,8 +112,8 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, * to h, then it is returned instead of creating a new RPC. */ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, - const struct in6_addr *source, - struct homa_data_hdr *h, int *created) + const struct in6_addr *source, + struct homa_data_hdr *h, int *created) __acquires(rpc_bucket_lock) { u64 id = homa_local_id(h->common.sender_id); @@ -438,7 +438,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * * Homa now reaps in two other places, if reaping while waiting for * messages isn't adequate: - * 1. If too may dead skbs accumulate, then homa_timer will call + * 1. If too may dead skbs accumulate, then homa_timer will call * homa_rpc_reap. * 2. If this timer thread cannot keep up with all the reaping to be * done then as a last resort homa_dispatch_pkts will reap in small @@ -447,14 +447,14 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * performance. * * During the introduction of homa_pools for managing input - * buffers, freeing of packets for incoming messages was moved to - * homa_copy_to_user under the assumption that this code wouldn't be + * buffers, freeing of packets for incoming messages was moved to + * homa_copy_to_user under the assumption that this code wouldn't be * on the critical path. However, there is evidence that with * fast networks (e.g. 100 Gbps) copying to user space is the * bottleneck for incoming messages, and packet freeing takes about - * 20-25% of the total time in homa_copy_to_user. So, it may eventually - * be desirable to remove packet freeing out of homa_copy_to_user. - */ + * 20-25% of the total time in homa_copy_to_user. So, it may eventually + * be desirable to remove packet freeing out of homa_copy_to_user. + */ #ifdef __UNIT_TEST__ #define BATCH_MAX 3 #else /* __UNIT_TEST__ */ diff --git a/homa_rpc.h b/homa_rpc.h index fb7369d3..92b54705 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -414,7 +414,7 @@ struct homa_rpc { }; void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, - int port, int error); + int port, int error); void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); void homa_rpc_abort(struct homa_rpc *crpc, int error); struct homa_rpc diff --git a/homa_sock.c b/homa_sock.c index cd0beb7b..7a029e47 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -225,7 +225,7 @@ int homa_sock_init(struct homa_sock *hsk) } hlist_add_head_rcu(&hsk->socktab_links, &socktab->buckets[homa_socktab_bucket(hnet, - hsk->port)]); + hsk->port)]); spin_unlock_bh(&socktab->write_lock); return result; @@ -463,7 +463,7 @@ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) id, bucket->id); spin_lock_bh(&bucket->lock); tt_record2("ending wait for bucket lock, id %d, (bucket %d)", - id, bucket->id); + id, bucket->id); if (homa_is_client(id)) { INC_METRIC(client_lock_misses, 1); INC_METRIC(client_lock_miss_cycles, homa_clock() - start); diff --git a/homa_sock.h b/homa_sock.h index 671e22ee..60011add 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -13,10 +13,7 @@ struct homa_pool; void homa_sock_lock_slow(struct homa_sock *hsk); #endif /* See strip.py */ -/** - * define HOMA_SOCKTAB_BUCKETS - Number of hash buckets in a homa_socktab. - * Must be a power of 2. - */ +/* Number of hash buckets in a homa_socktab. Must be a power of 2. */ #define HOMA_SOCKTAB_BUCKET_BITS 10 #define HOMA_SOCKTAB_BUCKETS BIT(HOMA_SOCKTAB_BUCKET_BITS) @@ -146,7 +143,7 @@ struct homa_sock { struct homa *homa; /** - * @homa_net: Overall state specific to the network namespace for + * @hnet: Overall state specific to the network namespace for * this socket. */ struct homa_net *hnet; diff --git a/util/strip.py b/util/strip.py index 7cde6b54..c57a14cc 100755 --- a/util/strip.py +++ b/util/strip.py @@ -366,6 +366,8 @@ def scan(file): # Remove braces for blocks that now have only a single statement if pline == '}' or pline.startswith('} else'): + if slines[-1].strip() == '': + slines.pop() if check_braces: check_braces = False if open_index != None: From fb068b0afacf3edb68f565a29e4535453b6fa61c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 23 May 2025 16:16:06 -0700 Subject: [PATCH 334/625] Remove homa_shared from homa_impl.h --- homa_impl.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 47e09348..bad2bd6e 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -71,7 +71,6 @@ struct homa; struct homa_peer; struct homa_rpc; struct homa_sock; -struct homa_shared; #ifndef __STRIP__ /* See strip.py */ #include "timetrace.h" @@ -673,7 +672,6 @@ void unit_hook(char *id); #endif /* __UNIT_TEST__ */ extern unsigned int homa_net_id; -extern struct homa_shared *homa_shared; void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc); From 7f397a9317c28c349b045a232599c8e35e6c420b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 23 May 2025 17:38:15 -0700 Subject: [PATCH 335/625] Update strip_decl.py data and make small improvements --- util/strip_decl.py | 48 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/util/strip_decl.py b/util/strip_decl.py index f6c3c8b5..733ae282 100755 --- a/util/strip_decl.py +++ b/util/strip_decl.py @@ -29,20 +29,31 @@ # "all" includes all symbols. symbols = [ ['none'], + ['peer', + 'int homa_xmit_control(' + ], + ['pacer', + 'void homa_xmit_data(' + ], + ['rpc', + 'int homa_message_in_init(', + 'void homa_rpc_handoff(', + ], ['outgoing', 'int homa_fill_data_interleaved(', 'int homa_message_out_fill(', 'void homa_message_out_init(', + 'void homa_resend_data(', 'struct sk_buff *homa_tx_data_pkt_alloc(', - 'int homa_xmit_control(', 'int __homa_xmit_control(', - 'void homa_xmit_data(', 'void __homa_xmit_data(', 'void homa_xmit_unknown(' ], ['utils', 'void homa_destroy(', 'int homa_init(', + 'void homa_net_destroy(', + 'int homa_net_init(', 'void homa_spin(' ], ['incoming', @@ -53,11 +64,8 @@ 'void homa_dispatch_pkts(', 'struct homa_gap *homa_gap_alloc(', 'void homa_gap_retry(', - 'int homa_message_in_init(', 'void homa_need_ack_pkt(', - 'void homa_resend_data(', 'void homa_resend_pkt(', - 'void homa_rpc_handoff(', 'void homa_rpc_unknown_pkt(', 'int homa_wait_private(', 'struct homa_rpc *homa_wait_shared(' @@ -77,13 +85,14 @@ 'enum hrtimer_restart homa_hrtimer(', 'int homa_ioctl(', 'int homa_load(', - 'int homa_net_init(', 'void homa_net_exit(', + 'int homa_net_start(', '__poll_t homa_poll(', 'int homa_recvmsg(', 'int homa_sendmsg(', 'int homa_setsockopt(', 'int homa_shutdown(', + 'int homa_socket(', 'int homa_softirq(', 'void homa_unhash(', 'void homa_unload(' @@ -91,6 +100,14 @@ ['all'] ] +# A list of all of the line prefixes that have not yet been encountered +# in the source file. Used to print error messages at the end for any +# that don't appear anywhere in the file. +unseen = [] + +for patch in symbols: + for prefix in patch[1:]: + unseen.append(prefix) if len(sys.argv) != 4: print('Usage: strip_decl.py src dst patch') @@ -99,6 +116,14 @@ src = open(sys.argv[1]) dst = open(sys.argv[2], 'w') patch_name = sys.argv[3] +found_patch = False +for patch in symbols: + if patch[0] == patch_name: + found_patch = True + break +if not found_patch: + print('Unknown patch name "%s"' % (patch_name), file=sys.stderr) + exit(1) skipping_to_semi = False prev_line_empty = False for line in src: @@ -107,6 +132,11 @@ skipping_to_semi = False continue + for prefix in unseen: + if line.startswith(prefix): + unseen.remove(prefix) + break; + found_patch = False omit = False for patch in symbols: @@ -131,5 +161,11 @@ print(line, file=dst, end='') prev_line_empty = False +if unseen: + print('The following prefixes did not appear in %s:' % (sys.argv[1]), + file=sys.stderr) + for prefix in unseen: + print(prefix, file=sys.stderr) + dst.close() src.close() \ No newline at end of file From 002eb05cf090dcd36bb57cc3892f53f20084aeb1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 30 May 2025 14:11:10 -0700 Subject: [PATCH 336/625] Fix bug in homa_sock_shutdown Must reinitialize interest-links after unlinking; otherwise homa_interest->unlink_shared will following dangling pointers. This was causing stack corruption and kernel crashes. --- homa_sock.c | 2 +- test/unit_homa_sock.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/homa_sock.c b/homa_sock.c index 7a029e47..92e0a31d 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -301,7 +301,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) while (!list_empty(&hsk->interests)) { interest = list_first_entry(&hsk->interests, struct homa_interest, links); - __list_del_entry(&interest->links); + list_del_init(&interest->links); atomic_set_release(&interest->ready, 1); wake_up(&interest->wait_queue); } diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index d1c43fb4..ab781ce8 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -293,6 +293,7 @@ TEST_F(homa_sock, homa_sock_shutdown__wakeup_interests) EXPECT_EQ(1, atomic_read(&interest2.ready)); EXPECT_EQ(NULL, interest1.rpc); EXPECT_EQ(NULL, interest2.rpc); + EXPECT_TRUE(list_empty(&interest1.links)); EXPECT_STREQ("wake_up; wake_up", unit_log_get()); } From 4141253901040a9fa25500566b13c3bd4d42787c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 30 May 2025 14:17:14 -0700 Subject: [PATCH 337/625] Add missing initialization in homa_peer_alloc_peertab --- homa_peer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/homa_peer.c b/homa_peer.c index 43e2bb60..d1676220 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -83,6 +83,7 @@ struct homa_peertab *homa_peer_alloc_peertab(void) return ERR_PTR(-ENOMEM); } + spin_lock_init(&peertab->lock); err = rhashtable_init(&peertab->ht, &ht_params); if (err) { kfree(peertab); From db60ceb00fac2887deab246b69f049ad20307dad Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 30 May 2025 14:22:01 -0700 Subject: [PATCH 338/625] Avoid using port field from socket after socket has been destroyed. Also, remove SOCKET_CLOSE freeze type (haven't used it in ages). --- homa_devel.h | 5 ++--- homa_plumbing.c | 9 ++++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/homa_devel.h b/homa_devel.h index cdc33ec9..46639ba4 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -37,9 +37,8 @@ enum homa_freeze_type { RESTART_RPC = 1, PEER_TIMEOUT = 2, SLOW_RPC = 3, - SOCKET_CLOSE = 4, - PACKET_LOST = 5, - NEED_ACK_MISSING_DATA = 6, + PACKET_LOST = 4, + NEED_ACK_MISSING_DATA = 5, }; /** diff --git a/homa_plumbing.c b/homa_plumbing.c index 14c0282d..667a6313 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -719,14 +719,13 @@ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) void homa_close(struct sock *sk, long timeout) { struct homa_sock *hsk = homa_sk(sk); +#ifndef __UPSTREAM__ /* See strip.py */ + int port = hsk->port; +#endif/* See strip.py */ homa_sock_destroy(hsk); sk_common_release(sk); - tt_record1("closed socket, port %d", hsk->port); -#ifndef __STRIP__ /* See strip.py */ - if (hsk->homa->freeze_type == SOCKET_CLOSE) - tt_freeze(); -#endif /* See strip.py */ + tt_record1("closed socket, port %d", port); } /** From f08fbae2bf80497230111e2461e8d1970ff4e2d7 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 30 May 2025 14:29:08 -0700 Subject: [PATCH 339/625] Remove temporary instrumentation from timetrace.c --- timetrace.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/timetrace.c b/timetrace.c index 9916f5b1..43162a94 100644 --- a/timetrace.c +++ b/timetrace.c @@ -96,10 +96,6 @@ bool tt_test_no_khz; unsigned int cpu_khz = 1000000; #endif -#define MAX_IDS 10 -#define MAX_CORES 50 -static atomic_t id_counts[MAX_CORES][MAX_IDS]; - /** * tt_init(): Enable time tracing, create /proc file for reading traces. * @proc_file: Name of a file in /proc; this file can be read to extract @@ -836,25 +832,9 @@ void tt_get_messages(char *buffer, size_t length) */ void tt_dbg1(char *msg, ...) { - int id, core; - int problems = 0; - if (atomic_read(&tt_frozen)) return; tt_freeze(); - - for (core = 0; core < MAX_CORES; core++) { - for (id = 0; id < MAX_IDS; id++) { - int value = atomic_read(&id_counts[core][id]); - if (value != 0) { - pr_err("Core %d has count %d for id %d\n", - core, value, id); - problems++; - } - } - } - pr_err("tt_dbg1 found %d nonzero counters (running on core %d)\n", - problems, raw_smp_processor_id()); pr_err("Dumping timetrace\n"); tt_printk(); pr_err("Finished dumping timetrace\n"); @@ -867,17 +847,6 @@ void tt_dbg1(char *msg, ...) */ void tt_dbg2(char *msg, ...) { - va_list ap; - int core; - int id; - - va_start(ap, msg); - id = va_arg(ap, int); - core = va_arg(ap, int); - atomic_add(1, &id_counts[core][id]); - tt_record4("tt_dbg2 incremented counter %d for core %d to %d in pid %d", - id, core, atomic_read(&id_counts[core][id]), current->pid); - va_end(ap); } /** @@ -887,17 +856,6 @@ void tt_dbg2(char *msg, ...) */ void tt_dbg3(char *msg, ...) { - va_list ap; - int core; - int id; - - va_start(ap, msg); - id = va_arg(ap, int); - core = va_arg(ap, int); - atomic_sub(1, &id_counts[core][id]); - tt_record4("tt_dbg3 decremented counter %d for core %d to %d in pid %d", - id, core, atomic_read(&id_counts[core][id]), current->pid); - va_end(ap); } /** From a374bcc7c5fbc18001a80d3bf34b6422bc2ff0ee Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 30 May 2025 15:09:17 -0700 Subject: [PATCH 340/625] Update notes.txt --- notes.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/notes.txt b/notes.txt index 9080cfe0..7809b9b0 100755 --- a/notes.txt +++ b/notes.txt @@ -1,6 +1,13 @@ Notes for Homa implementation in Linux: --------------------------------------- +* Refactor resend mechanism: + * New method that handles resends: + * Request all gaps + * Reaquest entire message if nothing received yet + * On resender, if entire message resent, reset msgout state (e.g., granted) + * Issue resends when handling NEED_ACKs. + * Notes for the next design of grants: * Update tthoma.py: e.g., no grant_recalc records * grant_check_slow_path too high From e9fb87481e18138cc9d836544678755fc9e0359a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 30 May 2025 15:24:00 -0700 Subject: [PATCH 341/625] Don't reset silent_ticks on NEED_ACK packets Otherwise, can get in a strange state where the server sent a response and is waiting for an ACK, so it sends NEED_ACKs. But all the response packets got lost, so the client can't ack, and the NEED_ACKs reset silent ticks so the client doesn't issue RESENDs. --- homa_incoming.c | 3 +-- test/unit_homa_incoming.c | 27 +++++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 9e56f0cd..e798cfae 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -519,8 +519,7 @@ void homa_dispatch_pkts(struct sk_buff *skb) #ifndef __STRIP__ /* See strip.py */ h->common.type == GRANT || #endif /* See strip.py */ - h->common.type == BUSY || - h->common.type == NEED_ACK) + h->common.type == BUSY) rpc->silent_ticks = 0; rpc->peer->outstanding_resends = 0; } diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index f0db68a2..b11d97ad 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1076,9 +1076,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) .offset = htonl(12600), .priority = 3, .resend_all = 0}; ASSERT_NE(NULL, crpc); -#ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(10000, crpc->msgout.granted); -#endif /* See strip.py */ unit_log_clear(); crpc->silent_ticks = 5; crpc->peer->outstanding_resends = 2; @@ -1095,6 +1093,31 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) EXPECT_EQ(0, crpc->peer->outstanding_resends); } #endif /* See strip.py */ +TEST_F(homa_incoming, homa_dispatch_pkts__dont_reset_silent_ticks_on_NEED_ACK) +{ + /* Note: if NEED_ACKs cause silent_ticks to get reset, can get in + * a strange state where the server sent a response and is waiting + * for an ACK, so it sends NEED_ACKs. But all the response packets + * got lost, so the client can't ack, and the NEED_ACKs reset + * silent ticks so the client doesn't issue RESENDs. + */ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1600); + struct homa_need_ack_hdr h = {.common = { + .sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = NEED_ACK}}; + + ASSERT_NE(NULL, crpc); + unit_log_clear(); + crpc->silent_ticks = 2; + crpc->peer->outstanding_resends = 3; + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_EQ(2, crpc->silent_ticks); + EXPECT_EQ(0, crpc->peer->outstanding_resends); +} TEST_F(homa_incoming, homa_dispatch_pkts__multiple_ack_packets) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, From 45a8cae644ebc1fb6d05f2483bf7f2686292081c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 30 May 2025 15:48:09 -0700 Subject: [PATCH 342/625] Simplify calculation of incoming in outgoing packets In particular, don't increase incoming to match granted: the bytes that were previously granted might have gotten lost. --- homa_outgoing.c | 9 +-------- test/unit_homa_outgoing.c | 10 +--------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index ebdfd6aa..34d70ea9 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -829,14 +829,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) h->common.sequence = htonl(offset); h->seg.offset = htonl(offset); h->retransmit = 1; -#ifndef __STRIP__ /* See strip.py */ - if ((offset + seg_length) <= rpc->msgout.granted) - h->incoming = htonl(rpc->msgout.granted); - else if ((offset + seg_length) > rpc->msgout.length) - h->incoming = htonl(rpc->msgout.length); - else - h->incoming = htonl(offset + seg_length); -#endif /* See strip.py */ + IF_NO_STRIP(h->incoming = htonl(end)); err = homa_skb_append_from_skb(rpc->hsk->homa, new_skb, skb, seg_offset, seg_length); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index a35f4471..72d24e63 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1098,15 +1098,7 @@ TEST_F(homa_outgoing, homa_resend_data__set_incoming) mock_xmit_log_verbose = 1; EXPECT_EQ(10000, crpc->msgout.granted); homa_resend_data(crpc, 8400, 8800, 2); - EXPECT_SUBSTR("incoming 10000", unit_log_get()); - - unit_log_clear(); - homa_resend_data(crpc, 12900, 13000, 2); - EXPECT_SUBSTR("incoming 14200", unit_log_get()); - - unit_log_clear(); - homa_resend_data(crpc, 15700, 16500, 2); - EXPECT_SUBSTR("incoming 16000", unit_log_get()); + EXPECT_SUBSTR("incoming 8800", unit_log_get()); } TEST_F(homa_outgoing, homa_resend_data__error_copying_data) { From dec9e9bd37ff80c84328f46f3d36a725f8030343 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 30 May 2025 16:03:59 -0700 Subject: [PATCH 343/625] Add rank in rxsnapshot output from tthoma.py Also a few other minor fixes. --- util/tthoma.py | 99 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 32 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index f60b8bd7..199cb0d5 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -94,6 +94,8 @@ # granted: # of bytes granted for the incoming message # sent: # of bytes that have been sent for the outgoing message # as of the end of the trace +# rank: RPC's rank in list of grantable RPCs (0 -> highest +# priority) or -1 if not in grantable list class RpcDict(dict): def __missing__(self, id): new_rpc = {'node': Dispatcher.cur_trace['node'], @@ -1429,17 +1431,6 @@ def __softirq_resend(self, trace, time, core, match, interests): 'length ([0-9]+), prio ([0-9]+)' }) - def __bpages_alloced(self, trace, time, core, match, interests): - id = int(match.group(1)) - bpages = int(match.group(2)) - for interest in interests: - interest.tt_bpages_alloced(trace, time, core, id, bpages) - - patterns.append({ - 'name': 'bpages_alloced', - 'regexp': 'RPC id ([0-9]+) has ([0-9]+) bpages allocated' - }) - def __rpc_end(self, trace, time, core, match, interests): id = int(match.group(1)) for interest in interests: @@ -1496,15 +1487,38 @@ def __rpc_incoming2(self, trace, time, core, match, interests): id = int(match.group(1)) incoming = int(match.group(2)) granted = int(match.group(3)) - prio = int(match.group(4)) for interest in interests: - interest.tt_rpc_incoming2(trace, time, core, id, incoming, - granted, prio) + interest.tt_rpc_incoming2(trace, time, core, id, incoming, granted) patterns.append({ 'name': 'rpc_incoming2', - 'regexp': 'RPC id ([0-9]+) has incoming ([0-9]+), granted ([0-9]+), ' - 'prio ([0-9]+)' + 'regexp': 'RPC id ([0-9]+) has incoming ([-0-9]+), granted ([0-9]+)' + }) + + def __rpc_incoming3(self, trace, time, core, match, interests): + id = int(match.group(1)) + length = int(match.group(2)) + remaining = int(match.group(3)) + rank = int(match.group(4)) + for interest in interests: + interest.tt_rpc_incoming3(trace, time, core, id, length, + remaining, rank) + + patterns.append({ + 'name': 'rpc_incoming3', + 'regexp': 'RPC id ([0-9]+): length ([0-9]+), remaining ([0-9]+), ' + 'rank ([-0-9]+)' + }) + + def __bpages_alloced(self, trace, time, core, match, interests): + id = int(match.group(1)) + bpages = int(match.group(2)) + for interest in interests: + interest.tt_bpages_alloced(trace, time, core, id, bpages) + + patterns.append({ + 'name': 'bpages_alloced', + 'regexp': 'RPC id ([0-9]+) has ([0-9]+) bpages allocated' }) def __rpc_outgoing(self, trace, time, core, match, interests): @@ -1754,7 +1768,9 @@ def print_list(node, events, num_bytes, extra): max_rpcs_core = core # print('core_peers for %s: %s' % (node, core_peers)) extra = ' %7.2f (C%02d) %4.3f (C%02d) %4.3f (C%02d)' % ( - max_gbps, max_core, max_rpcs/total_rpcs, max_rpcs_core, + max_gbps, max_core, + max_rpcs/total_rpcs if total_rpcs != 0 else 0, + max_rpcs_core, max_pending/total_pending if total_pending != 0 else 0, max_pending_core) print_list(node, events, total_bytes, extra) @@ -5864,18 +5880,22 @@ def tt_rpc_end(self, trace, t, core, id): rpcs[id]['end'] = t def tt_rpc_incoming(self, trace, t, core, id, peer, received, length): - global rpcs, max_unsched + global rpcs rpc = rpcs[id] rpc['peer'] = peer rpc['in_length'] = length rpc['remaining'] = length - received - def tt_rpc_incoming2(self, trace, t, core, id, incoming, granted, prio): - global rpcs, max_unsched + def tt_rpc_incoming2(self, trace, t, core, id, incoming, granted): + global rpcs rpcs[id]['granted'] = granted + def tt_rpc_incoming3(self, trace, t, core, id, length, remaining, rank): + global rpcs + rpcs[id]['rank'] = rank + def tt_rpc_outgoing(self, trace, t, core, id, peer, sent, length): - global rpcs, max_unsched + global rpcs rpc = rpcs[id] rpc['peer'] = peer rpc['out_length'] = length @@ -6544,6 +6564,12 @@ def check_live(tx_id, node, t, receive): # Deduce missing fields in RPCs where possible for id, live_rpc in live_rpcs.items(): next_stage = 0 + if id^1 in rpcs: + rx_rpc = rpcs[id^1] + else: + rx_rpc = {} + if 'remaining' in rx_rpc and live_rpc['pre_softirq'] == 0: + live_rpc['pre_softirq'] = rx_rpc['in_length'] - rx_rpc['remaining'] for type in ['copied', 'softirq', 'gro', 'xmit']: pre_field = 'pre_' + type post_field = 'post_' + type @@ -6563,10 +6589,10 @@ def check_live(tx_id, node, t, receive): next_stage = 0 unsched = 0 - if id^1 in rpcs: - rx_rpc = rpcs[id^1] - if 'unsched' in rx_rpc: - unsched = rx_rpc['unsched'] + if 'unsched' in rx_rpc: + unsched = rx_rpc['unsched'] + if 'granted' in rx_rpc and live_rpc['pre_grant_xmit'] == 0: + live_rpc['pre_grant_xmit'] = rx_rpc['granted'] for type in ['softirq', 'gro', 'xmit']: pre_field = 'pre_grant_' + type post_field = 'post_grant_' + type @@ -6682,17 +6708,26 @@ def output(self): print('Copied: Offset just after last data byte that has been ' 'copied to user space') print('Incoming: Gxmit - SoftIrq') + print('Rank: Rank among RPCs receiving grants. Smaller means ' + 'higher priority,') + print(' blank means not grantable') print('Lost: Packets that appear to have been dropped in the network') print(' Id Peer Length GXmit GGro GSoft ', end='') - print(' Xmit Gro SoftIrq Copied Incoming Lost') - print('-------------------------------------------', end='') - print('---------------------------------------------') + print(' Xmit Gro SoftIrq Copied Incoming Rank Lost') + print('------------------------------------------------------', end='') + print('--------------------------------------------------') for id in sorted_ids: rx_rpc = rpcs[id^1] live_rpc = live_rpcs[id] - incoming = (live_rpc['pre_grant_xmit'] - live_rpc['pre_softirq'] - if live_rpc['pre_grant_xmit'] > 0 else 0) + incoming = live_rpc['pre_grant_xmit'] - live_rpc['pre_softirq'] + if incoming <= 0: + incoming = '' + rank = '' + if 'rank' in rx_rpc: + rank = rx_rpc['rank'] + if rank < 0: + rank = '' print('%10d %-10s %7s %7s %7s %7s ' % (id^1, rpcs[id]['node'] if id in rpcs else "", rx_rpc['in_length'] if rx_rpc['in_length'] != None else "", @@ -6702,9 +6737,9 @@ def output(self): if live_rpc['pre_grant_gro'] > 0 else "", str(live_rpc['pre_grant_softirq']) if live_rpc['pre_grant_softirq'] > 0 else ""), end='') - print('%7d %7d %7d %7d %7d %4d' % (live_rpc['pre_xmit'], + print('%7d %7d %7d %7d %7s %4s %4d' % (live_rpc['pre_xmit'], live_rpc['pre_gro'], live_rpc['pre_softirq'], - live_rpc['pre_copied'], incoming, live_rpc['lost'])) + live_rpc['pre_copied'], incoming, rank, live_rpc['lost'])) print('\nFields in the tables below:') print('Id: Packet\'s RPC identifier on the receiver side') From 66448c53bf5b6f7479004675437c060b3546beab Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 30 May 2025 16:10:15 -0700 Subject: [PATCH 344/625] Remove obsolete info from README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 24c11303..31bb7ade 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,6 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - The incast optimization from Section 3.6 of the SIGCOMM paper has not been implemented yet. If you would like to test Homa under large incasts, let me know and I will implement this feature. - - Socket buffer memory management needs more work. Large numbers of large - messages (hundreds of MB?) may cause buffer exhaustion and deadlock. - Please contact me if you have any problems using this repo; I'm happy to provide advice and support. From d498d40ce57aeaa79e3bc42f0223999ea5c025f9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 3 Jun 2025 15:17:29 -0700 Subject: [PATCH 345/625] Move debugging functions from homa_rpc.c to homa_devel.c --- homa_devel.c | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++ homa_devel.h | 7 ++ homa_impl.h | 2 - homa_rpc.c | 241 ------------------------------------------------- homa_rpc.h | 10 --- 5 files changed, 257 insertions(+), 253 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index e7527d50..e11cc581 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_devel.h" +#include "homa_grant.h" #include "homa_peer.h" #include "homa_rpc.h" #ifndef __STRIP__ /* See strip.py */ @@ -614,3 +615,252 @@ void homa_check_list(struct list_head *list, int max_length) prev = p; } } + +/** + * homa_rpc_log() - Log info about a particular RPC; this is functionality + * pulled out of homa_rpc_log_active because its indentation got too deep. + * @rpc: RPC for which key info should be written to the system log. + */ +void homa_rpc_log(struct homa_rpc *rpc) +{ + char *type = homa_is_client(rpc->id) ? "Client" : "Server"; + char *peer = homa_print_ipv6_addr(&rpc->peer->addr); + + if (rpc->state == RPC_INCOMING) + pr_notice("%s RPC INCOMING, id %llu, peer %s:%d, %d/%d bytes received, incoming %d\n", + type, rpc->id, peer, rpc->dport, + rpc->msgin.length - rpc->msgin.bytes_remaining, +#ifndef __STRIP__ + rpc->msgin.length, rpc->msgin.granted); +#else + rpc->msgin.length, 0); +#endif /* __STRIP__ */ + else if (rpc->state == RPC_OUTGOING) { + pr_notice("%s RPC OUTGOING, id %llu, peer %s:%d, out length %d, left %d, granted %d, in left %d, resend_ticks %u, silent_ticks %d\n", + type, rpc->id, peer, rpc->dport, rpc->msgout.length, + rpc->msgout.length - rpc->msgout.next_xmit_offset, +#ifndef __STRIP__ + rpc->msgout.granted, rpc->msgin.bytes_remaining, +#else + 0, rpc->msgin.bytes_remaining, +#endif /* __STRIP__ */ + rpc->resend_timer_ticks, rpc->silent_ticks); + } else { + pr_notice("%s RPC %s, id %llu, peer %s:%d, incoming length %d, outgoing length %d\n", + type, homa_symbol_for_state(rpc), rpc->id, peer, + rpc->dport, rpc->msgin.length, rpc->msgout.length); + } +} + +/** + * homa_rpc_log_active() - Print information to the system log about all + * active RPCs. Intended primarily for debugging. + * @homa: Overall data about the Homa protocol implementation. + * @id: An RPC id: if nonzero, then only RPCs with this id will be + * logged. + */ +void homa_rpc_log_active(struct homa *homa, uint64_t id) +{ + struct homa_socktab_scan scan; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int count = 0; + + pr_notice("Logging active Homa RPCs:\n"); + rcu_read_lock(); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); + hsk; hsk = homa_socktab_next(&scan)) { + if (list_empty(&hsk->active_rpcs) || hsk->shutdown) + continue; + + if (!homa_protect_rpcs(hsk)) + continue; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + count++; + if (id != 0 && id != rpc->id) + continue; + homa_rpc_log(rpc); + } + homa_unprotect_rpcs(hsk); + } + homa_socktab_end_scan(&scan); + rcu_read_unlock(); + pr_notice("Finished logging active Homa RPCs: %d active RPCs\n", count); +} + +/** + * homa_rpc_log_tt() - Log info about a particular RPC using timetraces. + * @rpc: RPC for which key info should be written to the system log. + */ +void homa_rpc_log_tt(struct homa_rpc *rpc) +{ + if (rpc->state == RPC_INCOMING) { + int received = rpc->msgin.length + - rpc->msgin.bytes_remaining; + int rank; + + tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes received", + rpc->id, tt_addr(rpc->peer->addr), + received, rpc->msgin.length); +#ifndef __STRIP__ + tt_record3("RPC id %d has incoming %d, granted %d", rpc->id, + rpc->msgin.granted - received, rpc->msgin.granted); + rank = rpc->msgin.rank; +#else /* __STRIP__ */ + rank = -1; +#endif /* __STRIP__ */ + tt_record4("RPC id %d: length %d, remaining %d, rank %d", + rpc->id, rpc->msgin.length, + rpc->msgin.bytes_remaining, rank); + if (rpc->msgin.num_bpages == 0) { + tt_record1("RPC id %d is blocked waiting for buffers", + rpc->id); + } else { + struct sk_buff *skb = skb_peek(&rpc->msgin.packets); + + if (!skb) { + tt_record2("RPC id %d has %d bpages allocated, no uncopied bytes", + rpc->id, rpc->msgin.num_bpages); + } else { + struct homa_data_hdr *h; + + h = (struct homa_data_hdr *) skb->data; + tt_record3("RPC id %d has %d bpages allocated, first uncopied offset %d", + rpc->id, rpc->msgin.num_bpages, + ntohl(h->seg.offset)); + } + } + } else if (rpc->state == RPC_OUTGOING) { + tt_record4("Outgoing RPC id %d, peer 0x%x, %d/%d bytes sent", + rpc->id, tt_addr(rpc->peer->addr), + rpc->msgout.next_xmit_offset, + rpc->msgout.length); +#ifndef __STRIP__ + if (rpc->msgout.granted > rpc->msgout.next_xmit_offset) + tt_record3("RPC id %d has %d unsent grants (granted %d)", + rpc->id, rpc->msgout.granted - + rpc->msgout.next_xmit_offset, + rpc->msgout.granted); +#endif /* __STRIP__ */ + } else { + tt_record2("RPC id %d is in state %d", rpc->id, rpc->state); + } +} + +/** + * homa_rpc_log_active_tt() - Log information about all active RPCs using + * timetraces. + * @homa: Overall data about the Homa protocol implementation. + * @freeze_count: If nonzero, FREEZE requests will be sent for this many + * incoming RPCs with outstanding grants + */ +void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) +{ + struct homa_socktab_scan scan; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int count = 0; + + tt_record("Logging Homa RPCs:"); + rcu_read_lock(); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); + hsk; hsk = homa_socktab_next(&scan)) { + if (list_empty(&hsk->active_rpcs) || hsk->shutdown) + continue; + + if (!homa_protect_rpcs(hsk)) + continue; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + struct homa_freeze_hdr freeze; + + count++; + homa_rpc_log_tt(rpc); + if (freeze_count == 0) + continue; + if (rpc->state != RPC_INCOMING) + continue; +#ifndef __STRIP__ + if (rpc->msgin.granted <= (rpc->msgin.length + - rpc->msgin.bytes_remaining)) + continue; +#endif /* __STRIP__ */ + freeze_count--; + pr_notice("Emitting FREEZE in %s\n", __func__); + homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); + } + homa_unprotect_rpcs(hsk); + } + homa_socktab_end_scan(&scan); + rcu_read_unlock(); + tt_record1("Finished logging (%d active Homa RPCs)", count); +} + +/** + * homa_validate_incoming() - Scan all of the active RPCs to compute what + * homa_total_incoming should be, and see if it actually matches. + * @homa: Overall data about the Homa protocol implementation. + * @verbose: Print incoming info for each individual RPC. + * @link_errors: Set to 1 if one or more grantable RPCs don't seem to + * be linked into the grantable lists. + * Return: The difference between the actual value of homa->total_incoming + * and the expected value computed from the individual RPCs (positive + * means homa->total_incoming is higher than expected). + */ +int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) +{ + struct homa_socktab_scan scan; + int total_incoming = 0; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int actual; + + tt_record1("homa_validate_incoming starting, total_incoming %d", + atomic_read(&homa->grant->total_incoming)); + *link_errors = 0; + rcu_read_lock(); + for (hsk = homa_socktab_start_scan(homa->socktab, &scan); + hsk; hsk = homa_socktab_next(&scan)) { + if (list_empty(&hsk->active_rpcs) || hsk->shutdown) + continue; + + if (!homa_protect_rpcs(hsk)) + continue; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + int incoming; + + if (rpc->state != RPC_INCOMING) + continue; + incoming = rpc->msgin.granted - + (rpc->msgin.length + - rpc->msgin.bytes_remaining); + if (incoming < 0) + incoming = 0; + if (rpc->msgin.rec_incoming == 0) + continue; + total_incoming += rpc->msgin.rec_incoming; + if (verbose) + tt_record3("homa_validate_incoming: RPC id %d, incoming %d, rec_incoming %d", + rpc->id, incoming, + rpc->msgin.rec_incoming); + if (rpc->msgin.granted >= rpc->msgin.length) + continue; + if (list_empty(&rpc->grantable_links)) { + tt_record1("homa_validate_incoming: RPC id %d not linked in grantable list", + rpc->id); + *link_errors = 1; + } + if (list_empty(&rpc->grantable_links)) { + tt_record1("homa_validate_incoming: RPC id %d peer not linked in grantable list", + rpc->id); + *link_errors = 1; + } + } + homa_unprotect_rpcs(hsk); + } + homa_socktab_end_scan(&scan); + rcu_read_unlock(); + actual = atomic_read(&homa->grant->total_incoming); + tt_record3("homa_validate_incoming diff %d (expected %d, got %d)", + actual - total_incoming, total_incoming, actual); + return actual - total_incoming; +} \ No newline at end of file diff --git a/homa_devel.h b/homa_devel.h index 46639ba4..64bbd02f 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -26,6 +26,7 @@ #define KERNEL_VERSION(...) 100 #endif /* __STRIP__ */ +struct homa; struct homa_net; struct homa_rpc; @@ -88,9 +89,15 @@ char *homa_print_ipv6_addr(const struct in6_addr *addr); char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len); char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len); +void homa_rpc_log(struct homa_rpc *rpc); +void homa_rpc_log_active(struct homa *homa, uint64_t id); +void homa_rpc_log_tt(struct homa_rpc *rpc); +void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); int homa_snprintf(char *buffer, int size, int used, const char *format, ...) __printf(4, 5); char *homa_symbol_for_type(uint8_t type); char *homa_symbol_for_state(struct homa_rpc *rpc); +int homa_validate_incoming(struct homa *homa, int verbose, + int *link_errors); #endif /* _HOMA_DEVEL_H */ diff --git a/homa_impl.h b/homa_impl.h index bad2bd6e..b84bb7cc 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -756,8 +756,6 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, loff_t *ppos); int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, int length); -int homa_validate_incoming(struct homa *homa, int verbose, - int *link_errors); void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority); #else /* See strip.py */ diff --git a/homa_rpc.c b/homa_rpc.c index df2c9d9e..47eab485 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -703,244 +703,3 @@ struct homa_rpc *homa_rpc_find_server(struct homa_sock *hsk, homa_bucket_unlock(bucket, id); return NULL; } - -#ifndef __UPSTREAM__ /* See strip.py */ -/** - * homa_rpc_log() - Log info about a particular RPC; this is functionality - * pulled out of homa_rpc_log_active because its indentation got too deep. - * @rpc: RPC for which key info should be written to the system log. - */ -void homa_rpc_log(struct homa_rpc *rpc) -{ - char *type = homa_is_client(rpc->id) ? "Client" : "Server"; - char *peer = homa_print_ipv6_addr(&rpc->peer->addr); - - if (rpc->state == RPC_INCOMING) - pr_notice("%s RPC INCOMING, id %llu, peer %s:%d, %d/%d bytes received, incoming %d\n", - type, rpc->id, peer, rpc->dport, - rpc->msgin.length - rpc->msgin.bytes_remaining, -#ifndef __STRIP__ - rpc->msgin.length, rpc->msgin.granted); -#else - rpc->msgin.length, 0); -#endif /* __STRIP__ */ - else if (rpc->state == RPC_OUTGOING) { - pr_notice("%s RPC OUTGOING, id %llu, peer %s:%d, out length %d, left %d, granted %d, in left %d, resend_ticks %u, silent_ticks %d\n", - type, rpc->id, peer, rpc->dport, rpc->msgout.length, - rpc->msgout.length - rpc->msgout.next_xmit_offset, -#ifndef __STRIP__ - rpc->msgout.granted, rpc->msgin.bytes_remaining, -#else - 0, rpc->msgin.bytes_remaining, -#endif /* __STRIP__ */ - rpc->resend_timer_ticks, rpc->silent_ticks); - } else { - pr_notice("%s RPC %s, id %llu, peer %s:%d, incoming length %d, outgoing length %d\n", - type, homa_symbol_for_state(rpc), rpc->id, peer, - rpc->dport, rpc->msgin.length, rpc->msgout.length); - } -} - -/** - * homa_rpc_log_active() - Print information to the system log about all - * active RPCs. Intended primarily for debugging. - * @homa: Overall data about the Homa protocol implementation. - * @id: An RPC id: if nonzero, then only RPCs with this id will be - * logged. - */ -void homa_rpc_log_active(struct homa *homa, uint64_t id) -{ - struct homa_socktab_scan scan; - struct homa_sock *hsk; - struct homa_rpc *rpc; - int count = 0; - - pr_notice("Logging active Homa RPCs:\n"); - rcu_read_lock(); - for (hsk = homa_socktab_start_scan(homa->socktab, &scan); - hsk; hsk = homa_socktab_next(&scan)) { - if (list_empty(&hsk->active_rpcs) || hsk->shutdown) - continue; - - if (!homa_protect_rpcs(hsk)) - continue; - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - count++; - if (id != 0 && id != rpc->id) - continue; - homa_rpc_log(rpc); - } - homa_unprotect_rpcs(hsk); - } - homa_socktab_end_scan(&scan); - rcu_read_unlock(); - pr_notice("Finished logging active Homa RPCs: %d active RPCs\n", count); -} - -/** - * homa_rpc_log_tt() - Log info about a particular RPC using timetraces. - * @rpc: RPC for which key info should be written to the system log. - */ -void homa_rpc_log_tt(struct homa_rpc *rpc) -{ - if (rpc->state == RPC_INCOMING) { - int received = rpc->msgin.length - - rpc->msgin.bytes_remaining; - int rank; - - tt_record4("Incoming RPC id %d, peer 0x%x, %d/%d bytes received", - rpc->id, tt_addr(rpc->peer->addr), - received, rpc->msgin.length); -#ifndef __STRIP__ - tt_record3("RPC id %d has incoming %d, granted %d", rpc->id, - rpc->msgin.granted - received, rpc->msgin.granted); - rank = rpc->msgin.rank; -#else /* __STRIP__ */ - rank = -1; -#endif /* __STRIP__ */ - tt_record4("RPC id %d: length %d, remaining %d, rank %d", - rpc->id, rpc->msgin.length, - rpc->msgin.bytes_remaining, rank); - if (rpc->msgin.num_bpages == 0) - tt_record1("RPC id %d is blocked waiting for buffers", - rpc->id); - else - tt_record2("RPC id %d has %d bpages allocated", - rpc->id, rpc->msgin.num_bpages); - } else if (rpc->state == RPC_OUTGOING) { - tt_record4("Outgoing RPC id %d, peer 0x%x, %d/%d bytes sent", - rpc->id, tt_addr(rpc->peer->addr), - rpc->msgout.next_xmit_offset, - rpc->msgout.length); -#ifndef __STRIP__ - if (rpc->msgout.granted > rpc->msgout.next_xmit_offset) - tt_record3("RPC id %d has %d unsent grants (granted %d)", - rpc->id, rpc->msgout.granted - - rpc->msgout.next_xmit_offset, - rpc->msgout.granted); -#endif /* __STRIP__ */ - } else { - tt_record2("RPC id %d is in state %d", rpc->id, rpc->state); - } -} - -/** - * homa_rpc_log_active_tt() - Log information about all active RPCs using - * timetraces. - * @homa: Overall data about the Homa protocol implementation. - * @freeze_count: If nonzero, FREEZE requests will be sent for this many - * incoming RPCs with outstanding grants - */ -void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) -{ - struct homa_socktab_scan scan; - struct homa_sock *hsk; - struct homa_rpc *rpc; - int count = 0; - - tt_record("Logging Homa RPCs:"); - rcu_read_lock(); - for (hsk = homa_socktab_start_scan(homa->socktab, &scan); - hsk; hsk = homa_socktab_next(&scan)) { - if (list_empty(&hsk->active_rpcs) || hsk->shutdown) - continue; - - if (!homa_protect_rpcs(hsk)) - continue; - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - struct homa_freeze_hdr freeze; - - count++; - homa_rpc_log_tt(rpc); - if (freeze_count == 0) - continue; - if (rpc->state != RPC_INCOMING) - continue; -#ifndef __STRIP__ - if (rpc->msgin.granted <= (rpc->msgin.length - - rpc->msgin.bytes_remaining)) - continue; -#endif /* __STRIP__ */ - freeze_count--; - pr_notice("Emitting FREEZE in %s\n", __func__); - homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); - } - homa_unprotect_rpcs(hsk); - } - homa_socktab_end_scan(&scan); - rcu_read_unlock(); - tt_record1("Finished logging (%d active Homa RPCs)", count); -} -#endif /* See strip.py */ - -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_validate_incoming() - Scan all of the active RPCs to compute what - * homa_total_incoming should be, and see if it actually matches. - * @homa: Overall data about the Homa protocol implementation. - * @verbose: Print incoming info for each individual RPC. - * @link_errors: Set to 1 if one or more grantable RPCs don't seem to - * be linked into the grantable lists. - * Return: The difference between the actual value of homa->total_incoming - * and the expected value computed from the individual RPCs (positive - * means homa->total_incoming is higher than expected). - */ -int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) -{ - struct homa_socktab_scan scan; - int total_incoming = 0; - struct homa_sock *hsk; - struct homa_rpc *rpc; - int actual; - - tt_record1("homa_validate_incoming starting, total_incoming %d", - atomic_read(&homa->grant->total_incoming)); - *link_errors = 0; - rcu_read_lock(); - for (hsk = homa_socktab_start_scan(homa->socktab, &scan); - hsk; hsk = homa_socktab_next(&scan)) { - if (list_empty(&hsk->active_rpcs) || hsk->shutdown) - continue; - - if (!homa_protect_rpcs(hsk)) - continue; - list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - int incoming; - - if (rpc->state != RPC_INCOMING) - continue; - incoming = rpc->msgin.granted - - (rpc->msgin.length - - rpc->msgin.bytes_remaining); - if (incoming < 0) - incoming = 0; - if (rpc->msgin.rec_incoming == 0) - continue; - total_incoming += rpc->msgin.rec_incoming; - if (verbose) - tt_record3("homa_validate_incoming: RPC id %d, incoming %d, rec_incoming %d", - rpc->id, incoming, - rpc->msgin.rec_incoming); - if (rpc->msgin.granted >= rpc->msgin.length) - continue; - if (list_empty(&rpc->grantable_links)) { - tt_record1("homa_validate_incoming: RPC id %d not linked in grantable list", - rpc->id); - *link_errors = 1; - } - if (list_empty(&rpc->grantable_links)) { - tt_record1("homa_validate_incoming: RPC id %d peer not linked in grantable list", - rpc->id); - *link_errors = 1; - } - } - homa_unprotect_rpcs(hsk); - } - homa_socktab_end_scan(&scan); - rcu_read_unlock(); - actual = atomic_read(&homa->grant->total_incoming); - tt_record3("homa_validate_incoming diff %d (expected %d, got %d)", - actual - total_incoming, total_incoming, actual); - return actual - total_incoming; -} -#endif /* See strip.py */ diff --git a/homa_rpc.h b/homa_rpc.h index 92b54705..d6756db3 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -433,17 +433,7 @@ struct homa_rpc void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack); void homa_rpc_end(struct homa_rpc *rpc); -#ifndef __UPSTREAM__ /* See strip.py */ -void homa_rpc_log(struct homa_rpc *rpc); -void homa_rpc_log_active(struct homa *homa, uint64_t id); -void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); -void homa_rpc_log_tt(struct homa_rpc *rpc); -#endif /* See strip.py */ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all); -#ifndef __UPSTREAM__ /* See strip.py */ -int homa_validate_incoming(struct homa *homa, int verbose, - int *link_errors); -#endif /* See strip.py */ /** * homa_rpc_lock() - Acquire the lock for an RPC. From a87e7f47e46980e3578dc5fe9c6b632d80a150f2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 3 Jun 2025 15:38:49 -0700 Subject: [PATCH 346/625] Fix bug in homa_grant_check_rpc Previous code didn't always handle hit_total_incoming, which could lead to a complete system stall with no grants set out. Also improved comments a bit. --- homa_grant.c | 57 ++++++++++++++++++++++++------------------ test/unit_homa_grant.c | 19 +++++++++++++- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 1b22b596..05c9d40a 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -663,31 +663,20 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) u64 now; int i; - if (rpc->msgin.length < 0 || rpc->msgin.num_bpages <= 0 || - rpc->state == RPC_DEAD) - return; - if (rpc->msgin.rank < 0) { - homa_grant_update_incoming(rpc, grant); - return; - } - - tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", - rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, - rpc->msgin.length); - INC_METRIC(grant_check_calls, 1); - - /* This function handles 4 different things: - * 1. It generates new grant packets for @rpc if appropriate. This + /* This function has 5 different tasks: + * 1. It updates variables tracking incoming data. + * 2. It generates new grant packets for @rpc if appropriate. This * is the common case. - * 2. If total_incoming had been exhausted, but headroom is now + * 3. If total_incoming had been exhausted, but headroom is now * available, it sends grants to the highest priority RPC that * needs them, which may not be @rpc. - * 3. It occasionally sends grants to the oldest RPC as determined - * by the fifo_grant_fraction parameter. - * 4. It occasionally scans active_rpcs to restore proper priority + * 4. It occasionally sends grants to the oldest RPC as determined + * by the fifo_grant_fraction parameter. This is not currently + * implemented. + * 5. It occasionally scans active_rpcs to restore proper priority * order. More on this below. * - * Cases 2-4 require the global grant lock, but that lock is in + * Tasks 3-5 require the global grant lock, but that lock is in * danger of overload, particularly as network speeds increase. So, * this function handles case 1 without acquiring the grant lock. * Issuing a grant to @rpc may change its priority relative to other @@ -697,11 +686,31 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * inversions that may have developed. The interval for these scans * is chosen so as not to create too much contention for the grant lock. */ - now = homa_clock(); + + if (rpc->msgin.length < 0 || rpc->msgin.num_bpages <= 0 || + rpc->state == RPC_DEAD) + return; + + tt_record4("homa_grant_check_rpc starting for id %d, granted %d, recv_end %d, length %d", + rpc->id, rpc->msgin.granted, rpc->msgin.recv_end, + rpc->msgin.length); + INC_METRIC(grant_check_calls, 1); + + /* If there are RPCs stalled because total_incoming is too high, + * can't take the shortcut below: need to take the slow path in case + * there are stalled RPCs that can now be granted. + */ limit = atomic_xchg(&grant->incoming_hit_limit, false); + if (rpc->msgin.rank < 0 && !limit) { + /* Very fast path (Task 1 only). */ + homa_grant_update_incoming(rpc, grant); + return; + } + + now = homa_clock(); recalc = now >= READ_ONCE(grant->next_recalc); if (!recalc && !limit) { - /* Fast path (Case 1). */ + /* Fast path (Tasks 1 and 2). */ send_grant = homa_grant_update_granted(rpc, grant); homa_grant_update_incoming(rpc, grant); if (!send_grant) @@ -728,12 +737,12 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) homa_grant_update_incoming(rpc, grant); homa_grant_lock(grant); if (recalc) { - /* Case 4. */ + /* Task 5. */ grant->next_recalc = now + grant->recalc_cycles; homa_grant_fix_order(grant); } - /* Cases 3 and 4: search all active RPCs to find any that do + /* Tasks 3 and 5: search all active RPCs to find any that do * not have a full window of grants. Then release the grant lock * and send grants. */ diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index f06e89df..19403095 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -952,8 +952,25 @@ TEST_F(homa_grant, homa_grant_check_rpc__update_incoming_if_rpc_not_active) unit_log_clear(); homa_grant_check_rpc(rpc); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_calls); EXPECT_EQ(900, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_slow_path); +} +TEST_F(homa_grant, homa_grant_check_rpc__skip_shortcut_if_incoming_hit_limit) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 100, 1000, 2000); + + homa_message_in_init(rpc, 2000, 0); + EXPECT_EQ(0, rpc->msgin.rank); + rpc->msgin.rank = -1; + atomic_set(&self->homa.grant->incoming_hit_limit, 1); + homa_rpc_lock(rpc); + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_slow_path); + EXPECT_EQ(0, atomic_read(&self->homa.grant->incoming_hit_limit)); } TEST_F(homa_grant, homa_grant_check_rpc__fast_path) { From cd1e5ce00fa6b2a9c6a12be8e8ae29cd27760ac8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 3 Jun 2025 16:14:36 -0700 Subject: [PATCH 347/625] Add homa_drop_packet function Also added new sysctl parameters accept_bits and drop_bits. --- homa_devel.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- homa_devel.h | 1 + homa_impl.h | 16 ++++++++++++++++ homa_plumbing.c | 14 ++++++++++++++ 4 files changed, 78 insertions(+), 1 deletion(-) diff --git a/homa_devel.c b/homa_devel.c index e11cc581..769c85e2 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -16,6 +16,17 @@ #endif /* See strip.py */ #include "homa_wire.h" +/* homa_drop_packet will accept this many more packets before it drops some. */ +static int accept_count; + +/* If accept_count <= 0, homa_drop_packet will drop this many packets + * before it starts accepting again. + */ +static int drop_count; + +/* Used for random-number generation. */ +static u32 seed; + /** * homa_print_ipv4_addr() - Convert an IPV4 address to the standard string * representation. @@ -863,4 +874,39 @@ int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors) tt_record3("homa_validate_incoming diff %d (expected %d, got %d)", actual - total_incoming, total_incoming, actual); return actual - total_incoming; -} \ No newline at end of file +} + +/** + * homa_drop_packet() - Invoked for each incoming packet to determine + * (stochastically) whether that packet should be dropped. Used during + * development to exercise retry code. + * to + * @homa: Overall information about the Homa transport + * Return: Nonzero means drop the packet, zero means process normally. + */ +int homa_drop_packet(struct homa *homa) +{ + /* This code is full of races, but they don't matter (better fast + * than precise). + */ + if (homa->accept_bits == 0) + return 0; + while (1) { + if (accept_count > 0) { + accept_count--; + return 0; + } + if (drop_count > 0) { + drop_count--; + return 1; + } + if (seed == 0) + seed = homa_clock(); + seed = seed * 1664525 + 1013904223; + accept_count = (seed >> 4) & ((1 << homa->accept_bits) - 1); + seed = seed * 1664525 + 1013904223; + drop_count = 1 + ((seed >> 4) & ((1 << homa->drop_bits) - 1)); + tt_record2("homa_drop_packet set accept_count %d, drop_count 0x%x", + accept_count, drop_count); + } +} diff --git a/homa_devel.h b/homa_devel.h index 64bbd02f..7bbd3ace 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -81,6 +81,7 @@ static inline void check_addr_valid(void *addr, char *info) void homa_check_addr(void *p); void homa_check_list(struct list_head *list, int max_length); +int homa_drop_packet(struct homa *homa); void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format); void homa_freeze_peers(void); diff --git a/homa_impl.h b/homa_impl.h index b84bb7cc..c62ce65a 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -437,6 +437,22 @@ struct homa { * should be frozen. Set externally via sysctl. */ enum homa_freeze_type freeze_type; + + /** + * @accept_bits: determines how many consecutive packets will be + * accepted before the next bunch of packets is dropped (intervals + * between dropped packets are chosen uniformly from the + * range[0..1< Date: Mon, 2 Jun 2025 22:07:10 -0700 Subject: [PATCH 348/625] Refactor resend mechanism * New method homa_request_retrans: consolidates all the code for sending RESEND packets. Code in homa_timer is now much simpler. * Add special value -1 for length in RESEND packets. * In homa_resend_pkt(), update granted if necessary; eliminate obsolete code for length == 0. --- homa_devel.c | 4 + homa_impl.h | 2 +- homa_incoming.c | 130 +++++++++++++++++--------- homa_offload.c | 5 + homa_timer.c | 51 +--------- homa_wire.h | 8 +- notes.txt | 7 -- test/unit_homa_incoming.c | 189 ++++++++++++++++++++++++++++---------- test/unit_homa_timer.c | 61 +----------- util/strip_decl.py | 2 +- util/tthoma.py | 11 ++- 11 files changed, 251 insertions(+), 219 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 769c85e2..e5853962 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -16,6 +16,7 @@ #endif /* See strip.py */ #include "homa_wire.h" +#ifndef __STRIP__ /* See strip.py */ /* homa_drop_packet will accept this many more packets before it drops some. */ static int accept_count; @@ -26,6 +27,7 @@ static int drop_count; /* Used for random-number generation. */ static u32 seed; +#endif /* See strip.py */ /** * homa_print_ipv4_addr() - Convert an IPV4 address to the standard string @@ -806,6 +808,7 @@ void homa_rpc_log_active_tt(struct homa *homa, int freeze_count) tt_record1("Finished logging (%d active Homa RPCs)", count); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_validate_incoming() - Scan all of the active RPCs to compute what * homa_total_incoming should be, and see if it actually matches. @@ -910,3 +913,4 @@ int homa_drop_packet(struct homa *homa) accept_count, drop_count); } } +#endif /* See strip.py */ diff --git a/homa_impl.h b/homa_impl.h index c62ce65a..1a405115 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -706,7 +706,6 @@ int homa_err_handler_v6(struct sk_buff *skb, int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter); struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end); -void homa_gap_retry(struct homa_rpc *rpc); int homa_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int homa_hash(struct sock *sk); @@ -728,6 +727,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); +void homa_request_retrans(struct homa_rpc *rpc); void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk); void homa_rpc_handoff(struct homa_rpc *rpc); diff --git a/homa_incoming.c b/homa_incoming.c index e798cfae..a1728fbf 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -93,26 +93,56 @@ struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end) } /** - * homa_gap_retry() - Send RESEND requests for all of the unreceived - * gaps in a message. - * @rpc: RPC to check; must be locked by caller. + * homa_request_retrans() - The function is invoked when it appears that + * data packets for a message have been lost. It issues RESEND requests + * as appropriate and may modify the state of the RPC. + * @rpc: RPC for which incoming data is delinquent; must be locked by + * caller. */ -void homa_gap_retry(struct homa_rpc *rpc) +void homa_request_retrans(struct homa_rpc *rpc) __must_hold(rpc_bucket_lock) { struct homa_resend_hdr resend; struct homa_gap *gap; + int offset, length; - list_for_each_entry(gap, &rpc->msgin.gaps, links) { - resend.offset = htonl(gap->start); - resend.length = htonl(gap->end - gap->start); #ifndef __STRIP__ /* See strip.py */ - resend.priority = rpc->hsk->homa->num_priorities - 1; + resend.priority = rpc->hsk->homa->num_priorities - 1; #endif /* See strip.py */ - tt_record3("homa_gap_retry sending RESEND for id %d, start %d, end %d", - rpc->id, gap->start, gap->end); - homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); + + if (rpc->msgin.length >= 0) { + /* Issue RESENDS for any gaps in incoming data. */ + list_for_each_entry(gap, &rpc->msgin.gaps, links) { + resend.offset = htonl(gap->start); + resend.length = htonl(gap->end - gap->start); + tt_record4("Sending RESEND for id %d, peer 0x%x, offset %d, length %d", + rpc->id, tt_addr(rpc->peer->addr), + gap->start, gap->end - gap->start); + homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); + } + + /* Issue a RESEND for any granted data after the last gap. */ + offset = rpc->msgin.recv_end; +#ifndef __STRIP__ /* See strip.py */ + length = rpc->msgin.granted - rpc->msgin.recv_end; +#else /* See strip.py */ + length = rpc->msgin.length - rpc->msgin.recv_end; +#endif /* See strip.py */ + if (length <= 0) + return; + } else { + /* No data has been received for the RPC. Ask the sender to + * resend everything it has sent so far. + */ + offset = 0; + length = -1; } + + resend.offset = htonl(offset); + resend.length = htonl(length); + tt_record4("Sending RESEND for id %d, peer 0x%x, offset %d, length %d", + rpc->id, tt_addr(rpc->peer->addr), offset, offset + length); + homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); } /** @@ -758,6 +788,9 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, __must_hold(rpc_bucket_lock) { struct homa_resend_hdr *h = (struct homa_resend_hdr *)skb->data; + int offset = htonl(h->offset); + int length = htonl(h->length); + int end = offset + length; struct homa_busy_hdr busy; if (!rpc) { @@ -770,51 +803,65 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, } #ifndef __STRIP__ /* See strip.py */ tt_record4("resend request for id %llu, offset %d, length %d, prio %d", - rpc->id, ntohl(h->offset), ntohl(h->length), h->priority); + rpc->id, offset, length, h->priority); #else /* See strip.py */ tt_record3("resend request for id %llu, offset %d, length %d", - rpc->id, ntohl(h->offset), ntohl(h->length)); + rpc->id, offset, length); #endif /* See strip.py */ if (!homa_is_client(rpc->id) && rpc->state != RPC_OUTGOING) { /* We are the server for this RPC and don't yet have a - * response packet, so just send BUSY. + * response message, so send BUSY to keep the client + * waiting. */ tt_record2("sending BUSY from resend, id %d, state %d", rpc->id, rpc->state); homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); goto done; } + + /* First, retransmit bytes that were already sent once. */ + if (length == -1) + end = rpc->msgout.next_xmit_offset; + #ifndef __STRIP__ /* See strip.py */ - if (rpc->msgout.next_xmit_offset < rpc->msgout.granted) { - /* We have chosen not to transmit data from this message; - * send BUSY instead. + if (end > rpc->msgout.next_xmit_offset) + homa_resend_data(rpc, offset, rpc->msgout.next_xmit_offset, + h->priority); + else + homa_resend_data(rpc, offset, end, h->priority); + + if (end > rpc->msgout.granted) { + /* It appears that a grant packet was lost; assume that + * any data requested in the RESEND must have been + * granted previously. + */ + rpc->msgout.granted = end; + if (rpc->msgout.granted > rpc->msgout.length) + rpc->msgout.granted = rpc->msgout.length; + homa_xmit_data(rpc, false); + } +#else /* See strip.py */ + if (end > rpc->msgout.next_xmit_offset) + homa_resend_data(rpc, offset, rpc->msgout.next_xmit_offset); + else + homa_resend_data(rpc, offset, end); +#endif /* See strip.py */ + + if (offset >= rpc->msgout.next_xmit_offset) { + /* We have chosen not to transmit any of the requested data; + * send BUSY so the receiver knows we are alive. */ tt_record3("sending BUSY from resend, id %d, offset %d, granted %d", rpc->id, rpc->msgout.next_xmit_offset, +#ifndef __STRIP__ /* See strip.py */ rpc->msgout.granted); - homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); - } else { - if (ntohl(h->length) == 0) - /* This RESEND is from a server just trying to determine - * whether the client still cares about the RPC; return - * BUSY so the server doesn't time us out. - */ - homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); - homa_resend_data(rpc, ntohl(h->offset), - ntohl(h->offset) + ntohl(h->length), - h->priority); - } #else /* See strip.py */ - if (ntohl(h->length) == 0) - /* This RESEND is from a server just trying to determine - * whether the client still cares about the RPC; return - * BUSY so the server doesn't time us out. - */ - homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); - homa_resend_data(rpc, ntohl(h->offset), - ntohl(h->offset) + ntohl(h->length)); + rpc->msgout.length); #endif /* See strip.py */ + homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); + goto done; + } done: kfree_skb(skb); @@ -924,20 +971,15 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, tt_record1("Received NEED_ACK for id %d", id); - /* Return if it's not safe for the peer to purge its state + /* Don't ack if it's not safe for the peer to purge its state * for this RPC (the RPC still exists and we haven't received * the entire response), or if we can't find peer info. */ if (rpc && (rpc->state != RPC_INCOMING || -#ifndef __STRIP__ /* See strip.py */ rpc->msgin.bytes_remaining)) { tt_record3("NEED_ACK arrived for id %d before message received, state %d, remaining %d", rpc->id, rpc->state, rpc->msgin.bytes_remaining); - homa_freeze(rpc, NEED_ACK_MISSING_DATA, - "Freezing because NEED_ACK received before message complete, id %d, peer 0x%x"); -#else /* See strip.py */ - rpc->msgin.bytes_remaining)) { -#endif /* See strip.py */ + homa_request_retrans(rpc); goto done; } else { peer = homa_peer_get(hsk, &saddr); diff --git a/homa_offload.c b/homa_offload.c index ce1b0dc7..ee1aed04 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -303,6 +303,11 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, if (!homa_make_header_avl(skb)) tt_record("homa_gro_receive couldn't pull enough data from packet"); + // if (homa_drop_packet(homa)) { + // kfree_skb(skb); + // return ERR_PTR(-EINPROGRESS); + // } + h_new = (struct homa_data_hdr *)skb_transport_header(skb); offload_core = &per_cpu(homa_offload_core, smp_processor_id()); busy = (now - offload_core->last_gro) < homa->gro_busy_cycles; diff --git a/homa_timer.c b/homa_timer.c index dc1a0c9e..b1543145 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -28,7 +28,6 @@ void homa_timer_check_rpc(struct homa_rpc *rpc) __must_hold(&rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; - struct homa_resend_hdr resend; /* See if we need to request an ack for this RPC. */ if (!homa_is_client(rpc->id) && rpc->state == RPC_OUTGOING && @@ -112,54 +111,8 @@ void homa_timer_check_rpc(struct homa_rpc *rpc) return; } if (((rpc->silent_ticks - homa->resend_ticks) % homa->resend_interval) - != 0) - return; - - /* Issue a resend for the bytes just after the last ones received - * (gaps in the middle were already handled by homa_gap_retry above). - */ - if (rpc->msgin.length < 0) { - /* Haven't received any data for this message; request - * retransmission of just the first packet (the sender - * will send at least one full packet, regardless of - * the length below). - */ - resend.offset = htonl(0); - resend.length = htonl(100); - } else { - homa_gap_retry(rpc); - resend.offset = htonl(rpc->msgin.recv_end); -#ifndef __STRIP__ /* See strip.py */ - resend.length = htonl(rpc->msgin.granted - rpc->msgin.recv_end); -#else /* See strip.py */ - resend.length = htonl(rpc->msgin.length - rpc->msgin.recv_end); -#endif /* See strip.py */ - if (resend.length == 0) - return; - } -#ifndef __STRIP__ /* See strip.py */ - resend.priority = homa->num_priorities - 1; -#endif /* See strip.py */ - homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); -#ifndef __UPSTREAM__ /* See strip.py */ - if (homa_is_client(rpc->id)) { - tt_record4("Sent RESEND for client RPC id %llu, server 0x%x:%d, offset %d", - rpc->id, tt_addr(rpc->peer->addr), - rpc->dport, rpc->msgin.recv_end); - /* Should be if (homa->verbose) */ - // pr_notice("Homa client RESEND to %s:%d for id %llu, offset %d\n", - // homa_print_ipv6_addr(&rpc->peer->addr), - // rpc->dport, rpc->id, rpc->msgin.recv_end); - } else { - tt_record4("Sent RESEND for server RPC id %llu, client 0x%x:%d offset %d", - rpc->id, tt_addr(rpc->peer->addr), rpc->dport, - rpc->msgin.recv_end); - /* Should be if (homa->verbose) */ - // pr_notice("Homa server RESEND to %s:%d for id %llu, offset %d\n", - // homa_print_ipv6_addr(&rpc->peer->addr), - // rpc->dport, rpc->id, rpc->msgin.recv_end); - } -#endif /* See strip.py */ + == 0) + homa_request_retrans(rpc); } /** diff --git a/homa_wire.h b/homa_wire.h index c15c8245..ca627a99 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -417,11 +417,9 @@ struct homa_resend_hdr { __be32 offset; /** - * @length: Number of bytes of data to retransmit; this could specify - * a range longer than the total message size. Zero is a special case - * used by servers; in this case, there is no need to actually resend - * anything; the purpose of this packet is to trigger an RPC_UNKNOWN - * response if the client no longer cares about this RPC. + * @length: Number of bytes of data to retransmit. -1 means no data + * has been received for the message, so everything sent previously + * should be retransmitted. */ __be32 length; diff --git a/notes.txt b/notes.txt index 7809b9b0..9080cfe0 100755 --- a/notes.txt +++ b/notes.txt @@ -1,13 +1,6 @@ Notes for Homa implementation in Linux: --------------------------------------- -* Refactor resend mechanism: - * New method that handles resends: - * Request all gaps - * Reaquest entire message if nothing received yet - * On resender, if entire message resent, reset msgout state (e.g., granted) - * Issue resends when handling NEED_ACKs. - * Notes for the next design of grants: * Update tthoma.py: e.g., no grant_recalc records * grant_check_slow_path too high diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index b11d97ad..3ece4ab6 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -200,7 +200,7 @@ TEST_F(homa_incoming, homa_message_in_init__update_metrics) } #endif /* See strip.py */ -TEST_F(homa_incoming, homa_gap_retry) +TEST_F(homa_incoming, homa_request_retrans__request_gaps) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, self->client_port, @@ -210,11 +210,12 @@ TEST_F(homa_incoming, homa_gap_retry) homa_gap_alloc(&srpc->msgin.gaps, 4000, 6000); homa_gap_alloc(&srpc->msgin.gaps, 7000, 8000); #ifndef __STRIP__ /* See strip.py */ + srpc->msgin.granted = srpc->msgin.recv_end; self->homa.num_priorities = 8; #endif /* See strip.py */ unit_log_clear(); - homa_gap_retry(srpc); + homa_request_retrans(srpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit RESEND 1000-1999@7; " "xmit RESEND 4000-5999@7; " @@ -223,10 +224,55 @@ TEST_F(homa_incoming, homa_gap_retry) #else /* See strip.py */ EXPECT_STREQ("xmit RESEND 1000-1999; " "xmit RESEND 4000-5999; " - "xmit RESEND 7000-7999", + "xmit RESEND 7000-7999; " + "xmit RESEND 1400-9999", unit_log_get()); #endif /* See strip.py */ } +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_incoming, homa_request_retrans__no_granted_but_not_received_data) +{ + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + + EXPECT_EQ(1400, srpc->msgin.recv_end); + unit_log_clear(); + + srpc->msgin.granted = 1400; + homa_request_retrans(srpc); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_incoming, homa_request_retrans__granted_data_after_last_gap) +{ + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + + EXPECT_EQ(1400, srpc->msgin.recv_end); + unit_log_clear(); + + srpc->msgin.granted = 3000; + homa_request_retrans(srpc); + EXPECT_STREQ("xmit RESEND 1400-2999@0", unit_log_get()); +} +#endif /* See strip.py */ +TEST_F(homa_incoming, homa_request_retrans__no_data_received_yet) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 99, 1000, 10000); + + EXPECT_EQ(-1, crpc->msgin.length); + unit_log_clear(); + + homa_request_retrans(crpc); +#ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit RESEND 0--2@0", unit_log_get()); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 0--2", unit_log_get()); +#endif /* See strip.py */ +} TEST_F(homa_incoming, homa_add_packet__basics) { @@ -1623,86 +1669,131 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); } -#ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_incoming, homa_resend_pkt__send_busy_instead_of_data) +TEST_F(homa_incoming, homa_resend_pkt__negative_length) +{ + /* Entire msgin has not been received yet. But we have received + * everything we have granted so far. + */ + struct homa_resend_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->server_port), + .sender_id = cpu_to_be64(self->client_id), + .type = RESEND}, + .offset = htonl(0), + .length = htonl(-1)}; + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 2000, 20000); + + ASSERT_NE(NULL, srpc); + unit_log_clear(); + srpc->msgout.next_xmit_offset = 2000; + + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + // The server might send a GRANT right after BUSY so just check substr + EXPECT_STREQ("xmit DATA retrans 1400@0; " + "xmit DATA retrans 1400@1400", unit_log_get()); +} +TEST_F(homa_incoming, homa_resend_pkt__clip_range_to_next_xmit_offset) { struct homa_resend_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, .offset = htonl(100), - .length = htonl(200)}; + .length = htonl(2000)}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 2000, 100); + self->server_port, self->client_id, 5000, 100); ASSERT_NE(NULL, crpc); unit_log_clear(); + crpc->msgout.next_xmit_offset = 1400; + IF_NO_STRIP(crpc->msgout.granted = 5000); homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); - EXPECT_SUBSTR("xmit BUSY", unit_log_get()); + EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); } -#endif /* See strip.py */ -TEST_F(homa_incoming, homa_resend_pkt__client_send_data) +TEST_F(homa_incoming, homa_resend_pkt__no_need_to_clip_range) { struct homa_resend_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, .offset = htonl(100), + .length = htonl(300)}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 100); + + ASSERT_NE(NULL, crpc); + unit_log_clear(); + IF_NO_STRIP(crpc->msgout.granted = 2800); + crpc->msgout.next_xmit_offset = 2800; + + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); +} #ifndef __STRIP__ /* See strip.py */ - .length = htonl(200), - .priority = 3}; -#else /* See strip.py */ - .length = htonl(200)}; -#endif /* See strip.py */ +TEST_F(homa_incoming, homa_resend_pkt__update_granted_and_xmit) +{ + struct homa_resend_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RESEND}, + .offset = htonl(1400), + .length = htonl(2000)}; struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 2000, 100); + self->server_port, self->client_id, 5000, 100); ASSERT_NE(NULL, crpc); + crpc->msgout.granted = 1400; homa_rpc_lock(crpc); homa_xmit_data(crpc, false); homa_rpc_unlock(crpc); unit_log_clear(); - mock_clear_xmit_prios(); + EXPECT_EQ(1400, crpc->msgout.next_xmit_offset); homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); - EXPECT_SUBSTR("xmit DATA retrans 1400@0", unit_log_get()); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_STREQ("3", mock_xmit_prios); -#endif /* See strip.py */ + EXPECT_EQ(3400, crpc->msgout.granted); + EXPECT_STREQ("xmit DATA 1400@1400; " + "xmit DATA 1400@2800", unit_log_get()); } -TEST_F(homa_incoming, homa_resend_pkt__server_send_data) +TEST_F(homa_incoming, homa_resend_pkt__clip_granted_to_message_length) { - struct homa_resend_hdr h = {{.sport = htons(self->client_port), + struct homa_resend_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), - .sender_id = cpu_to_be64(self->client_id), + .sender_id = cpu_to_be64(self->server_id), .type = RESEND}, - .offset = htonl(100), -#ifndef __STRIP__ /* See strip.py */ - .length = htonl(2000), - .priority = 4}; -#else /* See strip.py */ - .length = htonl(2000)}; + .offset = htonl(1400), + .length = htonl(6000)}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 100); + + ASSERT_NE(NULL, crpc); + + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_EQ(5000, crpc->msgout.granted); +} #endif /* See strip.py */ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); +TEST_F(homa_incoming, homa_resend_pkt__requested_data_hasnt_been_sent_yet) +{ + struct homa_resend_hdr h = {{.sport = htons(self->server_port), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(self->server_id), + .type = RESEND}, + .offset = htonl(100), + .length = htonl(200)}; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 2000, 100); - ASSERT_NE(NULL, srpc); - homa_rpc_lock(srpc); - homa_xmit_data(srpc, false); - homa_rpc_unlock(srpc); + ASSERT_NE(NULL, crpc); unit_log_clear(); - mock_clear_xmit_prios(); - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); - EXPECT_STREQ("xmit DATA retrans 1400@0; " - "xmit DATA retrans 1400@1400", unit_log_get()); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_STREQ("4 4", mock_xmit_prios); -#endif /* See strip.py */ + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); + EXPECT_SUBSTR("xmit BUSY", unit_log_get()); } TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) @@ -1861,12 +1952,13 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_response_not_fully_received) ASSERT_NE(NULL, crpc); unit_log_clear(); - mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); - EXPECT_STREQ("", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-2999@0", unit_log_get()); EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ NEED_ACK - DATA]); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 1400-2999", unit_log_get()); #endif /* See strip.py */ } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) @@ -1882,12 +1974,13 @@ TEST_F(homa_incoming, homa_need_ack_pkt__rpc_not_incoming) ASSERT_NE(NULL, crpc); unit_log_clear(); - mock_xmit_log_verbose = 1; homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); - EXPECT_STREQ("", unit_log_get()); #ifndef __STRIP__ /* See strip.py */ + EXPECT_STREQ("xmit RESEND 0--2@0", unit_log_get()); EXPECT_EQ(1, homa_metrics_per_cpu()->packets_received[ NEED_ACK - DATA]); +#else /* See strip.py */ + EXPECT_STREQ("xmit RESEND 0--2", unit_log_get()); #endif /* See strip.py */ } TEST_F(homa_incoming, homa_need_ack_pkt__rpc_doesnt_exist) diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 5f4d92d4..10ebab85 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -174,7 +174,7 @@ TEST_F(homa_timer, homa_timer_check_rpc__timeout) #endif /* See strip.py */ EXPECT_EQ(ETIMEDOUT, -crpc->error); } -TEST_F(homa_timer, homa_timer_check_rpc__issue_resend) +TEST_F(homa_timer, homa_timer_check_rpc__request_retransmission) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, @@ -222,65 +222,6 @@ TEST_F(homa_timer, homa_timer_check_rpc__issue_resend) EXPECT_STREQ("xmit RESEND 1400-9999", unit_log_get()); #endif /* See strip.py */ } -TEST_F(homa_timer, homa_timer_check_rpc__request_first_bytes_of_message) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 10000); - - ASSERT_NE(NULL, crpc); -#ifndef __STRIP__ /* See strip.py */ - crpc->msgout.granted = 5000; -#endif /* See strip.py */ - crpc->msgout.next_xmit_offset = 5000; - self->homa.resend_ticks = 3; - - /* First call: resend_ticks-1. */ - crpc->silent_ticks = 2; - unit_log_clear(); - homa_rpc_lock(crpc); - homa_timer_check_rpc(crpc); - EXPECT_STREQ("", unit_log_get()); - - /* Second call: resend_ticks. */ - crpc->silent_ticks = 3; - unit_log_clear(); - homa_timer_check_rpc(crpc); - homa_rpc_unlock(crpc); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_STREQ("xmit RESEND 0-99@7", unit_log_get()); -#else /* See strip.py */ - EXPECT_STREQ("xmit RESEND 0-99", unit_log_get()); -#endif /* See strip.py */ -} -TEST_F(homa_timer, homa_timer_check_rpc__call_homa_gap_retry) -{ - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 200, 20000); - - ASSERT_NE(NULL, crpc); - crpc->silent_ticks = 3; -#ifndef __STRIP__ /* See strip.py */ - crpc->msgin.granted = 10000; - crpc->msgin.recv_end = 10000; -#endif /* See strip.py */ - crpc->msgin.bytes_remaining = 15000; - homa_gap_alloc(&crpc->msgin.gaps, 7000, 8000); - self->homa.resend_ticks = 3; - self->homa.resend_interval = 2; - - unit_log_clear(); - homa_rpc_lock(crpc); - homa_timer_check_rpc(crpc); - homa_rpc_unlock(crpc); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_STREQ("xmit RESEND 7000-7999@7", unit_log_get()); -#else /* See strip.py */ - EXPECT_STREQ("xmit RESEND 7000-7999; xmit RESEND 1400-19999", - unit_log_get()); -#endif /* See strip.py */ -} TEST_F(homa_timer, homa_timer__basics) { diff --git a/util/strip_decl.py b/util/strip_decl.py index 733ae282..e9361dfb 100755 --- a/util/strip_decl.py +++ b/util/strip_decl.py @@ -63,8 +63,8 @@ 'void homa_data_pkt(', 'void homa_dispatch_pkts(', 'struct homa_gap *homa_gap_alloc(', - 'void homa_gap_retry(', 'void homa_need_ack_pkt(', + 'void homa_request_retrans(', 'void homa_resend_pkt(', 'void homa_rpc_unknown_pkt(', 'int homa_wait_private(', diff --git a/util/tthoma.py b/util/tthoma.py index 199cb0d5..3d998eb6 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1349,13 +1349,16 @@ def __poll_success(self, trace, time, core, match, interests): def __resend_tx(self, trace, time, core, match, interests): id = int(match.group(1)) - offset = int(match.group(2)) + peer = match.group(2) + offset = int(match.group(3)) + length = int(match.group(4)) for interest in interests: - interest.tt_resend_tx(trace, time, core, id, offset) + interest.tt_resend_tx(trace, time, core, id, peer, offset, length) patterns.append({ 'name': 'resend_tx', - 'regexp': 'Sent RESEND for client RPC id ([0-9]+), .* offset ([0-9]+)' + 'regexp': 'Sending RESEND for id ([0-9]+), peer ([^,]+), ' + 'offset ([0-9]+), length ([0-9]+)' }) def __resend_rx(self, trace, time, core, match, interests): @@ -5809,7 +5812,7 @@ def tt_resend_rx(self, trace, t, core, id, offset, length): global rpcs rpcs[id]['resend_rx'].append([t, offset, length]) - def tt_resend_tx(self, trace, t, core, id, offset): + def tt_resend_tx(self, trace, t, core, id, peer, offset, length): global rpcs rpcs[id]['resend_tx'].append([t, offset]) From 69d055da1d9a251c1093e5eee6cfa87689604429 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 4 Jun 2025 09:22:58 -0700 Subject: [PATCH 349/625] When installing Homa, warn if Meldown mitigations are enabled --- cloudlab/bin/install_homa | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudlab/bin/install_homa b/cloudlab/bin/install_homa index a4528386..c457a332 100755 --- a/cloudlab/bin/install_homa +++ b/cloudlab/bin/install_homa @@ -47,4 +47,5 @@ for ((i = $first ; i <= $last; i++)); do ssh -4 $node 'sudo sysctl .kernel.printk="5 4 1 7"' ssh -4 $node 'echo $PATH' ssh -4 $node 'config default' + ssh -4 $node 'if ! grep -q mitigations=off /proc/cmdline; then echo WARNING: Meltdown/Spectre mitigations have not been disabled!; fi' done \ No newline at end of file From db8a266cd7ced9ade5282e4b35571e940af2f7dc Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 4 Jun 2025 09:25:16 -0700 Subject: [PATCH 350/625] Fix bug in homa_metrics.c Was printing grant_check_calls in place of grant_check_slow_path --- homa_metrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_metrics.c b/homa_metrics.c index 00732843..1ca40ead 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -322,7 +322,7 @@ char *homa_metrics_print(void) M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", m->grant_check_calls); M("grant_check_slow_path %15llu Number of times homa_grant_check_rpc acquired grant lock\n", - m->grant_check_calls); + m->grant_check_slow_path); M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", m->grant_priority_bumps); M("fifo_grants %15llu Grants issued using FIFO priority\n", From dbe5b593b90510a35d66cb37c6f3946bbbe868ab Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 4 Jun 2025 16:31:37 -0700 Subject: [PATCH 351/625] Remove superflous tt_record from homa_offload.c --- homa_offload.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/homa_offload.c b/homa_offload.c index ee1aed04..d75cf263 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -319,9 +319,6 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, priority = ((struct iphdr *)skb_network_header(skb))->tos >> 5; saddr = ntohl(ip_hdr(skb)->saddr); } - tt_record4("homa_gro_receive transport %d, len %d, data_len %d, delta %d", - skb->network_header, skb->len, skb->data_len, - skb_transport_header(skb) - skb->data); if (h_new->common.type == DATA) { if (h_new->seg.offset == (__force __be32)-1) { From 412654fb849bf3dcdd7100e7ed8f9e0eb1cd5f50 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 4 Jun 2025 21:46:35 -0700 Subject: [PATCH 352/625] Fix bug in metrics.py Formula for time/packet for NAPI etc. was incorrect. --- util/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/metrics.py b/util/metrics.py index e87f997a..c35227e9 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -304,7 +304,7 @@ def scale_number(number): cores = cpu_time/time_delta if packets_received > 0: print("%s %6.2f %7.2f us/packet" % (print_name.ljust(22), - cores, (cpu_time/packets_received) / 1000)) + cores, (cpu_time/(cpu_khz*1000)/packets_received) * 1e6)) else: print("%s %6.2f" % (print_name.ljust(22), cores)) cpu_time = float(deltas["napi_cycles"]) From 35420abc3c84ca469eec89602d5081dc671e8919 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Jun 2025 11:16:57 -0700 Subject: [PATCH 353/625] Tiny improvement in log message in timetrace.c --- timetrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timetrace.c b/timetrace.c index 43162a94..227867e7 100644 --- a/timetrace.c +++ b/timetrace.c @@ -835,7 +835,7 @@ void tt_dbg1(char *msg, ...) if (atomic_read(&tt_frozen)) return; tt_freeze(); - pr_err("Dumping timetrace\n"); + pr_err("Dumping timetrace on core %d\n", raw_smp_processor_id()); tt_printk(); pr_err("Finished dumping timetrace\n"); } From 944e22df2895cb3ec24c6d6e8522e5c51f4fa967 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 5 Jun 2025 11:29:56 -0700 Subject: [PATCH 354/625] Fix tthoma.py to work with refactored grant mechanism --- homa_grant.c | 13 +++-- notes.txt | 6 --- util/tthoma.py | 131 ++++++++++++++++++++++++------------------------- 3 files changed, 75 insertions(+), 75 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 05c9d40a..943b2dfa 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -509,6 +509,7 @@ void homa_grant_remove_active(struct homa_rpc *rpc, rpc->msgin.rank = -1; rpc->peer->active_rpcs--; grant->num_active_rpcs--; + grant->active_rpcs[grant->num_active_rpcs] = NULL; /* Pull the highest-priority entry (if there is one) from * grantable_peers into active_rpcs. @@ -704,7 +705,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) if (rpc->msgin.rank < 0 && !limit) { /* Very fast path (Task 1 only). */ homa_grant_update_incoming(rpc, grant); - return; + goto done; } now = homa_clock(); @@ -714,7 +715,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) send_grant = homa_grant_update_granted(rpc, grant); homa_grant_update_incoming(rpc, grant); if (!send_grant) - return; + goto done; homa_grant_cand_init(&cand); if (rpc->msgin.granted >= rpc->msgin.length) @@ -730,11 +731,13 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) homa_grant_cand_check(&cand, grant); homa_rpc_lock(rpc); homa_rpc_put(rpc); - return; + goto done; } INC_METRIC(grant_check_slow_path, 1); homa_grant_update_incoming(rpc, grant); + tt_record1("homa_grant_check_rpc acquiring grant lock (id %d)", + rpc->id); homa_grant_lock(grant); if (recalc) { /* Task 5. */ @@ -755,6 +758,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) homa_grant_cand_add(&cand, rpc2); } homa_grant_unlock(grant); + tt_record1("homa_grant_check_rpc released grant lock (id %d)", + rpc->id); if (!homa_grant_cand_empty(&cand)) { homa_rpc_hold(rpc); homa_rpc_unlock(rpc); @@ -762,6 +767,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) homa_rpc_lock(rpc); homa_rpc_put(rpc); } + done: + tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); } /** diff --git a/notes.txt b/notes.txt index 9080cfe0..5de431b7 100755 --- a/notes.txt +++ b/notes.txt @@ -1,12 +1,6 @@ Notes for Homa implementation in Linux: --------------------------------------- -* Notes for the next design of grants: - * Update tthoma.py: e.g., no grant_recalc records - * grant_check_slow_path too high - * Too much time in NAPI? - * Too many BUSY packets (more than DATA) - * Failure modes: * homa_grant_add_rpc: list has a loop, or encounter a null list link * stack corruption under homa_recvmsg after socket shutdown. diff --git a/util/tthoma.py b/util/tthoma.py index 3d998eb6..44a31088 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1444,15 +1444,6 @@ def __rpc_end(self, trace, time, core, match, interests): 'regexp': 'homa_rpc_end invoked for id ([0-9]+)' }) - def __grant_recalc_start(self, trace, time, core, match, interests): - for interest in interests: - interest.tt_grant_recalc_start(trace, time, core) - - patterns.append({ - 'name': 'grant_recalc_start', - 'regexp': 'homa_grant_recalc starting' - }) - def __grant_check_start(self, trace, time, core, match, interests): id = int(match.group(1)) for interest in interests: @@ -1473,6 +1464,26 @@ def __grant_check_done(self, trace, time, core, match, interests): 'regexp': 'homa_grant_check_rpc finished with id ([0-9]+)' }) + def __grant_check_lock(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_grant_check_lock(trace, time, core, id) + + patterns.append({ + 'name': 'grant_check_lock', + 'regexp': 'homa_grant_check_rpc acquiring grant lock \(id ([0-9]+)\)' + }) + + def __grant_check_unlock(self, trace, time, core, match, interests): + id = int(match.group(1)) + for interest in interests: + interest.tt_grant_check_unlock(trace, time, core, id) + + patterns.append({ + 'name': 'grant_check_unlock', + 'regexp': 'homa_grant_check_rpc released grant lock \(id ([0-9]+)\)' + }) + def __rpc_incoming(self, trace, time, core, match, interests): id = int(match.group(1)) peer = match.group(2) @@ -3092,25 +3103,25 @@ class AnalyzeGrants: "grants_rx_" and "grants_tx_". These files contain information about all incoming/outgoing RPCs with outstanding/available grants in each time interval. In addition, statistics are generated about the time spent - in homa_grant_check_rpc and homa_grant_recalc. + in homa_grant_check_rpc. """ def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') dispatcher.interest('AnalyzeIntervals') - # Node name -> total time spent in homa_grant_check_rpc on that node, - # including time in homa_grant_recalc and time spent sending grants. + # Node name -> total time spent in homa_grant_check_rpc on that node. self.node_check_time = defaultdict(lambda : 0) - # Node name -> total time spent in homa_grant_recalc on that node, - # not including sending grants. - self.node_recalc_time = defaultdict(lambda : 0) + # Node name -> total time spent acquiring and holding the grant lock + # while executing homa_grant_check_rpc on that node. + self.node_lock_time = defaultdict(lambda : 0) # Node name -> total time spent sending grants during homa_grant_check_rpc. self.node_grant_send_time = defaultdict(lambda : 0) - # Node name -> number of calls to homa_grant_recalc - self.node_recalcs = defaultdict(lambda : 0) + # Node name -> number of times the grant lock was acquired by + # homa_grant_check_rpc on that node. + self.node_locks = defaultdict(lambda : 0) # Node name -> number of calls to homa_grant_check_rpc self.node_checks = defaultdict(lambda : 0) @@ -3122,8 +3133,9 @@ def init_trace(self, trace): # Core -> start time of active call to homa_grant_check_rpc (if any) self.core_check_start = {} - # Core -> start time of active call to recalc start (if any) - self.core_recalc_start = {} + # Core -> time when homa_grant_check_rpc started acquiring the + # grant lock (if any) + self.core_lock_start = {} # Core -> time of first grant sent by current call to # homa_grant_check_rpc (only valid if homa_grant_check_rpc in progress) @@ -3133,13 +3145,16 @@ def tt_grant_check_start(self, trace, t, core, id): self.node_checks[trace['node']] += 1 self.core_check_start[core] = t - def tt_grant_recalc_start(self, trace, t, core): + def tt_grant_check_lock(self, trace, t, core, id): node = trace['node'] - self.node_recalcs[node] += 1 - self.core_recalc_start[core] = t - if core in self.core_first_grant_send: - self.node_grant_send_time[node] += t - self.core_first_grant_send[core] - del self.core_first_grant_send[core] + self.core_lock_start[core] = t + + def tt_grant_check_unlock(self, trace, t, core, id): + node = trace['node'] + if core in self.core_lock_start: + self.node_locks[node] += 1 + self.node_lock_time[node] += t - self.core_lock_start[core] + del self.core_lock_start[core] def tt_send_grant(self, trace, t, core, id, offset, priority, increment): if not core in self.core_check_start: @@ -3154,62 +3169,46 @@ def tt_grant_check_done(self, trace, t, core, id): grant = self.core_first_grant_send[core] self.node_grant_send_time[node] += (t - grant) del self.core_first_grant_send[core] - else: - grant = -1e20 - if core in self.core_recalc_start: - recalc_start = self.core_recalc_start[core] - end = t - if grant > recalc_start: - end = grant - self.node_recalc_time[node] += end - self.core_recalc_start[core] - del self.core_recalc_start[core] if core in self.core_check_start: self.node_check_time[node] += t - self.core_check_start[core] del self.core_check_start[core] def print_grant_check_stats(self): - print('\nStatistics about the functions homa_grant_check_rpc and ' - 'homa_grant_recalc:') + print('\nStatistics about the function homa_grant_check_rpc:') print('Node: Name of node') print('Checks: Rate of calling homa_grant_check_rpc (k/sec)') - print('CUsec: Average execution time in homa_grant_check_rpc, ' - 'not including') - print(' time in homa_grant_recalc or sending grants') - print('CCores: Average active cores in homa_grant_check_rpc, ' - 'not including') - print(' time in homa_grant_recalc or sending grants') - print('RFrac: Fraction of calls to homa_grant_check_rpc that ' - ' invoked homa_grant_recalc') - print('RUsec: Average execution time in homa_grant_recalc, ' - 'not including time') - print(' sending grants') - print('RCores: Average active cores in homa_grant_recalc, ' - 'not including time') - print(' sending grants') - print('GPer Average grants sent per call to homa_grant_check_rpc') - print('GUsec Average time to send a grant') - print('GCores Average cores actively sending grants from within ' + print('CUsec: Average execution time in homa_grant_check_rpc') + print('CCores: Average active cores in homa_grant_check_rpc') + print('LFrac: Fraction of calls to homa_grant_check_rpc that ' + 'acquired the grant lock') + print('LUsec: Average time spent acquiring/holding the grant ' + 'lock in homa_grant_check_rpc') + print('LCores: Average cores acquiring/hold the grant lock in ' + 'homa_grant_check_rpc') + print('GPer: Average grants sent per call to homa_grant_check_rpc') + print('GUsec: Average time to send a grant') + print('GCores: Average cores actively sending grants from within ' 'homa_grant_check_rpc') print('') - print('Node Checks CUsec CCores RFrac RUsec RCores ' + print('Node Checks CUsec CCores LFrac LUsec LCores ' 'GPer GUSec GCores') print('--------------------------------------------------' '-----------------') for node in get_sorted_nodes(): checks = self.node_checks[node] - recalcs = self.node_recalcs[node] + locks = self.node_locks[node] grants = self.node_grants_sent[node] - recalc_time = self.node_recalc_time[node] + lock_time = self.node_lock_time[node] grant_time = self.node_grant_send_time[node] - check_time = self.node_check_time[node] - recalc_time - grant_time + check_time = self.node_check_time[node] elapsed = traces[node]['elapsed_time'] print('%-10s %5.1f %5.2f %5.2f ' % (node, 1000*checks/elapsed, check_time/checks if checks else 0, check_time/elapsed), end='') - print('%5.2f %5.2f %5.2f ' % (recalcs/checks if checks else 0, - recalc_time/recalcs if recalcs else 0, - recalc_time/elapsed), end='') + print('%5.2f %5.2f %5.2f ' % (locks/checks if checks else 0, + lock_time/checks if checks else 0, + lock_time/elapsed), end='') print('%5.2f %5.2f %5.2f' % (grants/checks if checks else 0, grant_time/grants if grants else 0, grant_time/elapsed)) @@ -3399,7 +3398,7 @@ def rx_info(self, node, local_rpcs): if remaining == 1e20: result += '%12d ?? %6d' % (id, outstanding) else: - result += '%12d %6d %6d' % (id, remaining, outstanding) + result += '%12d %7d %6d' % (id, remaining, outstanding) return result def tx_info(self, node, local_rpcs): @@ -3430,7 +3429,7 @@ def tx_info(self, node, local_rpcs): if remaining == 1e20: result += '%12d ?? %6d' % (id, available) else: - result += '%12d %6d %6d' % (id, remaining, available) + result += '%12d %7d %6d' % (id, remaining, available) return result def analyze(self): @@ -3610,8 +3609,8 @@ def output(self): 'but data has\n') f.write('# not yet arrived\n') f.write('\n') - f.write(' Time Id1 Rem1 Grant1 ' - 'Id2 Rem2 Grant2 Id3 Rem3 Grant3\n') + f.write(' Time Id1 Rem1 Grant1 ' + 'Id2 Rem2 Grant2 Id3 Rem3 Grant3\n') for interval in intervals[name]: if not 'rx_grant_info' in interval: continue @@ -3634,8 +3633,8 @@ def output(self): 'but data has\n') f.write('# not yet been transmitted\n') f.write('\n') - f.write(' Time Id1 Rem1 Grant1 ' - 'Id2 Rem2 Grant2 Id3 Rem3 Grant3\n') + f.write(' Time Id1 Rem1 Grant1 ' + 'Id2 Rem2 Grant2 Id3 Rem3 Grant3\n') for interval in intervals[name]: if not 'tx_grant_info' in interval: continue From 0fa4c1b9c773b3bd51b5b6486b75c5316ad1b019 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Jun 2025 13:32:38 -0700 Subject: [PATCH 355/625] Fix 2 races in homa_grant.c * The test in homa_grant_end_rpc for whether the RPC might still be in the grant data structures was unsafe: it could fail if the RPC was in the process of being moved from the grantable list to the active list. * rpc->msgin.rank was accessed in places where it's value could have changed to -1, which would result in an erroneous priority level. --- homa_grant.c | 94 +++++++++++++++++++++++++----------------- homa_grant.h | 4 +- homa_rpc.h | 8 ++-- test/unit_homa_grant.c | 62 ++++++++++++++++++++-------- 4 files changed, 108 insertions(+), 60 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 943b2dfa..29f1a077 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -173,7 +173,7 @@ void homa_grant_end_rpc(struct homa_rpc *rpc) struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_grant_candidates cand; - if (rpc->msgin.rank >= 0 || !list_empty(&rpc->grantable_links)) { + if (rpc->msgin.granted < rpc->msgin.length) { homa_grant_cand_init(&cand); homa_grant_unmanage_rpc(rpc, &cand); if (!homa_grant_cand_empty(&cand)) { @@ -421,8 +421,8 @@ void homa_grant_manage_rpc(struct homa_rpc *rpc) homa_grant_lock(grant); - INC_METRIC(grantable_rpcs_integral, grant->num_grantable_rpcs - * (time - grant->last_grantable_change)); + INC_METRIC(grantable_rpcs_integral, grant->num_grantable_rpcs * + (time - grant->last_grantable_change)); grant->last_grantable_change = time; grant->num_grantable_rpcs++; tt_record2("Incremented num_grantable_rpcs to %d, id %d", @@ -586,24 +586,30 @@ void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa_grant *grant) } /** - * homa_grant_update_granted() - Compute a new grant offset for an RPC based - * on the state of that world as well as overall grant state. - * @rpc: RPC whose msgin.granted should be updated. Need not be locked. + * homa_grant_update_granted() - Compute a new grant offset for an RPC. + * @rpc: RPC whose msgin.granted should be updated. Must be locked by + * caller. * @grant: Information for managing grants. This function may set * incoming_hit_limit. - * Return: True means the offset was increased and a grant should be sent - * for the RPC. False means no grant should be sent. + * Return: >= 0 means the offset was increased and a grant should be + * sent for the RPC; the return value gives the priority to + * use in the grant. -1 means the grant offset was not changed + * and no grant should be sent. */ -bool homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) +int homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) + __must_hold(&rpc->bucket->lock) { - int received, new_grant_offset, incoming_delta, avl_incoming; + int received, new_grant_offset, incoming_delta, avl_incoming, rank; /* Don't increase the grant if the node has been slow to send * data already granted: no point in wasting grants on this * node. */ if (rpc->silent_ticks > 1) - return false; + return -1; + rank = READ_ONCE(rpc->msgin.rank); + if (rank < 0 || rpc->msgin.granted >= rpc->msgin.length) + return -1; received = rpc->msgin.length - rpc->msgin.bytes_remaining; new_grant_offset = received + grant->window; @@ -619,25 +625,34 @@ bool homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) new_grant_offset -= incoming_delta - avl_incoming; } if (new_grant_offset <= rpc->msgin.granted) - return false; + return -1; rpc->msgin.granted = new_grant_offset; - return true; + + /* The reason we compute the priority here rather than, say, in + * homa_grant_send is that rpc->msgin.rank could change to -1 + * before homa_grant_send is invoked (it could change at any time, + * since we don't have homa->grant->lock; that's why READ_ONCE + * is used above). It's OK to still send a grant in that case, but + * we need to have a meaningful priority level for it. + */ + return homa_grant_priority(rpc->hsk->homa, rank); } /** * homa_grant_send() - Issue a GRANT packet for the current grant offset * of an incoming RPC. - * @rpc: RPC for which to issue GRANT. Should not be locked (to - * minimize lock contention, since sending a packet is slow), - * but caller must hold a reference to keep it from being reaped. - * The msgin.resend_all field will be cleared. + * @rpc: RPC for which to issue GRANT. Should not be locked (to + * minimize lock contention, since sending a packet is slow), + * but caller must hold a reference to keep it from being reaped. + * The msgin.resend_all field will be cleared. + * @priority: Priority level to use for the grant. */ -void homa_grant_send(struct homa_rpc *rpc) +void homa_grant_send(struct homa_rpc *rpc, int priority) { struct homa_grant_hdr grant; grant.offset = htonl(rpc->msgin.granted); - grant.priority = homa_grant_priority(rpc->hsk->homa, rpc->msgin.rank); + grant.priority = priority; grant.resend_all = rpc->msgin.resend_all; if (grant.resend_all) rpc->msgin.resend_all = 0; @@ -660,7 +675,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_grant_candidates cand; - bool send_grant, limit, recalc; + bool limit, recalc; u64 now; int i; @@ -702,19 +717,16 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * there are stalled RPCs that can now be granted. */ limit = atomic_xchg(&grant->incoming_hit_limit, false); - if (rpc->msgin.rank < 0 && !limit) { - /* Very fast path (Task 1 only). */ - homa_grant_update_incoming(rpc, grant); - goto done; - } now = homa_clock(); recalc = now >= READ_ONCE(grant->next_recalc); if (!recalc && !limit) { + int priority; + /* Fast path (Tasks 1 and 2). */ - send_grant = homa_grant_update_granted(rpc, grant); + priority = homa_grant_update_granted(rpc, grant); homa_grant_update_incoming(rpc, grant); - if (!send_grant) + if (priority < 0) goto done; homa_grant_cand_init(&cand); @@ -726,7 +738,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) */ homa_rpc_hold(rpc); homa_rpc_unlock(rpc); - homa_grant_send(rpc); + homa_grant_send(rpc, priority); if (!homa_grant_cand_empty(&cand)) homa_grant_cand_check(&cand, grant); homa_rpc_lock(rpc); @@ -959,22 +971,28 @@ void homa_grant_cand_check(struct homa_grant_candidates *cand, struct homa_grant *grant) { struct homa_rpc *rpc; + bool locked; + int priority; while (cand->removes < cand->inserts) { rpc = cand->rpcs[cand->removes & HOMA_CAND_MASK]; cand->removes++; homa_rpc_lock(rpc); - - if (rpc->state != RPC_DEAD && - homa_grant_update_granted(rpc, grant)) { - homa_grant_update_incoming(rpc, grant); - if (rpc->msgin.granted >= rpc->msgin.length) - homa_grant_unmanage_rpc(rpc, cand); - homa_rpc_unlock(rpc); - homa_grant_send(rpc); - } else { - homa_rpc_unlock(rpc); + locked = true; + + if (rpc->state != RPC_DEAD) { + priority = homa_grant_update_granted(rpc, grant); + if (priority >= 0) { + homa_grant_update_incoming(rpc, grant); + if (rpc->msgin.granted >= rpc->msgin.length) + homa_grant_unmanage_rpc(rpc, cand); + homa_rpc_unlock(rpc); + locked = false; + homa_grant_send(rpc, priority); + } } + if (locked) + homa_rpc_unlock(rpc); homa_rpc_put(rpc); } } diff --git a/homa_grant.h b/homa_grant.h index 61ae5edd..2be010ac 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -249,10 +249,10 @@ int homa_grant_priority(struct homa *homa, int rank); void homa_grant_remove_active(struct homa_rpc *rpc, struct homa_grant_candidates *cand); void homa_grant_remove_grantable(struct homa_rpc *rpc); -void homa_grant_send(struct homa_rpc *rpc); +void homa_grant_send(struct homa_rpc *rpc, int priority); void homa_grant_unmanage_rpc(struct homa_rpc *rpc, struct homa_grant_candidates *cand); -bool homa_grant_update_granted(struct homa_rpc *rpc, +int homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant); void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa_grant *grant); diff --git a/homa_rpc.h b/homa_rpc.h index d6756db3..79d6a351 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -166,7 +166,8 @@ struct homa_message_in { #ifndef __STRIP__ /* See strip.py */ /** * @rank: Position of this RPC in homa->grant->active_rpcs, or -1 - * if not in homa->grant->active_rpcs. Managed by homa_grant.c. + * if not in homa->grant->active_rpcs. Managed by homa_grant.c; + * unsafe to access unless holding homa->grant->lock. */ int rank; @@ -197,7 +198,7 @@ struct homa_message_in { u64 birth; /** @resend_all: if nonzero, set resend_all in the next grant packet. */ - __u8 resend_all; + u8 resend_all; #endif /* See strip.py */ }; @@ -363,7 +364,8 @@ struct homa_rpc { /** * @grantable_links: Used to link this RPC into peer->grantable_rpcs. * If this RPC isn't in peer->grantable_rpcs, this is an empty - * list pointing to itself. + * list pointing to itself. Must hold homa->grant->lock when + * accessing. */ struct list_head grantable_links; #endif /* See strip.py */ diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 19403095..d6ff2ae7 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -228,19 +228,27 @@ TEST_F(homa_grant, homa_grant_end_rpc__basics) unit_hook_register(grant_spinlock_hook); hook_spinlock_count = 0; - /* First call: RPC is managed. */ homa_grant_end_rpc(rpc); EXPECT_EQ(-1, rpc->msgin.rank); EXPECT_EQ(1, hook_spinlock_count); EXPECT_EQ(-100, atomic_read(&self->homa.grant->total_incoming)); EXPECT_EQ(0, rpc->msgin.rec_incoming); +} +TEST_F(homa_grant, homa_grant_end_rpc__skip_cleanup_if_fully_granted) +{ + struct homa_rpc *rpc; + + rpc = test_rpc_init(self, 100, self->server_ip, 20000); + rpc->msgin.rec_incoming = 100; + rpc->msgin.granted = rpc->msgin.length; + EXPECT_EQ(0, rpc->msgin.rank); - /* Second call: RPC not managed, nothing to do. */ - hook_spinlock_count = 0; homa_grant_end_rpc(rpc); - EXPECT_EQ(0, hook_spinlock_count); + EXPECT_EQ(0, rpc->msgin.rank); + EXPECT_EQ(-100, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(0, rpc->msgin.rec_incoming); } -TEST_F(homa_grant, homa_grant_end_rpc__call_cand_check) +TEST_F(homa_grant, homa_grant_end_rpc__activate_other_rpc) { struct homa_rpc *rpc1, *rpc2; @@ -852,7 +860,10 @@ TEST_F(homa_grant, homa_grant_update_granted__basics) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - EXPECT_TRUE(homa_grant_update_granted(rpc, self->homa.grant)); + rpc->msgin.rank = 1; + self->homa.grant->num_active_rpcs = 4; + EXPECT_EQ(2, homa_grant_update_granted(rpc, self->homa.grant)); + self->homa.grant->num_active_rpcs = 0; EXPECT_EQ(10000, rpc->msgin.granted); EXPECT_EQ(0, atomic_read(&self->homa.grant->incoming_hit_limit)); } @@ -861,20 +872,37 @@ TEST_F(homa_grant, homa_grant_update_granted__rpc_idle) struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); rpc->silent_ticks = 2; - EXPECT_FALSE(homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(1000, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_update_granted__not_active) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + rpc->msgin.rank = -1; + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); EXPECT_EQ(1000, rpc->msgin.granted); } +TEST_F(homa_grant, homa_grant_update_granted__already_fully_granted) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + rpc->msgin.rank = 2; + rpc->msgin.granted = rpc->msgin.length; + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); +} TEST_F(homa_grant, homa_grant_update_granted__end_of_message) { struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); /* First call grants remaining bytes in message. */ rpc->msgin.bytes_remaining = 5000; - EXPECT_TRUE(homa_grant_update_granted(rpc, self->homa.grant)); + rpc->msgin.rank = 2; + EXPECT_EQ(0, homa_grant_update_granted(rpc, self->homa.grant)); EXPECT_EQ(20000, rpc->msgin.granted); /* Second call cannot grant anything additional. */ - EXPECT_FALSE(homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); } TEST_F(homa_grant, homa_grant_update_granted__insufficient_room_in_incoming) { @@ -883,7 +911,7 @@ TEST_F(homa_grant, homa_grant_update_granted__insufficient_room_in_incoming) rpc->msgin.bytes_remaining = 5000; rpc->msgin.rank = 5; atomic_set(&self->homa.grant->total_incoming, 48000); - EXPECT_TRUE(homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(0, homa_grant_update_granted(rpc, self->homa.grant)); EXPECT_EQ(17000, rpc->msgin.granted); } TEST_F(homa_grant, homa_grant_update_granted__incoming_overcommitted) @@ -891,7 +919,8 @@ TEST_F(homa_grant, homa_grant_update_granted__incoming_overcommitted) struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); atomic_set(&self->homa.grant->total_incoming, 51000); - EXPECT_FALSE(homa_grant_update_granted(rpc, self->homa.grant)); + rpc->msgin.rank = 2; + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); EXPECT_EQ(1000, rpc->msgin.granted); EXPECT_EQ(1, atomic_read(&self->homa.grant->incoming_hit_limit)); } @@ -902,10 +931,9 @@ TEST_F(homa_grant, homa_grant_send__basics) mock_xmit_log_verbose = 1; rpc->msgin.granted = 2600; - rpc->msgin.rank = 2; unit_log_clear(); - homa_grant_send(rpc); - EXPECT_SUBSTR("id 100, offset 2600, grant_prio 0", unit_log_get()); + homa_grant_send(rpc, 3); + EXPECT_SUBSTR("id 100, offset 2600, grant_prio 3", unit_log_get()); } TEST_F(homa_grant, homa_grant_send__resend_all) { @@ -916,8 +944,8 @@ TEST_F(homa_grant, homa_grant_send__resend_all) rpc->msgin.rank = 0; rpc->msgin.resend_all = 1; unit_log_clear(); - homa_grant_send(rpc); - EXPECT_SUBSTR("id 100, offset 9999, grant_prio 0, resend_all", + homa_grant_send(rpc, 1); + EXPECT_SUBSTR("id 100, offset 9999, grant_prio 1, resend_all", unit_log_get()); EXPECT_EQ(0, rpc->msgin.resend_all); } @@ -1296,7 +1324,7 @@ TEST_F(homa_grant, homa_grant_cand_check__rpc_becomes_fully_granted) unit_log_clear(); homa_grant_cand_check(&cand, self->homa.grant); - EXPECT_STREQ("xmit GRANT 20000@1; xmit GRANT 10000@0", unit_log_get()); + EXPECT_STREQ("xmit GRANT 20000@0; xmit GRANT 10000@0", unit_log_get()); EXPECT_EQ(-1, rpc1->msgin.rank); EXPECT_EQ(0, rpc2->msgin.rank); EXPECT_EQ(2, cand.removes); From f9d2e45b65bcc8e6693a0b61bf33d6c219833b28 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Jun 2025 15:38:47 -0700 Subject: [PATCH 356/625] Add check for socket shutdown in homa_rpc_handoff --- homa_incoming.c | 4 ++++ test/unit_homa_incoming.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/homa_incoming.c b/homa_incoming.c index a1728fbf..6b7918bd 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1263,6 +1263,10 @@ void homa_rpc_handoff(struct homa_rpc *rpc) * otherwise enqueue it. */ homa_sock_lock(hsk); + if (hsk->shutdown) { + homa_sock_unlock(hsk); + return; + } if (!list_empty(&hsk->interests)) { #ifndef __STRIP__ /* See strip.py */ interest = homa_choose_interest(hsk); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 3ece4ab6..10fc2d03 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2577,6 +2577,20 @@ TEST_F(homa_incoming, homa_rpc_handoff__private_rpc) EXPECT_TRUE(list_empty(&self->hsk.ready_rpcs)); homa_interest_unlink_private(&interest); } +TEST_F(homa_incoming, homa_rpc_handoff__socket_shutdown) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1600); + + ASSERT_NE(NULL, crpc); + unit_log_clear(); + + self->hsk.shutdown = 1; + homa_rpc_handoff(crpc); + self->hsk.shutdown = 0; + EXPECT_TRUE(list_empty(&self->hsk.ready_rpcs)); +} TEST_F(homa_incoming, homa_rpc_handoff__handoff_to_shared_interest) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, From 7eefa06a3e1911c053470cff3c75ca7269bc990e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Jun 2025 15:39:33 -0700 Subject: [PATCH 357/625] Use destroy function from struct proto properly Previously, Homa was releasing socket resources in homa_sock_shutdown. This could cause crashes if there was ongoing socket activity at the time homa_sock_shutdown was invoked. --- homa_plumbing.c | 12 +++++---- homa_sock.c | 52 +++++++++++++++++++++++---------------- homa_sock.h | 2 +- test/unit_homa_incoming.c | 10 ++++---- test/unit_homa_outgoing.c | 10 ++++---- test/unit_homa_peer.c | 6 ++--- test/unit_homa_plumbing.c | 18 +++++++------- test/unit_homa_rpc.c | 8 +++--- test/unit_homa_sock.c | 51 ++++++++++++++++++++------------------ test/unit_homa_utils.c | 2 +- test/utils.c | 11 +++++++++ test/utils.h | 7 +++--- 12 files changed, 108 insertions(+), 81 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index ea0ae412..fe0b392f 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -102,7 +102,7 @@ static struct proto homa_prot = { .connect = ip4_datagram_connect, .ioctl = homa_ioctl, .init = homa_socket, - .destroy = NULL, + .destroy = homa_sock_destroy, .setsockopt = homa_setsockopt, .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, @@ -120,7 +120,7 @@ static struct proto homav6_prot = { .connect = ip6_datagram_connect, .ioctl = homa_ioctl, .init = homa_socket, - .destroy = NULL, + .destroy = homa_sock_destroy, .setsockopt = homa_setsockopt, .getsockopt = homa_getsockopt, .sendmsg = homa_sendmsg, @@ -737,7 +737,7 @@ void homa_close(struct sock *sk, long timeout) int port = hsk->port; #endif/* See strip.py */ - homa_sock_destroy(hsk); + homa_sock_shutdown(hsk); sk_common_release(sk); tt_record1("closed socket, port %d", port); } @@ -842,8 +842,10 @@ int homa_socket(struct sock *sk) int result; result = homa_sock_init(hsk); - if (result != 0) - homa_sock_destroy(hsk); + if (result != 0) { + homa_sock_shutdown(hsk); + homa_sock_destroy(&hsk->sock); + } return result; } diff --git a/homa_sock.c b/homa_sock.c index 92e0a31d..99c61fca 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -39,7 +39,13 @@ void homa_socktab_destroy(struct homa_socktab *socktab, struct homa_net *hnet) hsk = homa_socktab_next(&scan)) { if (hnet && hnet != hsk->hnet) continue; - homa_sock_destroy(hsk); + + /* In actual use there should be no sockets left when this + * function is invoked, so the code below will never be + * invoked. However, it is useful during unit tests. + */ + homa_sock_shutdown(hsk); + homa_sock_destroy(&hsk->sock); } homa_socktab_end_scan(&scan); } @@ -189,6 +195,7 @@ int homa_sock_init(struct homa_sock *hsk) if (hnet->prev_default_port == starting_port) { spin_unlock_bh(&socktab->write_lock); hsk->shutdown = true; + hsk->homa = NULL; result = -EADDRNOTAVAIL; goto error; } @@ -252,16 +259,15 @@ void homa_sock_unlink(struct homa_sock *hsk) /** * homa_sock_shutdown() - Disable a socket so that it can no longer * be used for either sending or receiving messages. Any system calls - * currently waiting to send or receive messages will be aborted. + * currently waiting to send or receive messages will be aborted. This + * function will terminate any existing use of the socket, but it does + * not free up socket resources: that happens in homa_sock_destroy. * @hsk: Socket to shut down. */ void homa_sock_shutdown(struct homa_sock *hsk) { struct homa_interest *interest; struct homa_rpc *rpc; -#ifndef __STRIP__ /* See strip.py */ - int i = 0; -#endif /* See strip.py */ tt_record1("Starting shutdown for socket %d", hsk->port); homa_sock_lock(hsk); @@ -306,20 +312,34 @@ void homa_sock_shutdown(struct homa_sock *hsk) wake_up(&interest->wait_queue); } homa_sock_unlock(hsk); + tt_record1("Finished shutdown for socket %d", hsk->port); +} -#ifndef __STRIP__ /* See strip.py */ +/** + * homa_sock_destroy() - Release all of the internal resources associated + * with a socket; is invoked at time when that is safe (i.e., all references + * on the socket have been dropped). + * @hsk: Socket to destroy. + */ +void homa_sock_destroy(struct sock *sk) +{ + struct homa_sock *hsk = homa_sk(sk); + IF_NO_STRIP(int i = 0); + + if (!hsk->homa) + return; + + tt_record1("Starting to destroy socket %d", hsk->port); while (!list_empty(&hsk->dead_rpcs)) { homa_rpc_reap(hsk, true); +#ifndef __STRIP__ /* See strip.py */ i++; if (i == 5) { tt_record("Freezing because reap seems hung"); tt_freeze(); } - } -#else /* See strip.py */ - while (!list_empty(&hsk->dead_rpcs)) - homa_rpc_reap(hsk, 1000); #endif /* See strip.py */ + } WARN_ON_ONCE(refcount_read(&hsk->sock.sk_wmem_alloc) != 1); #ifdef __UNIT_TEST__ @@ -336,17 +356,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) homa_pool_free(hsk->buffer_pool); hsk->buffer_pool = NULL; } - tt_record1("Finished shutdown for socket %d", hsk->port); -} - -/** - * homa_sock_destroy() - Destructor for homa_sock objects. This function - * only cleans up the parts of the object that are owned by Homa. - * @hsk: Socket to destroy. - */ -void homa_sock_destroy(struct homa_sock *hsk) -{ - homa_sock_shutdown(hsk); + tt_record1("Finished destroying socket %d", hsk->port); } /** diff --git a/homa_sock.h b/homa_sock.h index 60011add..8d04ae04 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -273,7 +273,7 @@ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, #endif /* See strip.py */ int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, __u16 port); -void homa_sock_destroy(struct homa_sock *hsk); +void homa_sock_destroy(struct sock *sk); struct homa_sock *homa_sock_find(struct homa_net *hnet, __u16 port); int homa_sock_init(struct homa_sock *hsk); void homa_sock_shutdown(struct homa_sock *hsk); diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 10fc2d03..46ab4297 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -34,7 +34,7 @@ static void wait_hook4(char *id) if (hook_count != 0) return; if (hook_shutdown_hsk) - homa_sock_shutdown(hook_shutdown_hsk); + unit_sock_destroy(hook_shutdown_hsk); else homa_rpc_handoff(hook_rpc); } @@ -912,7 +912,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv4) // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); @@ -929,7 +929,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_ipv6) // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); @@ -946,7 +946,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__server_not_enabled) // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); self->hsk.is_server = false; @@ -964,7 +964,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__unknown_socket_free_many_packets) // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 1400); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 72d24e63..734ff55e 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -305,7 +305,7 @@ TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__multiple_segments_tcp_hijacking) EXPECT_STREQ("DATA from 0.0.0.0:40001, dport 99, id 2, message_length 10000, offset 10000, data_length 1500, incoming 10000, extra segs 1500@11500 1500@13000 500@14500", homa_print_packet(skb, buffer, sizeof(buffer))); kfree_skb(skb); - homa_sock_destroy(&hsk); + unit_sock_destroy(&hsk); } TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__error_copying_data_hijacking_path) { @@ -715,7 +715,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, self->client_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, @@ -739,7 +739,7 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, self->client_port); srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, @@ -967,7 +967,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv4_transmit_error) // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, self->client_port); crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, @@ -985,7 +985,7 @@ TEST_F(homa_outgoing, __homa_xmit_data__ipv6_transmit_error) // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, self->client_port); crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 97fef956..467c9fc5 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -333,7 +333,7 @@ TEST_F(homa_peer, homa_peer_prefer_evict) EXPECT_EQ(0, homa_peer_prefer_evict(peertab, peer1, peer4)); EXPECT_EQ(1, homa_peer_prefer_evict(peertab, peer1, peer2)); - homa_sock_destroy(&hsk2); + unit_sock_destroy(&hsk2); homa_peer_free_net(hnet2); } @@ -799,7 +799,7 @@ TEST_F(homa_peer, homa_peer_get_dst__ipv4) // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); struct homa_peer *peer = homa_peer_get(&self->hsk, @@ -821,7 +821,7 @@ TEST_F(homa_peer, homa_peer_get_dst__ipv6) // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index d5907157..97e58579 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -150,7 +150,7 @@ TEST_F(homa_plumbing, homa_bind__version_mismatch) // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); addr.sa_family = AF_INET6; sock.sk = &self->hsk.inet.sk; @@ -165,7 +165,7 @@ TEST_F(homa_plumbing, homa_bind__ipv6_address_too_short) // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); addr.in6.sin6_family = AF_INET6; @@ -181,7 +181,7 @@ TEST_F(homa_plumbing, homa_bind__ipv6_ok) // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); self->hsk.is_server = false; @@ -201,7 +201,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_address_too_short) // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); addr.in4.sin_family = AF_INET; @@ -217,7 +217,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_ok) // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); self->hsk.is_server = false; @@ -283,7 +283,7 @@ TEST_F(homa_plumbing, homa_socket__success) hsk.sock.sk_net.net = self->hnet->net; refcount_set(&hsk.sock.sk_wmem_alloc, 1); EXPECT_EQ(0, homa_socket(&hsk.sock)); - homa_sock_destroy(&hsk); + unit_sock_destroy(&hsk); } TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) { @@ -770,7 +770,7 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) // Make sure the test uses IPv4. mock_ipv6 = false; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); EXPECT_EQ(0, -homa_pool_get_pages(self->hsk.buffer_pool, 2, pages, 0)); @@ -801,7 +801,7 @@ TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv6) // Make sure the test uses IPv6. mock_ipv6 = true; - homa_sock_destroy(&self->hsk); + unit_sock_destroy(&self->hsk); mock_sock_init(&self->hsk, self->hnet, 0); server_ip6 = unit_get_in_addr("1::3:5:7"); @@ -1223,7 +1223,7 @@ TEST_F(homa_plumbing, homa_poll__socket_shutdown) { struct socket sock = {.sk = &self->hsk.sock}; - homa_sock_shutdown(&self->hsk); + unit_sock_destroy(&self->hsk); EXPECT_EQ(POLLIN | POLLOUT | POLLWRNORM, homa_poll(NULL, &sock, NULL)); } TEST_F(homa_plumbing, homa_poll__socket_readable) diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 9b90203c..3848efa0 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -363,7 +363,7 @@ TEST_F(homa_rpc, homa_rpc_acked__basics) homa_rpc_acked(&hsk, self->client_ip, &ack); EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); + unit_sock_destroy(&hsk); } TEST_F(homa_rpc, homa_rpc_acked__lookup_socket) { @@ -381,7 +381,7 @@ TEST_F(homa_rpc, homa_rpc_acked__lookup_socket) homa_rpc_acked(&self->hsk, self->client_ip, &ack); EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); + unit_sock_destroy(&hsk); } TEST_F(homa_rpc, homa_rpc_acked__no_such_socket) { @@ -399,7 +399,7 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_socket) homa_rpc_acked(&hsk, self->client_ip, &ack); EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); + unit_sock_destroy(&hsk); } TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) { @@ -417,7 +417,7 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) homa_rpc_acked(&hsk, self->client_ip, &ack); EXPECT_EQ(1, unit_list_length(&hsk.active_rpcs)); EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); - homa_sock_destroy(&hsk); + unit_sock_destroy(&hsk); } TEST_F(homa_rpc, homa_rpc_end__basics) diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index ab781ce8..dded29c3 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -118,10 +118,10 @@ TEST_F(homa_sock, homa_socktab_next) hsk = homa_socktab_next(&scan); EXPECT_EQ(NULL, hsk); EXPECT_EQ(0, mock_sock_holds); - homa_sock_destroy(&hsk1); - homa_sock_destroy(&hsk2); - homa_sock_destroy(&hsk3); - homa_sock_destroy(&hsk4); + unit_sock_destroy(&hsk1); + unit_sock_destroy(&hsk2); + unit_sock_destroy(&hsk3); + unit_sock_destroy(&hsk4); homa_socktab_end_scan(&scan); } @@ -152,7 +152,7 @@ TEST_F(homa_sock, homa_sock_init__cant_allocate_buffer_pool) mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_sock_init(&sock)); - homa_sock_destroy(&sock); + unit_sock_destroy(&sock); } TEST_F(homa_sock, homa_sock_init__skip_port_in_use) { @@ -163,8 +163,8 @@ TEST_F(homa_sock, homa_sock_init__skip_port_in_use) mock_sock_init(&hsk3, self->hnet, 0); EXPECT_EQ(65535, hsk2.port); EXPECT_EQ(32769, hsk3.port); - homa_sock_destroy(&hsk2); - homa_sock_destroy(&hsk3); + unit_sock_destroy(&hsk2); + unit_sock_destroy(&hsk3); } TEST_F(homa_sock, homa_sock_init__all_ports_in_use) { @@ -177,9 +177,9 @@ TEST_F(homa_sock, homa_sock_init__all_ports_in_use) EXPECT_EQ(65534, hsk2.port); EXPECT_EQ(65535, hsk3.port); EXPECT_EQ(1, hsk4.shutdown); - homa_sock_destroy(&hsk2); - homa_sock_destroy(&hsk3); - homa_sock_destroy(&hsk4); + unit_sock_destroy(&hsk2); + unit_sock_destroy(&hsk3); + unit_sock_destroy(&hsk4); } TEST_F(homa_sock, homa_sock_init__ip_header_length) { @@ -191,8 +191,8 @@ TEST_F(homa_sock, homa_sock_init__ip_header_length) mock_sock_init(&hsk_v6, self->hnet, 0); EXPECT_EQ(sizeof(struct iphdr), hsk_v4.ip_header_length); EXPECT_EQ(sizeof(struct ipv6hdr), hsk_v6.ip_header_length); - homa_sock_destroy(&hsk_v4); - homa_sock_destroy(&hsk_v6); + unit_sock_destroy(&hsk_v4); + unit_sock_destroy(&hsk_v6); } #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_sock, homa_sock_init__hijack_tcp) @@ -205,8 +205,8 @@ TEST_F(homa_sock, homa_sock_init__hijack_tcp) mock_sock_init(&hijack, self->hnet, 0); EXPECT_EQ(0, no_hijack.sock.sk_protocol); EXPECT_EQ(IPPROTO_TCP, hijack.sock.sk_protocol); - homa_sock_destroy(&hijack); - homa_sock_destroy(&no_hijack); + unit_sock_destroy(&hijack); + unit_sock_destroy(&no_hijack); } #endif /* See strip.py */ @@ -226,13 +226,13 @@ TEST_F(homa_sock, homa_sock_unlink__remove_from_map) sock_put(&hsk2.sock); sock_put(&hsk3.sock); - homa_sock_shutdown(&hsk2); + unit_sock_destroy(&hsk2); EXPECT_EQ(NULL, homa_sock_find(self->hnet, client2)); EXPECT_EQ(&hsk3, homa_sock_find(self->hnet, client3)); sock_put(&hsk3.sock); - homa_sock_shutdown(&hsk3); + unit_sock_destroy(&hsk3); EXPECT_EQ(NULL, homa_sock_find(self->hnet, client2)); EXPECT_EQ(NULL, homa_sock_find(self->hnet, client3)); @@ -251,6 +251,7 @@ TEST_F(homa_sock, homa_sock_shutdown__unlink_socket) homa_sock_shutdown(&hsk); EXPECT_EQ(NULL, homa_sock_find(self->hnet, client)); + homa_sock_destroy(&hsk.sock); } TEST_F(homa_sock, homa_sock_shutdown__already_shutdown) { @@ -277,6 +278,7 @@ TEST_F(homa_sock, homa_sock_shutdown__delete_rpcs) homa_sock_shutdown(&self->hsk); EXPECT_TRUE(self->hsk.shutdown); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + homa_sock_destroy(&self->hsk.sock); } TEST_F(homa_sock, homa_sock_shutdown__wakeup_interests) { @@ -295,6 +297,7 @@ TEST_F(homa_sock, homa_sock_shutdown__wakeup_interests) EXPECT_EQ(NULL, interest2.rpc); EXPECT_TRUE(list_empty(&interest1.links)); EXPECT_STREQ("wake_up; wake_up", unit_log_get()); + homa_sock_destroy(&self->hsk.sock); } TEST_F(homa_sock, homa_sock_bind) @@ -320,11 +323,11 @@ TEST_F(homa_sock, homa_sock_bind) EXPECT_EQ(NULL, homa_sock_find(self->hnet, 110)); EXPECT_EQ(&self->hsk, homa_sock_find(self->hnet, 120)); sock_put(&self->hsk.sock); - homa_sock_destroy(&hsk2); + unit_sock_destroy(&hsk2); } TEST_F(homa_sock, homa_sock_bind__socket_shutdown) { - homa_sock_shutdown(&self->hsk); + unit_sock_destroy(&self->hsk); EXPECT_EQ(ESHUTDOWN, -homa_sock_bind(self->hnet, &self->hsk, 100)); } @@ -339,7 +342,7 @@ TEST_F(homa_sock, homa_sock_find__basics) EXPECT_EQ(&hsk2, homa_sock_find(self->hnet, hsk2.port)); sock_put(&hsk2.sock); EXPECT_EQ(NULL, homa_sock_find(self->hnet, hsk2.port + 1)); - homa_sock_destroy(&hsk2); + unit_sock_destroy(&hsk2); } TEST_F(homa_sock, homa_sock_find__same_port_in_different_hnets) { @@ -358,8 +361,8 @@ TEST_F(homa_sock, homa_sock_find__same_port_in_different_hnets) sock_put(&hsk1.sock); sock_put(&hsk2.sock); - homa_sock_destroy(&hsk1); - homa_sock_destroy(&hsk2); + unit_sock_destroy(&hsk1); + unit_sock_destroy(&hsk2); } TEST_F(homa_sock, homa_sock_find__long_hash_chain) @@ -388,9 +391,9 @@ TEST_F(homa_sock, homa_sock_find__long_hash_chain) 5*HOMA_SOCKTAB_BUCKETS + 13)); sock_put(&hsk4.sock); - homa_sock_destroy(&hsk2); - homa_sock_destroy(&hsk3); - homa_sock_destroy(&hsk4); + unit_sock_destroy(&hsk2); + unit_sock_destroy(&hsk3); + unit_sock_destroy(&hsk4); } #ifndef __STRIP__ /* See strip.py */ diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index ebd42808..e1b63b5d 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -156,7 +156,7 @@ TEST_F(homa_utils, homa_net_destroy__delete_sockets) EXPECT_EQ(1, hsk2.shutdown); EXPECT_EQ(0, hsk3.shutdown); - homa_sock_destroy(&hsk3); + unit_sock_destroy(&hsk3); } TEST_F(homa_utils, homa_net_destroy__delete_peers) { diff --git a/test/utils.c b/test/utils.c index 27f8a14d..6abfbd29 100644 --- a/test/utils.c +++ b/test/utils.c @@ -488,6 +488,17 @@ void unit_homa_destroy(struct homa *homa) /* Currently nothing to check. */ } +/** + * unit_sock_destroy() - Invoked by unit tests to cleanup and destroy + * a socket. + * @hsk: Socket to destroy. + */ +void unit_sock_destroy(struct homa_sock *hsk) +{ + homa_sock_shutdown(hsk); + homa_sock_destroy(&hsk->sock); +} + /** * unit_log_peers() - Return a count of the number of peers in the * homa_peertab for @homa (could also include peers from other homas). diff --git a/test/utils.h b/test/utils.h index 036fc49b..05b3f437 100644 --- a/test/utils.h +++ b/test/utils.h @@ -53,6 +53,9 @@ void unit_log_grantables(struct homa *homa); void unit_log_hashed_rpcs(struct homa_sock *hsk); void unit_log_message_out_packets(struct homa_message_out *message, int verbose); +void unit_log_skb_list(struct sk_buff_head *packets, + int verbose); +void unit_log_throttled(struct homa *homa); const char *unit_print_gaps(struct homa_rpc *rpc); struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, @@ -61,9 +64,7 @@ struct homa_rpc struct in6_addr *client_ip, int client_port, int id, int req_length, int resp_length); -void unit_log_skb_list(struct sk_buff_head *packets, - int verbose); -void unit_log_throttled(struct homa *homa); +void unit_sock_destroy(struct homa_sock *hsk); void unit_teardown(void); /* Kludge to avoid including arpa/inet.h, which causes definition From f8888972335c54faa7435f9c6a7f1d9c36a58a6b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 6 Jun 2025 16:23:44 -0700 Subject: [PATCH 358/625] Modify tt_printk to print entries most-recent first (again) In some situations the kernel doesn't seem to survive long enough to print the entire trace, even when it is shortened; better to get the most recent entries rather than the oldest ones. --- timetrace.c | 65 ++++++++++++++++++++++++++++++------------------ timetrace.h | 2 +- util/ttsyslog.py | 6 ++++- 3 files changed, 47 insertions(+), 26 deletions(-) diff --git a/timetrace.c b/timetrace.c index 227867e7..973906af 100644 --- a/timetrace.c +++ b/timetrace.c @@ -288,7 +288,7 @@ void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, /** * tt_find_oldest() - This function is invoked when printing out the - * Timetrace: it finds the oldest event to print from each trace. + * timetrace: it finds the oldest event to print from each trace. * This will be events[0] if we never completely filled the buffer, * otherwise events[nextIndex+1]. This means we don't print the entry at * nextIndex; this is convenient because it simplifies boundary checks @@ -298,8 +298,9 @@ void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, * complete, since there may have been events that were discarded). * @pos: Array with NPOS elements; will be filled in with the oldest * index in the trace for each core. + * Return: Time of oldest log entry that should be printed. */ -void tt_find_oldest(int *pos) +u64 tt_find_oldest(int *pos) { struct tt_buffer *buffer; u64 start_time = 0; @@ -330,6 +331,7 @@ void tt_find_oldest(int *pos) pos[i] = (pos[i] + 1) & (tt_buffer_size - 1); } } + return start_time; } /** @@ -681,19 +683,26 @@ void tt_print_file(char *path) /** * tt_printk() - Print the contents of the timetrace to the system log. * Useful in situations where the system is too unstable to extract a - * timetrace by reading /proc/timetrace. + * timetrace by reading /proc/timetrace. Note: the timetrace is printed + * most recent entry first (in the hopes that if buffer overflows + * disrupt the output, at least the most recent entries will be complete). */ void tt_printk(void) { - /* Index of the next entry to return from each tt_buffer. - * This array is too large to allocate on the stack, and we don't - * want to allocate space dynamically (this function could be - * called at a point where the world is going to hell). So, - * allocate the array statically, and only allow one concurrent - * call to this function. + /* Index of the oldest entry to return from each tt_buffer. This + * array is too large to allocate on the stack, and we don't want to + * allocate space dynamically (this function could be called at a + * point where the world is going to hell). So, allocate the array + * statically and only allow one concurrent call to this function. */ - static int pos[NR_CPUS]; + static int oldest[NR_CPUS]; static atomic_t active; + + /* Index of the next entry to consider from each tt_buffer, or -1 if + * all entries have been processed. + */ + static int pos[NR_CPUS]; + u64 start_time; int i; if (atomic_xchg(&active, 1)) { @@ -703,37 +712,42 @@ void tt_printk(void) if (!init) return; atomic_inc(&tt_freeze_count); - tt_find_oldest(pos); + start_time = tt_find_oldest(oldest); + for (i = 0; i < nr_cpu_ids; i++) { + if (oldest[i] == tt_buffers[i]->next_index) + pos[i] = -1; + else + pos[i] = (tt_buffers[i]->next_index - 1) & + (tt_buffer_size - 1); + } /* Limit the number of entries logged per core (logging too many * seems to cause entries to be lost). */ for (i = 0; i < nr_cpu_ids; i++) { - struct tt_buffer *buffer = tt_buffers[i]; - - if (((buffer->next_index - pos[i]) & (TT_BUF_SIZE - 1)) > 200) - pos[i] = (buffer->next_index - 200) & (TT_BUF_SIZE - 1); + if (((pos[i] - oldest[i]) & (TT_BUF_SIZE - 1)) > 200) + oldest[i] = (pos[i] - 200) & (TT_BUF_SIZE - 1); } - pr_err("cpu_khz: %u\n", cpu_khz); + pr_err("cpu_khz: %u, start: %llu\n", cpu_khz, start_time); /* Each iteration of this loop printk's one event. */ while (true) { - u64 earliest_time = ~0; + u64 latest_time = 0; struct tt_event *event; int current_core = -1; char msg[200]; - int i; - /* Check all the traces to find the earliest available event. */ + /* Check all the traces to find the latest available event. */ for (i = 0; i < nr_cpu_ids; i++) { struct tt_buffer *buffer = tt_buffers[i]; + if (pos[i] == -1) + continue; event = &buffer->events[pos[i]]; - if (pos[i] != buffer->next_index && - event->timestamp < earliest_time) { + if (event->timestamp >= latest_time) { current_core = i; - earliest_time = event->timestamp; + latest_time = event->timestamp; } } if (current_core < 0) { @@ -741,8 +755,11 @@ void tt_printk(void) break; } event = &(tt_buffers[current_core]->events[pos[current_core]]); - pos[current_core] = (pos[current_core] + 1) - & (tt_buffer_size - 1); + if (pos[current_core] == oldest[current_core]) + pos[current_core] = -1; + else + pos[current_core] = (pos[current_core] - 1) + & (tt_buffer_size - 1); snprintf(msg, sizeof(msg), event->format, event->arg0, event->arg1, event->arg2, event->arg3); diff --git a/timetrace.h b/timetrace.h index 0cef6744..3856b195 100644 --- a/timetrace.h +++ b/timetrace.h @@ -108,7 +108,7 @@ void tt_set_temp(int *temp); void tt_dbg1(char *msg, ...); void tt_dbg2(char *msg, ...); void tt_dbg3(char *msg, ...); -void tt_find_oldest(int *pos); +u64 tt_find_oldest(int *pos); void tt_get_messages(char *buffer, size_t length); void tt_print_file(char *path); void tt_printk(void); diff --git a/util/ttsyslog.py b/util/ttsyslog.py index 0b212dcb..47316676 100755 --- a/util/ttsyslog.py +++ b/util/ttsyslog.py @@ -36,18 +36,22 @@ if len(sys.argv) > 1: f = open(sys.argv[1]) +lines = [] + for line in f: line = line.rstrip() if line.endswith('^M'): line = line[:-2] - # Ignore everything up until the initial line containing the clock speed. if cpu_ghz == None: match = re.match('.*cpu_khz: ([0-9.]+)', line) if match: cpu_ghz = float(match.group(1))*1e-06 continue + lines.append(line) + +for line in reversed(lines): match = re.match('.* ([0-9.]+) (\[C..\] .+)', line) if not match: continue From 41a732575b7fe7b8f49390aa87e17e8a25a7a3f0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 8 Jun 2025 20:27:11 -0700 Subject: [PATCH 359/625] Add important recent changes to README.md --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 31bb7ade..e7ccff55 100644 --- a/README.md +++ b/README.md @@ -122,11 +122,17 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - Homa exports a collection of configuration parameters through the sysctl mechanism. For details, see the man page `homa.7`. -## Significant changesKs +## Significant changes - May 2025: `homa_api.c` has been removed, so the functions `homa_abort`, `homa_reply`, `homa_replyv`, `homa_send`, and `homa_sendv` no longer exist. +- May 2025: added support for network namespaces. +- May 2025: reworked support for peers to cap peer memory usage. - April 2025: upgraded to Linux 6.13.9. +- April 2025: major refactoring of grant management (more efficient, + remove complexity that was causing an unending stream of bugs). +- March 2025: added memory cap on memory for outgoing messages: send + requests can block if memory limit is reached. - March 2025: implemented private RPCs, resulting in API changes. HOMA_RECVMSG_REQUEST and HOMA_RECVMSG_RESPONSE flags no longer exist and struct homa_sendmsg_args now has a flags field with one defined From 1f8691278e78faa76cd9809ad9b432cf7fcce5d9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 8 Jun 2025 20:27:40 -0700 Subject: [PATCH 360/625] Replace __u16 with u16, __u8 with u8, etc. --- homa_impl.h | 2 +- homa_pacer.c | 2 +- homa_plumbing.c | 4 ++-- homa_rpc.c | 2 +- homa_rpc.h | 4 ++-- homa_sock.c | 4 ++-- homa_sock.h | 8 ++++---- homa_wire.h | 16 ++++++++-------- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 1a405115..43434ad5 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -511,7 +511,7 @@ struct homa_net { * @prev_default_port: The most recent port number assigned from * the range of default ports. */ - __u16 prev_default_port; + u16 prev_default_port; /** * @num_peers: The total number of struct homa_peers that exist diff --git a/homa_pacer.c b/homa_pacer.c index 465d236a..dbcd1a93 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -411,7 +411,7 @@ void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) homa_ns_to_cycles(pacer->max_nic_queue_ns); /* Underestimate link bandwidth (overestimate time) by 1%. */ - tmp = 101 * 8000 * (__u64)homa_clock_khz(); + tmp = 101 * 8000 * (u64)homa_clock_khz(); do_div(tmp, pacer->link_mbps * 100); pacer->cycles_per_mbyte = tmp; } diff --git a/homa_plumbing.c b/homa_plumbing.c index fe0b392f..e16747df 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -420,7 +420,7 @@ static struct ctl_table homa_ctl_table[] = { /* Sizes of the headers for each Homa packet type, in bytes. */ #ifndef __STRIP__ /* See strip.py */ -static __u16 header_lengths[] = { +static u16 header_lengths[] = { sizeof(struct homa_data_hdr), sizeof(struct homa_grant_hdr), sizeof(struct homa_resend_hdr), @@ -432,7 +432,7 @@ static __u16 header_lengths[] = { sizeof(struct homa_ack_hdr) }; #else /* See strip.py */ -static __u16 header_lengths[] = { +static u16 header_lengths[] = { sizeof(struct homa_data_hdr), 0, sizeof(struct homa_resend_hdr), diff --git a/homa_rpc.c b/homa_rpc.c index 47eab485..26c40f67 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -222,7 +222,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, struct homa_ack *ack) { - __u16 server_port = ntohs(ack->server_port); + u16 server_port = ntohs(ack->server_port); u64 id = homa_local_id(ack->client_id); struct homa_sock *hsk2 = hsk; struct homa_rpc *rpc; diff --git a/homa_rpc.h b/homa_rpc.h index 79d6a351..1f95276f 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -84,7 +84,7 @@ struct homa_message_out { * @sched_priority: Priority level to use for future scheduled * packets. */ - __u8 sched_priority; + u8 sched_priority; #endif /* See strip.py */ /** @@ -288,7 +288,7 @@ struct homa_rpc { struct homa_peer *peer; /** @dport: Port number on @peer that will handle packets. */ - __u16 dport; + u16 dport; /** * @id: Unique identifier for the RPC among all those issued diff --git a/homa_sock.c b/homa_sock.c index 99c61fca..29e4c3e1 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -371,7 +371,7 @@ void homa_sock_destroy(struct sock *sk) * Return: 0 for success, otherwise a negative errno. */ int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, - __u16 port) + u16 port) { struct homa_socktab *socktab = hnet->homa->socktab; struct homa_sock *owner; @@ -416,7 +416,7 @@ int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, * then this method has taken a reference on the socket and * the caller must call sock_put to release it. */ -struct homa_sock *homa_sock_find(struct homa_net *hnet, __u16 port) +struct homa_sock *homa_sock_find(struct homa_net *hnet, u16 port) { int bucket = homa_socktab_bucket(hnet, port); struct homa_sock *result = NULL; diff --git a/homa_sock.h b/homa_sock.h index 8d04ae04..447b253c 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -158,7 +158,7 @@ struct homa_sock { * @port: Port number: identifies this socket uniquely among all * those on this node. */ - __u16 port; + u16 port; /** * @is_server: True means that this socket can act as both client @@ -272,9 +272,9 @@ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id); #endif /* See strip.py */ int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, - __u16 port); + u16 port); void homa_sock_destroy(struct sock *sk); -struct homa_sock *homa_sock_find(struct homa_net *hnet, __u16 port); +struct homa_sock *homa_sock_find(struct homa_net *hnet, u16 port); int homa_sock_init(struct homa_sock *hsk); void homa_sock_shutdown(struct homa_sock *hsk); void homa_sock_unlink(struct homa_sock *hsk); @@ -330,7 +330,7 @@ static inline void homa_sock_unlock(struct homa_sock *hsk) * Return: The index of the bucket in which a socket matching @hnet and * @port will be found (if it exists). */ -static inline int homa_socktab_bucket(struct homa_net *hnet, __u16 port) +static inline int homa_socktab_bucket(struct homa_net *hnet, u16 port) { #ifdef __UNIT_TEST__ return port & (HOMA_SOCKTAB_BUCKETS - 1); diff --git a/homa_wire.h b/homa_wire.h index ca627a99..cce00b64 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -116,7 +116,7 @@ struct homa_common_hdr { * @type: Homa packet type (one of the values of the homa_packet_type * enum). Corresponds to the low-order byte of the ack in TCP. */ - __u8 type; + u8 type; /** * @doff: High order 4 bits holds the number of 4-byte chunks in a @@ -124,7 +124,7 @@ struct homa_common_hdr { * must be in the same position as the data offset in a TCP header. * Used by TSO to determine where the replicated header portion ends. */ - __u8 doff; + u8 doff; #ifndef __STRIP__ /* See strip.py */ /** @@ -134,11 +134,11 @@ struct homa_common_hdr { * which TCP would never use together; must not include URG or FIN * (TSO will turn off FIN for all but the last segment). */ - __u8 flags; + u8 flags; #define HOMA_TCP_FLAGS 6 #else /* See strip.py */ /** @reserved1: Not used (corresponds to TCP flags). */ - __u8 reserved1; + u8 reserved1; #endif /* See strip.py */ /** @@ -343,7 +343,7 @@ struct homa_data_hdr { * @retransmit: 1 means this packet was sent in response to a RESEND * (it has already been sent previously). */ - __u8 retransmit; + u8 retransmit; char pad[3]; @@ -387,14 +387,14 @@ struct homa_grant_hdr { * MESSAGE_FRAG packets for this message, until a GRANT is received * with higher offset. Larger numbers indicate higher priorities. */ - __u8 priority; + u8 priority; /** * @resend_all: Nonzero means that the sender should resend all previously * transmitted data, starting at the beginning of the message (assume * that no packets have been successfully received). */ - __u8 resend_all; + u8 resend_all; } __packed; #endif /* See strip.py */ @@ -430,7 +430,7 @@ struct homa_resend_hdr { * The sender should transmit all the requested data using this * priority. */ - __u8 priority; + u8 priority; #endif /* See strip.py */ } __packed; From a8bd8b0633bd4beb94b73d7fdea99caee2b71dac Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 9 Jun 2025 21:14:55 -0700 Subject: [PATCH 361/625] Replace grant->incoming_hit_limit with grant->stalled_rank This allowed homa_grant_check_rpc to be reworked in a way that reduces grant lock acquisitions by about 2x at high network load. This commit also adds new metrics grant_check_recalcs and grant_check_others, and replaces the grant_check_slow_path metric with grant_check_locked. --- homa_grant.c | 196 ++++++++++++++++++++++------------------ homa_grant.h | 10 ++- homa_metrics.c | 8 +- homa_metrics.h | 19 +++- test/unit_homa_grant.c | 199 +++++++++++++++++++++++++++-------------- util/tthoma.py | 97 ++++++++++++++------ 6 files changed, 339 insertions(+), 190 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 29f1a077..93436973 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -91,6 +91,7 @@ struct homa_grant *homa_grant_alloc(void) pr_err("%s couldn't allocate grant structure\n", __func__); return ERR_PTR(-ENOMEM); } + atomic_set(&grant->stalled_rank, INT_MAX); grant->max_incoming = 400000; spin_lock_init(&grant->lock); INIT_LIST_HEAD(&grant->grantable_peers); @@ -600,6 +601,7 @@ int homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) __must_hold(&rpc->bucket->lock) { int received, new_grant_offset, incoming_delta, avl_incoming, rank; + int prev_stalled; /* Don't increase the grant if the node has been slow to send * data already granted: no point in wasting grants on this @@ -618,10 +620,14 @@ int homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) incoming_delta = new_grant_offset - received - rpc->msgin.rec_incoming; avl_incoming = grant->max_incoming - atomic_read(&grant->total_incoming); if (avl_incoming < incoming_delta) { - atomic_set(&grant->incoming_hit_limit, 1); - tt_record3("insufficient headroom for grant: needed %d, available %d, used %d", - incoming_delta, avl_incoming, - atomic_read(&grant->total_incoming)); + tt_record4("insufficient headroom for grant for RPC id %d " + "(rank %d): desired incoming %d, shortfall %d", + rpc->id, rank, new_grant_offset - received, + incoming_delta - avl_incoming); + prev_stalled = atomic_read(&grant->stalled_rank); + while (prev_stalled > rank) + prev_stalled = atomic_cmpxchg(&grant->stalled_rank, + prev_stalled, rank); new_grant_offset -= incoming_delta - avl_incoming; } if (new_grant_offset <= rpc->msgin.granted) @@ -674,34 +680,39 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) __must_hold(&rpc->bucket->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; + int needy_rank, stalled_rank, rank; struct homa_grant_candidates cand; - bool limit, recalc; + int locked = 0; u64 now; int i; - /* This function has 5 different tasks: - * 1. It updates variables tracking incoming data. - * 2. It generates new grant packets for @rpc if appropriate. This - * is the common case. - * 3. If total_incoming had been exhausted, but headroom is now - * available, it sends grants to the highest priority RPC that - * needs them, which may not be @rpc. - * 4. It occasionally sends grants to the oldest RPC as determined - * by the fifo_grant_fraction parameter. This is not currently - * implemented. - * 5. It occasionally scans active_rpcs to restore proper priority - * order. More on this below. + /* The challenge for this function is to minimize use of the grant + * lock, since that is global. Early versions of Homa acquired the + * grant lock on every call to this function, but that resulted in + * too much contention for the grant lock (especially at network + * speeds of 100 Gbps or more). * - * Tasks 3-5 require the global grant lock, but that lock is in - * danger of overload, particularly as network speeds increase. So, - * this function handles case 1 without acquiring the grant lock. - * Issuing a grant to @rpc may change its priority relative to other - * RPCs in active_rpcs, but we don't check for that in the common - * case, since it would require the grant lock. Instead, this function - * occasionally scans all the RPCs in active_rpcs to fix any priority - * inversions that may have developed. The interval for these scans - * is chosen so as not to create too much contention for the grant lock. - */ + * This implementation is designed in the hopes that most calls can + * follow a fast path that does not require the grant lock: just + * update grant state for @rpc and possibly issue a new grant for + * @rpc, without considering other RPCs. + * + * However, there are some situations where other RPCs must be + * considered: + * 1. If there are higher-priority RPCs that are stalled (they would + * like to issue grants but could not because @total_incoming + * was exceeded), then they must get first shot at any headroom + * that has become available. + * 2. The priority order of RPCs could change, if data packets arrive + * for lower priority RPCs but not for higher priority ones. + * Rather than checking every time data arrives (which would + * require the grant lock), we recheck the priorities at regular + * time intervals. + * 3. Occasionally we need to send grants to the oldest message (FIFO + * priority) in order to prevent starvation. + * + * Each of these situations requires the grant lock. + **/ if (rpc->msgin.length < 0 || rpc->msgin.num_bpages <= 0 || rpc->state == RPC_DEAD) @@ -712,74 +723,81 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) rpc->msgin.length); INC_METRIC(grant_check_calls, 1); - /* If there are RPCs stalled because total_incoming is too high, - * can't take the shortcut below: need to take the slow path in case - * there are stalled RPCs that can now be granted. - */ - limit = atomic_xchg(&grant->incoming_hit_limit, false); - + needy_rank = INT_MAX; now = homa_clock(); - recalc = now >= READ_ONCE(grant->next_recalc); - if (!recalc && !limit) { + homa_grant_update_incoming(rpc, grant); + if (now >= READ_ONCE(grant->next_recalc)) { + /* Situation 2. */ + locked = 1; + tt_record1("homa_grant_check_rpc acquiring grant lock to fix order (id %d)", + rpc->id); + homa_grant_lock(grant); + grant->next_recalc = now + grant->recalc_cycles; + needy_rank = homa_grant_fix_order(grant); + homa_grant_unlock(grant); + tt_record1("homa_grant_check_rpc released grant lock (id %d)", + rpc->id); + INC_METRIC(grant_check_recalcs, 1); + } + + rank = READ_ONCE(rpc->msgin.rank); + stalled_rank = atomic_xchg(&grant->stalled_rank, INT_MAX); + if (stalled_rank < needy_rank) + needy_rank = stalled_rank; + + if (rank <= needy_rank) { int priority; - /* Fast path (Tasks 1 and 2). */ + /* Fast path. */ priority = homa_grant_update_granted(rpc, grant); homa_grant_update_incoming(rpc, grant); - if (priority < 0) - goto done; + if (priority >= 0) { + homa_grant_cand_init(&cand); + if (rpc->msgin.granted >= rpc->msgin.length) + homa_grant_unmanage_rpc(rpc, &cand); + + /* Sending a grant is slow, so release the RPC lock while + * sending the grant to reduce contention. + */ + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); + homa_grant_send(rpc, priority); + if (!homa_grant_cand_empty(&cand)) + homa_grant_cand_check(&cand, grant); + homa_rpc_lock(rpc); + homa_rpc_put(rpc); + } + } + if (needy_rank < INT_MAX && + atomic_read(&grant->total_incoming) < grant->max_incoming) { + /* Situations 1 and 2. */ homa_grant_cand_init(&cand); - if (rpc->msgin.granted >= rpc->msgin.length) - homa_grant_unmanage_rpc(rpc, &cand); - - /* Sending a grant is slow, so release the RPC lock while - * sending the grant to reduce contention. - */ - homa_rpc_hold(rpc); - homa_rpc_unlock(rpc); - homa_grant_send(rpc, priority); - if (!homa_grant_cand_empty(&cand)) + locked = 1; + tt_record3("homa_grant_check_rpc acquiring grant lock, needy_rank %d, id %d, num_active %d", + needy_rank, rpc->id, grant->num_active_rpcs); + homa_grant_lock(grant); + for (i = needy_rank; i < grant->num_active_rpcs; i++) { + struct homa_rpc *rpc2 = grant->active_rpcs[i]; + + if (rpc2->msgin.rec_incoming < grant->window && + rpc2->state != RPC_DEAD) + homa_grant_cand_add(&cand, rpc2); + } + homa_grant_unlock(grant); + tt_record1("homa_grant_check_rpc released grant lock (id %d)", + rpc->id); + if (!homa_grant_cand_empty(&cand)) { + homa_rpc_hold(rpc); + homa_rpc_unlock(rpc); homa_grant_cand_check(&cand, grant); - homa_rpc_lock(rpc); - homa_rpc_put(rpc); - goto done; - } - - INC_METRIC(grant_check_slow_path, 1); - homa_grant_update_incoming(rpc, grant); - tt_record1("homa_grant_check_rpc acquiring grant lock (id %d)", - rpc->id); - homa_grant_lock(grant); - if (recalc) { - /* Task 5. */ - grant->next_recalc = now + grant->recalc_cycles; - homa_grant_fix_order(grant); + homa_rpc_lock(rpc); + homa_rpc_put(rpc); + } + INC_METRIC(grant_check_others, 1); } - /* Tasks 3 and 5: search all active RPCs to find any that do - * not have a full window of grants. Then release the grant lock - * and send grants. - */ - homa_grant_cand_init(&cand); - for (i = 0; i < grant->num_active_rpcs; i++) { - struct homa_rpc *rpc2 = grant->active_rpcs[i]; - - if (rpc2->msgin.rec_incoming < grant->window && - rpc2->state != RPC_DEAD) - homa_grant_cand_add(&cand, rpc2); - } - homa_grant_unlock(grant); - tt_record1("homa_grant_check_rpc released grant lock (id %d)", - rpc->id); - if (!homa_grant_cand_empty(&cand)) { - homa_rpc_hold(rpc); - homa_rpc_unlock(rpc); - homa_grant_cand_check(&cand, grant); - homa_rpc_lock(rpc); - homa_rpc_put(rpc); - } - done: + INC_METRIC(grant_check_locked, locked); tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); } @@ -787,11 +805,14 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * homa_grant_fix_order() - This function scans all of the RPCS in * @active_rpcs and repairs any priority inversions that may exist. * @grant: Overall grant management information. + * Return: The new rank of the highest-priority RPC whose rank improved, + * or INT_MAX if no RPCs were promoted. */ -void homa_grant_fix_order(struct homa_grant *grant) +int homa_grant_fix_order(struct homa_grant *grant) __must_hold(&grant->lock) { struct homa_rpc *rpc, *other; + int result = INT_MAX; int i, j; for (i = 1; i < grant->num_active_rpcs; i++) { @@ -804,9 +825,12 @@ void homa_grant_fix_order(struct homa_grant *grant) other->msgin.rank = j + 1; grant->active_rpcs[j] = rpc; rpc->msgin.rank = j; + if (j < result) + result = j; INC_METRIC(grant_priority_bumps, 1); } } + return result; } /** diff --git a/homa_grant.h b/homa_grant.h index 2be010ac..6507ea64 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -31,10 +31,12 @@ struct homa_grant { atomic_t total_incoming; /** - * @incoming_hit_limit: True means that one or more RPCs could - * not be fully granted because @total_incoming exceeded @max_incoming. + * @stalled_rank: rank of the highest-priority RPC (i.e., lowest + * rank) whose incoming message could not be fully granted because + * @total_incoming exceeded @max_incoming. INT_MAX means there are + * no stalled RPCs. */ - atomic_t incoming_hit_limit; + atomic_t stalled_rank; /** * @max_incoming: Homa will try to ensure that the total number of @@ -233,7 +235,7 @@ int homa_grant_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void homa_grant_end_rpc(struct homa_rpc *rpc); void homa_grant_find_oldest(struct homa *homa); -void homa_grant_fix_order(struct homa_grant *grant); +int homa_grant_fix_order(struct homa_grant *grant); void homa_grant_free(struct homa_grant *grant); void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched); struct homa_rpc diff --git a/homa_metrics.c b/homa_metrics.c index 1ca40ead..484d5717 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -321,8 +321,12 @@ char *homa_metrics_print(void) m->grantable_rpcs_integral); M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", m->grant_check_calls); - M("grant_check_slow_path %15llu Number of times homa_grant_check_rpc acquired grant lock\n", - m->grant_check_slow_path); + M("grant_check_locked %15llu Number of calls to homa_grant_check_rpc that acquired grant lock\n", + m->grant_check_locked); + M("grant_check_others %15llu Number of times homa_grant_check_rpc checked non-caller RPCs for grants\n", + m->grant_check_others); + M("grant_check_recalcs %15llu Number of times homa_grant_check_rpc updated grant priority order\n", + m->grant_check_recalcs); M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", m->grant_priority_bumps); M("fifo_grants %15llu Grants issued using FIFO priority\n", diff --git a/homa_metrics.h b/homa_metrics.h index b1416d20..54090d63 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -490,10 +490,23 @@ struct homa_metrics { u64 grant_check_calls; /** - * @grant_check_slow_path: cumulative number of times - * homa_grant_check_rpc acquired the grant lock. + * @grant_check_locked: cumulative number of times an invocation of + * homa_grant_check_rpc acquired the grant lock at least once. */ - u64 grant_check_slow_path; + u64 grant_check_locked; + + /** + * @grant_check_recalcs: cumulative number of times that + * homa_grant_check_rpc verified and/or adjusted the priority of + * active RPCs. + */ + u64 grant_check_recalcs; + + /** + * @grant_check_others: cumulative number of times homa_grant_check_rpc + * checked other RPCs besides the invoking one for potential grants. + */ + u64 grant_check_others; /** * @grant_priority_bumps: cumulative number of times the grant priority diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index d6ff2ae7..12841769 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -865,7 +865,7 @@ TEST_F(homa_grant, homa_grant_update_granted__basics) EXPECT_EQ(2, homa_grant_update_granted(rpc, self->homa.grant)); self->homa.grant->num_active_rpcs = 0; EXPECT_EQ(10000, rpc->msgin.granted); - EXPECT_EQ(0, atomic_read(&self->homa.grant->incoming_hit_limit)); + EXPECT_EQ(INT_MAX, atomic_read(&self->homa.grant->stalled_rank)); } TEST_F(homa_grant, homa_grant_update_granted__rpc_idle) { @@ -919,10 +919,22 @@ TEST_F(homa_grant, homa_grant_update_granted__incoming_overcommitted) struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); atomic_set(&self->homa.grant->total_incoming, 51000); + atomic_set(&self->homa.grant->stalled_rank, 3); rpc->msgin.rank = 2; EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); EXPECT_EQ(1000, rpc->msgin.granted); - EXPECT_EQ(1, atomic_read(&self->homa.grant->incoming_hit_limit)); + EXPECT_EQ(2, atomic_read(&self->homa.grant->stalled_rank)); +} +TEST_F(homa_grant, homa_grant_update_granted__incoming_overcommitted_but_stalled_doesnt_change) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); + + atomic_set(&self->homa.grant->total_incoming, 51000); + atomic_set(&self->homa.grant->stalled_rank, 3); + rpc->msgin.rank = 4; + EXPECT_EQ(-1, homa_grant_update_granted(rpc, self->homa.grant)); + EXPECT_EQ(1000, rpc->msgin.granted); + EXPECT_EQ(3, atomic_read(&self->homa.grant->stalled_rank)); } TEST_F(homa_grant, homa_grant_send__basics) @@ -966,7 +978,27 @@ TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) EXPECT_EQ(0, atomic_read(&self->homa.grant->total_incoming)); EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); } -TEST_F(homa_grant, homa_grant_check_rpc__update_incoming_if_rpc_not_active) +TEST_F(homa_grant, homa_grant_check_rpc__rpc_dead) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 100, 1000, 20000); + + homa_message_in_init(rpc, 20000, 0); + EXPECT_EQ(0, rpc->msgin.granted); + rpc->state = RPC_DEAD; + + unit_log_clear(); + homa_rpc_lock(rpc); + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_calls); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(0, rpc->msgin.granted); + rpc->state = RPC_INCOMING; +} +TEST_F(homa_grant, homa_grant_check_rpc__update_incoming_even_if_rpc_no_longer_active) { struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, @@ -982,23 +1014,39 @@ TEST_F(homa_grant, homa_grant_check_rpc__update_incoming_if_rpc_not_active) EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_calls); EXPECT_EQ(900, atomic_read(&self->homa.grant->total_incoming)); - EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_slow_path); + EXPECT_EQ(0, rpc->msgin.rec_incoming); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); } -TEST_F(homa_grant, homa_grant_check_rpc__skip_shortcut_if_incoming_hit_limit) +TEST_F(homa_grant, homa_grant_check_rpc__fix_order) { - struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->server_port, - 100, 1000, 2000); + struct homa_rpc *rpc1, *rpc2, *rpc3; - homa_message_in_init(rpc, 2000, 0); - EXPECT_EQ(0, rpc->msgin.rank); - rpc->msgin.rank = -1; - atomic_set(&self->homa.grant->incoming_hit_limit, 1); - homa_rpc_lock(rpc); - homa_grant_check_rpc(rpc); - homa_rpc_unlock(rpc); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_slow_path); - EXPECT_EQ(0, atomic_read(&self->homa.grant->incoming_hit_limit)); + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + EXPECT_EQ(2, rpc3->msgin.rank); + rpc3->msgin.granted = 25000; + rpc3->msgin.bytes_remaining = 15000; + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + mock_clock = self->homa.grant->next_recalc; + + unit_log_clear(); + homa_rpc_lock(rpc2); + homa_grant_check_rpc(rpc2); + homa_rpc_unlock(rpc2); + EXPECT_STREQ("xmit GRANT 35000@2; xmit GRANT 5000@1", unit_log_get()); + EXPECT_EQ(5000, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(35000, rpc3->msgin.granted); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 104 ungranted 5000; " + "active[1]: id 100 ungranted 15000; " + "active[2]: id 102 ungranted 30000", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_recalcs); + EXPECT_EQ(40000, self->homa.grant->next_recalc); } TEST_F(homa_grant, homa_grant_check_rpc__fast_path) { @@ -1015,6 +1063,9 @@ TEST_F(homa_grant, homa_grant_check_rpc__fast_path) homa_grant_check_rpc(rpc); EXPECT_STREQ("xmit GRANT 10000@0", unit_log_get()); EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_calls); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_recalcs); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_others); EXPECT_EQ(10000, rpc->msgin.granted); /* Second call doesn't issue a grant (nothing has changed). */ @@ -1023,7 +1074,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__fast_path) homa_rpc_unlock(rpc); EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(2, homa_metrics_per_cpu()->grant_check_calls); - EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_slow_path); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); EXPECT_EQ(10000, rpc->msgin.granted); } TEST_F(homa_grant, homa_grant_check_rpc__fast_path_grants_to_end_of_message) @@ -1040,39 +1091,31 @@ TEST_F(homa_grant, homa_grant_check_rpc__fast_path_grants_to_end_of_message) EXPECT_EQ(6000, rpc->msgin.granted); EXPECT_EQ(-1, rpc->msgin.rank); EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); - EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_slow_path); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); } -TEST_F(homa_grant, homa_grant_check_rpc__fix_order) +TEST_F(homa_grant, homa_grant_check_rpc__fast_path_promote_other_message) { - struct homa_rpc *rpc1, *rpc2, *rpc3; + struct homa_rpc *rpc1, *rpc2; - rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); - rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); - rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); - EXPECT_EQ(2, rpc3->msgin.rank); - rpc3->msgin.granted = 25000; - rpc3->msgin.bytes_remaining = 15000; - atomic_set(&self->homa.grant->total_incoming, - self->homa.grant->max_incoming - 15000); - mock_clock = self->homa.grant->next_recalc; + self->homa.grant->max_overcommit = 1; + rpc1 = test_rpc_init(self, 100, self->server_ip, 8000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 25000); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(-1, rpc2->msgin.rank); unit_log_clear(); - homa_rpc_lock(rpc2); - homa_grant_check_rpc(rpc2); - homa_rpc_unlock(rpc2); - EXPECT_STREQ("xmit GRANT 35000@2; xmit GRANT 5000@1", unit_log_get()); - EXPECT_EQ(5000, rpc1->msgin.granted); - EXPECT_EQ(0, rpc2->msgin.granted); - EXPECT_EQ(35000, rpc3->msgin.granted); + homa_rpc_lock(rpc1); + homa_grant_check_rpc(rpc1); + homa_rpc_unlock(rpc1); + EXPECT_STREQ("xmit GRANT 8000@0; xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(8000, rpc1->msgin.granted); + EXPECT_EQ(10000, rpc2->msgin.granted); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("active[0]: id 104 ungranted 5000; " - "active[1]: id 100 ungranted 15000; " - "active[2]: id 102 ungranted 30000", unit_log_get()); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_slow_path); - EXPECT_EQ(40000, self->homa.grant->next_recalc); + EXPECT_STREQ("active[0]: id 102 ungranted 15000", unit_log_get()); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); } -TEST_F(homa_grant, homa_grant_check_rpc__incoming_hit_limit) +TEST_F(homa_grant, homa_grant_check_rpc__skip_fast_path_because_of_stalled_rank) { struct homa_rpc *rpc1, *rpc2, *rpc3; @@ -1081,24 +1124,38 @@ TEST_F(homa_grant, homa_grant_check_rpc__incoming_hit_limit) rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); atomic_set(&self->homa.grant->total_incoming, self->homa.grant->max_incoming - 15000); - atomic_set(&self->homa.grant->incoming_hit_limit, 1); unit_log_clear(); - homa_rpc_lock(rpc1); - homa_grant_check_rpc(rpc1); - homa_rpc_unlock(rpc1); - EXPECT_STREQ("xmit GRANT 10000@2; xmit GRANT 5000@1", unit_log_get()); - EXPECT_EQ(10000, rpc1->msgin.granted); - EXPECT_EQ(5000, rpc2->msgin.granted); - EXPECT_EQ(0, rpc3->msgin.granted); + atomic_set(&self->homa.grant->stalled_rank, 1); + homa_rpc_lock(rpc3); + homa_grant_check_rpc(rpc3); + homa_rpc_unlock(rpc3); + EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 5000@0", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(2, atomic_read(&self->homa.grant->stalled_rank)); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(10000, rpc2->msgin.granted); + EXPECT_EQ(5000, rpc3->msgin.granted); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_others); +} +TEST_F(homa_grant, homa_grant_check_rpc__dont_check_needy_if_incoming_maxed) +{ + struct homa_rpc *rpc; + + test_rpc_init(self, 100, self->server_ip, 20000); + test_rpc_init(self, 102, self->server_ip, 30000); + rpc = test_rpc_init(self, 104, self->server_ip, 40000); + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming); + unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("active[0]: id 100 ungranted 10000; " - "active[1]: id 102 ungranted 25000; " - "active[2]: id 104 ungranted 40000", unit_log_get()); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_slow_path); - EXPECT_EQ(20000, self->homa.grant->next_recalc); - EXPECT_EQ(1, atomic_read(&self->homa.grant->incoming_hit_limit)); + atomic_set(&self->homa.grant->stalled_rank, 0); + homa_rpc_lock(rpc); + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_others); } TEST_F(homa_grant, homa_grant_check_rpc__skip_rpc_with_too_much_incoming) { @@ -1110,7 +1167,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__skip_rpc_with_too_much_incoming) rpc2->msgin.rec_incoming = 10000; atomic_set(&self->homa.grant->total_incoming, self->homa.grant->max_incoming - 15000); - atomic_set(&self->homa.grant->incoming_hit_limit, 1); + atomic_set(&self->homa.grant->stalled_rank, 0); homa_rpc_lock(rpc3); homa_grant_check_rpc(rpc3); @@ -1121,7 +1178,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__skip_rpc_with_too_much_incoming) "active[1]: id 102 ungranted 30000; " "active[2]: id 104 ungranted 35000", unit_log_get()); } -TEST_F(homa_grant, homa_grant_check_rpc__skip_dead_rpc) +TEST_F(homa_grant, homa_grant_check_rpc__skip_dead_rpc_when_checking_needy) { struct homa_rpc *rpc1, *rpc2, *rpc3; int saved_state; @@ -1133,7 +1190,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__skip_dead_rpc) rpc2->state = RPC_DEAD; atomic_set(&self->homa.grant->total_incoming, self->homa.grant->max_incoming - 15000); - atomic_set(&self->homa.grant->incoming_hit_limit, 1); + atomic_set(&self->homa.grant->stalled_rank, 0); unit_log_clear(); homa_rpc_lock(rpc3); @@ -1148,7 +1205,7 @@ TEST_F(homa_grant, homa_grant_check_rpc__skip_dead_rpc) EXPECT_STREQ("active[0]: id 100 ungranted 10000; " "active[1]: id 102 ungranted 30000; " "active[2]: id 104 ungranted 35000", unit_log_get()); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_slow_path); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_locked); rpc2->state = saved_state; } @@ -1160,19 +1217,25 @@ TEST_F(homa_grant, homa_grant_fix_order) test_rpc_init(self, 102, self->server_ip, 30000); rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); rpc4 = test_rpc_init(self, 106, self->server_ip, 50000); - rpc3->msgin.granted = 25000; - rpc3->msgin.bytes_remaining = 15000; + rpc3->msgin.granted = 15000; + rpc3->msgin.bytes_remaining = 25000; rpc4->msgin.granted = 26000; rpc4->msgin.bytes_remaining = 24000; - homa_grant_fix_order(self->homa.grant); + EXPECT_EQ(1,homa_grant_fix_order(self->homa.grant)); unit_log_clear(); unit_log_grantables(&self->homa); - EXPECT_STREQ("active[0]: id 104 ungranted 15000; " - "active[1]: id 100 ungranted 20000; " - "active[2]: id 106 ungranted 24000; " + EXPECT_STREQ("active[0]: id 100 ungranted 20000; " + "active[1]: id 106 ungranted 24000; " + "active[2]: id 104 ungranted 25000; " "active[3]: id 102 ungranted 30000", unit_log_get()); EXPECT_EQ(3, homa_metrics_per_cpu()->grant_priority_bumps); + + /* Second call: nothing changes. */ + EXPECT_EQ(INT_MAX, homa_grant_fix_order(self->homa.grant)); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_EQ(3, homa_metrics_per_cpu()->grant_priority_bumps); } #if 0 diff --git a/util/tthoma.py b/util/tthoma.py index 44a31088..95f046ee 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1464,14 +1464,27 @@ def __grant_check_done(self, trace, time, core, match, interests): 'regexp': 'homa_grant_check_rpc finished with id ([0-9]+)' }) - def __grant_check_lock(self, trace, time, core, match, interests): + def __grant_check_lock_recalc(self, trace, time, core, match, interests): id = int(match.group(1)) for interest in interests: - interest.tt_grant_check_lock(trace, time, core, id) + interest.tt_grant_check_lock_recalc(trace, time, core, id) patterns.append({ - 'name': 'grant_check_lock', - 'regexp': 'homa_grant_check_rpc acquiring grant lock \(id ([0-9]+)\)' + 'name': 'grant_check_lock_recalc', + 'regexp': 'homa_grant_check_rpc acquiring grant lock to fix order \(id ([0-9]+)\)' + }) + + def __grant_check_lock_needy(self, trace, time, core, match, interests): + rank = int(match.group(1)) + id = int(match.group(2)) + active = int(match.group(3)) + for interest in interests: + interest.tt_grant_check_lock_needy(trace, time, core, id, rank, active) + + patterns.append({ + 'name': 'grant_check_lock_needy', + 'regexp': 'homa_grant_check_rpc acquiring grant lock, needy_rank ' + '([0-9]+), id ([0-9]+), num_active ([0-9]+)' }) def __grant_check_unlock(self, trace, time, core, match, interests): @@ -3113,8 +3126,14 @@ def __init__(self, dispatcher): self.node_check_time = defaultdict(lambda : 0) # Node name -> total time spent acquiring and holding the grant lock - # while executing homa_grant_check_rpc on that node. - self.node_lock_time = defaultdict(lambda : 0) + # while validating/updating grant priorities in homa_grant_check_rpc + # on that node. + self.node_lock_recalc_time = defaultdict(lambda : 0) + + # Node name -> total time spent acquiring and holding the grant lock + # while checking RPCs other than the calling one in homa_grant_check_rpc + # on that node. + self.node_lock_needy_time = defaultdict(lambda : 0) # Node name -> total time spent sending grants during homa_grant_check_rpc. self.node_grant_send_time = defaultdict(lambda : 0) @@ -3134,8 +3153,13 @@ def init_trace(self, trace): self.core_check_start = {} # Core -> time when homa_grant_check_rpc started acquiring the - # grant lock (if any) - self.core_lock_start = {} + # grant lock to valid priority order (if any) + self.core_lock_recalc_start = {} + + # Core -> time when homa_grant_check_rpc started acquiring the + # grant lock because RPCs other than the invoking one needed to + # be checked for possible grants (if any) + self.core_lock_needy_start = {} # Core -> time of first grant sent by current call to # homa_grant_check_rpc (only valid if homa_grant_check_rpc in progress) @@ -3145,16 +3169,26 @@ def tt_grant_check_start(self, trace, t, core, id): self.node_checks[trace['node']] += 1 self.core_check_start[core] = t - def tt_grant_check_lock(self, trace, t, core, id): + def tt_grant_check_lock_recalc(self, trace, t, core, id): + node = trace['node'] + self.core_lock_recalc_start[core] = t + + def tt_grant_check_lock_needy(self, trace, t, core, id, rank, active): node = trace['node'] - self.core_lock_start[core] = t + self.core_lock_needy_start[core] = t def tt_grant_check_unlock(self, trace, t, core, id): node = trace['node'] - if core in self.core_lock_start: + if core in self.core_lock_recalc_start: self.node_locks[node] += 1 - self.node_lock_time[node] += t - self.core_lock_start[core] - del self.core_lock_start[core] + self.node_lock_recalc_time[node] += (t - + self.core_lock_recalc_start[core]) + del self.core_lock_recalc_start[core] + elif core in self.core_lock_needy_start: + self.node_locks[node] += 1 + self.node_lock_needy_time[node] += (t - + self.core_lock_needy_start[core]) + del self.core_lock_needy_start[core] def tt_send_grant(self, trace, t, core, id, offset, priority, increment): if not core in self.core_check_start: @@ -3179,36 +3213,45 @@ def print_grant_check_stats(self): print('Checks: Rate of calling homa_grant_check_rpc (k/sec)') print('CUsec: Average execution time in homa_grant_check_rpc') print('CCores: Average active cores in homa_grant_check_rpc') - print('LFrac: Fraction of calls to homa_grant_check_rpc that ' - 'acquired the grant lock') - print('LUsec: Average time spent acquiring/holding the grant ' - 'lock in homa_grant_check_rpc') - print('LCores: Average cores acquiring/hold the grant lock in ' - 'homa_grant_check_rpc') + print('LPer: Average # of acquisitions of the grant lock per call to ') + print(' homa_grant_check_rpc') + print('RUsec: Average time spent acquiring/holding the grant ' + 'lock for priority ') + print(' recalculations') + print('RCores: Average cores acquiring/hold the grant lock for ' + 'priority recalculation') + print('NUsec: Average time spent acquiring/holding the grant ' + 'lock while considering ') + print(' needy RPCs other than the calling one') + print('NCores: Average cores acquiring/hold the grant lock while ' + 'considering needy') + print(' RPCs other than the calling one') print('GPer: Average grants sent per call to homa_grant_check_rpc') print('GUsec: Average time to send a grant') print('GCores: Average cores actively sending grants from within ' 'homa_grant_check_rpc') print('') - print('Node Checks CUsec CCores LFrac LUsec LCores ' - 'GPer GUSec GCores') - print('--------------------------------------------------' - '-----------------') + print('Node Checks CUsec CCores LPer RUsec RCores ' + 'NUsec NCores GPer GUSec GCores') + print('----------------------------------------------------' + '--------------------------------') for node in get_sorted_nodes(): checks = self.node_checks[node] locks = self.node_locks[node] grants = self.node_grants_sent[node] - lock_time = self.node_lock_time[node] + recalc_time = self.node_lock_recalc_time[node] + needy_time = self.node_lock_needy_time[node] grant_time = self.node_grant_send_time[node] check_time = self.node_check_time[node] elapsed = traces[node]['elapsed_time'] print('%-10s %5.1f %5.2f %5.2f ' % (node, 1000*checks/elapsed, check_time/checks if checks else 0, check_time/elapsed), end='') - print('%5.2f %5.2f %5.2f ' % (locks/checks if checks else 0, - lock_time/checks if checks else 0, - lock_time/elapsed), end='') + print('%5.2f %6.3f %5.2f %6.3f %5.2f ' % (locks/checks if checks else 0, + recalc_time/checks if checks else 0, recalc_time/elapsed, + needy_time/checks if checks else 0, needy_time/elapsed), + end='') print('%5.2f %5.2f %5.2f' % (grants/checks if checks else 0, grant_time/grants if grants else 0, grant_time/elapsed)) From 0bb69534e0b5cbaec34878650d0fa8b395a0a9c6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Jun 2025 15:49:04 -0700 Subject: [PATCH 362/625] Change dist raw data from cdfs to weights --- util/dist.cc | 4393 +++++++++++++++++++++++++------------------------- util/dist.h | 20 + 2 files changed, 2212 insertions(+), 2201 deletions(-) diff --git a/util/dist.cc b/util/dist.cc index ed8271aa..eaa0f0ee 100644 --- a/util/dist.cc +++ b/util/dist.cc @@ -12,13 +12,6 @@ #include "dist.h" -/* Forward declarations for built-in CDFs. */ -extern dist_point_gen::cdf_point w1[]; -extern dist_point_gen::cdf_point w2[]; -extern dist_point_gen::cdf_point w3[]; -extern dist_point_gen::cdf_point w4[]; -extern dist_point_gen::cdf_point w5[]; - /** * dist_point_gen() - Constructor for the dist_point generator class. Sets the * distribution for the object, potentially merging buckets to reduce the @@ -55,7 +48,7 @@ dist_point_gen::dist_point_gen(const char* dist, size_t max_length, return; } - cdf_point* points; + weight* points; if (strcmp(dist, "w1") == 0) { points = w1; } else if (strcmp(dist, "w2") == 0) { @@ -72,24 +65,36 @@ dist_point_gen::dist_point_gen(const char* dist, size_t max_length, abort(); } - /* Reduce the set of points according to min_bucket_frac and - * max_size_ratio. + /* Add up all the weights in the distribution. */ + double total_weight = 0.0; + for (weight *p = points; p->length > 0; p++) { + total_weight += p->freq; + } + + /* Convert from weights to cumulative fracions, and reduce the set + * of points according to min_bucket_frac and max_size_ratio. */ - for (cdf_point *p = points; ; p++) { + double cur_frac = 0.0; + dist_points.emplace_back(0, 0.0); + for (weight *p = points; ; p++) { + if (p->length == 0) { + dist_points.back().fraction = 1.0; + break; + } if (p->length >= max_length) { dist_points.emplace_back(max_length, 1.0); break; } - if (p->fraction >= 1.0) { + cur_frac += p->freq/total_weight; + if (cur_frac >= 1.0) { dist_points.emplace_back(p->length, 1.0); break; } - if (dist_points.empty() - || (p->fraction - dist_points.back().fraction - >= min_bucket_frac) - || (max_size_ratio*dist_points.back().length - < p[1].length)) { - dist_points.emplace_back(p->length, p->fraction); + if ((cur_frac - dist_points.back().fraction >= + min_bucket_frac) || + (max_size_ratio*dist_points.back().length < + p[1].length)) { + dist_points.emplace_back(p->length, cur_frac); } } @@ -210,2197 +215,2183 @@ std::vector dist_point_gen::cdf_fractions() const } /* - * The following arrays store CDFs for Workloads 1-5 from the Homa - * SIGCOMM paper. + * The following arrays store distributions for Workloads 1-5 from the Homa + * SIGCOMM paper. The first number is a message length and the second + * number indicates the relative frequency of messages of that length. + * The end of each array is indicated by an entry with 0 for both length + * and frequency. */ -dist_point_gen::cdf_point w1[] = { - {8, 0.31094}, - {9, 0.31931}, - {10, 0.32768}, - {11, 0.41757}, - {12, 0.41849}, - {13, 0.42175}, - {14, 0.44155}, - {15, 0.47}, - {16, 0.474821816501057}, - {17, 0.477306734916455}, - {18, 0.478598446159319}, - {19, 0.481284092023641}, - {20, 0.484111102839223}, - {21, 0.485579552767652}, - {22, 0.488630276714586}, - {23, 0.490214146264239}, - {24, 0.493502936247448}, - {25, 0.495209487027664}, - {26, 0.496958569155332}, - {27, 0.500587629388271}, - {28, 0.502469263537327}, - {29, 0.504396739229229}, - {30, 0.506370885197163}, - {31, 0.508392528537904}, - {32, 0.510462493543295}, - {33, 0.512581600464071}, - {34, 0.514750664204364}, - {35, 0.516970492945357}, - {36, 0.519241886696665}, - {37, 0.52156563577416}, - {38, 0.52394251920312}, - {39, 0.526373303045767}, - {40, 0.528858738652444}, - {41, 0.531399560835934}, - {43, 0.533996485968624}, - {44, 0.536650210002549}, - {45, 0.539361406412584}, - {47, 0.542130724063425}, - {48, 0.544958785001327}, - {50, 0.547846182171941}, - {51, 0.550793477066002}, - {53, 0.553801197295066}, - {54, 0.556869834099928}, - {56, 0.559999839794875}, - {57, 0.563191625151417}, - {59, 0.566445556725714}, - {61, 0.569761954134462}, - {63, 0.573141087284622}, - {65, 0.576583173562996}, - {67, 0.580088374992285}, - {69, 0.583656795360951}, - {71, 0.58728847733488}, - {73, 0.590983399559541}, - {75, 0.594741473762058}, - {77, 0.598562541863314}, - {79, 0.602446373110951}, - {82, 0.606392661244829}, - {84, 0.610401021707236}, - {87, 0.614470988910841}, - {89, 0.618602013578049}, - {92, 0.622793460166102}, - {95, 0.627044604392854}, - {97, 0.631354630878766}, - {100, 0.635722630921181}, - {103, 0.640147600417399}, - {106, 0.644628437953517}, - {110, 0.649163943076279}, - {113, 0.653752814765475}, - {116, 0.658393650124541}, - {120, 0.663084943307071}, - {123, 0.667825084696898}, - {127, 0.672612360359169}, - {131, 0.677444951779569}, - {135, 0.682320935908353}, - {139, 0.687238285525227}, - {143, 0.692194869940407}, - {147, 0.697188456046198}, - {151, 0.702216709732398}, - {156, 0.707277197677561}, - {161, 0.712367389526724}, - {165, 0.71748466046463}, - {170, 0.722626294191701}, - {175, 0.727789486308116}, - {181, 0.73297134810924}, - {186, 0.738168910793445}, - {191, 0.743379130080985}, - {197, 0.748598891240071}, - {203, 0.753825014513709}, - {209, 0.759054260938132}, - {215, 0.764283338540864}, - {222, 0.769508908903652}, - {228, 0.774727594072556}, - {235, 0.779935983794678}, - {242, 0.785130643058093}, - {249, 0.790308119908749}, - {257, 0.795464953515364}, - {264, 0.800597682450724}, - {272, 0.805702853155265}, - {280, 0.810777028546545}, - {289, 0.815816796736054}, - {297, 0.820818779812929}, - {306, 0.825779642652527}, - {315, 0.830696101706428}, - {324, 0.835564933729451}, - {334, 0.840382984398504}, - {344, 0.845147176777796}, - {354, 0.84985451958491}, - {365, 0.854502115212638}, - {376, 0.859087167462276}, - {387, 0.863606988945202}, - {398, 0.86805900811114}, - {410, 0.872440775863436}, - {422, 0.876749971723942}, - {435, 0.880984409512793}, - {448, 0.885142042511289}, - {461, 0.889220968079412}, - {475, 0.893219431703032}, - {489, 0.897135830449671}, - {503, 0.900968715815669}, - {518, 0.904716795951764}, - {534, 0.908378937258383}, - {549, 0.911954165346268}, - {566, 0.915441665362468}, - {582, 0.91884078168609}, - {600, 0.922151017002482}, - {618, 0.925372030768756}, - {636, 0.928503637087566}, - {655, 0.931545802009937}, - {674, 0.934498640291545}, - {694, 0.937362411630225}, - {715, 0.940137516415517}, - {736, 0.942824491023827}, - {758, 0.945424002695147}, - {780, 0.947936844029309}, - {804, 0.950363927141405}, - {827, 0.952706277517256}, - {852, 0.954965027610696}, - {877, 0.957141410224929}, - {903, 0.959236751720322}, - {930, 0.961252465090754}, - {958, 0.963190042950019}, - {986, 0.965051050468866}, - {1015, 0.966837118301992}, - {1046, 0.968549935542785}, - {1077, 0.97019124274183}, - {1108, 0.971762825023174}, - {1141, 0.973266505330149}, - {1175, 0.974704137830182}, - {1210, 0.976077601505512}, - {1246, 0.977388793954181}, - {1283, 0.978639625422937}, - {1321, 0.97983201309107}, - {1360, 0.980967875621433}, - {1401, 0.982049127992267}, - {1442, 0.983077676620779}, - {1485, 0.984055414786898}, - {1529, 0.984984218363116}, - {1574, 0.985865941853998}, - {1621, 0.986702414746682}, - {1669, 0.987495438171598}, - {1719, 0.988246781870691}, - {1770, 0.988958181468629}, - {1822, 0.989631336040849}, - {1876, 0.990267905970848}, - {1932, 0.990869511087789}, - {1989, 0.991437729074415}, - {2048, 0.991974094134257}, - {2109, 0.992480095906333}, - {2172, 0.992957178614888}, - {2236, 0.993406740441211}, - {2303, 0.993830133104215}, - {2371, 0.994228661636218}, - {2441, 0.994603584340256}, - {2514, 0.994956112915281}, - {2588, 0.995287412735657}, - {2665, 0.995598603271606}, - {2744, 0.995890758637502}, - {2826, 0.996164908255274}, - {2909, 0.996422037620571}, - {2996, 0.996663089159829}, - {3085, 0.996888963166859}, - {3176, 0.997100518808131}, - {3270, 0.997298575186473}, - {3367, 0.997483912453533}, - {3467, 0.997657272961877}, - {3570, 0.997819362448284}, - {3676, 0.997970851240328}, - {3785, 0.998112375478974}, - {3897, 0.998244538350506}, - {4013, 0.99836791132166}, - {4132, 0.998483035372409}, - {4255, 0.998590422221387}, - {4381, 0.99869055553946}, - {4511, 0.998783892147428}, - {4645, 0.99887086319436}, - {4782, 0.998951875313439}, - {4924, 0.999027311752696}, - {5070, 0.999097533478328}, - {5221, 0.999162880248718}, - {5376, 0.999223671657584}, - {5535, 0.999280208145}, - {5699, 0.999332771975342}, - {5868, 0.999381628181432}, - {6043, 0.999427025474451}, - {6222, 0.999469197119345}, - {6406, 0.99950836177569}, - {6596, 0.999544724304136}, - {6792, 0.999578476538699}, - {6994, 0.999609798025316}, - {7201, 0.999638856727198}, - {7415, 0.999665809697595}, - {7635, 0.999690803720715}, - {7861, 0.999713975921559}, - {8094, 0.999735454345555}, - {8334, 0.99975535850886}, - {8582, 0.999773799920301}, - {8836, 0.999790882575893}, - {9098, 0.999806703426944}, - {9368, 0.999821352822742}, - {9646, 0.99983491492882}, - {9932, 0.999847468121828}, - {10227, 0.999859085361988}, - {10530, 0.999869834544144}, - {10843, 0.999879778828365}, - {11164, 0.999888976951074}, - {11496, 0.999897483517634}, - {11837, 0.999905349277299}, - {12188, 0.999912621381435}, - {12549, 0.999919343625857}, - {12922, 0.999925556678125}, - {13305, 0.999931298290599}, - {13700, 0.999936603500036}, - {14106, 0.99994150481446}, - {14524, 0.999946032388038}, - {14955, 0.999950214184624}, - {15399, 0.999954076130654}, - {15856, 0.999957642257982}, - {16326, 0.999960934837285}, - {16810, 0.999963974502583}, - {17309, 0.999966780367418}, - {17822, 0.999969370133206}, - {18351, 0.999971760190245}, - {18896, 0.999973965711837}, - {19456, 0.99997600074196}, - {20033, 0.999977878276901}, - {20627, 0.999979610341234}, - {21239, 0.999981208058518}, - {21869, 0.999982681717046}, - {22518, 0.999984040830983}, - {23186, 0.999985294197189}, - {23874, 0.999986449948021}, - {24582, 0.999987515600377}, - {25311, 0.999988498101245}, - {26062, 0.999989403869978}, - {26835, 0.999990238837541}, - {27631, 0.999991008482917}, - {28451, 0.999991717866879}, - {29295, 0.999992371663312}, - {30164, 0.999992974188243}, - {31059, 0.99999352942676}, - {31980, 0.999994041057951}, - {32929, 0.999994512478005}, - {33905, 0.999994946821621}, - {34911, 0.999995346981819}, - {35947, 0.9999957156283}, - {37013, 0.999996055224422}, - {38111, 0.99999636804293}, - {39242, 0.999996656180495}, - {40406, 0.999996921571177}, - {41604, 0.999997165998873}, - {42838, 0.999997391108826}, - {44109, 0.999997598418274}, - {45418, 0.999997789326289}, - {46765, 0.999997965122879}, - {48152, 0.999998126997391}, - {49580, 0.999998276046281}, - {51051, 0.999998413280298}, - {52565, 0.999998539631112}, - {54125, 0.99999865595744}, - {55730, 0.999998763050707}, - {57383, 0.999998861640266}, - {59086, 0.999998952398228}, - {60838, 0.99999903594391}, - {62643, 0.999999112847952}, - {64501, 0.99999918363611}, - {66415, 0.999999248792755}, - {68385, 0.99999930876411}, - {70413, 0.999999363961231}, - {72502, 0.999999414762756}, - {74653, 0.999999461517448}, - {76867, 0.999999504546531}, - {79147, 0.999999544145854}, - {81495, 0.999999580587881}, - {83912, 0.999999614123529}, - {86401, 0.999999644983861}, - {88964, 0.999999673381646}, - {91603, 0.999999699512802}, - {94321, 0.999999723557724}, - {97119, 0.999999745682501}, - {100000, 1.0}, +dist_point_gen::weight dist_point_gen::w1[] = { + {8, 0.310940000000}, + {9, 0.008370000000}, + {10, 0.008370000000}, + {11, 0.089890000000}, + {12, 0.000920000000}, + {13, 0.003260000000}, + {14, 0.019800000000}, + {15, 0.028450000000}, + {16, 0.004821816501}, + {17, 0.002484918415}, + {18, 0.001291711243}, + {19, 0.002685645864}, + {20, 0.002827010816}, + {21, 0.001468449928}, + {22, 0.003050723947}, + {23, 0.001583869550}, + {24, 0.003288789983}, + {25, 0.001706550780}, + {26, 0.001749082128}, + {27, 0.003629060233}, + {28, 0.001881634149}, + {29, 0.001927475692}, + {30, 0.001974145968}, + {31, 0.002021643341}, + {32, 0.002069965005}, + {33, 0.002119106921}, + {34, 0.002169063740}, + {35, 0.002219828741}, + {36, 0.002271393751}, + {37, 0.002323749077}, + {38, 0.002376883429}, + {39, 0.002430783843}, + {40, 0.002485435607}, + {41, 0.002540822183}, + {43, 0.002596925133}, + {44, 0.002653724034}, + {45, 0.002711196410}, + {47, 0.002769317651}, + {48, 0.002828060938}, + {50, 0.002887397171}, + {51, 0.002947294894}, + {53, 0.003007720229}, + {54, 0.003068636805}, + {56, 0.003130005695}, + {57, 0.003191785357}, + {59, 0.003253931574}, + {61, 0.003316397409}, + {63, 0.003379133150}, + {65, 0.003442086278}, + {67, 0.003505201429}, + {69, 0.003568420369}, + {71, 0.003631681974}, + {73, 0.003694922225}, + {75, 0.003758074203}, + {77, 0.003821068101}, + {79, 0.003883831248}, + {82, 0.003946288134}, + {84, 0.004008360462}, + {87, 0.004069967204}, + {89, 0.004131024667}, + {92, 0.004191446588}, + {95, 0.004251144227}, + {97, 0.004310026486}, + {100, 0.004368000042}, + {103, 0.004424969496}, + {106, 0.004480837536}, + {110, 0.004535505123}, + {113, 0.004588871689}, + {116, 0.004640835359}, + {120, 0.004691293183}, + {123, 0.004740141390}, + {127, 0.004787275662}, + {131, 0.004832591420}, + {135, 0.004875984129}, + {139, 0.004917349617}, + {143, 0.004956584415}, + {147, 0.004993586106}, + {151, 0.005028253686}, + {156, 0.005060487945}, + {161, 0.005090191849}, + {165, 0.005117270938}, + {170, 0.005141633727}, + {175, 0.005163192116}, + {181, 0.005181861801}, + {186, 0.005197562684}, + {191, 0.005210219288}, + {197, 0.005219761159}, + {203, 0.005226123274}, + {209, 0.005229246424}, + {215, 0.005229077603}, + {222, 0.005225570363}, + {228, 0.005218685169}, + {235, 0.005208389722}, + {242, 0.005194659263}, + {249, 0.005177476851}, + {257, 0.005156833607}, + {264, 0.005132728935}, + {272, 0.005105170705}, + {280, 0.005074175391}, + {289, 0.005039768190}, + {297, 0.005001983077}, + {306, 0.004960862840}, + {315, 0.004916459054}, + {324, 0.004868832023}, + {334, 0.004818050669}, + {344, 0.004764192379}, + {354, 0.004707342807}, + {365, 0.004647595628}, + {376, 0.004585052250}, + {387, 0.004519821483}, + {398, 0.004452019166}, + {410, 0.004381767752}, + {422, 0.004309195861}, + {435, 0.004234437789}, + {448, 0.004157632998}, + {461, 0.004078925568}, + {475, 0.003998463624}, + {489, 0.003916398747}, + {503, 0.003832885366}, + {518, 0.003748080136}, + {534, 0.003662141307}, + {549, 0.003575228088}, + {566, 0.003487500016}, + {582, 0.003399116324}, + {600, 0.003310235316}, + {618, 0.003221013766}, + {636, 0.003131606319}, + {655, 0.003042164922}, + {674, 0.002952838282}, + {694, 0.002863771339}, + {715, 0.002775104785}, + {736, 0.002686974608}, + {758, 0.002599511671}, + {780, 0.002512841334}, + {804, 0.002427083112}, + {827, 0.002342350376}, + {852, 0.002258750093}, + {877, 0.002176382614}, + {903, 0.002095341495}, + {930, 0.002015713370}, + {958, 0.001937577859}, + {986, 0.001861007519}, + {1015, 0.001786067833}, + {1046, 0.001712817241}, + {1077, 0.001641307199}, + {1108, 0.001571582281}, + {1141, 0.001503680307}, + {1175, 0.001437632500}, + {1210, 0.001373463675}, + {1246, 0.001311192449}, + {1283, 0.001250831469}, + {1321, 0.001192387668}, + {1360, 0.001135862530}, + {1401, 0.001081252371}, + {1442, 0.001028548629}, + {1485, 0.000977738166}, + {1529, 0.000928803576}, + {1574, 0.000881723491}, + {1621, 0.000836472893}, + {1669, 0.000793023425}, + {1719, 0.000751343699}, + {1770, 0.000711399598}, + {1822, 0.000673154572}, + {1876, 0.000636569930}, + {1932, 0.000601605117}, + {1989, 0.000568217987}, + {2048, 0.000536365060}, + {2109, 0.000506001772}, + {2172, 0.000477082709}, + {2236, 0.000449561826}, + {2303, 0.000423392663}, + {2371, 0.000398528532}, + {2441, 0.000374922704}, + {2514, 0.000352528575}, + {2588, 0.000331299820}, + {2665, 0.000311190536}, + {2744, 0.000292155366}, + {2826, 0.000274149618}, + {2909, 0.000257129365}, + {2996, 0.000241051539}, + {3085, 0.000225874007}, + {3176, 0.000211555641}, + {3270, 0.000198056378}, + {3367, 0.000185337267}, + {3467, 0.000173360508}, + {3570, 0.000162089486}, + {3676, 0.000151488792}, + {3785, 0.000141524239}, + {3897, 0.000132162872}, + {4013, 0.000123372971}, + {4132, 0.000115124051}, + {4255, 0.000107386849}, + {4381, 0.000100133318}, + {4511, 0.000093336608}, + {4645, 0.000086971047}, + {4782, 0.000081012119}, + {4924, 0.000075436439}, + {5070, 0.000070221726}, + {5221, 0.000065346770}, + {5376, 0.000060791409}, + {5535, 0.000056536487}, + {5699, 0.000052563830}, + {5868, 0.000048856206}, + {6043, 0.000045397293}, + {6222, 0.000042171645}, + {6406, 0.000039164656}, + {6596, 0.000036362528}, + {6792, 0.000033752235}, + {6994, 0.000031321487}, + {7201, 0.000029058702}, + {7415, 0.000026952970}, + {7635, 0.000024994023}, + {7861, 0.000023172201}, + {8094, 0.000021478424}, + {8334, 0.000019904163}, + {8582, 0.000018441411}, + {8836, 0.000017082656}, + {9098, 0.000015820851}, + {9368, 0.000014649396}, + {9646, 0.000013562106}, + {9932, 0.000012553193}, + {10227, 0.000011617240}, + {10530, 0.000010749182}, + {10843, 0.000009944284}, + {11164, 0.000009198123}, + {11496, 0.000008506567}, + {11837, 0.000007865760}, + {12188, 0.000007272104}, + {12549, 0.000006722244}, + {12922, 0.000006213052}, + {13305, 0.000005741612}, + {13700, 0.000005305209}, + {14106, 0.000004901314}, + {14524, 0.000004527574}, + {14955, 0.000004181797}, + {15399, 0.000003861946}, + {15856, 0.000003566127}, + {16326, 0.000003292579}, + {16810, 0.000003039665}, + {17309, 0.000002805865}, + {17822, 0.000002589766}, + {18351, 0.000002390057}, + {18896, 0.000002205522}, + {19456, 0.000002035030}, + {20033, 0.000001877535}, + {20627, 0.000001732064}, + {21239, 0.000001597717}, + {21869, 0.000001473659}, + {22518, 0.000001359114}, + {23186, 0.000001253366}, + {23874, 0.000001155751}, + {24582, 0.000001065652}, + {25311, 0.000000982501}, + {26062, 0.000000905769}, + {26835, 0.000000834968}, + {27631, 0.000000769645}, + {28451, 0.000000709384}, + {29295, 0.000000653796}, + {30164, 0.000000602525}, + {31059, 0.000000555239}, + {31980, 0.000000511631}, + {32929, 0.000000471420}, + {33905, 0.000000434344}, + {34911, 0.000000400160}, + {35947, 0.000000368646}, + {37013, 0.000000339596}, + {38111, 0.000000312819}, + {39242, 0.000000288138}, + {40406, 0.000000265391}, + {41604, 0.000000244428}, + {42838, 0.000000225110}, + {44109, 0.000000207309}, + {45418, 0.000000190908}, + {46765, 0.000000175797}, + {48152, 0.000000161875}, + {49580, 0.000000149049}, + {51051, 0.000000137234}, + {52565, 0.000000126351}, + {54125, 0.000000116326}, + {55730, 0.000000107093}, + {57383, 0.000000098590}, + {59086, 0.000000090758}, + {60838, 0.000000083546}, + {62643, 0.000000076904}, + {64501, 0.000000070788}, + {66415, 0.000000065157}, + {68385, 0.000000059971}, + {70413, 0.000000055197}, + {72502, 0.000000050802}, + {74653, 0.000000046755}, + {76867, 0.000000043029}, + {79147, 0.000000039599}, + {81495, 0.000000036442}, + {83912, 0.000000033536}, + {86401, 0.000000030860}, + {88964, 0.000000028398}, + {91603, 0.000000026131}, + {94321, 0.000000024045}, + {97119, 0.000000022125}, + {100000, 0.000000254317}, + {0, 0}, }; -dist_point_gen::cdf_point w2[] = { - {8, 0.170107627909139}, - {32, 0.189008475454599}, - {34, 0.202751220734426}, - {36, 0.216493966014254}, - {38, 0.230236711294082}, - {40, 0.24397945657391}, - {43, 0.257722201853738}, - {46, 0.271464947133566}, - {49, 0.285207692413394}, - {53, 0.298950437693222}, - {58, 0.31269318297305}, - {64, 0.326435928252878}, - {67, 0.327962652787335}, - {71, 0.329489377321793}, - {75, 0.331016101856251}, - {80, 0.332542826390709}, - {85, 0.334069550925166}, - {91, 0.335596275459624}, - {98, 0.337122999994082}, - {107, 0.338649724528539}, - {116, 0.340176449062997}, - {128, 0.341703173597455}, - {135, 0.353916810301949}, - {142, 0.366130447006443}, - {151, 0.378344083710937}, - {160, 0.390557720415431}, - {171, 0.402771357119925}, - {183, 0.414984993824419}, - {197, 0.427198630528913}, - {213, 0.439412267233407}, - {233, 0.451625903937901}, - {256, 0.463839540642395}, - {269, 0.508495648577945}, - {284, 0.553151756513495}, - {301, 0.597807864449045}, - {320, 0.642463972384595}, - {341, 0.687120080320145}, - {366, 0.731776188255695}, - {394, 0.776432296191245}, - {427, 0.821088404126795}, - {465, 0.865744512062345}, - {512, 0.910400619997895}, - {539, 0.915171571649521}, - {569, 0.919942523301147}, - {602, 0.924713474952773}, - {640, 0.929484426604399}, - {683, 0.934255378256025}, - {731, 0.939026329907651}, - {788, 0.943797281559276}, - {853, 0.948568233210902}, - {931, 0.953339184862528}, - {1024, 0.958110136514154}, - {1078, 0.960400194140515}, - {1138, 0.962690251766876}, - {1205, 0.964980309393238}, - {1280, 0.967270367019599}, - {1365, 0.96956042464596}, - {1463, 0.971850482272321}, - {1575, 0.974140539898682}, - {1707, 0.976430597525043}, - {1862, 0.978720655151404}, - {2048, 0.981010712777766}, - {2156, 0.982441997677838}, - {2276, 0.983873282577911}, - {2409, 0.985304567477984}, - {2560, 0.986735852378057}, - {2731, 0.98816713727813}, - {2926, 0.989598422178203}, - {3151, 0.991029707078276}, - {3413, 0.992460991978349}, - {3724, 0.993892276878422}, - {4096, 0.995323561778495}, - {4312, 0.995633673903787}, - {4551, 0.99594378602908}, - {4819, 0.996253898154372}, - {5120, 0.996564010279664}, - {5461, 0.996874122404957}, - {5851, 0.997184234530249}, - {6302, 0.997494346655541}, - {6827, 0.997804458780834}, - {7447, 0.998114570906126}, - {8192, 0.998424683031419}, - {8623, 0.998472392503279}, - {9102, 0.998520101975139}, - {9638, 0.998567811446999}, - {10240, 0.998615520918859}, - {10923, 0.998663230390719}, - {11703, 0.998710939862579}, - {12603, 0.99875864933444}, - {13653, 0.9988063588063}, - {14895, 0.99885406827816}, - {16384, 0.99890177775002}, - {17246, 0.99892563248595}, - {18204, 0.99894948722188}, - {19275, 0.99897334195781}, - {20480, 0.99899719669374}, - {21845, 0.99902105142967}, - {23406, 0.9990449061656}, - {25206, 0.999068760901531}, - {27307, 0.999092615637461}, - {29789, 0.999116470373391}, - {32768, 0.999140325109321}, - {34493, 0.999161198026518}, - {36409, 0.999182070943715}, - {38551, 0.999202943860912}, - {40960, 0.99922381677811}, - {43691, 0.999244689695307}, - {46811, 0.999265562612504}, - {50412, 0.999286435529701}, - {54613, 0.999307308446898}, - {59578, 0.999328181364096}, - {65536, 0.999349054281293}, - {68985, 0.999401236574286}, - {72818, 0.999453418867279}, - {77101, 0.999505601160272}, - {81920, 0.999557783453265}, - {87381, 0.999609965746258}, - {93623, 0.999662148039251}, - {100825, 0.999714330332244}, - {109227, 0.999766512625237}, - {119156, 0.99981869491823}, - {131072, 0.999870877211223}, - {137971, 0.999881313669821}, - {145636, 0.99989175012842}, - {154202, 0.999902186587019}, - {163840, 0.999912623045617}, - {174763, 0.999923059504216}, - {187246, 0.999933495962815}, - {201649, 0.999943932421413}, - {218453, 0.999954368880012}, - {238313, 0.99996480533861}, - {262144, 0.999975241797209}, - {275941, 0.999975987263521}, - {291271, 0.999976732729834}, - {308405, 0.999977478196146}, - {327680, 0.999978223662459}, - {349525, 0.999978969128771}, - {374491, 0.999979714595084}, - {403298, 0.999980460061396}, - {436907, 0.999981205527708}, - {476625, 0.999981950994021}, - {524288, 0.999982696460333}, - {573085, 0.9999844268143}, - {631896, 0.999986157168266}, - {704160, 0.999987887522233}, - {795085, 0.9999896178762}, - {912974, 0.999991348230167}, - {1071908, 0.999993078584133}, - {1297841, 0.9999948089381}, - {1644453, 0.999996539292067}, - {2243665, 0.999998269646033}, - {3529904, 1}, +dist_point_gen::weight dist_point_gen::w2[] = { + {8, 0.170107627909}, + {32, 0.018900847545}, + {34, 0.013742745280}, + {36, 0.013742745280}, + {38, 0.013742745280}, + {40, 0.013742745280}, + {43, 0.013742745280}, + {46, 0.013742745280}, + {49, 0.013742745280}, + {53, 0.013742745280}, + {58, 0.013742745280}, + {64, 0.013742745280}, + {67, 0.001526724534}, + {71, 0.001526724534}, + {75, 0.001526724534}, + {80, 0.001526724534}, + {85, 0.001526724534}, + {91, 0.001526724534}, + {98, 0.001526724534}, + {107, 0.001526724534}, + {116, 0.001526724534}, + {128, 0.001526724534}, + {135, 0.012213636704}, + {142, 0.012213636704}, + {151, 0.012213636704}, + {160, 0.012213636704}, + {171, 0.012213636704}, + {183, 0.012213636704}, + {197, 0.012213636704}, + {213, 0.012213636704}, + {233, 0.012213636704}, + {256, 0.012213636704}, + {269, 0.044656107936}, + {284, 0.044656107936}, + {301, 0.044656107936}, + {320, 0.044656107936}, + {341, 0.044656107936}, + {366, 0.044656107936}, + {394, 0.044656107936}, + {427, 0.044656107936}, + {465, 0.044656107936}, + {512, 0.044656107936}, + {539, 0.004770951652}, + {569, 0.004770951652}, + {602, 0.004770951652}, + {640, 0.004770951652}, + {683, 0.004770951652}, + {731, 0.004770951652}, + {788, 0.004770951652}, + {853, 0.004770951652}, + {931, 0.004770951652}, + {1024, 0.004770951652}, + {1078, 0.002290057626}, + {1138, 0.002290057626}, + {1205, 0.002290057626}, + {1280, 0.002290057626}, + {1365, 0.002290057626}, + {1463, 0.002290057626}, + {1575, 0.002290057626}, + {1707, 0.002290057626}, + {1862, 0.002290057626}, + {2048, 0.002290057626}, + {2156, 0.001431284900}, + {2276, 0.001431284900}, + {2409, 0.001431284900}, + {2560, 0.001431284900}, + {2731, 0.001431284900}, + {2926, 0.001431284900}, + {3151, 0.001431284900}, + {3413, 0.001431284900}, + {3724, 0.001431284900}, + {4096, 0.001431284900}, + {4312, 0.000310112125}, + {4551, 0.000310112125}, + {4819, 0.000310112125}, + {5120, 0.000310112125}, + {5461, 0.000310112125}, + {5851, 0.000310112125}, + {6302, 0.000310112125}, + {6827, 0.000310112125}, + {7447, 0.000310112125}, + {8192, 0.000310112125}, + {8623, 0.000047709472}, + {9102, 0.000047709472}, + {9638, 0.000047709472}, + {10240, 0.000047709472}, + {10923, 0.000047709472}, + {11703, 0.000047709472}, + {12603, 0.000047709472}, + {13653, 0.000047709472}, + {14895, 0.000047709472}, + {16384, 0.000047709472}, + {17246, 0.000023854736}, + {18204, 0.000023854736}, + {19275, 0.000023854736}, + {20480, 0.000023854736}, + {21845, 0.000023854736}, + {23406, 0.000023854736}, + {25206, 0.000023854736}, + {27307, 0.000023854736}, + {29789, 0.000023854736}, + {32768, 0.000023854736}, + {34493, 0.000020872917}, + {36409, 0.000020872917}, + {38551, 0.000020872917}, + {40960, 0.000020872917}, + {43691, 0.000020872917}, + {46811, 0.000020872917}, + {50412, 0.000020872917}, + {54613, 0.000020872917}, + {59578, 0.000020872917}, + {65536, 0.000020872917}, + {68985, 0.000052182293}, + {72818, 0.000052182293}, + {77101, 0.000052182293}, + {81920, 0.000052182293}, + {87381, 0.000052182293}, + {93623, 0.000052182293}, + {100825, 0.000052182293}, + {109227, 0.000052182293}, + {119156, 0.000052182293}, + {131072, 0.000052182293}, + {137971, 0.000010436459}, + {145636, 0.000010436459}, + {154202, 0.000010436459}, + {163840, 0.000010436459}, + {174763, 0.000010436459}, + {187246, 0.000010436459}, + {201649, 0.000010436459}, + {218453, 0.000010436459}, + {238313, 0.000010436459}, + {262144, 0.000010436459}, + {275941, 0.000000745466}, + {291271, 0.000000745466}, + {308405, 0.000000745466}, + {327680, 0.000000745466}, + {349525, 0.000000745466}, + {374491, 0.000000745466}, + {403298, 0.000000745466}, + {436907, 0.000000745466}, + {476625, 0.000000745466}, + {524288, 0.000000745466}, + {573085, 0.000001730354}, + {631896, 0.000001730354}, + {704160, 0.000001730354}, + {795085, 0.000001730354}, + {912974, 0.000001730354}, + {1071908, 0.000001730354}, + {1297841, 0.000001730354}, + {1644453, 0.000001730354}, + {2243665, 0.000001730354}, + {3529904, 0.000001730354}, + {0, 0}, }; -dist_point_gen::cdf_point w3[] = { - {8, 0.0648826230027598}, - {32, 0.0973239345041398}, - {36, 0.108913294911834}, - {40, 0.120502655319528}, - {46, 0.132092015727223}, - {53, 0.143681376134917}, - {64, 0.155270736542611}, - {70, 0.18798780420397}, - {77, 0.220704871865328}, - {85, 0.253421939526687}, - {96, 0.286139007188045}, - {110, 0.318856074849404}, - {128, 0.351573142510763}, - {137, 0.369864095116246}, - {146, 0.388155047721729}, - {158, 0.406446000327212}, - {171, 0.424736952932695}, - {186, 0.443027905538178}, - {205, 0.461318858143661}, - {228, 0.479609810749145}, - {256, 0.497900763354628}, - {268, 0.523993534878443}, - {282, 0.550086306402258}, - {296, 0.576179077926073}, - {313, 0.602271849449888}, - {331, 0.628364620973703}, - {352, 0.654457392497518}, - {375, 0.680550164021333}, - {402, 0.706642935545148}, - {433, 0.732735707068963}, - {469, 0.758828478592778}, - {512, 0.784921250116593}, - {531, 0.790512558564584}, - {551, 0.796103867012575}, - {573, 0.801695175460566}, - {597, 0.807286483908557}, - {623, 0.812877792356548}, - {652, 0.818469100804539}, - {683, 0.82406040925253}, - {717, 0.829651717700521}, - {755, 0.835243026148512}, - {796, 0.840834334596503}, - {843, 0.846425643044494}, - {896, 0.852016951492485}, - {956, 0.857608259940476}, - {1024, 0.863199568388467}, - {1053, 0.865736365928318}, - {1084, 0.86827316346817}, - {1117, 0.870809961008021}, - {1152, 0.873346758547872}, - {1189, 0.875883556087723}, - {1229, 0.878420353627574}, - {1271, 0.880957151167425}, - {1317, 0.883493948707276}, - {1365, 0.886030746247128}, - {1418, 0.888567543786979}, - {1475, 0.89110434132683}, - {1536, 0.893641138866681}, - {1603, 0.896177936406532}, - {1676, 0.898714733946383}, - {1755, 0.901251531486234}, - {1843, 0.903788329026086}, - {1940, 0.906325126565937}, - {2048, 0.908861924105788}, - {2092, 0.909949120991973}, - {2137, 0.911036317878158}, - {2185, 0.912123514764344}, - {2234, 0.913210711650529}, - {2286, 0.914297908536714}, - {2341, 0.9153851054229}, - {2398, 0.916472302309085}, - {2458, 0.91755949919527}, - {2521, 0.918646696081455}, - {2587, 0.919733892967641}, - {2657, 0.920821089853826}, - {2731, 0.921908286740011}, - {2809, 0.922995483626197}, - {2891, 0.924082680512382}, - {2979, 0.925169877398567}, - {3072, 0.926257074284752}, - {3171, 0.927344271170938}, - {3277, 0.928431468057123}, - {3390, 0.929518664943308}, - {3511, 0.930605861829494}, - {3641, 0.931693058715679}, - {3781, 0.932780255601864}, - {3932, 0.933867452488049}, - {4096, 0.934954649374235}, - {4163, 0.935901565991808}, - {4233, 0.936848482609381}, - {4304, 0.937795399226954}, - {4378, 0.938742315844527}, - {4455, 0.9396892324621}, - {4535, 0.940636149079674}, - {4617, 0.941583065697247}, - {4703, 0.94252998231482}, - {4792, 0.943476898932393}, - {4884, 0.944423815549966}, - {4979, 0.945370732167539}, - {5079, 0.946317648785112}, - {5183, 0.947264565402686}, - {5291, 0.948211482020259}, - {5403, 0.949158398637832}, - {5521, 0.950105315255405}, - {5643, 0.951052231872978}, - {5772, 0.951999148490551}, - {5906, 0.952946065108124}, - {6046, 0.953892981725698}, - {6194, 0.954839898343271}, - {6349, 0.955786814960844}, - {6512, 0.956733731578417}, - {6683, 0.95768064819599}, - {6864, 0.958627564813563}, - {7054, 0.959574481431136}, - {7256, 0.96052139804871}, - {7469, 0.961468314666283}, - {7696, 0.962415231283856}, - {7936, 0.963362147901429}, - {8192, 0.964309064519002}, - {8293, 0.964806259070139}, - {8397, 0.965303453621275}, - {8503, 0.965800648172412}, - {8612, 0.966297842723548}, - {8724, 0.966795037274685}, - {8839, 0.967292231825821}, - {8957, 0.967789426376958}, - {9078, 0.968286620928094}, - {9202, 0.968783815479231}, - {9330, 0.969281010030367}, - {9461, 0.969778204581504}, - {9596, 0.97027539913264}, - {9735, 0.970772593683777}, - {9879, 0.971269788234913}, - {10026, 0.97176698278605}, - {10178, 0.972264177337186}, - {10335, 0.972761371888322}, - {10496, 0.973258566439459}, - {10663, 0.973755760990595}, - {10835, 0.974252955541732}, - {11012, 0.974750150092868}, - {11196, 0.975247344644005}, - {11385, 0.975744539195141}, - {11582, 0.976241733746278}, - {11785, 0.976738928297414}, - {11995, 0.977236122848551}, - {12214, 0.977733317399687}, - {12440, 0.978230511950824}, - {12674, 0.97872770650196}, - {12918, 0.979224901053097}, - {13171, 0.979722095604233}, - {13435, 0.98021929015537}, - {13709, 0.980716484706506}, - {13995, 0.981213679257643}, - {14292, 0.981710873808779}, - {14603, 0.982208068359916}, - {14928, 0.982705262911052}, - {15267, 0.983202457462189}, - {15622, 0.983699652013325}, - {15994, 0.984196846564462}, - {16384, 0.984694041115598}, - {16540, 0.984801735350781}, - {16699, 0.984909429585963}, - {16861, 0.985017123821145}, - {17027, 0.985124818056328}, - {17195, 0.98523251229151}, - {17367, 0.985340206526693}, - {17542, 0.985447900761875}, - {17721, 0.985555594997057}, - {17904, 0.98566328923224}, - {18091, 0.985770983467422}, - {18281, 0.985878677702604}, - {18476, 0.985986371937787}, - {18674, 0.986094066172969}, - {18877, 0.986201760408151}, - {19085, 0.986309454643334}, - {19297, 0.986417148878516}, - {19514, 0.986524843113698}, - {19735, 0.986632537348881}, - {19962, 0.986740231584063}, - {20194, 0.986847925819246}, - {20432, 0.986955620054428}, - {20675, 0.98706331428961}, - {20924, 0.987171008524793}, - {21179, 0.987278702759975}, - {21441, 0.987386396995157}, - {21709, 0.98749409123034}, - {21984, 0.987601785465522}, - {22265, 0.987709479700704}, - {22555, 0.987817173935887}, - {22851, 0.987924868171069}, - {23156, 0.988032562406252}, - {23469, 0.988140256641434}, - {23790, 0.988247950876616}, - {24121, 0.988355645111799}, - {24461, 0.988463339346981}, - {24810, 0.988571033582163}, - {25170, 0.988678727817346}, - {25540, 0.988786422052528}, - {25921, 0.98889411628771}, - {26314, 0.989001810522893}, - {26719, 0.989109504758075}, - {27136, 0.989217198993258}, - {27567, 0.98932489322844}, - {28011, 0.989432587463622}, - {28471, 0.989540281698805}, - {28945, 0.989647975933987}, - {29436, 0.989755670169169}, - {29943, 0.989863364404352}, - {30468, 0.989971058639534}, - {31013, 0.990078752874716}, - {31576, 0.990186447109899}, - {32161, 0.990294141345081}, - {32768, 0.990401835580264}, - {33007, 0.990443196317542}, - {33250, 0.990484557054821}, - {33496, 0.990525917792099}, - {33746, 0.990567278529378}, - {34000, 0.990608639266656}, - {34257, 0.990650000003935}, - {34519, 0.990691360741213}, - {34784, 0.990732721478492}, - {35054, 0.99077408221577}, - {35328, 0.990815442953049}, - {35606, 0.990856803690327}, - {35889, 0.990898164427606}, - {36176, 0.990939525164885}, - {36468, 0.990980885902163}, - {36764, 0.991022246639442}, - {37065, 0.99106360737672}, - {37372, 0.991104968113999}, - {37683, 0.991146328851277}, - {38000, 0.991187689588556}, - {38322, 0.991229050325834}, - {38649, 0.991270411063113}, - {38983, 0.991311771800391}, - {39322, 0.99135313253767}, - {39667, 0.991394493274948}, - {40018, 0.991435854012227}, - {40375, 0.991477214749506}, - {40739, 0.991518575486784}, - {41109, 0.991559936224063}, - {41486, 0.991601296961341}, - {41870, 0.99164265769862}, - {42262, 0.991684018435898}, - {42660, 0.991725379173177}, - {43067, 0.991766739910455}, - {43481, 0.991808100647734}, - {43903, 0.991849461385012}, - {44333, 0.991890822122291}, - {44772, 0.99193218285957}, - {45220, 0.991973543596848}, - {45677, 0.992014904334127}, - {46143, 0.992056265071405}, - {46618, 0.992097625808684}, - {47104, 0.992138986545962}, - {47600, 0.992180347283241}, - {48106, 0.992221708020519}, - {48623, 0.992263068757798}, - {49152, 0.992304429495076}, - {49692, 0.992345790232355}, - {50244, 0.992387150969633}, - {50809, 0.992428511706912}, - {51386, 0.992469872444191}, - {51977, 0.992511233181469}, - {52581, 0.992552593918748}, - {53200, 0.992593954656026}, - {53833, 0.992635315393305}, - {54482, 0.992676676130583}, - {55146, 0.992718036867862}, - {55827, 0.99275939760514}, - {56525, 0.992800758342419}, - {57240, 0.992842119079697}, - {57974, 0.992883479816976}, - {58727, 0.992924840554255}, - {59500, 0.992966201291533}, - {60293, 0.993007562028812}, - {61108, 0.99304892276609}, - {61945, 0.993090283503369}, - {62805, 0.993131644240647}, - {63690, 0.993173004977926}, - {64600, 0.993214365715204}, - {65536, 0.993255726452483}, - {65902, 0.993298761415796}, - {66272, 0.993341796379109}, - {66647, 0.993384831342422}, - {67025, 0.993427866305735}, - {67408, 0.993470901269048}, - {67796, 0.993513936232361}, - {68188, 0.993556971195674}, - {68584, 0.993600006158987}, - {68985, 0.9936430411223}, - {69391, 0.993686076085613}, - {69802, 0.993729111048926}, - {70217, 0.993772146012239}, - {70638, 0.993815180975552}, - {71063, 0.993858215938865}, - {71494, 0.993901250902179}, - {71930, 0.993944285865492}, - {72371, 0.993987320828805}, - {72818, 0.994030355792118}, - {73270, 0.994073390755431}, - {73728, 0.994116425718744}, - {74192, 0.994159460682057}, - {74661, 0.99420249564537}, - {75137, 0.994245530608683}, - {75618, 0.994288565571996}, - {76106, 0.994331600535309}, - {76601, 0.994374635498622}, - {77101, 0.994417670461935}, - {77608, 0.994460705425248}, - {78122, 0.994503740388561}, - {78643, 0.994546775351874}, - {79171, 0.994589810315187}, - {79706, 0.9946328452785}, - {80248, 0.994675880241813}, - {80798, 0.994718915205126}, - {81355, 0.994761950168439}, - {81920, 0.994804985131752}, - {82493, 0.994848020095065}, - {83074, 0.994891055058378}, - {83663, 0.994934090021691}, - {84261, 0.994977124985004}, - {84867, 0.995020159948317}, - {85482, 0.995063194911631}, - {86106, 0.995106229874944}, - {86739, 0.995149264838257}, - {87381, 0.99519229980157}, - {88033, 0.995235334764883}, - {88695, 0.995278369728196}, - {89367, 0.995321404691509}, - {90049, 0.995364439654822}, - {90742, 0.995407474618135}, - {91446, 0.995450509581448}, - {92160, 0.995493544544761}, - {92886, 0.995536579508074}, - {93623, 0.995579614471387}, - {94372, 0.9956226494347}, - {95133, 0.995665684398013}, - {95906, 0.995708719361326}, - {96692, 0.995751754324639}, - {97492, 0.995794789287952}, - {98304, 0.995837824251265}, - {99130, 0.995880859214578}, - {99970, 0.995923894177891}, - {100825, 0.995966929141204}, - {101694, 0.996009964104517}, - {102578, 0.996052999067831}, - {103478, 0.996096034031144}, - {104394, 0.996139068994457}, - {105326, 0.99618210395777}, - {106275, 0.996225138921083}, - {107241, 0.996268173884396}, - {108225, 0.996311208847709}, - {109227, 0.996354243811022}, - {110247, 0.996397278774335}, - {111288, 0.996440313737648}, - {112347, 0.996483348700961}, - {113428, 0.996526383664274}, - {114529, 0.996569418627587}, - {115652, 0.9966124535909}, - {116797, 0.996655488554213}, - {117965, 0.996698523517526}, - {119156, 0.996741558480839}, - {120372, 0.996784593444152}, - {121613, 0.996827628407465}, - {122880, 0.996870663370778}, - {124173, 0.996913698334091}, - {125494, 0.996956733297404}, - {126844, 0.996999768260717}, - {128223, 0.99704280322403}, - {129632, 0.997085838187344}, - {131072, 0.997128873150657}, - {131630, 0.997140095435483}, - {132192, 0.997151317720308}, - {132760, 0.997162540005134}, - {133332, 0.99717376228996}, - {133909, 0.997184984574786}, - {134491, 0.997196206859612}, - {135079, 0.997207429144438}, - {135671, 0.997218651429264}, - {136269, 0.99722987371409}, - {136872, 0.997241095998916}, - {137480, 0.997252318283742}, - {138094, 0.997263540568568}, - {138713, 0.997274762853394}, - {139338, 0.99728598513822}, - {139968, 0.997297207423046}, - {140605, 0.997308429707872}, - {141247, 0.997319651992698}, - {141894, 0.997330874277524}, - {142548, 0.997342096562349}, - {143208, 0.997353318847175}, - {143874, 0.997364541132001}, - {144547, 0.997375763416827}, - {145225, 0.997386985701653}, - {145910, 0.997398207986479}, - {146602, 0.997409430271305}, - {147300, 0.997420652556131}, - {148005, 0.997431874840957}, - {148716, 0.997443097125783}, - {149435, 0.997454319410609}, - {150160, 0.997465541695435}, - {150893, 0.997476763980261}, - {151632, 0.997487986265087}, - {152379, 0.997499208549913}, - {153134, 0.997510430834739}, - {153895, 0.997521653119564}, - {154665, 0.99753287540439}, - {155442, 0.997544097689216}, - {156227, 0.997555319974042}, - {157020, 0.997566542258868}, - {157821, 0.997577764543694}, - {158631, 0.99758898682852}, - {159448, 0.997600209113346}, - {160275, 0.997611431398172}, - {161109, 0.997622653682998}, - {161953, 0.997633875967824}, - {162805, 0.99764509825265}, - {163667, 0.997656320537476}, - {164537, 0.997667542822302}, - {165417, 0.997678765107128}, - {166306, 0.997689987391954}, - {167205, 0.997701209676779}, - {168114, 0.997712431961605}, - {169033, 0.997723654246431}, - {169961, 0.997734876531257}, - {170901, 0.997746098816083}, - {171850, 0.997757321100909}, - {172810, 0.997768543385735}, - {173781, 0.997779765670561}, - {174763, 0.997790987955387}, - {175756, 0.997802210240213}, - {176760, 0.997813432525039}, - {177776, 0.997824654809865}, - {178803, 0.997835877094691}, - {179843, 0.997847099379517}, - {180895, 0.997858321664343}, - {181959, 0.997869543949169}, - {183035, 0.997880766233995}, - {184125, 0.99789198851882}, - {185227, 0.997903210803646}, - {186343, 0.997914433088472}, - {187473, 0.997925655373298}, - {188616, 0.997936877658124}, - {189773, 0.99794809994295}, - {190944, 0.997959322227776}, - {192130, 0.997970544512602}, - {193331, 0.997981766797428}, - {194547, 0.997992989082254}, - {195778, 0.99800421136708}, - {197025, 0.998015433651906}, - {198288, 0.998026655936732}, - {199568, 0.998037878221558}, - {200864, 0.998049100506384}, - {202176, 0.99806032279121}, - {203507, 0.998071545076035}, - {204854, 0.998082767360861}, - {206220, 0.998093989645687}, - {207604, 0.998105211930513}, - {209007, 0.998116434215339}, - {210429, 0.998127656500165}, - {211870, 0.998138878784991}, - {213331, 0.998150101069817}, - {214812, 0.998161323354643}, - {216315, 0.998172545639469}, - {217838, 0.998183767924295}, - {219383, 0.998194990209121}, - {220950, 0.998206212493947}, - {222540, 0.998217434778773}, - {224152, 0.998228657063599}, - {225788, 0.998239879348425}, - {227448, 0.99825110163325}, - {229133, 0.998262323918076}, - {230843, 0.998273546202902}, - {232579, 0.998284768487728}, - {234341, 0.998295990772554}, - {236130, 0.99830721305738}, - {237946, 0.998318435342206}, - {239791, 0.998329657627032}, - {241664, 0.998340879911858}, - {243567, 0.998352102196684}, - {245500, 0.99836332448151}, - {247464, 0.998374546766336}, - {249460, 0.998385769051162}, - {251488, 0.998396991335988}, - {253549, 0.998408213620814}, - {255645, 0.99841943590564}, - {257775, 0.998430658190466}, - {259941, 0.998441880475291}, - {262144, 0.998453102760117}, - {263003, 0.998459933656693}, - {263869, 0.998466764553268}, - {264739, 0.998473595449844}, - {265616, 0.998480426346419}, - {266499, 0.998487257242994}, - {267387, 0.99849408813957}, - {268281, 0.998500919036145}, - {269181, 0.998507749932721}, - {270088, 0.998514580829296}, - {271000, 0.998521411725872}, - {271919, 0.998528242622447}, - {272844, 0.998535073519022}, - {273775, 0.998541904415598}, - {274713, 0.998548735312173}, - {275657, 0.998555566208749}, - {276607, 0.998562397105324}, - {277564, 0.998569228001899}, - {278528, 0.998576058898475}, - {279498, 0.99858288979505}, - {280476, 0.998589720691626}, - {281460, 0.998596551588201}, - {282451, 0.998603382484777}, - {283449, 0.998610213381352}, - {284454, 0.998617044277927}, - {285466, 0.998623875174503}, - {286486, 0.998630706071078}, - {287513, 0.998637536967654}, - {288547, 0.998644367864229}, - {289589, 0.998651198760804}, - {290638, 0.99865802965738}, - {291695, 0.998664860553955}, - {292759, 0.998671691450531}, - {293832, 0.998678522347106}, - {294912, 0.998685353243681}, - {296000, 0.998692184140257}, - {297097, 0.998699015036832}, - {298201, 0.998705845933408}, - {299314, 0.998712676829983}, - {300435, 0.998719507726558}, - {301564, 0.998726338623134}, - {302702, 0.998733169519709}, - {303849, 0.998740000416285}, - {305004, 0.99874683131286}, - {306168, 0.998753662209436}, - {307341, 0.998760493106011}, - {308523, 0.998767324002586}, - {309715, 0.998774154899162}, - {310915, 0.998780985795737}, - {312125, 0.998787816692313}, - {313344, 0.998794647588888}, - {314573, 0.998801478485463}, - {315811, 0.998808309382039}, - {317060, 0.998815140278614}, - {318318, 0.99882197117519}, - {319586, 0.998828802071765}, - {320864, 0.99883563296834}, - {322153, 0.998842463864916}, - {323452, 0.998849294761491}, - {324761, 0.998856125658067}, - {326082, 0.998862956554642}, - {327413, 0.998869787451218}, - {328754, 0.998876618347793}, - {330107, 0.998883449244368}, - {331471, 0.998890280140944}, - {332847, 0.998897111037519}, - {334234, 0.998903941934095}, - {335632, 0.99891077283067}, - {337042, 0.998917603727245}, - {338464, 0.998924434623821}, - {339899, 0.998931265520396}, - {341345, 0.998938096416972}, - {342804, 0.998944927313547}, - {344275, 0.998951758210122}, - {345759, 0.998958589106698}, - {347256, 0.998965420003273}, - {348765, 0.998972250899849}, - {350288, 0.998979081796424}, - {351825, 0.998985912693}, - {353375, 0.998992743589575}, - {354938, 0.99899957448615}, - {356516, 0.999006405382726}, - {358107, 0.999013236279301}, - {359713, 0.999020067175877}, - {361334, 0.999026898072452}, - {362969, 0.999033728969027}, - {364618, 0.999040559865603}, - {366283, 0.999047390762178}, - {367964, 0.999054221658754}, - {369659, 0.999061052555329}, - {371371, 0.999067883451904}, - {373098, 0.99907471434848}, - {374841, 0.999081545245055}, - {376601, 0.999088376141631}, - {378378, 0.999095207038206}, - {380171, 0.999102037934781}, - {381981, 0.999108868831357}, - {383809, 0.999115699727932}, - {385654, 0.999122530624508}, - {387517, 0.999129361521083}, - {389398, 0.999136192417658}, - {391298, 0.999143023314234}, - {393216, 0.999149854210809}, - {395153, 0.999156685107385}, - {397109, 0.99916351600396}, - {399085, 0.999170346900536}, - {401080, 0.999177177797111}, - {403096, 0.999184008693686}, - {405132, 0.999190839590262}, - {407188, 0.999197670486837}, - {409266, 0.999204501383413}, - {411364, 0.999211332279988}, - {413485, 0.999218163176564}, - {415627, 0.999224994073139}, - {417792, 0.999231824969714}, - {419979, 0.99923865586629}, - {422190, 0.999245486762865}, - {424424, 0.999252317659441}, - {426681, 0.999259148556016}, - {428963, 0.999265979452591}, - {431269, 0.999272810349167}, - {433600, 0.999279641245742}, - {435957, 0.999286472142318}, - {438339, 0.999293303038893}, - {440748, 0.999300133935468}, - {443183, 0.999306964832044}, - {445645, 0.999313795728619}, - {448134, 0.999320626625195}, - {450652, 0.99932745752177}, - {453198, 0.999334288418345}, - {455773, 0.999341119314921}, - {458378, 0.999347950211496}, - {461012, 0.999354781108072}, - {463677, 0.999361612004647}, - {466372, 0.999368442901223}, - {469100, 0.999375273797798}, - {471859, 0.999382104694373}, - {474651, 0.999388935590949}, - {477477, 0.999395766487524}, - {480336, 0.9994025973841}, - {483229, 0.999409428280675}, - {486158, 0.99941625917725}, - {489122, 0.999423090073826}, - {492123, 0.999429920970401}, - {495161, 0.999436751866977}, - {498236, 0.999443582763552}, - {501350, 0.999450413660127}, - {504504, 0.999457244556703}, - {507697, 0.999464075453278}, - {510930, 0.999470906349854}, - {514206, 0.999477737246429}, - {517523, 0.999484568143004}, - {520884, 0.99949139903958}, - {524288, 0.999498229936155}, - {526844, 0.999500751393763}, - {529425, 0.99950327285137}, - {532031, 0.999505794308977}, - {534663, 0.999508315766584}, - {537322, 0.999510837224192}, - {540006, 0.999513358681799}, - {542718, 0.999515880139406}, - {545458, 0.999518401597013}, - {548225, 0.999520923054621}, - {551020, 0.999523444512228}, - {553844, 0.999525965969835}, - {556697, 0.999528487427442}, - {559579, 0.99953100888505}, - {562492, 0.999533530342657}, - {565435, 0.999536051800264}, - {568408, 0.999538573257872}, - {571414, 0.999541094715479}, - {574451, 0.999543616173086}, - {577521, 0.999546137630693}, - {580624, 0.9995486590883}, - {583760, 0.999551180545908}, - {586931, 0.999553702003515}, - {590136, 0.999556223461122}, - {593376, 0.99955874491873}, - {596652, 0.999561266376337}, - {599964, 0.999563787833944}, - {603313, 0.999566309291551}, - {606700, 0.999568830749159}, - {610125, 0.999571352206766}, - {613590, 0.999573873664373}, - {617093, 0.99957639512198}, - {620637, 0.999578916579588}, - {624222, 0.999581438037195}, - {627848, 0.999583959494802}, - {631517, 0.999586480952409}, - {635229, 0.999589002410017}, - {638985, 0.999591523867624}, - {642786, 0.999594045325231}, - {646632, 0.999596566782838}, - {650524, 0.999599088240446}, - {654463, 0.999601609698053}, - {658451, 0.99960413115566}, - {662487, 0.999606652613268}, - {666573, 0.999609174070875}, - {670710, 0.999611695528482}, - {674899, 0.999614216986089}, - {679140, 0.999616738443697}, - {683435, 0.999619259901304}, - {687784, 0.999621781358911}, - {692190, 0.999624302816518}, - {696652, 0.999626824274126}, - {701172, 0.999629345731733}, - {705750, 0.99963186718934}, - {710390, 0.999634388646947}, - {715090, 0.999636910104555}, - {719853, 0.999639431562162}, - {724680, 0.999641953019769}, - {729573, 0.999644474477376}, - {734531, 0.999646995934984}, - {739558, 0.999649517392591}, - {744654, 0.999652038850198}, - {749820, 0.999654560307805}, - {755059, 0.999657081765413}, - {760371, 0.99965960322302}, - {765759, 0.999662124680627}, - {771224, 0.999664646138234}, - {776767, 0.999667167595842}, - {782390, 0.999669689053449}, - {788096, 0.999672210511056}, - {793885, 0.999674731968664}, - {799760, 0.999677253426271}, - {805723, 0.999679774883878}, - {811775, 0.999682296341485}, - {817919, 0.999684817799093}, - {824156, 0.9996873392567}, - {830490, 0.999689860714307}, - {836921, 0.999692382171914}, - {843453, 0.999694903629522}, - {850088, 0.999697425087129}, - {856827, 0.999699946544736}, - {863675, 0.999702468002343}, - {870633, 0.999704989459951}, - {877704, 0.999707510917558}, - {884890, 0.999710032375165}, - {892196, 0.999712553832772}, - {899623, 0.99971507529038}, - {907174, 0.999717596747987}, - {914854, 0.999720118205594}, - {922664, 0.999722639663201}, - {930609, 0.999725161120809}, - {938693, 0.999727682578416}, - {946917, 0.999730204036023}, - {955288, 0.99973272549363}, - {963807, 0.999735246951238}, - {972480, 0.999737768408845}, - {981310, 0.999740289866452}, - {990302, 0.99974281132406}, - {999461, 0.999745332781667}, - {1008790, 0.999747854239274}, - {1018296, 0.999750375696881}, - {1027982, 0.999752897154489}, - {1037854, 0.999755418612096}, - {1047917, 0.999757940069703}, - {1058178, 0.99976046152731}, - {1068642, 0.999762982984918}, - {1079314, 0.999765504442525}, - {1090202, 0.999768025900132}, - {1101312, 0.999770547357739}, - {1112651, 0.999773068815347}, - {1124225, 0.999775590272954}, - {1136043, 0.999778111730561}, - {1148112, 0.999780633188168}, - {1160440, 0.999783154645776}, - {1173036, 0.999785676103383}, - {1185908, 0.99978819756099}, - {1199066, 0.999790719018598}, - {1212519, 0.999793240476205}, - {1226277, 0.999795761933812}, - {1240351, 0.999798283391419}, - {1254752, 0.999800804849026}, - {1269492, 0.999803326306634}, - {1284581, 0.999805847764241}, - {1300034, 0.999808369221848}, - {1315863, 0.999810890679455}, - {1332082, 0.999813412137063}, - {1348706, 0.99981593359467}, - {1365751, 0.999818455052277}, - {1383231, 0.999820976509885}, - {1401165, 0.999823497967492}, - {1419570, 0.999826019425099}, - {1438465, 0.999828540882706}, - {1457870, 0.999831062340314}, - {1477805, 0.999833583797921}, - {1498294, 0.999836105255528}, - {1519358, 0.999838626713135}, - {1541023, 0.999841148170743}, - {1563315, 0.99984366962835}, - {1586261, 0.999846191085957}, - {1609891, 0.999848712543564}, - {1634235, 0.999851234001172}, - {1659327, 0.999853755458779}, - {1685202, 0.999856276916386}, - {1711896, 0.999858798373993}, - {1739450, 0.999861319831601}, - {1767905, 0.999863841289208}, - {1797307, 0.999866362746815}, - {1827703, 0.999868884204423}, - {1859145, 0.99987140566203}, - {1891687, 0.999873927119637}, - {1925389, 0.999876448577244}, - {1960314, 0.999878970034851}, - {1996529, 0.999881491492459}, - {2034108, 0.999884012950066}, - {2073128, 0.999886534407673}, - {2113675, 0.999889055865281}, - {2155839, 0.999891577322888}, - {2199720, 0.999894098780495}, - {2245424, 0.999896620238102}, - {2293067, 0.99989914169571}, - {2342777, 0.999901663153317}, - {2394689, 0.999904184610924}, - {2448954, 0.999906706068531}, - {2505735, 0.999909227526139}, - {2565212, 0.999911748983746}, - {2627581, 0.999914270441353}, - {2693059, 0.99991679189896}, - {2761883, 0.999919313356568}, - {2834317, 0.999921834814175}, - {2910653, 0.999924356271782}, - {2991214, 0.999926877729389}, - {3076362, 0.999929399186997}, - {3166500, 0.999931920644604}, - {3262080, 0.999934442102211}, - {3363608, 0.999936963559819}, - {3471660, 0.999939485017426}, - {3586885, 0.999942006475033}, - {3710020, 0.99994452793264}, - {3841911, 0.999947049390248}, - {3983524, 0.999949570847855}, - {4135977, 0.999952092305462}, - {4300563, 0.999954613763069}, - {4478791, 0.999957135220677}, - {4672430, 0.999959656678284}, - {4883570, 0.999962178135891}, - {5114695, 0.999964699593498}, - {5368784, 0.999967221051106}, - {5649438, 0.999969742508713}, - {5961053, 0.99997226396632}, - {6309051, 0.999974785423927}, - {6700199, 0.999977306881535}, - {7143054, 0.999979828339142}, - {7648594, 0.999982349796749}, - {8231141, 0.999984871254356}, - {8909743, 0.999987392711964}, - {9710291, 0.999989914169571}, - {10668901, 0.999992435627178}, - {11837511, 0.999994957084785}, - {13293619, 0.999997478542393}, - {15158197, 1.0}, +dist_point_gen::weight dist_point_gen::w3[] = { + {8, 0.064882623003}, + {32, 0.032441311501}, + {36, 0.011589360408}, + {40, 0.011589360408}, + {46, 0.011589360408}, + {53, 0.011589360408}, + {64, 0.011589360408}, + {70, 0.032717067661}, + {77, 0.032717067661}, + {85, 0.032717067661}, + {96, 0.032717067661}, + {110, 0.032717067661}, + {128, 0.032717067661}, + {137, 0.018290952605}, + {146, 0.018290952605}, + {158, 0.018290952605}, + {171, 0.018290952605}, + {186, 0.018290952605}, + {205, 0.018290952605}, + {228, 0.018290952605}, + {256, 0.018290952605}, + {268, 0.026092771524}, + {282, 0.026092771524}, + {296, 0.026092771524}, + {313, 0.026092771524}, + {331, 0.026092771524}, + {352, 0.026092771524}, + {375, 0.026092771524}, + {402, 0.026092771524}, + {433, 0.026092771524}, + {469, 0.026092771524}, + {512, 0.026092771524}, + {531, 0.005591308448}, + {551, 0.005591308448}, + {573, 0.005591308448}, + {597, 0.005591308448}, + {623, 0.005591308448}, + {652, 0.005591308448}, + {683, 0.005591308448}, + {717, 0.005591308448}, + {755, 0.005591308448}, + {796, 0.005591308448}, + {843, 0.005591308448}, + {896, 0.005591308448}, + {956, 0.005591308448}, + {1024, 0.005591308448}, + {1053, 0.002536797540}, + {1084, 0.002536797540}, + {1117, 0.002536797540}, + {1152, 0.002536797540}, + {1189, 0.002536797540}, + {1229, 0.002536797540}, + {1271, 0.002536797540}, + {1317, 0.002536797540}, + {1365, 0.002536797540}, + {1418, 0.002536797540}, + {1475, 0.002536797540}, + {1536, 0.002536797540}, + {1603, 0.002536797540}, + {1676, 0.002536797540}, + {1755, 0.002536797540}, + {1843, 0.002536797540}, + {1940, 0.002536797540}, + {2048, 0.002536797540}, + {2092, 0.001087196886}, + {2137, 0.001087196886}, + {2185, 0.001087196886}, + {2234, 0.001087196886}, + {2286, 0.001087196886}, + {2341, 0.001087196886}, + {2398, 0.001087196886}, + {2458, 0.001087196886}, + {2521, 0.001087196886}, + {2587, 0.001087196886}, + {2657, 0.001087196886}, + {2731, 0.001087196886}, + {2809, 0.001087196886}, + {2891, 0.001087196886}, + {2979, 0.001087196886}, + {3072, 0.001087196886}, + {3171, 0.001087196886}, + {3277, 0.001087196886}, + {3390, 0.001087196886}, + {3511, 0.001087196886}, + {3641, 0.001087196886}, + {3781, 0.001087196886}, + {3932, 0.001087196886}, + {4096, 0.001087196886}, + {4163, 0.000946916618}, + {4233, 0.000946916618}, + {4304, 0.000946916618}, + {4378, 0.000946916618}, + {4455, 0.000946916618}, + {4535, 0.000946916618}, + {4617, 0.000946916618}, + {4703, 0.000946916618}, + {4792, 0.000946916618}, + {4884, 0.000946916618}, + {4979, 0.000946916618}, + {5079, 0.000946916618}, + {5183, 0.000946916618}, + {5291, 0.000946916618}, + {5403, 0.000946916618}, + {5521, 0.000946916618}, + {5643, 0.000946916618}, + {5772, 0.000946916618}, + {5906, 0.000946916618}, + {6046, 0.000946916618}, + {6194, 0.000946916618}, + {6349, 0.000946916618}, + {6512, 0.000946916618}, + {6683, 0.000946916618}, + {6864, 0.000946916618}, + {7054, 0.000946916618}, + {7256, 0.000946916618}, + {7469, 0.000946916618}, + {7696, 0.000946916618}, + {7936, 0.000946916618}, + {8192, 0.000946916618}, + {8293, 0.000497194551}, + {8397, 0.000497194551}, + {8503, 0.000497194551}, + {8612, 0.000497194551}, + {8724, 0.000497194551}, + {8839, 0.000497194551}, + {8957, 0.000497194551}, + {9078, 0.000497194551}, + {9202, 0.000497194551}, + {9330, 0.000497194551}, + {9461, 0.000497194551}, + {9596, 0.000497194551}, + {9735, 0.000497194551}, + {9879, 0.000497194551}, + {10026, 0.000497194551}, + {10178, 0.000497194551}, + {10335, 0.000497194551}, + {10496, 0.000497194551}, + {10663, 0.000497194551}, + {10835, 0.000497194551}, + {11012, 0.000497194551}, + {11196, 0.000497194551}, + {11385, 0.000497194551}, + {11582, 0.000497194551}, + {11785, 0.000497194551}, + {11995, 0.000497194551}, + {12214, 0.000497194551}, + {12440, 0.000497194551}, + {12674, 0.000497194551}, + {12918, 0.000497194551}, + {13171, 0.000497194551}, + {13435, 0.000497194551}, + {13709, 0.000497194551}, + {13995, 0.000497194551}, + {14292, 0.000497194551}, + {14603, 0.000497194551}, + {14928, 0.000497194551}, + {15267, 0.000497194551}, + {15622, 0.000497194551}, + {15994, 0.000497194551}, + {16384, 0.000497194551}, + {16540, 0.000107694235}, + {16699, 0.000107694235}, + {16861, 0.000107694235}, + {17027, 0.000107694235}, + {17195, 0.000107694235}, + {17367, 0.000107694235}, + {17542, 0.000107694235}, + {17721, 0.000107694235}, + {17904, 0.000107694235}, + {18091, 0.000107694235}, + {18281, 0.000107694235}, + {18476, 0.000107694235}, + {18674, 0.000107694235}, + {18877, 0.000107694235}, + {19085, 0.000107694235}, + {19297, 0.000107694235}, + {19514, 0.000107694235}, + {19735, 0.000107694235}, + {19962, 0.000107694235}, + {20194, 0.000107694235}, + {20432, 0.000107694235}, + {20675, 0.000107694235}, + {20924, 0.000107694235}, + {21179, 0.000107694235}, + {21441, 0.000107694235}, + {21709, 0.000107694235}, + {21984, 0.000107694235}, + {22265, 0.000107694235}, + {22555, 0.000107694235}, + {22851, 0.000107694235}, + {23156, 0.000107694235}, + {23469, 0.000107694235}, + {23790, 0.000107694235}, + {24121, 0.000107694235}, + {24461, 0.000107694235}, + {24810, 0.000107694235}, + {25170, 0.000107694235}, + {25540, 0.000107694235}, + {25921, 0.000107694235}, + {26314, 0.000107694235}, + {26719, 0.000107694235}, + {27136, 0.000107694235}, + {27567, 0.000107694235}, + {28011, 0.000107694235}, + {28471, 0.000107694235}, + {28945, 0.000107694235}, + {29436, 0.000107694235}, + {29943, 0.000107694235}, + {30468, 0.000107694235}, + {31013, 0.000107694235}, + {31576, 0.000107694235}, + {32161, 0.000107694235}, + {32768, 0.000107694235}, + {33007, 0.000041360737}, + {33250, 0.000041360737}, + {33496, 0.000041360737}, + {33746, 0.000041360737}, + {34000, 0.000041360737}, + {34257, 0.000041360737}, + {34519, 0.000041360737}, + {34784, 0.000041360737}, + {35054, 0.000041360737}, + {35328, 0.000041360737}, + {35606, 0.000041360737}, + {35889, 0.000041360737}, + {36176, 0.000041360737}, + {36468, 0.000041360737}, + {36764, 0.000041360737}, + {37065, 0.000041360737}, + {37372, 0.000041360737}, + {37683, 0.000041360737}, + {38000, 0.000041360737}, + {38322, 0.000041360737}, + {38649, 0.000041360737}, + {38983, 0.000041360737}, + {39322, 0.000041360737}, + {39667, 0.000041360737}, + {40018, 0.000041360737}, + {40375, 0.000041360737}, + {40739, 0.000041360737}, + {41109, 0.000041360737}, + {41486, 0.000041360737}, + {41870, 0.000041360737}, + {42262, 0.000041360737}, + {42660, 0.000041360737}, + {43067, 0.000041360737}, + {43481, 0.000041360737}, + {43903, 0.000041360737}, + {44333, 0.000041360737}, + {44772, 0.000041360737}, + {45220, 0.000041360737}, + {45677, 0.000041360737}, + {46143, 0.000041360737}, + {46618, 0.000041360737}, + {47104, 0.000041360737}, + {47600, 0.000041360737}, + {48106, 0.000041360737}, + {48623, 0.000041360737}, + {49152, 0.000041360737}, + {49692, 0.000041360737}, + {50244, 0.000041360737}, + {50809, 0.000041360737}, + {51386, 0.000041360737}, + {51977, 0.000041360737}, + {52581, 0.000041360737}, + {53200, 0.000041360737}, + {53833, 0.000041360737}, + {54482, 0.000041360737}, + {55146, 0.000041360737}, + {55827, 0.000041360737}, + {56525, 0.000041360737}, + {57240, 0.000041360737}, + {57974, 0.000041360737}, + {58727, 0.000041360737}, + {59500, 0.000041360737}, + {60293, 0.000041360737}, + {61108, 0.000041360737}, + {61945, 0.000041360737}, + {62805, 0.000041360737}, + {63690, 0.000041360737}, + {64600, 0.000041360737}, + {65536, 0.000041360737}, + {65902, 0.000043034963}, + {66272, 0.000043034963}, + {66647, 0.000043034963}, + {67025, 0.000043034963}, + {67408, 0.000043034963}, + {67796, 0.000043034963}, + {68188, 0.000043034963}, + {68584, 0.000043034963}, + {68985, 0.000043034963}, + {69391, 0.000043034963}, + {69802, 0.000043034963}, + {70217, 0.000043034963}, + {70638, 0.000043034963}, + {71063, 0.000043034963}, + {71494, 0.000043034963}, + {71930, 0.000043034963}, + {72371, 0.000043034963}, + {72818, 0.000043034963}, + {73270, 0.000043034963}, + {73728, 0.000043034963}, + {74192, 0.000043034963}, + {74661, 0.000043034963}, + {75137, 0.000043034963}, + {75618, 0.000043034963}, + {76106, 0.000043034963}, + {76601, 0.000043034963}, + {77101, 0.000043034963}, + {77608, 0.000043034963}, + {78122, 0.000043034963}, + {78643, 0.000043034963}, + {79171, 0.000043034963}, + {79706, 0.000043034963}, + {80248, 0.000043034963}, + {80798, 0.000043034963}, + {81355, 0.000043034963}, + {81920, 0.000043034963}, + {82493, 0.000043034963}, + {83074, 0.000043034963}, + {83663, 0.000043034963}, + {84261, 0.000043034963}, + {84867, 0.000043034963}, + {85482, 0.000043034963}, + {86106, 0.000043034963}, + {86739, 0.000043034963}, + {87381, 0.000043034963}, + {88033, 0.000043034963}, + {88695, 0.000043034963}, + {89367, 0.000043034963}, + {90049, 0.000043034963}, + {90742, 0.000043034963}, + {91446, 0.000043034963}, + {92160, 0.000043034963}, + {92886, 0.000043034963}, + {93623, 0.000043034963}, + {94372, 0.000043034963}, + {95133, 0.000043034963}, + {95906, 0.000043034963}, + {96692, 0.000043034963}, + {97492, 0.000043034963}, + {98304, 0.000043034963}, + {99130, 0.000043034963}, + {99970, 0.000043034963}, + {100825, 0.000043034963}, + {101694, 0.000043034963}, + {102578, 0.000043034963}, + {103478, 0.000043034963}, + {104394, 0.000043034963}, + {105326, 0.000043034963}, + {106275, 0.000043034963}, + {107241, 0.000043034963}, + {108225, 0.000043034963}, + {109227, 0.000043034963}, + {110247, 0.000043034963}, + {111288, 0.000043034963}, + {112347, 0.000043034963}, + {113428, 0.000043034963}, + {114529, 0.000043034963}, + {115652, 0.000043034963}, + {116797, 0.000043034963}, + {117965, 0.000043034963}, + {119156, 0.000043034963}, + {120372, 0.000043034963}, + {121613, 0.000043034963}, + {122880, 0.000043034963}, + {124173, 0.000043034963}, + {125494, 0.000043034963}, + {126844, 0.000043034963}, + {128223, 0.000043034963}, + {129632, 0.000043034963}, + {131072, 0.000043034963}, + {131630, 0.000011222285}, + {132192, 0.000011222285}, + {132760, 0.000011222285}, + {133332, 0.000011222285}, + {133909, 0.000011222285}, + {134491, 0.000011222285}, + {135079, 0.000011222285}, + {135671, 0.000011222285}, + {136269, 0.000011222285}, + {136872, 0.000011222285}, + {137480, 0.000011222285}, + {138094, 0.000011222285}, + {138713, 0.000011222285}, + {139338, 0.000011222285}, + {139968, 0.000011222285}, + {140605, 0.000011222285}, + {141247, 0.000011222285}, + {141894, 0.000011222285}, + {142548, 0.000011222285}, + {143208, 0.000011222285}, + {143874, 0.000011222285}, + {144547, 0.000011222285}, + {145225, 0.000011222285}, + {145910, 0.000011222285}, + {146602, 0.000011222285}, + {147300, 0.000011222285}, + {148005, 0.000011222285}, + {148716, 0.000011222285}, + {149435, 0.000011222285}, + {150160, 0.000011222285}, + {150893, 0.000011222285}, + {151632, 0.000011222285}, + {152379, 0.000011222285}, + {153134, 0.000011222285}, + {153895, 0.000011222285}, + {154665, 0.000011222285}, + {155442, 0.000011222285}, + {156227, 0.000011222285}, + {157020, 0.000011222285}, + {157821, 0.000011222285}, + {158631, 0.000011222285}, + {159448, 0.000011222285}, + {160275, 0.000011222285}, + {161109, 0.000011222285}, + {161953, 0.000011222285}, + {162805, 0.000011222285}, + {163667, 0.000011222285}, + {164537, 0.000011222285}, + {165417, 0.000011222285}, + {166306, 0.000011222285}, + {167205, 0.000011222285}, + {168114, 0.000011222285}, + {169033, 0.000011222285}, + {169961, 0.000011222285}, + {170901, 0.000011222285}, + {171850, 0.000011222285}, + {172810, 0.000011222285}, + {173781, 0.000011222285}, + {174763, 0.000011222285}, + {175756, 0.000011222285}, + {176760, 0.000011222285}, + {177776, 0.000011222285}, + {178803, 0.000011222285}, + {179843, 0.000011222285}, + {180895, 0.000011222285}, + {181959, 0.000011222285}, + {183035, 0.000011222285}, + {184125, 0.000011222285}, + {185227, 0.000011222285}, + {186343, 0.000011222285}, + {187473, 0.000011222285}, + {188616, 0.000011222285}, + {189773, 0.000011222285}, + {190944, 0.000011222285}, + {192130, 0.000011222285}, + {193331, 0.000011222285}, + {194547, 0.000011222285}, + {195778, 0.000011222285}, + {197025, 0.000011222285}, + {198288, 0.000011222285}, + {199568, 0.000011222285}, + {200864, 0.000011222285}, + {202176, 0.000011222285}, + {203507, 0.000011222285}, + {204854, 0.000011222285}, + {206220, 0.000011222285}, + {207604, 0.000011222285}, + {209007, 0.000011222285}, + {210429, 0.000011222285}, + {211870, 0.000011222285}, + {213331, 0.000011222285}, + {214812, 0.000011222285}, + {216315, 0.000011222285}, + {217838, 0.000011222285}, + {219383, 0.000011222285}, + {220950, 0.000011222285}, + {222540, 0.000011222285}, + {224152, 0.000011222285}, + {225788, 0.000011222285}, + {227448, 0.000011222285}, + {229133, 0.000011222285}, + {230843, 0.000011222285}, + {232579, 0.000011222285}, + {234341, 0.000011222285}, + {236130, 0.000011222285}, + {237946, 0.000011222285}, + {239791, 0.000011222285}, + {241664, 0.000011222285}, + {243567, 0.000011222285}, + {245500, 0.000011222285}, + {247464, 0.000011222285}, + {249460, 0.000011222285}, + {251488, 0.000011222285}, + {253549, 0.000011222285}, + {255645, 0.000011222285}, + {257775, 0.000011222285}, + {259941, 0.000011222285}, + {262144, 0.000011222285}, + {263003, 0.000006830897}, + {263869, 0.000006830897}, + {264739, 0.000006830897}, + {265616, 0.000006830897}, + {266499, 0.000006830897}, + {267387, 0.000006830897}, + {268281, 0.000006830897}, + {269181, 0.000006830897}, + {270088, 0.000006830897}, + {271000, 0.000006830897}, + {271919, 0.000006830897}, + {272844, 0.000006830897}, + {273775, 0.000006830897}, + {274713, 0.000006830897}, + {275657, 0.000006830897}, + {276607, 0.000006830897}, + {277564, 0.000006830897}, + {278528, 0.000006830897}, + {279498, 0.000006830897}, + {280476, 0.000006830897}, + {281460, 0.000006830897}, + {282451, 0.000006830897}, + {283449, 0.000006830897}, + {284454, 0.000006830897}, + {285466, 0.000006830897}, + {286486, 0.000006830897}, + {287513, 0.000006830897}, + {288547, 0.000006830897}, + {289589, 0.000006830897}, + {290638, 0.000006830897}, + {291695, 0.000006830897}, + {292759, 0.000006830897}, + {293832, 0.000006830897}, + {294912, 0.000006830897}, + {296000, 0.000006830897}, + {297097, 0.000006830897}, + {298201, 0.000006830897}, + {299314, 0.000006830897}, + {300435, 0.000006830897}, + {301564, 0.000006830897}, + {302702, 0.000006830897}, + {303849, 0.000006830897}, + {305004, 0.000006830897}, + {306168, 0.000006830897}, + {307341, 0.000006830897}, + {308523, 0.000006830897}, + {309715, 0.000006830897}, + {310915, 0.000006830897}, + {312125, 0.000006830897}, + {313344, 0.000006830897}, + {314573, 0.000006830897}, + {315811, 0.000006830897}, + {317060, 0.000006830897}, + {318318, 0.000006830897}, + {319586, 0.000006830897}, + {320864, 0.000006830897}, + {322153, 0.000006830897}, + {323452, 0.000006830897}, + {324761, 0.000006830897}, + {326082, 0.000006830897}, + {327413, 0.000006830897}, + {328754, 0.000006830897}, + {330107, 0.000006830897}, + {331471, 0.000006830897}, + {332847, 0.000006830897}, + {334234, 0.000006830897}, + {335632, 0.000006830897}, + {337042, 0.000006830897}, + {338464, 0.000006830897}, + {339899, 0.000006830897}, + {341345, 0.000006830897}, + {342804, 0.000006830897}, + {344275, 0.000006830897}, + {345759, 0.000006830897}, + {347256, 0.000006830897}, + {348765, 0.000006830897}, + {350288, 0.000006830897}, + {351825, 0.000006830897}, + {353375, 0.000006830897}, + {354938, 0.000006830897}, + {356516, 0.000006830897}, + {358107, 0.000006830897}, + {359713, 0.000006830897}, + {361334, 0.000006830897}, + {362969, 0.000006830897}, + {364618, 0.000006830897}, + {366283, 0.000006830897}, + {367964, 0.000006830897}, + {369659, 0.000006830897}, + {371371, 0.000006830897}, + {373098, 0.000006830897}, + {374841, 0.000006830897}, + {376601, 0.000006830897}, + {378378, 0.000006830897}, + {380171, 0.000006830897}, + {381981, 0.000006830897}, + {383809, 0.000006830897}, + {385654, 0.000006830897}, + {387517, 0.000006830897}, + {389398, 0.000006830897}, + {391298, 0.000006830897}, + {393216, 0.000006830897}, + {395153, 0.000006830897}, + {397109, 0.000006830897}, + {399085, 0.000006830897}, + {401080, 0.000006830897}, + {403096, 0.000006830897}, + {405132, 0.000006830897}, + {407188, 0.000006830897}, + {409266, 0.000006830897}, + {411364, 0.000006830897}, + {413485, 0.000006830897}, + {415627, 0.000006830897}, + {417792, 0.000006830897}, + {419979, 0.000006830897}, + {422190, 0.000006830897}, + {424424, 0.000006830897}, + {426681, 0.000006830897}, + {428963, 0.000006830897}, + {431269, 0.000006830897}, + {433600, 0.000006830897}, + {435957, 0.000006830897}, + {438339, 0.000006830897}, + {440748, 0.000006830897}, + {443183, 0.000006830897}, + {445645, 0.000006830897}, + {448134, 0.000006830897}, + {450652, 0.000006830897}, + {453198, 0.000006830897}, + {455773, 0.000006830897}, + {458378, 0.000006830897}, + {461012, 0.000006830897}, + {463677, 0.000006830897}, + {466372, 0.000006830897}, + {469100, 0.000006830897}, + {471859, 0.000006830897}, + {474651, 0.000006830897}, + {477477, 0.000006830897}, + {480336, 0.000006830897}, + {483229, 0.000006830897}, + {486158, 0.000006830897}, + {489122, 0.000006830897}, + {492123, 0.000006830897}, + {495161, 0.000006830897}, + {498236, 0.000006830897}, + {501350, 0.000006830897}, + {504504, 0.000006830897}, + {507697, 0.000006830897}, + {510930, 0.000006830897}, + {514206, 0.000006830897}, + {517523, 0.000006830897}, + {520884, 0.000006830897}, + {524288, 0.000006830897}, + {526844, 0.000002521458}, + {529425, 0.000002521458}, + {532031, 0.000002521458}, + {534663, 0.000002521458}, + {537322, 0.000002521458}, + {540006, 0.000002521458}, + {542718, 0.000002521458}, + {545458, 0.000002521458}, + {548225, 0.000002521458}, + {551020, 0.000002521458}, + {553844, 0.000002521458}, + {556697, 0.000002521458}, + {559579, 0.000002521458}, + {562492, 0.000002521458}, + {565435, 0.000002521458}, + {568408, 0.000002521458}, + {571414, 0.000002521458}, + {574451, 0.000002521458}, + {577521, 0.000002521458}, + {580624, 0.000002521458}, + {583760, 0.000002521458}, + {586931, 0.000002521458}, + {590136, 0.000002521458}, + {593376, 0.000002521458}, + {596652, 0.000002521458}, + {599964, 0.000002521458}, + {603313, 0.000002521458}, + {606700, 0.000002521458}, + {610125, 0.000002521458}, + {613590, 0.000002521458}, + {617093, 0.000002521458}, + {620637, 0.000002521458}, + {624222, 0.000002521458}, + {627848, 0.000002521458}, + {631517, 0.000002521458}, + {635229, 0.000002521458}, + {638985, 0.000002521458}, + {642786, 0.000002521458}, + {646632, 0.000002521458}, + {650524, 0.000002521458}, + {654463, 0.000002521458}, + {658451, 0.000002521458}, + {662487, 0.000002521458}, + {666573, 0.000002521458}, + {670710, 0.000002521458}, + {674899, 0.000002521458}, + {679140, 0.000002521458}, + {683435, 0.000002521458}, + {687784, 0.000002521458}, + {692190, 0.000002521458}, + {696652, 0.000002521458}, + {701172, 0.000002521458}, + {705750, 0.000002521458}, + {710390, 0.000002521458}, + {715090, 0.000002521458}, + {719853, 0.000002521458}, + {724680, 0.000002521458}, + {729573, 0.000002521458}, + {734531, 0.000002521458}, + {739558, 0.000002521458}, + {744654, 0.000002521458}, + {749820, 0.000002521458}, + {755059, 0.000002521458}, + {760371, 0.000002521458}, + {765759, 0.000002521458}, + {771224, 0.000002521458}, + {776767, 0.000002521458}, + {782390, 0.000002521458}, + {788096, 0.000002521458}, + {793885, 0.000002521458}, + {799760, 0.000002521458}, + {805723, 0.000002521458}, + {811775, 0.000002521458}, + {817919, 0.000002521458}, + {824156, 0.000002521458}, + {830490, 0.000002521458}, + {836921, 0.000002521458}, + {843453, 0.000002521458}, + {850088, 0.000002521458}, + {856827, 0.000002521458}, + {863675, 0.000002521458}, + {870633, 0.000002521458}, + {877704, 0.000002521458}, + {884890, 0.000002521458}, + {892196, 0.000002521458}, + {899623, 0.000002521458}, + {907174, 0.000002521458}, + {914854, 0.000002521458}, + {922664, 0.000002521458}, + {930609, 0.000002521458}, + {938693, 0.000002521458}, + {946917, 0.000002521458}, + {955288, 0.000002521458}, + {963807, 0.000002521458}, + {972480, 0.000002521458}, + {981310, 0.000002521458}, + {990302, 0.000002521458}, + {999461, 0.000002521458}, + {1008790, 0.000002521458}, + {1018296, 0.000002521458}, + {1027982, 0.000002521458}, + {1037854, 0.000002521458}, + {1047917, 0.000002521458}, + {1058178, 0.000002521458}, + {1068642, 0.000002521458}, + {1079314, 0.000002521458}, + {1090202, 0.000002521458}, + {1101312, 0.000002521458}, + {1112651, 0.000002521458}, + {1124225, 0.000002521458}, + {1136043, 0.000002521458}, + {1148112, 0.000002521458}, + {1160440, 0.000002521458}, + {1173036, 0.000002521458}, + {1185908, 0.000002521458}, + {1199066, 0.000002521458}, + {1212519, 0.000002521458}, + {1226277, 0.000002521458}, + {1240351, 0.000002521458}, + {1254752, 0.000002521458}, + {1269492, 0.000002521458}, + {1284581, 0.000002521458}, + {1300034, 0.000002521458}, + {1315863, 0.000002521458}, + {1332082, 0.000002521458}, + {1348706, 0.000002521458}, + {1365751, 0.000002521458}, + {1383231, 0.000002521458}, + {1401165, 0.000002521458}, + {1419570, 0.000002521458}, + {1438465, 0.000002521458}, + {1457870, 0.000002521458}, + {1477805, 0.000002521458}, + {1498294, 0.000002521458}, + {1519358, 0.000002521458}, + {1541023, 0.000002521458}, + {1563315, 0.000002521458}, + {1586261, 0.000002521458}, + {1609891, 0.000002521458}, + {1634235, 0.000002521458}, + {1659327, 0.000002521458}, + {1685202, 0.000002521458}, + {1711896, 0.000002521458}, + {1739450, 0.000002521458}, + {1767905, 0.000002521458}, + {1797307, 0.000002521458}, + {1827703, 0.000002521458}, + {1859145, 0.000002521458}, + {1891687, 0.000002521458}, + {1925389, 0.000002521458}, + {1960314, 0.000002521458}, + {1996529, 0.000002521458}, + {2034108, 0.000002521458}, + {2073128, 0.000002521458}, + {2113675, 0.000002521458}, + {2155839, 0.000002521458}, + {2199720, 0.000002521458}, + {2245424, 0.000002521458}, + {2293067, 0.000002521458}, + {2342777, 0.000002521458}, + {2394689, 0.000002521458}, + {2448954, 0.000002521458}, + {2505735, 0.000002521458}, + {2565212, 0.000002521458}, + {2627581, 0.000002521458}, + {2693059, 0.000002521458}, + {2761883, 0.000002521458}, + {2834317, 0.000002521458}, + {2910653, 0.000002521458}, + {2991214, 0.000002521458}, + {3076362, 0.000002521458}, + {3166500, 0.000002521458}, + {3262080, 0.000002521458}, + {3363608, 0.000002521458}, + {3471660, 0.000002521458}, + {3586885, 0.000002521458}, + {3710020, 0.000002521458}, + {3841911, 0.000002521458}, + {3983524, 0.000002521458}, + {4135977, 0.000002521458}, + {4300563, 0.000002521458}, + {4478791, 0.000002521458}, + {4672430, 0.000002521458}, + {4883570, 0.000002521458}, + {5114695, 0.000002521458}, + {5368784, 0.000002521458}, + {5649438, 0.000002521458}, + {5961053, 0.000002521458}, + {6309051, 0.000002521458}, + {6700199, 0.000002521458}, + {7143054, 0.000002521458}, + {7648594, 0.000002521458}, + {8231141, 0.000002521458}, + {8909743, 0.000002521458}, + {9710291, 0.000002521458}, + {10668901, 0.000002521458}, + {11837511, 0.000002521458}, + {13293619, 0.000002521458}, + {15158197, 0.000002521458}, + {0, 0}, }; -dist_point_gen::cdf_point w4[] = { - {53, 0.00074}, - {56, 0.00148}, - {60, 0.00222}, - {64, 0.00296}, - {68, 0.0037}, - {72, 0.00444}, - {77, 0.00518}, - {81, 0.00592}, - {87, 0.00666}, - {92, 0.0074}, - {100, 0.00815}, - {109, 0.0089}, - {119, 0.00965}, - {130, 0.0104}, - {141, 0.01115}, - {154, 0.0119}, - {168, 0.01265}, - {183, 0.0134}, - {199, 0.01415}, - {217, 0.0149}, - {222, 0.0175}, - {227, 0.0201}, - {232, 0.0227}, - {237, 0.0253}, - {243, 0.0279}, - {248, 0.0305}, - {254, 0.0331}, - {259, 0.0357}, - {265, 0.0383}, - {271, 0.0409}, - {274, 0.04201}, - {277, 0.04312}, - {279, 0.04423}, - {282, 0.04534}, - {285, 0.04645}, - {288, 0.04756}, - {291, 0.04867}, - {294, 0.04978}, - {297, 0.05089}, - {300, 0.052}, - {303, 0.06055}, - {305, 0.0691}, - {308, 0.07765}, - {310, 0.0862}, - {313, 0.09475}, - {315, 0.1033}, - {318, 0.11185}, - {321, 0.1204}, - {323, 0.12895}, - {326, 0.1375}, - {331, 0.14401}, - {335, 0.15052}, - {340, 0.15703}, - {345, 0.16354}, - {350, 0.17005}, - {355, 0.17656}, - {360, 0.18307}, - {365, 0.18958}, - {371, 0.19609}, - {376, 0.2026}, - {381, 0.20464}, - {385, 0.20668}, - {390, 0.20872}, - {395, 0.21076}, - {400, 0.2128}, - {405, 0.21484}, - {410, 0.21688}, - {415, 0.21892}, - {420, 0.22096}, - {425, 0.223}, - {430, 0.22709}, - {435, 0.23118}, - {441, 0.23527}, - {446, 0.23936}, - {452, 0.24345}, - {457, 0.24754}, - {463, 0.25163}, - {468, 0.25572}, - {474, 0.25981}, - {480, 0.2639}, - {491, 0.28342}, - {502, 0.30294}, - {513, 0.32246}, - {525, 0.34198}, - {537, 0.3615}, - {549, 0.38102}, - {561, 0.40054}, - {574, 0.42006}, - {587, 0.43958}, - {600, 0.4591}, - {607, 0.46486}, - {615, 0.47062}, - {623, 0.47638}, - {630, 0.48214}, - {638, 0.4879}, - {646, 0.49366}, - {654, 0.49942}, - {662, 0.50518}, - {671, 0.51094}, - {679, 0.5167}, - {685, 0.52023}, - {690, 0.52376}, - {696, 0.52729}, - {702, 0.53082}, - {707, 0.53435}, - {713, 0.53788}, - {719, 0.54141}, - {725, 0.54494}, - {731, 0.54847}, - {737, 0.552}, - {743, 0.55442}, - {749, 0.55684}, - {755, 0.55926}, - {762, 0.56168}, - {768, 0.5641}, - {774, 0.56652}, - {781, 0.56894}, - {787, 0.57136}, - {793, 0.57378}, - {800, 0.5762}, - {808, 0.5777}, - {816, 0.5792}, - {825, 0.5807}, - {833, 0.5822}, - {841, 0.5837}, - {850, 0.5852}, - {859, 0.5867}, - {867, 0.5882}, - {876, 0.5897}, - {885, 0.5912}, - {900, 0.59305}, - {914, 0.5949}, - {929, 0.59675}, - {945, 0.5986}, - {960, 0.60045}, - {976, 0.6023}, - {992, 0.60415}, - {1009, 0.606}, - {1025, 0.60785}, - {1042, 0.6097}, - {1063, 0.61155}, - {1085, 0.6134}, - {1108, 0.61525}, - {1130, 0.6171}, - {1154, 0.61895}, - {1177, 0.6208}, - {1201, 0.62265}, - {1226, 0.6245}, - {1251, 0.62635}, - {1277, 0.6282}, - {1290, 0.62932}, - {1303, 0.63044}, - {1316, 0.63156}, - {1330, 0.63268}, - {1343, 0.6338}, - {1357, 0.63492}, - {1371, 0.63604}, - {1385, 0.63716}, - {1399, 0.63828}, - {1413, 0.6394}, - {1425, 0.64014}, - {1436, 0.64088}, - {1448, 0.64162}, - {1460, 0.64236}, - {1472, 0.6431}, - {1484, 0.64384}, - {1497, 0.64458}, - {1509, 0.64532}, - {1521, 0.64606}, - {1534, 0.6468}, - {1547, 0.64885}, - {1559, 0.6509}, - {1572, 0.65295}, - {1585, 0.655}, - {1598, 0.65705}, - {1611, 0.6591}, - {1624, 0.66115}, - {1637, 0.6632}, - {1651, 0.66525}, - {1664, 0.6673}, - {1712, 0.66804}, - {1762, 0.66878}, - {1813, 0.66952}, - {1865, 0.67026}, - {1919, 0.671}, - {1975, 0.67174}, - {2032, 0.67248}, - {2091, 0.67322}, - {2152, 0.67396}, - {2214, 0.6747}, - {2311, 0.67563}, - {2412, 0.67656}, - {2517, 0.67749}, - {2627, 0.67842}, - {2742, 0.67935}, - {2862, 0.68028}, - {2987, 0.68121}, - {3118, 0.68214}, - {3254, 0.68307}, - {3396, 0.684}, - {3544, 0.68493}, - {3699, 0.68586}, - {3861, 0.68679}, - {4030, 0.68772}, - {4206, 0.68865}, - {4390, 0.68958}, - {4582, 0.69051}, - {4783, 0.69144}, - {4992, 0.69237}, - {5210, 0.6933}, - {5427, 0.69479}, - {5652, 0.69628}, - {5887, 0.69777}, - {6132, 0.69926}, - {6387, 0.70075}, - {6653, 0.70224}, - {6929, 0.70373}, - {7217, 0.70522}, - {7517, 0.70671}, - {7830, 0.7082}, - {9012, 0.7095}, - {10373, 0.7108}, - {11939, 0.7121}, - {13741, 0.7134}, - {15816, 0.7147}, - {18203, 0.716}, - {20952, 0.7173}, - {24115, 0.7186}, - {27756, 0.7199}, - {31946, 0.7212}, - {32405, 0.72213}, - {32871, 0.72306}, - {33343, 0.72399}, - {33822, 0.72492}, - {34308, 0.72585}, - {34801, 0.72678}, - {35301, 0.72771}, - {35808, 0.72864}, - {36322, 0.72957}, - {36844, 0.7305}, - {37146, 0.73143}, - {37450, 0.73236}, - {37756, 0.73329}, - {38065, 0.73422}, - {38377, 0.73515}, - {38691, 0.73608}, - {39007, 0.73701}, - {39327, 0.73794}, - {39648, 0.73887}, - {39973, 0.7398}, - {40382, 0.74166}, - {40796, 0.74352}, - {41214, 0.74538}, - {41636, 0.74724}, - {42062, 0.7491}, - {42493, 0.75096}, - {42928, 0.75282}, - {43367, 0.75468}, - {43811, 0.75654}, - {44260, 0.7584}, - {44531, 0.76044}, - {44804, 0.76248}, - {45079, 0.76452}, - {45356, 0.76656}, - {45634, 0.7686}, - {45913, 0.77064}, - {46195, 0.77268}, - {46478, 0.77472}, - {46763, 0.77676}, - {47050, 0.7788}, - {47435, 0.78252}, - {47823, 0.78624}, - {48215, 0.78996}, - {48609, 0.79368}, - {49007, 0.7974}, - {49408, 0.80112}, - {49813, 0.80484}, - {50221, 0.80856}, - {50632, 0.81228}, - {51046, 0.816}, - {52097, 0.81674}, - {53169, 0.81748}, - {54264, 0.81822}, - {55381, 0.81896}, - {56521, 0.8197}, - {57684, 0.82044}, - {58872, 0.82118}, - {60084, 0.82192}, - {61321, 0.82266}, - {62583, 0.8234}, - {63353, 0.82433}, - {64132, 0.82526}, - {64921, 0.82619}, - {65720, 0.82712}, - {66528, 0.82805}, - {67347, 0.82898}, - {68175, 0.82991}, - {69014, 0.83084}, - {69863, 0.83177}, - {70722, 0.8327}, - {71156, 0.83363}, - {71592, 0.83456}, - {72031, 0.83549}, - {72473, 0.83642}, - {72917, 0.83735}, - {73364, 0.83828}, - {73814, 0.83921}, - {74266, 0.84014}, - {74722, 0.84107}, - {75180, 0.842}, - {75795, 0.84497}, - {76416, 0.84794}, - {77041, 0.85091}, - {77672, 0.85388}, - {78307, 0.85685}, - {78948, 0.85982}, - {79595, 0.86279}, - {80246, 0.86576}, - {80903, 0.86873}, - {81565, 0.8717}, - {83414, 0.87338}, - {85305, 0.87506}, - {87238, 0.87674}, - {89216, 0.87842}, - {91238, 0.8801}, - {93306, 0.88178}, - {95421, 0.88346}, - {97584, 0.88514}, - {99796, 0.88682}, - {102058, 0.8885}, - {103313, 0.88943}, - {104584, 0.89036}, - {105871, 0.89129}, - {107173, 0.89222}, - {108492, 0.89315}, - {109826, 0.89408}, - {111177, 0.89501}, - {112545, 0.89594}, - {113929, 0.89687}, - {115331, 0.8978}, - {116988, 0.89872}, - {118669, 0.89964}, - {120373, 0.90056}, - {122103, 0.90148}, - {123857, 0.9024}, - {125636, 0.90332}, - {127441, 0.90424}, - {129272, 0.90516}, - {131129, 0.90608}, - {133013, 0.907}, - {135199, 0.90794}, - {137421, 0.90888}, - {139679, 0.90982}, - {141975, 0.91076}, - {144308, 0.9117}, - {146680, 0.91264}, - {149091, 0.91358}, - {151541, 0.91452}, - {154032, 0.91546}, - {156563, 0.9164}, - {159136, 0.91733}, - {161752, 0.91826}, - {164410, 0.91919}, - {167112, 0.92012}, - {169859, 0.92105}, - {172651, 0.92198}, - {175488, 0.92291}, - {178373, 0.92384}, - {181304, 0.92477}, - {184284, 0.9257}, - {186932, 0.92662}, - {189618, 0.92754}, - {192343, 0.92846}, - {195107, 0.92938}, - {197911, 0.9303}, - {200755, 0.93122}, - {203640, 0.93214}, - {206566, 0.93306}, - {209535, 0.93398}, - {212546, 0.9349}, - {215160, 0.93583}, - {217805, 0.93676}, - {220484, 0.93769}, - {223195, 0.93862}, - {225940, 0.93955}, - {228718, 0.94048}, - {231530, 0.94141}, - {234377, 0.94234}, - {237259, 0.94327}, - {240177, 0.9442}, - {244622, 0.94513}, - {249150, 0.94606}, - {253761, 0.94699}, - {258458, 0.94792}, - {263241, 0.94885}, - {268113, 0.94978}, - {273075, 0.95071}, - {278129, 0.95164}, - {283277, 0.95257}, - {288520, 0.9535}, - {302981, 0.95443}, - {318166, 0.95536}, - {334113, 0.95629}, - {350858, 0.95722}, - {368444, 0.95815}, - {386910, 0.95908}, - {406302, 0.96001}, - {426666, 0.96094}, - {448051, 0.96187}, - {470507, 0.9628}, - {492079, 0.96336}, - {514641, 0.96392}, - {538237, 0.96448}, - {562914, 0.96504}, - {588723, 0.9656}, - {615716, 0.96616}, - {643946, 0.96672}, - {673470, 0.96728}, - {704348, 0.96784}, - {736642, 0.9684}, - {754877, 0.96877}, - {773563, 0.96914}, - {792711, 0.96951}, - {812333, 0.96988}, - {832442, 0.97025}, - {853048, 0.97062}, - {874164, 0.97099}, - {895802, 0.97136}, - {917977, 0.97173}, - {940700, 0.9721}, - - /* Alternate form that doesn't flatten out when the maximum message - * size is 1000000. - */ -#if 0 - {920000, 0.9721}, - {925000, 0.974}, - {930000, 0.976}, - {935000, 0.978}, - {940000, 0.980}, - {945000, 0.982}, - {950000, 0.984}, - {955000, 0.986}, - {960000, 0.988}, - {965000, 0.990}, - {970000, 0.992}, - {975000, 0.994}, - {980000, 0.996}, - {990000, 0.998}, - {1000000, 1.0}, -#else - {1002039, 0.97303}, - {1067379, 0.97396}, - {1136978, 0.97489}, - {1211116, 0.97582}, - {1290088, 0.97675}, - {1374210, 0.97768}, - {1463817, 0.97861}, - {1559267, 0.97954}, - {1660940, 0.98047}, - {1769244, 0.9814}, - {1805665, 0.98196}, - {1842837, 0.98252}, - {1880773, 0.98308}, - {1919490, 0.98364}, - {1959005, 0.9842}, - {1999332, 0.98476}, - {2040490, 0.98532}, - {2082496, 0.98588}, - {2125366, 0.98644}, - {2169118, 0.987}, - {2527289, 0.9883}, - {2944602, 0.9896}, - {3430822, 0.9909}, - {3997329, 0.9922}, - {4657379, 0.9935}, - {5426418, 0.9948}, - {6322443, 0.9961}, - {7366423, 0.9974}, - {8582787, 0.9987}, - {10000000, 1.0}, -#endif +dist_point_gen::weight dist_point_gen::w4[] = { + {53, 0.000740000000}, + {56, 0.000740000000}, + {60, 0.000740000000}, + {64, 0.000740000000}, + {68, 0.000740000000}, + {72, 0.000740000000}, + {77, 0.000740000000}, + {81, 0.000740000000}, + {87, 0.000740000000}, + {92, 0.000740000000}, + {100, 0.000750000000}, + {109, 0.000750000000}, + {119, 0.000750000000}, + {130, 0.000750000000}, + {141, 0.000750000000}, + {154, 0.000750000000}, + {168, 0.000750000000}, + {183, 0.000750000000}, + {199, 0.000750000000}, + {217, 0.000750000000}, + {222, 0.002600000000}, + {227, 0.002600000000}, + {232, 0.002600000000}, + {237, 0.002600000000}, + {243, 0.002600000000}, + {248, 0.002600000000}, + {254, 0.002600000000}, + {259, 0.002600000000}, + {265, 0.002600000000}, + {271, 0.002600000000}, + {274, 0.001110000000}, + {277, 0.001110000000}, + {279, 0.001110000000}, + {282, 0.001110000000}, + {285, 0.001110000000}, + {288, 0.001110000000}, + {291, 0.001110000000}, + {294, 0.001110000000}, + {297, 0.001110000000}, + {300, 0.001110000000}, + {303, 0.008550000000}, + {305, 0.008550000000}, + {308, 0.008550000000}, + {310, 0.008550000000}, + {313, 0.008550000000}, + {315, 0.008550000000}, + {318, 0.008550000000}, + {321, 0.008550000000}, + {323, 0.008550000000}, + {326, 0.008550000000}, + {331, 0.006510000000}, + {335, 0.006510000000}, + {340, 0.006510000000}, + {345, 0.006510000000}, + {350, 0.006510000000}, + {355, 0.006510000000}, + {360, 0.006510000000}, + {365, 0.006510000000}, + {371, 0.006510000000}, + {376, 0.006510000000}, + {381, 0.002040000000}, + {385, 0.002040000000}, + {390, 0.002040000000}, + {395, 0.002040000000}, + {400, 0.002040000000}, + {405, 0.002040000000}, + {410, 0.002040000000}, + {415, 0.002040000000}, + {420, 0.002040000000}, + {425, 0.002040000000}, + {430, 0.004090000000}, + {435, 0.004090000000}, + {441, 0.004090000000}, + {446, 0.004090000000}, + {452, 0.004090000000}, + {457, 0.004090000000}, + {463, 0.004090000000}, + {468, 0.004090000000}, + {474, 0.004090000000}, + {480, 0.004090000000}, + {491, 0.019520000000}, + {502, 0.019520000000}, + {513, 0.019520000000}, + {525, 0.019520000000}, + {537, 0.019520000000}, + {549, 0.019520000000}, + {561, 0.019520000000}, + {574, 0.019520000000}, + {587, 0.019520000000}, + {600, 0.019520000000}, + {607, 0.005760000000}, + {615, 0.005760000000}, + {623, 0.005760000000}, + {630, 0.005760000000}, + {638, 0.005760000000}, + {646, 0.005760000000}, + {654, 0.005760000000}, + {662, 0.005760000000}, + {671, 0.005760000000}, + {679, 0.005760000000}, + {685, 0.003530000000}, + {690, 0.003530000000}, + {696, 0.003530000000}, + {702, 0.003530000000}, + {707, 0.003530000000}, + {713, 0.003530000000}, + {719, 0.003530000000}, + {725, 0.003530000000}, + {731, 0.003530000000}, + {737, 0.003530000000}, + {743, 0.002420000000}, + {749, 0.002420000000}, + {755, 0.002420000000}, + {762, 0.002420000000}, + {768, 0.002420000000}, + {774, 0.002420000000}, + {781, 0.002420000000}, + {787, 0.002420000000}, + {793, 0.002420000000}, + {800, 0.002420000000}, + {808, 0.001500000000}, + {816, 0.001500000000}, + {825, 0.001500000000}, + {833, 0.001500000000}, + {841, 0.001500000000}, + {850, 0.001500000000}, + {859, 0.001500000000}, + {867, 0.001500000000}, + {876, 0.001500000000}, + {885, 0.001500000000}, + {900, 0.001850000000}, + {914, 0.001850000000}, + {929, 0.001850000000}, + {945, 0.001850000000}, + {960, 0.001850000000}, + {976, 0.001850000000}, + {992, 0.001850000000}, + {1009, 0.001850000000}, + {1025, 0.001850000000}, + {1042, 0.001850000000}, + {1063, 0.001850000000}, + {1085, 0.001850000000}, + {1108, 0.001850000000}, + {1130, 0.001850000000}, + {1154, 0.001850000000}, + {1177, 0.001850000000}, + {1201, 0.001850000000}, + {1226, 0.001850000000}, + {1251, 0.001850000000}, + {1277, 0.001850000000}, + {1290, 0.001120000000}, + {1303, 0.001120000000}, + {1316, 0.001120000000}, + {1330, 0.001120000000}, + {1343, 0.001120000000}, + {1357, 0.001120000000}, + {1371, 0.001120000000}, + {1385, 0.001120000000}, + {1399, 0.001120000000}, + {1413, 0.001120000000}, + {1425, 0.000740000000}, + {1436, 0.000740000000}, + {1448, 0.000740000000}, + {1460, 0.000740000000}, + {1472, 0.000740000000}, + {1484, 0.000740000000}, + {1497, 0.000740000000}, + {1509, 0.000740000000}, + {1521, 0.000740000000}, + {1534, 0.000740000000}, + {1547, 0.002050000000}, + {1559, 0.002050000000}, + {1572, 0.002050000000}, + {1585, 0.002050000000}, + {1598, 0.002050000000}, + {1611, 0.002050000000}, + {1624, 0.002050000000}, + {1637, 0.002050000000}, + {1651, 0.002050000000}, + {1664, 0.002050000000}, + {1712, 0.000740000000}, + {1762, 0.000740000000}, + {1813, 0.000740000000}, + {1865, 0.000740000000}, + {1919, 0.000740000000}, + {1975, 0.000740000000}, + {2032, 0.000740000000}, + {2091, 0.000740000000}, + {2152, 0.000740000000}, + {2214, 0.000740000000}, + {2311, 0.000930000000}, + {2412, 0.000930000000}, + {2517, 0.000930000000}, + {2627, 0.000930000000}, + {2742, 0.000930000000}, + {2862, 0.000930000000}, + {2987, 0.000930000000}, + {3118, 0.000930000000}, + {3254, 0.000930000000}, + {3396, 0.000930000000}, + {3544, 0.000930000000}, + {3699, 0.000930000000}, + {3861, 0.000930000000}, + {4030, 0.000930000000}, + {4206, 0.000930000000}, + {4390, 0.000930000000}, + {4582, 0.000930000000}, + {4783, 0.000930000000}, + {4992, 0.000930000000}, + {5210, 0.000930000000}, + {5427, 0.001490000000}, + {5652, 0.001490000000}, + {5887, 0.001490000000}, + {6132, 0.001490000000}, + {6387, 0.001490000000}, + {6653, 0.001490000000}, + {6929, 0.001490000000}, + {7217, 0.001490000000}, + {7517, 0.001490000000}, + {7830, 0.001490000000}, + {9012, 0.001300000000}, + {10373, 0.001300000000}, + {11939, 0.001300000000}, + {13741, 0.001300000000}, + {15816, 0.001300000000}, + {18203, 0.001300000000}, + {20952, 0.001300000000}, + {24115, 0.001300000000}, + {27756, 0.001300000000}, + {31946, 0.001300000000}, + {32405, 0.000930000000}, + {32871, 0.000930000000}, + {33343, 0.000930000000}, + {33822, 0.000930000000}, + {34308, 0.000930000000}, + {34801, 0.000930000000}, + {35301, 0.000930000000}, + {35808, 0.000930000000}, + {36322, 0.000930000000}, + {36844, 0.000930000000}, + {37146, 0.000930000000}, + {37450, 0.000930000000}, + {37756, 0.000930000000}, + {38065, 0.000930000000}, + {38377, 0.000930000000}, + {38691, 0.000930000000}, + {39007, 0.000930000000}, + {39327, 0.000930000000}, + {39648, 0.000930000000}, + {39973, 0.000930000000}, + {40382, 0.001860000000}, + {40796, 0.001860000000}, + {41214, 0.001860000000}, + {41636, 0.001860000000}, + {42062, 0.001860000000}, + {42493, 0.001860000000}, + {42928, 0.001860000000}, + {43367, 0.001860000000}, + {43811, 0.001860000000}, + {44260, 0.001860000000}, + {44531, 0.002040000000}, + {44804, 0.002040000000}, + {45079, 0.002040000000}, + {45356, 0.002040000000}, + {45634, 0.002040000000}, + {45913, 0.002040000000}, + {46195, 0.002040000000}, + {46478, 0.002040000000}, + {46763, 0.002040000000}, + {47050, 0.002040000000}, + {47435, 0.003720000000}, + {47823, 0.003720000000}, + {48215, 0.003720000000}, + {48609, 0.003720000000}, + {49007, 0.003720000000}, + {49408, 0.003720000000}, + {49813, 0.003720000000}, + {50221, 0.003720000000}, + {50632, 0.003720000000}, + {51046, 0.003720000000}, + {52097, 0.000740000000}, + {53169, 0.000740000000}, + {54264, 0.000740000000}, + {55381, 0.000740000000}, + {56521, 0.000740000000}, + {57684, 0.000740000000}, + {58872, 0.000740000000}, + {60084, 0.000740000000}, + {61321, 0.000740000000}, + {62583, 0.000740000000}, + {63353, 0.000930000000}, + {64132, 0.000930000000}, + {64921, 0.000930000000}, + {65720, 0.000930000000}, + {66528, 0.000930000000}, + {67347, 0.000930000000}, + {68175, 0.000930000000}, + {69014, 0.000930000000}, + {69863, 0.000930000000}, + {70722, 0.000930000000}, + {71156, 0.000930000000}, + {71592, 0.000930000000}, + {72031, 0.000930000000}, + {72473, 0.000930000000}, + {72917, 0.000930000000}, + {73364, 0.000930000000}, + {73814, 0.000930000000}, + {74266, 0.000930000000}, + {74722, 0.000930000000}, + {75180, 0.000930000000}, + {75795, 0.002970000000}, + {76416, 0.002970000000}, + {77041, 0.002970000000}, + {77672, 0.002970000000}, + {78307, 0.002970000000}, + {78948, 0.002970000000}, + {79595, 0.002970000000}, + {80246, 0.002970000000}, + {80903, 0.002970000000}, + {81565, 0.002970000000}, + {83414, 0.001680000000}, + {85305, 0.001680000000}, + {87238, 0.001680000000}, + {89216, 0.001680000000}, + {91238, 0.001680000000}, + {93306, 0.001680000000}, + {95421, 0.001680000000}, + {97584, 0.001680000000}, + {99796, 0.001680000000}, + {102058, 0.001680000000}, + {103313, 0.000930000000}, + {104584, 0.000930000000}, + {105871, 0.000930000000}, + {107173, 0.000930000000}, + {108492, 0.000930000000}, + {109826, 0.000930000000}, + {111177, 0.000930000000}, + {112545, 0.000930000000}, + {113929, 0.000930000000}, + {115331, 0.000930000000}, + {116988, 0.000920000000}, + {118669, 0.000920000000}, + {120373, 0.000920000000}, + {122103, 0.000920000000}, + {123857, 0.000920000000}, + {125636, 0.000920000000}, + {127441, 0.000920000000}, + {129272, 0.000920000000}, + {131129, 0.000920000000}, + {133013, 0.000920000000}, + {135199, 0.000940000000}, + {137421, 0.000940000000}, + {139679, 0.000940000000}, + {141975, 0.000940000000}, + {144308, 0.000940000000}, + {146680, 0.000940000000}, + {149091, 0.000940000000}, + {151541, 0.000940000000}, + {154032, 0.000940000000}, + {156563, 0.000940000000}, + {159136, 0.000930000000}, + {161752, 0.000930000000}, + {164410, 0.000930000000}, + {167112, 0.000930000000}, + {169859, 0.000930000000}, + {172651, 0.000930000000}, + {175488, 0.000930000000}, + {178373, 0.000930000000}, + {181304, 0.000930000000}, + {184284, 0.000930000000}, + {186932, 0.000920000000}, + {189618, 0.000920000000}, + {192343, 0.000920000000}, + {195107, 0.000920000000}, + {197911, 0.000920000000}, + {200755, 0.000920000000}, + {203640, 0.000920000000}, + {206566, 0.000920000000}, + {209535, 0.000920000000}, + {212546, 0.000920000000}, + {215160, 0.000930000000}, + {217805, 0.000930000000}, + {220484, 0.000930000000}, + {223195, 0.000930000000}, + {225940, 0.000930000000}, + {228718, 0.000930000000}, + {231530, 0.000930000000}, + {234377, 0.000930000000}, + {237259, 0.000930000000}, + {240177, 0.000930000000}, + {244622, 0.000930000000}, + {249150, 0.000930000000}, + {253761, 0.000930000000}, + {258458, 0.000930000000}, + {263241, 0.000930000000}, + {268113, 0.000930000000}, + {273075, 0.000930000000}, + {278129, 0.000930000000}, + {283277, 0.000930000000}, + {288520, 0.000930000000}, + {302981, 0.000930000000}, + {318166, 0.000930000000}, + {334113, 0.000930000000}, + {350858, 0.000930000000}, + {368444, 0.000930000000}, + {386910, 0.000930000000}, + {406302, 0.000930000000}, + {426666, 0.000930000000}, + {448051, 0.000930000000}, + {470507, 0.000930000000}, + {492079, 0.000560000000}, + {514641, 0.000560000000}, + {538237, 0.000560000000}, + {562914, 0.000560000000}, + {588723, 0.000560000000}, + {615716, 0.000560000000}, + {643946, 0.000560000000}, + {673470, 0.000560000000}, + {704348, 0.000560000000}, + {736642, 0.000560000000}, + {754877, 0.000370000000}, + {773563, 0.000370000000}, + {792711, 0.000370000000}, + {812333, 0.000370000000}, + {832442, 0.000370000000}, + {853048, 0.000370000000}, + {874164, 0.000370000000}, + {895802, 0.000370000000}, + {917977, 0.000370000000}, + {940700, 0.000370000000}, + {1002039, 0.000930000000}, + {1067379, 0.000930000000}, + {1136978, 0.000930000000}, + {1211116, 0.000930000000}, + {1290088, 0.000930000000}, + {1374210, 0.000930000000}, + {1463817, 0.000930000000}, + {1559267, 0.000930000000}, + {1660940, 0.000930000000}, + {1769244, 0.000930000000}, + {1805665, 0.000560000000}, + {1842837, 0.000560000000}, + {1880773, 0.000560000000}, + {1919490, 0.000560000000}, + {1959005, 0.000560000000}, + {1999332, 0.000560000000}, + {2040490, 0.000560000000}, + {2082496, 0.000560000000}, + {2125366, 0.000560000000}, + {2169118, 0.000560000000}, + {2527289, 0.001300000000}, + {2944602, 0.001300000000}, + {3430822, 0.001300000000}, + {3997329, 0.001300000000}, + {4657379, 0.001300000000}, + {5426418, 0.001300000000}, + {6322443, 0.001300000000}, + {7366423, 0.001300000000}, + {8582787, 0.001300000000}, + {10000000, 0.001300000000}, + {0, 0}, }; -dist_point_gen::cdf_point w5[] = { - {1430, 0.025000}, - {2860, 0.050000}, - {4290, 0.075000}, - {5720, 0.100000}, - {7150, 0.125000}, - {8580, 0.150000}, - {10010, 0.157143}, - {11440, 0.164286}, - {13230, 0.171429}, - {14700, 0.178571}, - {16170, 0.185714}, - {17640, 0.192857}, - {19110, 0.200000}, - {20580, 0.216667}, - {22050, 0.233333}, - {23520, 0.250000}, - {24990, 0.266667}, - {26460, 0.283333}, - {27930, 0.300000}, - {29400, 0.307143}, - {30870, 0.314286}, - {32340, 0.321429}, - {33810, 0.328571}, - {35280, 0.335714}, - {36750, 0.342857}, - {38220, 0.350000}, - {39690, 0.357143}, - {41160, 0.364286}, - {42630, 0.371429}, - {44100, 0.378571}, - {45570, 0.385714}, - {47040, 0.392857}, - {48510, 0.400000}, - {49980, 0.406500}, - {51450, 0.413000}, - {52920, 0.419500}, - {54390, 0.426000}, - {55860, 0.432500}, - {57330, 0.439000}, - {58800, 0.445500}, - {60270, 0.452000}, - {61740, 0.458500}, - {63210, 0.465000}, - {64680, 0.471500}, - {66150, 0.478000}, - {67620, 0.484500}, - {69090, 0.491000}, - {70560, 0.497500}, - {72030, 0.504000}, - {73500, 0.510500}, - {74970, 0.517000}, - {76440, 0.523500}, - {77910, 0.530000}, - {79380, 0.530875}, - {80850, 0.531750}, - {82320, 0.532625}, - {83790, 0.533500}, - {85260, 0.534375}, - {86730, 0.535250}, - {88200, 0.536125}, - {89670, 0.537000}, - {91140, 0.537875}, - {92610, 0.538750}, - {94080, 0.539625}, - {95550, 0.540500}, - {97020, 0.541375}, - {98490, 0.542250}, - {99960, 0.543125}, - {101430, 0.544000}, - {102900, 0.544875}, - {104370, 0.545750}, - {105840, 0.546625}, - {107310, 0.547500}, - {108780, 0.548375}, - {110250, 0.549250}, - {111720, 0.550125}, - {113190, 0.551000}, - {114660, 0.551875}, - {116130, 0.552750}, - {117600, 0.553625}, - {119070, 0.554500}, - {120540, 0.555375}, - {122010, 0.556250}, - {123480, 0.557125}, - {124950, 0.558000}, - {126420, 0.558875}, - {127890, 0.559750}, - {129360, 0.560625}, - {130830, 0.561500}, - {132300, 0.562375}, - {133770, 0.563250}, - {135240, 0.564125}, - {136710, 0.565000}, - {138180, 0.565875}, - {139650, 0.566750}, - {141120, 0.567625}, - {142590, 0.568500}, - {144060, 0.569375}, - {145530, 0.570250}, - {147000, 0.571125}, - {148470, 0.572000}, - {149940, 0.572875}, - {151410, 0.573750}, - {152880, 0.574625}, - {154350, 0.575500}, - {155820, 0.576375}, - {157290, 0.577250}, - {158760, 0.578125}, - {160230, 0.579000}, - {161700, 0.579875}, - {163170, 0.580750}, - {164640, 0.581625}, - {166110, 0.582500}, - {167580, 0.583375}, - {169050, 0.584250}, - {170520, 0.585125}, - {171990, 0.586000}, - {173460, 0.586875}, - {174930, 0.587750}, - {176400, 0.588625}, - {177870, 0.589500}, - {179340, 0.590375}, - {180810, 0.591250}, - {182280, 0.592125}, - {183750, 0.593000}, - {185220, 0.593875}, - {186690, 0.594750}, - {188160, 0.595625}, - {189630, 0.596500}, - {191100, 0.597375}, - {192570, 0.598250}, - {194040, 0.599125}, - {195510, 0.600000}, - {196980, 0.600187}, - {198450, 0.600375}, - {199920, 0.600562}, - {201390, 0.600749}, - {202860, 0.600936}, - {204330, 0.601124}, - {205800, 0.601311}, - {207270, 0.601498}, - {208740, 0.601685}, - {210210, 0.601873}, - {211680, 0.602060}, - {213150, 0.602247}, - {214620, 0.602434}, - {216090, 0.602622}, - {217560, 0.602809}, - {219030, 0.602996}, - {220500, 0.603184}, - {221970, 0.603371}, - {223440, 0.603558}, - {224910, 0.603745}, - {226380, 0.603933}, - {227850, 0.604120}, - {229320, 0.604307}, - {230790, 0.604494}, - {232260, 0.604682}, - {233730, 0.604869}, - {235200, 0.605056}, - {236670, 0.605243}, - {238140, 0.605431}, - {239610, 0.605618}, - {241080, 0.605805}, - {242550, 0.605993}, - {244020, 0.606180}, - {245490, 0.606367}, - {246960, 0.606554}, - {248430, 0.606742}, - {249900, 0.606929}, - {251370, 0.607116}, - {252840, 0.607303}, - {254310, 0.607491}, - {255780, 0.607678}, - {257250, 0.607865}, - {258720, 0.608052}, - {260190, 0.608240}, - {261660, 0.608427}, - {263130, 0.608614}, - {264600, 0.608801}, - {266070, 0.608989}, - {267540, 0.609176}, - {269010, 0.609363}, - {270480, 0.609551}, - {271950, 0.609738}, - {273420, 0.609925}, - {274890, 0.610112}, - {276360, 0.610300}, - {277830, 0.610487}, - {279300, 0.610674}, - {280770, 0.610861}, - {282240, 0.611049}, - {283710, 0.611236}, - {285180, 0.611423}, - {286650, 0.611610}, - {288120, 0.611798}, - {289590, 0.611985}, - {291060, 0.612172}, - {292530, 0.612360}, - {294000, 0.612547}, - {295470, 0.612734}, - {296940, 0.612921}, - {298410, 0.613109}, - {299880, 0.613296}, - {301350, 0.613483}, - {302820, 0.613670}, - {304290, 0.613858}, - {305760, 0.614045}, - {307230, 0.614232}, - {308700, 0.614419}, - {310170, 0.614607}, - {311640, 0.614794}, - {313110, 0.614981}, - {314580, 0.615169}, - {316050, 0.615356}, - {317520, 0.615543}, - {318990, 0.615730}, - {320460, 0.615918}, - {321930, 0.616105}, - {323400, 0.616292}, - {324870, 0.616479}, - {326340, 0.616667}, - {327810, 0.616854}, - {329280, 0.617041}, - {330750, 0.617228}, - {332220, 0.617416}, - {333690, 0.617603}, - {335160, 0.617790}, - {336630, 0.617978}, - {338100, 0.618165}, - {339570, 0.618352}, - {341040, 0.618539}, - {342510, 0.618727}, - {343980, 0.618914}, - {345450, 0.619101}, - {346920, 0.619288}, - {348390, 0.619476}, - {349860, 0.619663}, - {351330, 0.619850}, - {352800, 0.620037}, - {354270, 0.620225}, - {355740, 0.620412}, - {357210, 0.620599}, - {358680, 0.620787}, - {360150, 0.620974}, - {361620, 0.621161}, - {363090, 0.621348}, - {364560, 0.621536}, - {373380, 0.622659}, - {392490, 0.625094}, - {411600, 0.627528}, - {432180, 0.630150}, - {451290, 0.632584}, - {470400, 0.635019}, - {490980, 0.637640}, - {510090, 0.640075}, - {529200, 0.642509}, - {549780, 0.645131}, - {568890, 0.647566}, - {588000, 0.650000}, - {608580, 0.652622}, - {627690, 0.655056}, - {648270, 0.657678}, - {667380, 0.660112}, - {686490, 0.662547}, - {707070, 0.665169}, - {726180, 0.667603}, - {745290, 0.670037}, - {765870, 0.672659}, - {784980, 0.675094}, - {804090, 0.677528}, - {824670, 0.680150}, - {843780, 0.682584}, - {862890, 0.685019}, - {883470, 0.687640}, - {902580, 0.690075}, - {921690, 0.692509}, - {942270, 0.695131}, - {961380, 0.697566}, - {980490, 0.700000}, - {1005480, 0.702553}, - {1030470, 0.705105}, - {1053990, 0.707508}, - {1078980, 0.710060}, - {1103970, 0.712613}, - {1127490, 0.715015}, - {1152480, 0.717568}, - {1177470, 0.720120}, - {1200990, 0.722523}, - {1225980, 0.725075}, - {1250970, 0.727628}, - {1274490, 0.730030}, - {1299480, 0.732583}, - {1324470, 0.735135}, - {1347990, 0.737538}, - {1372980, 0.740090}, - {1397970, 0.742643}, - {1421490, 0.745045}, - {1446480, 0.747598}, - {1470000, 0.750000}, - {1494990, 0.752553}, - {1519980, 0.755105}, - {1543500, 0.757508}, - {1568490, 0.760060}, - {1593480, 0.762613}, - {1617000, 0.765015}, - {1641990, 0.767568}, - {1666980, 0.770120}, - {1690500, 0.772523}, - {1715490, 0.775075}, - {1740480, 0.777628}, - {1764000, 0.780030}, - {1788990, 0.782583}, - {1813980, 0.785135}, - {1837500, 0.787538}, - {1862490, 0.790090}, - {1887480, 0.792643}, - {1911000, 0.795045}, - {1935990, 0.797598}, - {1959510, 0.800000}, - {2033010, 0.802500}, - {2106510, 0.805000}, - {2180010, 0.807500}, - {2253510, 0.810000}, - {2327010, 0.812500}, - {2400510, 0.815000}, - {2474010, 0.817500}, - {2547510, 0.820000}, - {2621010, 0.822500}, - {2694510, 0.825000}, - {2768010, 0.827500}, - {2841510, 0.830000}, - {2915010, 0.832500}, - {2988510, 0.835000}, - {3062010, 0.837500}, - {3135510, 0.840000}, - {3209010, 0.842500}, - {3282510, 0.845000}, - {3356010, 0.847500}, - {3429510, 0.850000}, - {3503010, 0.852500}, - {3576510, 0.855000}, - {3650010, 0.857500}, - {3723510, 0.860000}, - {3797010, 0.862500}, - {3870510, 0.865000}, - {3944010, 0.867500}, - {4017510, 0.870000}, - {4091010, 0.872500}, - {4164510, 0.875000}, - {4238010, 0.877500}, - {4311510, 0.880000}, - {4385010, 0.882500}, - {4458510, 0.885000}, - {4532010, 0.887500}, - {4605510, 0.890000}, - {4679010, 0.892500}, - {4752510, 0.895000}, - {4826010, 0.897500}, - {4899510, 0.900000}, - {10291470, 0.902505}, - {10780980, 0.905003}, - {11270490, 0.907500}, - {11761470, 0.910005}, - {12250980, 0.912503}, - {12740490, 0.915000}, - {13231470, 0.917505}, - {13720980, 0.920003}, - {14210490, 0.922501}, - {14701470, 0.925006}, - {15190980, 0.927503}, - {15680490, 0.930001}, - {16171470, 0.932506}, - {16660980, 0.935003}, - {17150490, 0.937501}, - {17641470, 0.940006}, - {18130980, 0.942504}, - {18620490, 0.945001}, - {19111470, 0.947506}, - {19600980, 0.950004}, - {20090490, 0.952501}, - {20581470, 0.955006}, - {21070980, 0.957504}, - {21560490, 0.960002}, - {22051470, 0.962507}, - {22540980, 0.965004}, - {23030490, 0.967502}, - {23521470, 0.970007}, - {24010980, 0.972504}, - {24500490, 0.975002}, - {24991470, 0.977507}, - {25480980, 0.980005}, - {25970490, 0.982502}, - {26461470, 0.985007}, - {26950980, 0.987505}, - {27440490, 0.990002}, - {27931470, 0.992507}, - {28420980, 0.995005}, - {28910490, 0.997502}, - {29400000, 1.000000}, +dist_point_gen::weight dist_point_gen::w5[] = { + {1430, 0.025000000000}, + {2860, 0.025000000000}, + {4290, 0.025000000000}, + {5720, 0.025000000000}, + {7150, 0.025000000000}, + {8580, 0.025000000000}, + {10010, 0.007143000000}, + {11440, 0.007143000000}, + {13230, 0.007143000000}, + {14700, 0.007142000000}, + {16170, 0.007143000000}, + {17640, 0.007143000000}, + {19110, 0.007143000000}, + {20580, 0.016667000000}, + {22050, 0.016666000000}, + {23520, 0.016667000000}, + {24990, 0.016667000000}, + {26460, 0.016666000000}, + {27930, 0.016667000000}, + {29400, 0.007143000000}, + {30870, 0.007143000000}, + {32340, 0.007143000000}, + {33810, 0.007142000000}, + {35280, 0.007143000000}, + {36750, 0.007143000000}, + {38220, 0.007143000000}, + {39690, 0.007143000000}, + {41160, 0.007143000000}, + {42630, 0.007143000000}, + {44100, 0.007142000000}, + {45570, 0.007143000000}, + {47040, 0.007143000000}, + {48510, 0.007143000000}, + {49980, 0.006500000000}, + {51450, 0.006500000000}, + {52920, 0.006500000000}, + {54390, 0.006500000000}, + {55860, 0.006500000000}, + {57330, 0.006500000000}, + {58800, 0.006500000000}, + {60270, 0.006500000000}, + {61740, 0.006500000000}, + {63210, 0.006500000000}, + {64680, 0.006500000000}, + {66150, 0.006500000000}, + {67620, 0.006500000000}, + {69090, 0.006500000000}, + {70560, 0.006500000000}, + {72030, 0.006500000000}, + {73500, 0.006500000000}, + {74970, 0.006500000000}, + {76440, 0.006500000000}, + {77910, 0.006500000000}, + {79380, 0.000875000000}, + {80850, 0.000875000000}, + {82320, 0.000875000000}, + {83790, 0.000875000000}, + {85260, 0.000875000000}, + {86730, 0.000875000000}, + {88200, 0.000875000000}, + {89670, 0.000875000000}, + {91140, 0.000875000000}, + {92610, 0.000875000000}, + {94080, 0.000875000000}, + {95550, 0.000875000000}, + {97020, 0.000875000000}, + {98490, 0.000875000000}, + {99960, 0.000875000000}, + {101430, 0.000875000000}, + {102900, 0.000875000000}, + {104370, 0.000875000000}, + {105840, 0.000875000000}, + {107310, 0.000875000000}, + {108780, 0.000875000000}, + {110250, 0.000875000000}, + {111720, 0.000875000000}, + {113190, 0.000875000000}, + {114660, 0.000875000000}, + {116130, 0.000875000000}, + {117600, 0.000875000000}, + {119070, 0.000875000000}, + {120540, 0.000875000000}, + {122010, 0.000875000000}, + {123480, 0.000875000000}, + {124950, 0.000875000000}, + {126420, 0.000875000000}, + {127890, 0.000875000000}, + {129360, 0.000875000000}, + {130830, 0.000875000000}, + {132300, 0.000875000000}, + {133770, 0.000875000000}, + {135240, 0.000875000000}, + {136710, 0.000875000000}, + {138180, 0.000875000000}, + {139650, 0.000875000000}, + {141120, 0.000875000000}, + {142590, 0.000875000000}, + {144060, 0.000875000000}, + {145530, 0.000875000000}, + {147000, 0.000875000000}, + {148470, 0.000875000000}, + {149940, 0.000875000000}, + {151410, 0.000875000000}, + {152880, 0.000875000000}, + {154350, 0.000875000000}, + {155820, 0.000875000000}, + {157290, 0.000875000000}, + {158760, 0.000875000000}, + {160230, 0.000875000000}, + {161700, 0.000875000000}, + {163170, 0.000875000000}, + {164640, 0.000875000000}, + {166110, 0.000875000000}, + {167580, 0.000875000000}, + {169050, 0.000875000000}, + {170520, 0.000875000000}, + {171990, 0.000875000000}, + {173460, 0.000875000000}, + {174930, 0.000875000000}, + {176400, 0.000875000000}, + {177870, 0.000875000000}, + {179340, 0.000875000000}, + {180810, 0.000875000000}, + {182280, 0.000875000000}, + {183750, 0.000875000000}, + {185220, 0.000875000000}, + {186690, 0.000875000000}, + {188160, 0.000875000000}, + {189630, 0.000875000000}, + {191100, 0.000875000000}, + {192570, 0.000875000000}, + {194040, 0.000875000000}, + {195510, 0.000875000000}, + {196980, 0.000187000000}, + {198450, 0.000188000000}, + {199920, 0.000187000000}, + {201390, 0.000187000000}, + {202860, 0.000187000000}, + {204330, 0.000188000000}, + {205800, 0.000187000000}, + {207270, 0.000187000000}, + {208740, 0.000187000000}, + {210210, 0.000188000000}, + {211680, 0.000187000000}, + {213150, 0.000187000000}, + {214620, 0.000187000000}, + {216090, 0.000188000000}, + {217560, 0.000187000000}, + {219030, 0.000187000000}, + {220500, 0.000188000000}, + {221970, 0.000187000000}, + {223440, 0.000187000000}, + {224910, 0.000187000000}, + {226380, 0.000188000000}, + {227850, 0.000187000000}, + {229320, 0.000187000000}, + {230790, 0.000187000000}, + {232260, 0.000188000000}, + {233730, 0.000187000000}, + {235200, 0.000187000000}, + {236670, 0.000187000000}, + {238140, 0.000188000000}, + {239610, 0.000187000000}, + {241080, 0.000187000000}, + {242550, 0.000188000000}, + {244020, 0.000187000000}, + {245490, 0.000187000000}, + {246960, 0.000187000000}, + {248430, 0.000188000000}, + {249900, 0.000187000000}, + {251370, 0.000187000000}, + {252840, 0.000187000000}, + {254310, 0.000188000000}, + {255780, 0.000187000000}, + {257250, 0.000187000000}, + {258720, 0.000187000000}, + {260190, 0.000188000000}, + {261660, 0.000187000000}, + {263130, 0.000187000000}, + {264600, 0.000187000000}, + {266070, 0.000188000000}, + {267540, 0.000187000000}, + {269010, 0.000187000000}, + {270480, 0.000188000000}, + {271950, 0.000187000000}, + {273420, 0.000187000000}, + {274890, 0.000187000000}, + {276360, 0.000188000000}, + {277830, 0.000187000000}, + {279300, 0.000187000000}, + {280770, 0.000187000000}, + {282240, 0.000188000000}, + {283710, 0.000187000000}, + {285180, 0.000187000000}, + {286650, 0.000187000000}, + {288120, 0.000188000000}, + {289590, 0.000187000000}, + {291060, 0.000187000000}, + {292530, 0.000188000000}, + {294000, 0.000187000000}, + {295470, 0.000187000000}, + {296940, 0.000187000000}, + {298410, 0.000188000000}, + {299880, 0.000187000000}, + {301350, 0.000187000000}, + {302820, 0.000187000000}, + {304290, 0.000188000000}, + {305760, 0.000187000000}, + {307230, 0.000187000000}, + {308700, 0.000187000000}, + {310170, 0.000188000000}, + {311640, 0.000187000000}, + {313110, 0.000187000000}, + {314580, 0.000188000000}, + {316050, 0.000187000000}, + {317520, 0.000187000000}, + {318990, 0.000187000000}, + {320460, 0.000188000000}, + {321930, 0.000187000000}, + {323400, 0.000187000000}, + {324870, 0.000187000000}, + {326340, 0.000188000000}, + {327810, 0.000187000000}, + {329280, 0.000187000000}, + {330750, 0.000187000000}, + {332220, 0.000188000000}, + {333690, 0.000187000000}, + {335160, 0.000187000000}, + {336630, 0.000188000000}, + {338100, 0.000187000000}, + {339570, 0.000187000000}, + {341040, 0.000187000000}, + {342510, 0.000188000000}, + {343980, 0.000187000000}, + {345450, 0.000187000000}, + {346920, 0.000187000000}, + {348390, 0.000188000000}, + {349860, 0.000187000000}, + {351330, 0.000187000000}, + {352800, 0.000187000000}, + {354270, 0.000188000000}, + {355740, 0.000187000000}, + {357210, 0.000187000000}, + {358680, 0.000188000000}, + {360150, 0.000187000000}, + {361620, 0.000187000000}, + {363090, 0.000187000000}, + {364560, 0.000188000000}, + {373380, 0.001123000000}, + {392490, 0.002435000000}, + {411600, 0.002434000000}, + {432180, 0.002622000000}, + {451290, 0.002434000000}, + {470400, 0.002435000000}, + {490980, 0.002621000000}, + {510090, 0.002435000000}, + {529200, 0.002434000000}, + {549780, 0.002622000000}, + {568890, 0.002435000000}, + {588000, 0.002434000000}, + {608580, 0.002622000000}, + {627690, 0.002434000000}, + {648270, 0.002622000000}, + {667380, 0.002434000000}, + {686490, 0.002435000000}, + {707070, 0.002622000000}, + {726180, 0.002434000000}, + {745290, 0.002434000000}, + {765870, 0.002622000000}, + {784980, 0.002435000000}, + {804090, 0.002434000000}, + {824670, 0.002622000000}, + {843780, 0.002434000000}, + {862890, 0.002435000000}, + {883470, 0.002621000000}, + {902580, 0.002435000000}, + {921690, 0.002434000000}, + {942270, 0.002622000000}, + {961380, 0.002435000000}, + {980490, 0.002434000000}, + {1005480, 0.002553000000}, + {1030470, 0.002552000000}, + {1053990, 0.002403000000}, + {1078980, 0.002552000000}, + {1103970, 0.002553000000}, + {1127490, 0.002402000000}, + {1152480, 0.002553000000}, + {1177470, 0.002552000000}, + {1200990, 0.002403000000}, + {1225980, 0.002552000000}, + {1250970, 0.002553000000}, + {1274490, 0.002402000000}, + {1299480, 0.002553000000}, + {1324470, 0.002552000000}, + {1347990, 0.002403000000}, + {1372980, 0.002552000000}, + {1397970, 0.002553000000}, + {1421490, 0.002402000000}, + {1446480, 0.002553000000}, + {1470000, 0.002402000000}, + {1494990, 0.002553000000}, + {1519980, 0.002552000000}, + {1543500, 0.002403000000}, + {1568490, 0.002552000000}, + {1593480, 0.002553000000}, + {1617000, 0.002402000000}, + {1641990, 0.002553000000}, + {1666980, 0.002552000000}, + {1690500, 0.002403000000}, + {1715490, 0.002552000000}, + {1740480, 0.002553000000}, + {1764000, 0.002402000000}, + {1788990, 0.002553000000}, + {1813980, 0.002552000000}, + {1837500, 0.002403000000}, + {1862490, 0.002552000000}, + {1887480, 0.002553000000}, + {1911000, 0.002402000000}, + {1935990, 0.002553000000}, + {1959510, 0.002402000000}, + {2033010, 0.002500000000}, + {2106510, 0.002500000000}, + {2180010, 0.002500000000}, + {2253510, 0.002500000000}, + {2327010, 0.002500000000}, + {2400510, 0.002500000000}, + {2474010, 0.002500000000}, + {2547510, 0.002500000000}, + {2621010, 0.002500000000}, + {2694510, 0.002500000000}, + {2768010, 0.002500000000}, + {2841510, 0.002500000000}, + {2915010, 0.002500000000}, + {2988510, 0.002500000000}, + {3062010, 0.002500000000}, + {3135510, 0.002500000000}, + {3209010, 0.002500000000}, + {3282510, 0.002500000000}, + {3356010, 0.002500000000}, + {3429510, 0.002500000000}, + {3503010, 0.002500000000}, + {3576510, 0.002500000000}, + {3650010, 0.002500000000}, + {3723510, 0.002500000000}, + {3797010, 0.002500000000}, + {3870510, 0.002500000000}, + {3944010, 0.002500000000}, + {4017510, 0.002500000000}, + {4091010, 0.002500000000}, + {4164510, 0.002500000000}, + {4238010, 0.002500000000}, + {4311510, 0.002500000000}, + {4385010, 0.002500000000}, + {4458510, 0.002500000000}, + {4532010, 0.002500000000}, + {4605510, 0.002500000000}, + {4679010, 0.002500000000}, + {4752510, 0.002500000000}, + {4826010, 0.002500000000}, + {4899510, 0.002500000000}, + {10291470, 0.002505000000}, + {10780980, 0.002498000000}, + {11270490, 0.002497000000}, + {11761470, 0.002505000000}, + {12250980, 0.002498000000}, + {12740490, 0.002497000000}, + {13231470, 0.002505000000}, + {13720980, 0.002498000000}, + {14210490, 0.002498000000}, + {14701470, 0.002505000000}, + {15190980, 0.002497000000}, + {15680490, 0.002498000000}, + {16171470, 0.002505000000}, + {16660980, 0.002497000000}, + {17150490, 0.002498000000}, + {17641470, 0.002505000000}, + {18130980, 0.002498000000}, + {18620490, 0.002497000000}, + {19111470, 0.002505000000}, + {19600980, 0.002498000000}, + {20090490, 0.002497000000}, + {20581470, 0.002505000000}, + {21070980, 0.002498000000}, + {21560490, 0.002498000000}, + {22051470, 0.002505000000}, + {22540980, 0.002497000000}, + {23030490, 0.002498000000}, + {23521470, 0.002505000000}, + {24010980, 0.002497000000}, + {24500490, 0.002498000000}, + {24991470, 0.002505000000}, + {25480980, 0.002498000000}, + {25970490, 0.002497000000}, + {26461470, 0.002505000000}, + {26950980, 0.002498000000}, + {27440490, 0.002497000000}, + {27931470, 0.002505000000}, + {28420980, 0.002498000000}, + {28910490, 0.002497000000}, + {29400000, 0.002498000000}, + {0, 0}, }; diff --git a/util/dist.h b/util/dist.h index 6557f1a9..ae17038b 100644 --- a/util/dist.h +++ b/util/dist.h @@ -60,5 +60,25 @@ class dist_point_gen { std::uniform_real_distribution uniform_dist; static int dist_msg_overhead(int length, int mtu); + + /** + * Used for entering raw data (weights instead of CDF, this is usually + * more convenient when entering distributions by hand). + */ + struct weight { + /** @length: message length, in bytes. */ + size_t length; + + /** + * @freq: relative frequency of messages of this length. + */ + double freq; + + weight(size_t length, double freq) + : length(length), freq(freq) + {} + }; + + static struct weight w1[], w2[], w3[], w4[], w5[]; }; #endif /* _DIST_H */ From e3cce741fed9f95709dfea810ffd7c2fa0d564be Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 10 Jun 2025 16:12:36 -0700 Subject: [PATCH 363/625] Fix 2 bugs in homa_grant_check_rpc * Was issuing grants (at high priority!) to RPCs with rank -1. * If needy RPCs didn't get checked because total_incoming was maxxed, left stalled_rank in the reset position, losing information about the stalled RPCs. --- homa_grant.c | 8 +++- test/unit_homa_grant.c | 96 ++++++++++++++++++++++++++++++++---------- 2 files changed, 79 insertions(+), 25 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 93436973..983a32cd 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -741,11 +741,11 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) } rank = READ_ONCE(rpc->msgin.rank); - stalled_rank = atomic_xchg(&grant->stalled_rank, INT_MAX); + stalled_rank = atomic_read(&grant->stalled_rank); if (stalled_rank < needy_rank) needy_rank = stalled_rank; - if (rank <= needy_rank) { + if (rank >= 0 && rank <= needy_rank) { int priority; /* Fast path. */ @@ -771,7 +771,11 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) if (needy_rank < INT_MAX && atomic_read(&grant->total_incoming) < grant->max_incoming) { + UNIT_HOOK("grant_check_needy"); /* Situations 1 and 2. */ + stalled_rank = atomic_xchg(&grant->stalled_rank, INT_MAX); + if (stalled_rank < needy_rank) + needy_rank = stalled_rank; homa_grant_cand_init(&cand); locked = 1; tt_record3("homa_grant_check_rpc acquiring grant lock, needy_rank %d, id %d, num_active %d", diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 12841769..a590dac0 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -37,6 +37,14 @@ static void grant_spinlock_hook(char *id) hook_spinlock_count++; } +static struct homa_grant *hook_grant; +static void grant_check_stalled_hook(char *id) +{ + if (strcmp(id, "grant_check_needy") != 0) + return; + atomic_dec(&hook_grant->stalled_rank); +} + FIXTURE(homa_grant) { struct in6_addr client_ip[5]; int client_port; @@ -1077,6 +1085,47 @@ TEST_F(homa_grant, homa_grant_check_rpc__fast_path) EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); EXPECT_EQ(10000, rpc->msgin.granted); } +TEST_F(homa_grant, homa_grant_check_rpc__skip_fast_path_rpc_not_active) +{ + struct homa_rpc *rpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + 100, 1000, 20000); + + homa_message_in_init(rpc, 20000, 0); + EXPECT_EQ(0, rpc->msgin.rank); + rpc->msgin.rank = -1; + + unit_log_clear(); + homa_rpc_lock(rpc); + + homa_grant_check_rpc(rpc); + homa_rpc_unlock(rpc); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, rpc->msgin.granted); +} +TEST_F(homa_grant, homa_grant_check_rpc__skip_fast_path_because_of_stalled_rpc) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 15000); + + unit_log_clear(); + atomic_set(&self->homa.grant->stalled_rank, 1); + homa_rpc_lock(rpc3); + homa_grant_check_rpc(rpc3); + homa_rpc_unlock(rpc3); + EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 5000@0", unit_log_get()); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_locked); + EXPECT_EQ(2, atomic_read(&self->homa.grant->stalled_rank)); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(10000, rpc2->msgin.granted); + EXPECT_EQ(5000, rpc3->msgin.granted); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_others); +} TEST_F(homa_grant, homa_grant_check_rpc__fast_path_grants_to_end_of_message) { struct homa_rpc *rpc = test_rpc_init(self, 100, self->server_ip, 6000); @@ -1115,29 +1164,6 @@ TEST_F(homa_grant, homa_grant_check_rpc__fast_path_promote_other_message) EXPECT_STREQ("active[0]: id 102 ungranted 15000", unit_log_get()); EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); } -TEST_F(homa_grant, homa_grant_check_rpc__skip_fast_path_because_of_stalled_rank) -{ - struct homa_rpc *rpc1, *rpc2, *rpc3; - - rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); - rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); - rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); - atomic_set(&self->homa.grant->total_incoming, - self->homa.grant->max_incoming - 15000); - - unit_log_clear(); - atomic_set(&self->homa.grant->stalled_rank, 1); - homa_rpc_lock(rpc3); - homa_grant_check_rpc(rpc3); - homa_rpc_unlock(rpc3); - EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 5000@0", unit_log_get()); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_locked); - EXPECT_EQ(2, atomic_read(&self->homa.grant->stalled_rank)); - EXPECT_EQ(0, rpc1->msgin.granted); - EXPECT_EQ(10000, rpc2->msgin.granted); - EXPECT_EQ(5000, rpc3->msgin.granted); - EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_others); -} TEST_F(homa_grant, homa_grant_check_rpc__dont_check_needy_if_incoming_maxed) { struct homa_rpc *rpc; @@ -1157,6 +1183,30 @@ TEST_F(homa_grant, homa_grant_check_rpc__dont_check_needy_if_incoming_maxed) EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_others); } +TEST_F(homa_grant, homa_grant_check_rpc__reread_stalled_rank_before_checking_needy) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 20000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 30000); + rpc3 = test_rpc_init(self, 104, self->server_ip, 40000); + atomic_set(&self->homa.grant->total_incoming, + self->homa.grant->max_incoming - 5000); + + unit_hook_register(grant_check_stalled_hook); + hook_grant = self->homa.grant; + + unit_log_clear(); + atomic_set(&self->homa.grant->stalled_rank, 1); + homa_rpc_lock(rpc3); + homa_grant_check_rpc(rpc3); + homa_rpc_unlock(rpc3); + EXPECT_EQ(1, homa_metrics_per_cpu()->grant_check_others); + EXPECT_EQ(0, atomic_read(&self->homa.grant->stalled_rank)); + EXPECT_EQ(5000, rpc1->msgin.granted); + EXPECT_EQ(0, rpc2->msgin.granted); + EXPECT_EQ(0, rpc3->msgin.granted); +} TEST_F(homa_grant, homa_grant_check_rpc__skip_rpc_with_too_much_incoming) { struct homa_rpc *rpc2, *rpc3; From 9e71ef350eba3bc8188e116e358691e52747ea64 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 11 Jun 2025 10:31:38 -0700 Subject: [PATCH 364/625] Fix bugs in tthoma.py --- util/tthoma.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 95f046ee..aa8402c6 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1358,7 +1358,7 @@ def __resend_tx(self, trace, time, core, match, interests): patterns.append({ 'name': 'resend_tx', 'regexp': 'Sending RESEND for id ([0-9]+), peer ([^,]+), ' - 'offset ([0-9]+), length ([0-9]+)' + 'offset ([0-9]+), length ([-0-9]+)' }) def __resend_rx(self, trace, time, core, match, interests): @@ -1371,7 +1371,7 @@ def __resend_rx(self, trace, time, core, match, interests): patterns.append({ 'name': 'resend_rx', 'regexp': 'resend request for id ([0-9]+), offset ([0-9]+), ' - 'length ([0-9]+)' + 'length ([-0-9]+)' }) def __retransmit(self, trace, time, core, match, interests): @@ -4349,7 +4349,8 @@ def analyze(self): # rx_grantable in_length = rpc['in_length'] if rpc['send_grant'] or (('unsched' in rpc) and (in_length != None) - and (in_length > rpc['unsched'])): + and (in_length > rpc['unsched']) or (('granted' in rpc) + and (in_length != None) and (rpc['granted'] < in_length))): start = traces[rpc['node']]['first_time'] if rpc['softirq_data_pkts']: start = rpc['softirq_data_pkts'][0]['softirq'] From 477c81f580edee6bcbc08d9f576b30ed19aa733f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 11 Jun 2025 10:32:14 -0700 Subject: [PATCH 365/625] Create "starve" workload in dist.cc --- util/dist.cc | 21 ++++++++++++++++++++- util/dist.h | 2 +- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/util/dist.cc b/util/dist.cc index eaa0f0ee..68c26ada 100644 --- a/util/dist.cc +++ b/util/dist.cc @@ -59,9 +59,11 @@ dist_point_gen::dist_point_gen(const char* dist, size_t max_length, points = w4; } else if (strcmp(dist, "w5") == 0) { points = w5; + } else if (strcmp(dist, "starve") == 0) { + points = starve; } else { fprintf(stderr, "Invalid workload %s; must be w1, " - "w2, w3, w4, w5, or a number\n", dist); + "w2, w3, w4, w5, starve, or a number\n", dist); abort(); } @@ -2395,3 +2397,20 @@ dist_point_gen::weight dist_point_gen::w5[] = { {29400000, 0.002498000000}, {0, 0}, }; + +/* The distribution below is not representative of any real work load; it + * is intended to maximize the likelihood that large requests starve. Run + * it at 100% network load for 30 seconds or more. + */ +dist_point_gen::weight dist_point_gen::starve[] = { + {100000, 100}, + {200000, 100}, + {300000, 100}, + {400000, 100}, + {500000, 100}, + {600000, 100}, + {700000, 100}, + {800000, 100}, + {900000, 100}, + {1000000, 100}, +}; diff --git a/util/dist.h b/util/dist.h index ae17038b..15e8c7dd 100644 --- a/util/dist.h +++ b/util/dist.h @@ -79,6 +79,6 @@ class dist_point_gen { {} }; - static struct weight w1[], w2[], w3[], w4[], w5[]; + static struct weight w1[], w2[], w3[], w4[], w5[], starve[]; }; #endif /* _DIST_H */ From 28d81d0341d05583449c19d54e8e183e68eb82b3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 13 Jun 2025 13:52:28 -0700 Subject: [PATCH 366/625] Cleanup issues from net-next code review * Remove unused variables * Fix spelling errors * Fix reverse-Xmas-tree violations * Use kzalloc insteac of kmalloc(__GFP_ZERO) * Don't log error messages after memory allocation failures --- homa_grant.c | 10 ++++------ homa_incoming.c | 14 ++++---------- homa_metrics.c | 4 +--- homa_offload.c | 2 +- homa_outgoing.c | 13 ++----------- homa_pacer.c | 12 +++++------- homa_peer.c | 8 +++----- homa_peer.h | 6 +++--- homa_plumbing.c | 18 +++++++----------- homa_pool.c | 4 ++-- homa_rpc.c | 8 ++++---- homa_skb.c | 4 ++-- homa_sock.h | 2 +- homa_timer.c | 16 +++++----------- homa_utils.c | 5 +---- test/unit_homa_utils.c | 8 -------- timetrace.c | 4 +--- 17 files changed, 46 insertions(+), 92 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 983a32cd..04f719ae 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -86,11 +86,9 @@ struct homa_grant *homa_grant_alloc(void) struct homa_grant *grant; int err; - grant = kmalloc(sizeof(*grant), GFP_KERNEL | __GFP_ZERO); - if (!grant) { - pr_err("%s couldn't allocate grant structure\n", __func__); + grant = kzalloc(sizeof(*grant), GFP_KERNEL); + if (!grant) return ERR_PTR(-ENOMEM); - } atomic_set(&grant->stalled_rank, INT_MAX); grant->max_incoming = 400000; spin_lock_init(&grant->lock); @@ -999,8 +997,8 @@ void homa_grant_cand_check(struct homa_grant_candidates *cand, struct homa_grant *grant) { struct homa_rpc *rpc; - bool locked; int priority; + bool locked; while (cand->removes < cand->inserts) { rpc = cand->rpcs[cand->removes & HOMA_CAND_MASK]; @@ -1084,8 +1082,8 @@ void homa_grant_update_sysctl_deps(struct homa_grant *grant) int homa_grant_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct homa_grant *grant; struct ctl_table table_copy; + struct homa_grant *grant; int result; grant = homa_net_from_net(current->nsproxy->net_ns)->homa->grant; diff --git a/homa_incoming.c b/homa_incoming.c index 6b7918bd..9ac167bc 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -177,7 +177,6 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) /* Packet creates a new gap. */ if (!homa_gap_alloc(&rpc->msgin.gaps, rpc->msgin.recv_end, start)) { - pr_err("Homa couldn't allocate gap: insufficient memory\n"); tt_record2("Couldn't allocate gap for id %d (start %d): no memory", rpc->id, start); goto discard; @@ -230,7 +229,6 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) /* Packet is in the middle of the gap; must split the gap. */ gap2 = homa_gap_alloc(&gap->links, gap->start, start); if (!gap2) { - pr_err("Homa couldn't allocate gap for split: insufficient memory\n"); tt_record2("Couldn't allocate gap for split for id %d (start %d): no memory", rpc->id, end); goto discard; @@ -286,10 +284,10 @@ int homa_copy_to_user(struct homa_rpc *rpc) int end_offset = 0; #endif /* See strip.py */ int error = 0; + int n = 0; /* Number of filled entries in skbs. */ #ifndef __STRIP__ /* See strip.py */ u64 start; #endif /* See strip.py */ - int n = 0; /* Number of filled entries in skbs. */ int i; /* Tricky note: we can't hold the RPC lock while we're actually @@ -509,8 +507,6 @@ void homa_dispatch_pkts(struct sk_buff *skb) h, &created); if (IS_ERR(rpc)) { - pr_warn("homa_pkt_dispatch couldn't create server rpc: error %lu", - -PTR_ERR(rpc)); INC_METRIC(server_cant_create_rpcs, 1); rpc = NULL; goto discard; @@ -966,8 +962,8 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); u64 id = homa_local_id(h->sender_id); - struct homa_peer *peer; struct homa_ack_hdr ack; + struct homa_peer *peer; tt_record1("Received NEED_ACK for id %d", id); @@ -1147,12 +1143,10 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) */ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) { + IF_NO_STRIP(int avail_immediately = 1); struct homa_interest interest; + IF_NO_STRIP(int blocked = 0); struct homa_rpc *rpc; -#ifndef __STRIP__ /* See strip.py */ - int avail_immediately = 1; - int blocked = 0; -#endif /* See strip.py */ int result; INIT_LIST_HEAD(&interest.links); diff --git a/homa_metrics.c b/homa_metrics.c index 484d5717..1645caa4 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -93,10 +93,8 @@ void homa_metric_append(const char *format, ...) /* Not enough room; expand buffer capacity. */ homa_mout.capacity *= 2; new_buffer = kmalloc(homa_mout.capacity, GFP_KERNEL); - if (!new_buffer) { - pr_warn("%s couldn't allocate memory\n", __func__); + if (!new_buffer) return; - } memcpy(new_buffer, homa_mout.output, homa_mout.length); kfree(homa_mout.output); homa_mout.output = new_buffer; diff --git a/homa_offload.c b/homa_offload.c index d75cf263..e5bf9aea 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -506,8 +506,8 @@ void homa_gro_gen2(struct homa *homa, struct sk_buff *skb) */ struct homa_data_hdr *h = (struct homa_data_hdr *)skb_transport_header(skb); - int this_core = smp_processor_id(); struct homa_offload_core *offload_core; + int this_core = smp_processor_id(); int candidate = this_core; u64 now = homa_clock(); int i; diff --git a/homa_outgoing.c b/homa_outgoing.c index 34d70ea9..c2fc833b 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -449,15 +449,11 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, struct homa_sock *hsk) { -#ifndef __STRIP__ /* See strip.py */ - struct netdev_queue *txq; -#endif /* See strip.py */ + IF_NO_STRIP(struct netdev_queue *txq); + IF_NO_STRIP(int priority); struct homa_common_hdr *h; struct sk_buff *skb; int extra_bytes; -#ifndef __STRIP__ /* See strip.py */ - int priority; -#endif /* See strip.py */ int result; skb = homa_skb_alloc_tx(HOMA_MAX_HEADER); @@ -816,11 +812,6 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) + seg_length); #endif /* See strip.py */ if (unlikely(!new_skb)) { -#ifndef __STRIP__ /* See strip.py */ - if (rpc->hsk->homa->verbose) - pr_notice("%s couldn't allocate skb\n", - __func__); -#endif /* See strip.py */ UNIT_LOG("; ", "skb allocation error"); goto resend_done; } diff --git a/homa_pacer.c b/homa_pacer.c index dbcd1a93..67f75a21 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -57,11 +57,9 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) struct homa_pacer *pacer; int err; - pacer = kmalloc(sizeof(*pacer), GFP_KERNEL | __GFP_ZERO); - if (!pacer) { - pr_err("%s couldn't allocate homa_pacer struct\n", __func__); + pacer = kzalloc(sizeof(*pacer), GFP_KERNEL); + if (!pacer) return ERR_PTR(-ENOMEM); - } pacer->homa = homa; spin_lock_init(&pacer->mutex); pacer->fifo_count = 1000; @@ -335,7 +333,7 @@ void homa_pacer_manage_rpc(struct homa_rpc *rpc) struct homa_pacer *pacer = rpc->hsk->homa->pacer; struct homa_rpc *candidate; int bytes_left; - int checks = 0; + IF_NO_STRIP(int checks = 0); IF_NO_STRIP(u64 now); if (!list_empty(&rpc->throttled_links)) @@ -352,7 +350,7 @@ void homa_pacer_manage_rpc(struct homa_rpc *rpc) throttled_links) { int bytes_left_cand; - checks++; + IF_NO_STRIP(checks++); /* Watch out: the pacer might have just transmitted the last * packet from candidate. @@ -431,8 +429,8 @@ void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) int homa_pacer_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct homa_pacer *pacer; struct ctl_table table_copy; + struct homa_pacer *pacer; int result; pacer = homa_net_from_net(current->nsproxy->net_ns)->homa->pacer; diff --git a/homa_peer.c b/homa_peer.c index d1676220..d2feb336 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -77,11 +77,9 @@ struct homa_peertab *homa_peer_alloc_peertab(void) struct homa_peertab *peertab; int err; - peertab = kmalloc(sizeof(*peertab), GFP_KERNEL | __GFP_ZERO); - if (!peertab) { - pr_err("%s couldn't create peertab: kmalloc failure", __func__); + peertab = kzalloc(sizeof(*peertab), GFP_KERNEL); + if (!peertab) return ERR_PTR(-ENOMEM); - } spin_lock_init(&peertab->lock); err = rhashtable_init(&peertab->ht, &ht_params); @@ -432,7 +430,7 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, struct homa_peer *peer; struct dst_entry *dst; - peer = kmalloc(sizeof(*peer), GFP_ATOMIC | __GFP_ZERO); + peer = kzalloc(sizeof(*peer), GFP_ATOMIC); if (!peer) { INC_METRIC(peer_kmalloc_errors, 1); return (struct homa_peer *)ERR_PTR(-ENOMEM); diff --git a/homa_peer.h b/homa_peer.h index 5ac76899..cec167d3 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -73,7 +73,7 @@ struct homa_peertab { /** * @net_max: If the number of peers for a homa_net exceeds this number, - * work aggressivley to reclaim peers for that homa_net. Set + * work aggressively to reclaim peers for that homa_net. Set * externally via sysctl. */ int net_max; @@ -429,7 +429,7 @@ static inline u32 homa_peer_hash(const void *data, u32 dummy, u32 seed) return h; } -/** +/**q * homa_peer_compare() - Comparison function for entries in @peertab->ht. * @arg: Contains one of the keys to compare. * @obj: homa_peer object containing the other key to compare. @@ -438,8 +438,8 @@ static inline u32 homa_peer_hash(const void *data, u32 dummy, u32 seed) static inline int homa_peer_compare(struct rhashtable_compare_arg *arg, const void *obj) { - const struct homa_peer *peer = obj; const struct homa_peer_key *key = arg->key; + const struct homa_peer *peer = obj; return !(ipv6_addr_equal(&key->addr, &peer->ht_key.addr) && peer->ht_key.hnet == key->hnet); diff --git a/homa_plumbing.c b/homa_plumbing.c index e16747df..071ff722 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -593,7 +593,7 @@ int __init homa_load(void) timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); if (IS_ERR(timer_kthread)) { status = PTR_ERR(timer_kthread); - pr_err("couldn't create Homa pacer thread: error %d\n", + pr_err("couldn't create Homa timer thread: error %d\n", status); timer_kthread = NULL; goto timer_err; @@ -768,10 +768,10 @@ int homa_shutdown(struct socket *sock, int how) */ int homa_ioc_abort(struct sock *sk, int *karg) { - int ret = 0; struct homa_sock *hsk = homa_sk(sk); struct homa_abort_args args; struct homa_rpc *rpc; + int ret = 0; if (unlikely(copy_from_user(&args, (void __user *)karg, sizeof(args)))) return -EFAULT; @@ -807,8 +807,8 @@ int homa_ioc_abort(struct sock *sk, int *karg) int homa_ioctl(struct sock *sk, int cmd, int *karg) { #ifndef __STRIP__ /* See strip.py */ - int result; u64 start = homa_clock(); + int result; if (cmd == HOMAIOCABORT) { result = homa_ioc_abort(sk, karg); @@ -927,8 +927,8 @@ int homa_getsockopt(struct sock *sk, int level, int optname, { struct homa_sock *hsk = homa_sk(sk); struct homa_rcvbuf_args rcvbuf_args; - void *result; int is_server; + void *result; int len; if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int))) @@ -975,12 +975,10 @@ int homa_getsockopt(struct sock *sk, int level, int optname, */ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { + IF_NO_STRIP(u64 start = homa_clock()); struct homa_sock *hsk = homa_sk(sk); struct homa_sendmsg_args args; union sockaddr_in_union *addr; -#ifndef __STRIP__ /* See strip.py */ - u64 start = homa_clock(); -#endif /* See strip.py */ struct homa_rpc *rpc = NULL; int result = 0; #ifndef __STRIP__ /* See strip.py */ @@ -1139,14 +1137,12 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { + IF_NO_STRIP(u64 start = homa_clock()); struct homa_sock *hsk = homa_sk(sk); struct homa_recvmsg_args control; + IF_NO_STRIP(u64 finish); struct homa_rpc *rpc; int nonblocking; -#ifndef __STRIP__ /* See strip.py */ - u64 start = homa_clock(); - u64 finish; -#endif /* See strip.py */ int result; INC_METRIC(recv_calls, 1); diff --git a/homa_pool.c b/homa_pool.c index 901d30cc..6f3e57be 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -87,7 +87,7 @@ int homa_pool_set_region(struct homa_sock *hsk, void __user *region, if (num_bpages < MIN_POOL_SIZE) return -EINVAL; descriptors = kmalloc_array(num_bpages, sizeof(struct homa_bpage), - __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); if (!descriptors) return -ENOMEM; cores = alloc_percpu_gfp(struct homa_pool_core, __GFP_ZERO); @@ -291,8 +291,8 @@ int homa_pool_alloc_msg(struct homa_rpc *rpc) { struct homa_pool *pool = rpc->hsk->buffer_pool; int full_pages, partial, i, core_id; - u32 pages[HOMA_MAX_BPAGES]; struct homa_pool_core *core; + u32 pages[HOMA_MAX_BPAGES]; struct homa_bpage *bpage; struct homa_rpc *other; diff --git a/homa_rpc.c b/homa_rpc.c index 26c40f67..b3d0285a 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -36,7 +36,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, struct homa_rpc *crpc; int err; - crpc = kmalloc(sizeof(*crpc), GFP_KERNEL | __GFP_ZERO); + crpc = kzalloc(sizeof(*crpc), GFP_KERNEL); if (unlikely(!crpc)) return ERR_PTR(-ENOMEM); @@ -142,7 +142,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, } /* Initialize fields that don't require the socket lock. */ - srpc = kmalloc(sizeof(*srpc), GFP_ATOMIC | __GFP_ZERO); + srpc = kzalloc(sizeof(*srpc), GFP_ATOMIC); if (!srpc) { err = -ENOMEM; goto error; @@ -361,8 +361,8 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, int port, int error) { struct homa_socktab_scan scan; - struct homa_rpc *rpc; struct homa_sock *hsk; + struct homa_rpc *rpc; for (hsk = homa_socktab_start_scan(homa->socktab, &scan); hsk; hsk = homa_socktab_next(&scan)) { @@ -467,8 +467,8 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) struct homa_rpc *tmp; int i, batch_size; int skbs_to_reap; - int rx_frees; int result = 0; + int rx_frees; INC_METRIC(reaper_calls, 1); INC_METRIC(reaper_dead_skbs, hsk->dead_skbs); diff --git a/homa_skb.c b/homa_skb.c index 0817577a..9953b985 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -52,7 +52,7 @@ int homa_skb_init(struct homa *homa) if (!homa->page_pools[numa]) { struct homa_page_pool *pool; - pool = kmalloc(sizeof(*pool), GFP_ATOMIC | __GFP_ZERO); + pool = kzalloc(sizeof(*pool), GFP_ATOMIC); if (!pool) return -ENOMEM; homa->page_pools[numa] = pool; @@ -323,8 +323,8 @@ bool homa_skb_page_alloc(struct homa *homa, struct homa_skb_core *skb_core) int homa_skb_append_to_frag(struct homa *homa, struct sk_buff *skb, void *buf, int length) { - char *src = buf; int chunk_length; + char *src = buf; char *dst; while (length > 0) { diff --git a/homa_sock.h b/homa_sock.h index 447b253c..2e4439f7 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -85,7 +85,7 @@ struct homa_rpc_bucket { * in hundreds or thousands of RPCs accumulating before RCU allows * them to be deleted. * This approach has the disadvantage that RPCs within a bucket share - * locks and thus may not be able to work concurently, but there are + * locks and thus may not be able to work concurrently, but there are * enough buckets in the table to make such colllisions rare. * * See "Homa Locking Strategy" in homa_impl.h for more info about diff --git a/homa_timer.c b/homa_timer.c index b1543145..023c42f7 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -123,23 +123,17 @@ void homa_timer_check_rpc(struct homa_rpc *rpc) void homa_timer(struct homa *homa) { struct homa_socktab_scan scan; + struct homa_sock *hsk; + struct homa_rpc *rpc; + int rpc_count = 0; #ifndef __STRIP__ /* See strip.py */ static u64 prev_grant_count; int total_incoming_rpcs = 0; int sum_incoming_rec = 0; -#endif /* See strip.py */ - struct homa_sock *hsk; -#ifndef __STRIP__ /* See strip.py */ static int zero_count; -#endif /* See strip.py */ - struct homa_rpc *rpc; -#ifndef __STRIP__ /* See strip.py */ int sum_incoming = 0; - u64 total_grants; -#endif /* See strip.py */ int total_rpcs = 0; - int rpc_count = 0; -#ifndef __STRIP__ /* See strip.py */ + u64 total_grants; cycles_t start; cycles_t end; int core; @@ -203,7 +197,7 @@ void homa_timer(struct homa *homa) continue; rcu_read_lock(); list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { - total_rpcs++; + IF_NO_STRIP(total_rpcs++); homa_rpc_lock(rpc); if (rpc->state == RPC_IN_SERVICE) { rpc->silent_ticks = 0; diff --git a/homa_utils.c b/homa_utils.c index 55db0ea5..cf22c0d1 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -54,11 +54,8 @@ int homa_init(struct homa *homa) return err; } homa->socktab = kmalloc(sizeof(*homa->socktab), GFP_KERNEL); - if (!homa->socktab) { - pr_err("%s couldn't create socktab: kmalloc failure", - __func__); + if (!homa->socktab) return -ENOMEM; - } homa_socktab_init(homa->socktab); #ifndef __STRIP__ /* See strip.py */ err = homa_skb_init(homa); diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index e1b63b5d..1238aa35 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -66,8 +66,6 @@ TEST_F(homa_utils, homa_init__grant_alloc_failure) mock_kmalloc_errors = 1; unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); - EXPECT_SUBSTR("homa_grant_alloc couldn't allocate grant structure", - mock_printk_output); EXPECT_EQ(NULL, homa2.grant); homa_destroy(&homa2); } @@ -83,8 +81,6 @@ TEST_F(homa_utils, homa_init__pacer_alloc_failure) #endif/* See strip.py */ unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); - EXPECT_SUBSTR("homa_pacer_alloc couldn't allocate homa_pacer struct", - mock_printk_output); EXPECT_EQ(NULL, homa2.pacer); homa_destroy(&homa2); } @@ -99,8 +95,6 @@ TEST_F(homa_utils, homa_init__peertab_alloc_failure) #endif/* See strip.py */ unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); - EXPECT_SUBSTR("homa_peer_alloc_peertab couldn't create peertab: kmalloc failure", - mock_printk_output); EXPECT_EQ(NULL, homa2.peertab); homa_destroy(&homa2); } @@ -115,8 +109,6 @@ TEST_F(homa_utils, homa_init__cant_allocate_port_map) #endif/* See strip.py */ unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); - EXPECT_SUBSTR("homa_init couldn't create socktab: kmalloc failure", - mock_printk_output); EXPECT_EQ(NULL, homa2.socktab); homa_destroy(&homa2); } diff --git a/timetrace.c b/timetrace.c index 973906af..38446cd2 100644 --- a/timetrace.c +++ b/timetrace.c @@ -118,10 +118,8 @@ int tt_init(char *proc_file) struct tt_buffer *buffer; buffer = kmalloc(sizeof(*buffer), GFP_KERNEL); - if (!buffer) { - pr_err("%s couldn't allocate tt_buffers\n", __func__); + if (!buffer) goto error; - } memset(buffer, 0, sizeof(*buffer)); tt_buffers[i] = buffer; } From 737f180f120025828df4b5a6b95dfcde6a592697 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 13 Jun 2025 13:54:53 -0700 Subject: [PATCH 367/625] Update notes.txt --- notes.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/notes.txt b/notes.txt index 5de431b7..c984bf47 100755 --- a/notes.txt +++ b/notes.txt @@ -1,9 +1,12 @@ Notes for Homa implementation in Linux: --------------------------------------- -* Failure modes: - * homa_grant_add_rpc: list has a loop, or encounter a null list link - * stack corruption under homa_recvmsg after socket shutdown. +* Loose ends as of 6/13/2025: + * Analyze poor throughput under w4 with -b30 + * Reimplement FIFO grants + * Refactor RPC reference counts: acquire at top-level so no-one else + has to worry about them. + * Check out peformance on c6525-25g. * Move interest cleanup code from homa_sock to a new function in homa_interest. Also move wakeup code from homa_rpc_handoff. From 06c89d09a4c172178652dc5ad05f0d2734988292 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 16 Jun 2025 11:14:48 -0700 Subject: [PATCH 368/625] Add rxlongterm filter to tt_homa.py * Add new struct homa_rx_snapshot to homa_devel.c, along with functions homa_snapshot_rx and homa_rx_snapshot_log_tt * Add new metrics rx_msgs_started, rx_msgs_ended, rx_bytes_started, rx_bytes_retired. * Fix bug in 'MsgRate' metric in the activity analyzer for tt_homa. --- homa_devel.c | 93 ++++++++++++ homa_devel.h | 23 +++ homa_incoming.c | 4 + homa_metrics.c | 8 ++ homa_metrics.h | 30 ++++ homa_plumbing.c | 2 + homa_rpc.c | 4 + homa_timer.c | 1 + util/tthoma.py | 373 +++++++++++++++++++++++++++++++++++++++++++++--- 9 files changed, 520 insertions(+), 18 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index e5853962..6f85d584 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -29,6 +29,20 @@ static int drop_count; static u32 seed; #endif /* See strip.py */ +/* Used to record a history of rx state. */ +#define MAX_RX_SNAPSHOTS 1000 +static struct homa_rx_snapshot rx_snapshots[MAX_RX_SNAPSHOTS]; +static int next_snapshot; + +/* homa_clock() time when most recent rx snapshot was taken. */ +u64 snapshot_time; + +/* Interval between rx snapshots in ms. */ +#define RX_SNAPSHOT_INTERVAL 20 + +/* Interval between rx snapshots, in homa_clock() units. */ +u64 snapshot_interval; + /** * homa_print_ipv4_addr() - Convert an IPV4 address to the standard string * representation. @@ -914,3 +928,82 @@ int homa_drop_packet(struct homa *homa) } } #endif /* See strip.py */ + +/** + * homa_snapshot_rx() - This function is called by homa_timer; it collects + * data about the backlog of partially received incoming messages. + */ +void homa_snapshot_rx(void) +{ + struct homa_rx_snapshot *snap; + u64 now = homa_clock(); + int core; + + if (snapshot_interval == 0) + snapshot_interval = homa_clock_khz() * RX_SNAPSHOT_INTERVAL; + + if (now < snapshot_time + snapshot_interval) + return; + snapshot_time = now; + snap = &rx_snapshots[next_snapshot]; + snap->clock = now; + snap->msgs_started = 0; + snap->msgs_ended = 0; + snap->bytes_started = 0; + snap->bytes_retired = 0; + for (core = 0; core < nr_cpu_ids; core++) { + struct homa_metrics *m = &per_cpu(homa_metrics, core); + + snap->msgs_started += m->rx_msgs_started; + snap->msgs_ended += m->rx_msgs_ended; + snap->bytes_started += m->rx_msg_bytes_started; + snap->bytes_retired += m->rx_msg_bytes_retired; + } + next_snapshot++; + if (next_snapshot >= MAX_RX_SNAPSHOTS) + next_snapshot = 0; +} + +/** + * homa_rx_snapshot_log_tt() - Dump all of the snapshot data for incoming + * messages to the timetrace. + */ +void homa_rx_snapshot_log_tt(void) +{ + struct homa_rx_snapshot *snap; + u64 now = homa_clock(); + u64 mbase, bbase; + u64 usecs; + int i; + + i = next_snapshot; + + /* Adjust all the output values to start at 0, in order to avoid + * wraparound in 32-bit timetrace values. + */ + mbase = rx_snapshots[i].msgs_ended; + bbase = rx_snapshots[i].bytes_retired; + do { + snap = &rx_snapshots[i]; + + /* Compute how many microseconds before now this snapshot + * was taken. + */ + usecs = 1000*(now - snap->clock); + do_div(usecs, homa_clock_khz()); + + tt_record3("rx snapshot part 1, usecs %d, msgs_started %d, msgs_ended %d", + -usecs, snap->msgs_started - mbase, + snap->msgs_ended - mbase); + tt_record3("rx snapshot part 2, usecs %d, 4kbytes_started %d, 4kbytes_retired %d", + -usecs, (snap->bytes_started - bbase) >> 12, + (snap->bytes_retired - bbase) >> 12); + tt_record2("rx snapshot time: 0x%x%08x", snap->clock >> 32, + snap->clock & 0xffffffff); + + i++; + if (i >= MAX_RX_SNAPSHOTS) + i = 0; + } while (i != next_snapshot); + +} diff --git a/homa_devel.h b/homa_devel.h index 7bbd3ace..ee74752f 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -42,6 +42,27 @@ enum homa_freeze_type { NEED_ACK_MISSING_DATA = 5, }; +/** + * struct homa_rx_state - Captures the state of incoming messages at a + * point in time. + */ +struct homa_rx_snapshot { + /** @clock: homa_clock() value when data was gathered. */ + u64 clock; + + /** @msgs_started: sum of all rx_msgs_started metrics. */ + u64 msgs_started; + + /** @msgs_ended: sum of all rx_msgs_ended metrics. */ + u64 msgs_ended; + + /** @bytes_started: sum of all rx_msg_bytes_started metrics. */ + u64 bytes_started; + + /** @bytes_retired: sum of all rx_msg_bytes_retired metrics. */ + u64 bytes_retired; +}; + /** * tt_addr() - Given an address, return a 4-byte id that will (hopefully) * provide a unique identifier for the address in a timetrace record. @@ -94,6 +115,8 @@ void homa_rpc_log(struct homa_rpc *rpc); void homa_rpc_log_active(struct homa *homa, uint64_t id); void homa_rpc_log_tt(struct homa_rpc *rpc); void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); +void homa_rx_snapshot_log_tt(void); +void homa_snapshot_rx(void); int homa_snprintf(char *buffer, int size, int used, const char *format, ...) __printf(4, 5); char *homa_symbol_for_type(uint8_t type); diff --git a/homa_incoming.c b/homa_incoming.c index 9ac167bc..97aa74fe 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -66,6 +66,8 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) INC_METRIC(large_msg_count, 1); INC_METRIC(large_msg_bytes, length); } + INC_METRIC(rx_msgs_started, 1); + INC_METRIC(rx_msg_bytes_started, length); #endif /* See strip.py */ return 0; } @@ -257,6 +259,8 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) #endif /* See strip.py */ __skb_queue_tail(&rpc->msgin.packets, skb); rpc->msgin.bytes_remaining -= length; + INC_METRIC(rx_msg_bytes_retired, length); + INC_METRIC(rx_msgs_ended, rpc->msgin.bytes_remaining == 0); } /** diff --git a/homa_metrics.c b/homa_metrics.c index 1645caa4..510ecae0 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -141,6 +141,14 @@ char *homa_metrics_print(void) m->large_msg_count, lower); M("large_msg_bytes %15llu Bytes in incoming messages >= %d bytes\n", m->large_msg_bytes, lower); + M("rx_msgs_started %15llu Messages for which at least one packet was received\n", + m->rx_msgs_started); + M("rx_msg_bytes_started %15llu Total bytes in new message starts\n", + m->rx_msg_bytes_started); + M("rx_msg_bytes_retired %15llu Incoming message bytes either received or aborted\n", + m->rx_msg_bytes_retired); + M("rx_msgs_ended %15llu Incoming messages completed or aborted\n", + m->rx_msgs_ended); M("sent_msg_bytes %15llu Total bytes in all outgoing messages\n", m->sent_msg_bytes); for (i = DATA; i <= MAX_OP; i++) { diff --git a/homa_metrics.h b/homa_metrics.h index 54090d63..d2b33ca3 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -49,6 +49,36 @@ struct homa_metrics { */ u64 large_msg_bytes; + /** + * @rx_msgs_started: incremented whenever the first packet is received + * for a new incoming message. + */ + u64 rx_msgs_started; + + /** + * @rx_msg_bytes_started: total number of incoming message bytes for + * which at least one packet of the message has been received + * (incremented by the length of the message when the first packet is + * received). + */ + u64 rx_msg_bytes_started; + + /** + * @rx_msg_bytes_retired: cumulative count of incoming message bytes + * that were either (a) successfully received (counts only goodput, + * not retransmits) or (b) abandoned because the message was deleted + * before they were received. Or, think of this as the bytes from + * @rx_msg_bytes_started that we're no longer waiting to receive. + */ + u64 rx_msg_bytes_retired; + + /** + * @rx_msgs_ended: incremented whenever an input message is "retired", + * either because it was completed or because it was destroyed before + * it completed. + */ + u64 rx_msgs_ended; + /** * @sent_msg_bytes: The total number of bytes in outbound * messages. diff --git a/homa_plumbing.c b/homa_plumbing.c index 071ff722..2bbfc802 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1390,6 +1390,7 @@ int homa_softirq(struct sk_buff *skb) if (unlikely(h->type == FREEZE)) { if (!atomic_read(&tt_frozen)) { homa_rpc_log_active_tt(homa_from_skb(skb), 0); + homa_rx_snapshot_log_tt(); tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", ntohs(h->dport), tt_addr(skb_canonical_ipv6_saddr(skb)), @@ -1653,6 +1654,7 @@ int homa_dointvec(const struct ctl_table *table, int write, tt_freeze(); } else if (homa->sysctl_action == 7) { homa_rpc_log_active_tt(homa, 0); + homa_rx_snapshot_log_tt(); tt_record("Freezing cluster because of action 7"); homa_freeze_peers(); tt_record("Finished freezing cluster"); diff --git a/homa_rpc.c b/homa_rpc.c index b3d0285a..e3551b19 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -598,6 +598,10 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) } tt_record2("homa_rpc_reap finished reaping id %d, socket %d", rpc->id, rpc->hsk->port); + INC_METRIC(rx_msg_bytes_retired, + rpc->msgin.bytes_remaining); + INC_METRIC(rx_msgs_ended, + rpc->msgin.bytes_remaining != 0); rpc->state = 0; kfree(rpc); } diff --git a/homa_timer.c b/homa_timer.c index 023c42f7..a266a55f 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -238,6 +238,7 @@ void homa_timer(struct homa *homa) homa_skb_release_pages(homa); homa_peer_gc(homa->peertab); #ifndef __STRIP__ /* See strip.py */ + homa_snapshot_rx(); end = homa_clock(); INC_METRIC(timer_cycles, end - start); #endif /* See strip.py */ diff --git a/util/tthoma.py b/util/tthoma.py index aa8402c6..b8727291 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -11,6 +11,7 @@ from collections import defaultdict from functools import cmp_to_key from glob import glob +from itertools import count from optparse import OptionParser import math from operator import itemgetter @@ -1597,6 +1598,34 @@ def __tcp_xmit(self, trace, time, core, match, interests): 'regexp': '__tcp_transmit_skb sent packet with ([0-9]+) bytes' }) + def __rx_snapshot1(self, trace, time, core, match, interests): + usecs = int(match.group(1)) + msgs_started = int(match.group(2)) + msgs_ended = int(match.group(3)) + for interest in interests: + interest.tt_rx_snapshot1(trace, time, core, usecs, msgs_started, + msgs_ended) + + patterns.append({ + 'name': 'rx_snapshot1', + 'regexp': 'rx snapshot part 1, usecs (-[0-9]+), msgs_started ([0-9]+), ' + 'msgs_ended ([0-9]+)' + }) + + def __rx_snapshot2(self, trace, time, core, match, interests): + usecs = int(match.group(1)) + bytes_started = int(match.group(2)) * 4096 + bytes_ended = int(match.group(3)) * 4096 + for interest in interests: + interest.tt_rx_snapshot2(trace, time, core, usecs, bytes_started, + bytes_ended) + + patterns.append({ + 'name': 'rx_snapshot2', + 'regexp': 'rx snapshot part 2, usecs (-[0-9]+), 4kbytes_started ' + '([-0-9]+), 4kbytes_retired ([-0-9]+)' + }) + #------------------------------------------------ # Analyzer: activity #------------------------------------------------ @@ -1626,9 +1655,17 @@ def analyze(self): # Node name -> list of events for input messages on that server. self.node_in_msgs = {} + # Node name -> count of new incoming messages that started during + # the trace. + self.node_in_starts = defaultdict(lambda: 0) + # Node name -> list of events for output messages on that server. self.node_out_msgs = {} + # Node name -> count of new outgoing messages that started during + # the trace. + self.node_out_starts = defaultdict(lambda: 0) + # Node name -> dictionary that maps from core number to total GRO data # received by that core self.node_core_in_bytes = {} @@ -1668,11 +1705,19 @@ def analyze(self): self.node_in_msgs[node].append([in_start, 'start']) self.node_in_msgs[node].append([in_end, 'end']) + for time, offset, priority in rpc['gro_data']: + if offset == 0: + self.node_in_starts[node] += 1 + break + if 'tx_live' in rpc: out_start, out_end = rpc['tx_live'] self.node_out_msgs[node].append([out_start, 'start']) self.node_out_msgs[node].append([out_end, 'end']) + if 'sendmsg' in rpc: + self.node_out_starts[node] += 1 + sender_id = id^1 if sender_id in rpcs: sender = rpcs[sender_id] @@ -1734,21 +1779,12 @@ def sum_list(self, events): def output(self): global rpcs, traces - def print_list(node, events, num_bytes, extra): - global traces - msgs, liveFrac, avgLive = self.sum_list(events) - rate = msgs/(events[-1][0] - events[0][0]) - gbps = num_bytes*8e-3/(traces[node]['elapsed_time']) - print('%-10s %6d %7.3f %9.3f %8.2f %7.2f %7.2f%s' % ( - node, msgs, rate, liveFrac, avgLive, gbps, - gbps/liveFrac, extra)) - print('\n-------------------') print('Analyzer: activity') print('-------------------\n') print('Msgs: Total number of incoming/outgoing messages that were') print(' live at some point during the traces') - print('MsgRate: Rate at which new messages arrived (M/sec)') + print('MsgRate: Rate at which new messages were initiated (K/sec)') print('LiveFrac: Fraction of time when at least one message was live') print('AvgLive: Average number of live messages') print('Gbps: Total message throughput (Gbps)') @@ -1761,7 +1797,7 @@ def print_list(node, events, num_bytes, extra): print(' handled by a single GRO core\n') print('Incoming messages:') print('Node Msgs MsgRate LiveFrac AvgLive Gbps LiveGbps' - ' MaxCore MaxFrac MaxPFrac') + ' MaxCore MaxFrac MaxPFrac') print('-------------------------------------------------------------' '--------------------------------------') for node in get_sorted_nodes(): @@ -1793,14 +1829,20 @@ def print_list(node, events, num_bytes, extra): if rpcs > max_rpcs: max_rpcs = rpcs max_rpcs_core = core - # print('core_peers for %s: %s' % (node, core_peers)) - extra = ' %7.2f (C%02d) %4.3f (C%02d) %4.3f (C%02d)' % ( + + elapsed = traces[node]['elapsed_time'] + msgs, liveFrac, avgLive = self.sum_list(events) + rate = 1e3 * self.node_in_starts[node] / elapsed + gbps = total_bytes*8e-3 / elapsed + print('%-10s %6d %7.2f %9.3f %8.2f %7.2f %7.2f' % ( + node, msgs, rate, liveFrac, avgLive, gbps, + gbps/liveFrac), end='') + print(' %5.2f (C%02d) %6.3f (C%02d) %6.3f (C%02d)' % ( max_gbps, max_core, max_rpcs/total_rpcs if total_rpcs != 0 else 0, max_rpcs_core, max_pending/total_pending if total_pending != 0 else 0, - max_pending_core) - print_list(node, events, total_bytes, extra) + max_pending_core)) print('\nOutgoing messages:') print('Node Msgs MsgRate LiveFrac AvgLive Gbps LiveGbps') print('-------------------------------------------------------------') @@ -1808,12 +1850,19 @@ def print_list(node, events, num_bytes, extra): if not node in self.node_out_msgs: continue bytes = self.node_out_bytes[node] - print_list(node, sorted(self.node_out_msgs[node]), bytes, "") + elapsed = traces[node]['elapsed_time'] + events = sorted(self.node_out_msgs[node]) + msgs, liveFrac, avgLive = self.sum_list(events) + rate = 1e3 * self.node_out_starts[node] / elapsed + gbps = bytes*8e-3 / elapsed + print('%-10s %6d %7.2f %9.3f %8.2f %7.2f %7.2f' % ( + node, msgs, rate, liveFrac, avgLive, gbps, + gbps/liveFrac)) if options.data: for node in get_sorted_nodes(): f = open('%s/activity_%s.dat' % (options.data, node), 'w') - f.write('# Node: %s\n' % (name)) + f.write('# Node: %s\n' % (node)) f.write('# Generated at %s.\n' % (time.strftime('%I:%M %p on %m/%d/%Y'))) f.write('# Statistics about RPC and packet activity on the ') @@ -1828,7 +1877,7 @@ def print_list(node, events, num_bytes, extra): f.write('# RxGts: Number of incoming RPCS with outstanding grants at the\n') f.write('# end of the interval (doesn\'t include unscheduled)\n') f.write('# RxGtKB: Number of KB for which grants have been sent but data\n') - f.write(' not yet received at the end of the interval\n') + f.write('# not yet received at the end of the interval\n') f.write('# RxPkts: Number of data packets received during the interval\n') f.write('# RxGbps: Throughput of received data during the interval\n') f.write('# Incoming: KB of data that had been transmitted but not yet\n') @@ -6450,6 +6499,294 @@ def output(self): print('%8d %20s %10s %4s %9.3f %9.3f %7.1f' % (active, pkid, node, core_id, gro_time, time, time - gro_time)) +#------------------------------------------------ +# Analyzer: rxlongterm +#------------------------------------------------ +class AnalyzeRxlongterm: + """ + Uses data recorded by homa_rx_snapshot_log_tt to analyze incoming RPC + traffic for each node over a much longer time period than covered by + the traces themselves. Provides information about backlog (incomplete + incoming RPCs) as well as arrival rates of new RPCs and service rates. + This analyzer will not work unless homa_rx_snapshot_log_tt was invoked + before reading the timetraces. If --data is specified then more detailed + node-specific files are generated in the data directory. + """ + + def __init__(self, dispatcher): + # Node name -> list of records for that node. Each record has + # the following fields: + # time: Time when the record was generated. + # mstarts: Value of the msgs_started field from the homa_rx_snapshot + # mends: Value of the msgs_ended field from the homa_rx_snapshot + # bstarts: Value of the msg_bytes_started field from the + # homa_rx_snapshot + # bends: Value of the msg_bytes_retired field from the + # homa_rx_snapshot + # + self.node_records = defaultdict(list) + + # A list with one entry for each interval of backlog data (not the + # same intervals as the global variable "intervals"). Each entry + # is a list with two values: + # time: The time that the interval represents + # indexes: A list with one entry for each element in + # get_sorted_nodes, which is the index of the first + # element in node_records whose time is at or after + # time, or -1 if there is no such entry or if the + # index would be zero (so there is no preceding entry) + self.intervals = [] + + # Elepased time between elements of self.intervals + self.interval = None + + def init_trace(self, trace): + # Time of the first snapshot record encountered for this node; + # serves as a reference point for time values in the records. + self.ref_time = None + + def tt_rx_snapshot1(self, trace, t, core, usecs, msg_starts, msg_ends): + if self.ref_time == None: + self.ref_time = t + records = self.node_records[trace['node']] + if (len(records) > 0 and not 'bstarts' in records[-1]): + # Previous record was incomplete, so just remove it. + print('Removing incomplete rx_snapshot record for node %s at ' + 'usecs %d' % (trace['node'], usecs)) + del records[-1] + records.append({'time': self.ref_time + usecs, 'mstarts': msg_starts, + 'mends': msg_ends, 'usecs': usecs}) + + def tt_rx_snapshot2(self, trace, t, core, usecs, byte_starts, byte_ends): + if self.ref_time == None: + self.ref_time = t + record_time = self.ref_time + usecs + records = self.node_records[trace['node']] + if records: + record = records[-1] + if (record['time'] != record_time): + print('Ignoring rx_snapshot2 record for node %s at usecs %d ' + 'because of time mismatch: expected %.2f, got %.2f' & + (trace['node'], usecs, record_time, record['time'])) + else: + record['bstarts'] = byte_starts + record['bends'] = byte_ends + + def analyze(self): + """ + Returns a list with one entry for each interval. Each entry is a list + with two values: + time: The time that the interval represents + indexes: A list with one entry for each element in + get_sorted_nodes, which is the index of the first + element in node_records whose time is at or after + time, or -1 if there is no such entry or if the + index would be zero (so there is no preceding entry) + """ + + nodes = get_sorted_nodes() + start = 1e20 + end = -1e20 + interval = None + for node in nodes: + records = self.node_records[node] + if records[0]['time'] < start: + start = records[0]['time'] + if records[-1]['time'] > end: + end = records[-1]['time'] + + # Figure out the interval for records on this node (round to + # an integer that is all zeroes except the high-order digit) + tend = records[-1]['time'] + tstart = records[0]['time'] + node_interval = (tend - tstart) / (len(records) - 1) + node_interval = int(float('%.0g' % (node_interval))) + if interval == None: + interval = node_interval + elif interval != node_interval: + print('%s has a different interval for rx backlog records than %s (%d vs %d)' % + (node, nodes[0], node_interval, interval), file=sys.stderr) + + start = int(start) // interval * interval + + # Each iteration of the following loop generates one list of indexes + # for the resut. + next = [1] * len(nodes) + self.intervals = [] + for t in count(start, interval): + if t > end: + break + indices = [] + for i in range(0, len(nodes)): + records = self.node_records[nodes[i]] + if records[0]['time'] >= t or records[-1]['time'] < t: + indices.append(-1) + continue + while records[next[i]]['time'] < t: + next[i] += 1 + indices.append(next[i]) + # print('Index %d for %s has interval %d, time %d, usecs %d' % ( + # next[i], nodes[i], t, records[next[i]]['time'], + # records[next[i]]['usecs'])) + self.intervals.append([t, indices]) + + self.interval = interval + + def output_node_data(self, node, node_index): + """ + Generates a node-specific data file with details about that + particular node. + node: Name of node for which to print data + node_index: Index of info for this node in various arrays + """ + + f = open('%s/rxlongterm_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Interval-based statistics about incoming RPCs on a single node:\n') + f.write('# Time: Time in seconds. The actual interval for the ' + 'data spans this\n') + f.write(' time and its length is approximately the same ' + 'as the time between\n') + f.write(' consecutive lines, but its end time could be ' + 'anywhere from the\n') + f.write(' given time up to the next time\n') + f.write('# MStart: New incoming messages that started during ' + 'the interval\n') + f.write('# MStartR Rate at which new messages started (K/sec)\n') + f.write('# MEnd: Messages for which the last byte was received\n') + f.write('# MEndR Rate at which messages ended (K/sec)\n') + f.write('# DStart: Total data in new messages that started during ' + 'the interval (MB)\n') + f.write('# DStartR Rate coresponding to DStart (Gbps)\n') + f.write('# DRecv: Data that was successfully received in the ' + 'interval (goodput, MB)\n') + f.write('# DRecvR Rate corresonding to DEnd (K/sec)\n') + f.write('\n') + f.write('# Time MStart MStartR MEnd MEndR DStart DStartR DRecv DRecvR\n') + + records = self.node_records[node] + for interval in self.intervals: + t = interval[0] + rec_index = interval[1][node_index] + if rec_index < 0: + continue + cur = records[rec_index] + prev = records[rec_index - 1] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mstarts = cur['mstarts'] - prev['mstarts'] + mends = cur['mends'] - prev['mends'] + f.write('%10.3f %6d %6.2f %5d %6.2f' % (1e-06 * t, + mstarts, 1e-3 * (mstarts / elapsed_secs), + mends, 1e-3 * (mends / elapsed_secs), + )) + bstarts = cur['bstarts'] - prev['bstarts'] + bends = cur['bends'] - prev['bends'] + f.write(' %6.2f %7.2f %6.2f %6.2f\n' % ( + 1e-6 * bstarts, 8e-9 * (bstarts / elapsed_secs), + 1e-6 * bends, 8e-9 * (bends / elapsed_secs), + )) + + def output(self): + print('\n--------------------') + print('Analyzer: rxlongterm') + print('--------------------\n') + + nodes = get_sorted_nodes() + + print('Overall rates of incoming messages for each node:') + print('Secs: Time period over which averages were computed (seconds)') + print('Mstart: Average rate at which incoming messages were ' + 'initiated (first') + print(' packet arrived, K/sec)') + print('Mend: Average rate at which incoming messages were ' + 'completed (last byte') + print(' of data arrived, K/sec)') + print('Bstart: Average rate at which incoming messages were ' + 'initiated, weighted') + print(' by amount of data in the message (Gbps)') + print('Brecv: Average rate at which data was successfully ' + 'received for incoming') + print(' messages (goodput only, Gbps)') + print('') + print('Node Secs Mstart Mend Bstart Brecv') + print('-----------------------------------------------------') + + sum_mstarts = 0 + sum_mends = 0 + sum_bstarts = 0 + sum_bends = 0 + sum_secs = 0 + for node in nodes: + first = self.node_records[node][0] + last = self.node_records[node][-1] + records = self.node_records[node] + secs = 1e-6 * (last['time'] - first['time']) + if secs <= 0: + continue + sum_secs += secs + mstarts = last['mstarts'] - first['mstarts'] + sum_mstarts += mstarts + mends = last['mends'] - first['mends'] + sum_mends += mends + bstarts = last['bstarts'] - first['bstarts'] + sum_bstarts += bstarts + bends = last['bends'] - first['bends'] + sum_bends += bends + print('%-10s %6.2f %8.2f %8.2f %8.2f %8.2f' % ( + node, secs, + 1e-3 * mstarts / secs, + 1e-3 * mends / secs, + 8e-9 * bstarts / secs, + 8e-9 * bends / secs + )) + if sum_secs != 0: + print('Average %6.2f %8.2f %8.2f %8.2f %8.2f' % ( + sum_secs / len(nodes), + 1e-3 * sum_mstarts / sum_secs, + 1e-3 * sum_mends / sum_secs, + 8e-9 * sum_bstarts / sum_secs, + 8e-9 * sum_bends / sum_secs + )) + + print('') + print('The number of active incoming RPCs on each node (those for ' + 'which at least') + print('one packet has been received, but some data is still ' + 'outstanding) as a') + print('function of time (in seconds).') + print('') + + print(' Time', end='') + for node in nodes: + print('%10s' % (node), end='') + print('') + print('-' * 10 * (len(nodes) + 1)) + for interval in self.intervals: + any_data = False + line = '%10.4f' % (interval[0] * 1e-6) + for i, index in zip(range(0, len(interval[1])), interval[1]): + if index == -1: + line += ' ' * 10 + else: + record = self.node_records[nodes[i]][index] + if record['time'] < interval[0] or ( + self.node_records[nodes[i]][index-1]['time'] >= + interval[0]): + print('Index %d for %s has out-of range time ' + '(time %d, interval time %.4f)' % ( + index, nodes[i], record['time'] * 1e-6, + interval[0]), file=sys.stderr) + line += ' %8d' % (record['mstarts'] - record['mends']) + any_data = True + if any_data: + print(line.rstrip()) + + if options.data: + for i in range(0, len(nodes)): + self.output_node_data(nodes[i], i) + #------------------------------------------------ # Analyzer: rxsnapshot #------------------------------------------------ From d0bd36b1b1ee0b971ccaf8f50c7da01acabaf988 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Jul 2025 11:15:56 -0700 Subject: [PATCH 369/625] Eliminate use of "context" for lock annotations --- homa_grant.c | 4 ++-- homa_impl.h | 6 ------ homa_incoming.c | 22 +++++++++++----------- homa_outgoing.c | 12 ++++++------ homa_pacer.c | 4 ++-- homa_rpc.c | 12 ++++++------ homa_rpc.h | 6 +++--- homa_sock.c | 2 +- homa_sock.h | 8 ++++---- 9 files changed, 35 insertions(+), 41 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 04f719ae..261ee4ca 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -143,7 +143,7 @@ void homa_grant_free(struct homa_grant *grant) * @unsched: Number of unscheduled bytes in the incoming message for @rpc. */ void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { rpc->msgin.rank = -1; if (rpc->msgin.num_bpages == 0) @@ -167,7 +167,7 @@ void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched) * may release and then reacquire the lock. */ void homa_grant_end_rpc(struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_grant_candidates cand; diff --git a/homa_impl.h b/homa_impl.h index 43434ad5..38190201 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -80,12 +80,6 @@ struct homa_sock; void homa_throttle_lock_slow(struct homa *homa); #endif /* See strip.py */ -#ifdef __CHECKER__ -#define __context__(x, y, z) __attribute__((context(x, y, z))) -#else -#define __context__(...) -#endif /* __CHECKER__ */ - /** * union sockaddr_in_union - Holds either an IPv4 or IPv6 address (smaller * and easier to use than sockaddr_storage). diff --git a/homa_incoming.c b/homa_incoming.c index 97aa74fe..d3a189e9 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -40,7 +40,7 @@ int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched) */ int homa_message_in_init(struct homa_rpc *rpc, int length) #endif /* See strip.py */ - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { int err; @@ -102,7 +102,7 @@ struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end) * caller. */ void homa_request_retrans(struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_resend_hdr resend; struct homa_gap *gap; @@ -155,7 +155,7 @@ void homa_request_retrans(struct homa_rpc *rpc) * (the packet will either be freed or added to rpc->msgin.packets). */ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; struct homa_gap *gap, *dummy, *gap2; @@ -275,7 +275,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) * if all available packets have been copied out. */ int homa_copy_to_user(struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { #ifdef __UNIT_TEST__ #define MAX_SKBS 3 @@ -647,7 +647,7 @@ void homa_dispatch_pkts(struct sk_buff *skb) * Must be locked by the caller. */ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; #ifndef __STRIP__ /* See strip.py */ @@ -749,7 +749,7 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) * Must be locked by caller. */ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_grant_hdr *h = (struct homa_grant_hdr *)skb->data; int new_offset = ntohl(h->offset); @@ -785,7 +785,7 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) */ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_resend_hdr *h = (struct homa_resend_hdr *)skb->data; int offset = htonl(h->offset); @@ -875,7 +875,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, * be locked by caller. */ void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { tt_record3("Received unknown for id %llu, peer %x:%d", rpc->id, tt_addr(rpc->peer->addr), rpc->dport); @@ -961,7 +961,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) */ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data; const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); @@ -1021,7 +1021,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, */ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); struct homa_ack_hdr *h = (struct homa_ack_hdr *)skb->data; @@ -1247,7 +1247,7 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) * @rpc: RPC to handoff; must be locked. */ void homa_rpc_handoff(struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_sock *hsk = rpc->hsk; struct homa_interest *interest; diff --git a/homa_outgoing.c b/homa_outgoing.c index c2fc833b..ff1dbfc9 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -24,7 +24,7 @@ * @length: Number of bytes that will eventually be in rpc->msgout. */ void homa_message_out_init(struct homa_rpc *rpc, int length) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { memset(&rpc->msgout, 0, sizeof(rpc->msgout)); rpc->msgout.length = length; @@ -70,7 +70,7 @@ void homa_message_out_init(struct homa_rpc *rpc, int length) #endif /* See strip.py */ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct iov_iter *iter) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_skb_info *homa_info = homa_get_skb_info(skb); int seg_length = homa_info->seg_length; @@ -127,7 +127,7 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, struct iov_iter *iter, int offset, int length, int max_seg_data) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_skb_info *homa_info; struct homa_data_hdr *h; @@ -246,7 +246,7 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, * rpc->state will be RPC_DEAD. */ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { /* Geometry information for packets: * mtu: largest size for an on-the-wire packet (including @@ -589,7 +589,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) * the NIC queue is sufficiently long. */ void homa_xmit_data(struct homa_rpc *rpc, bool force) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; #ifndef __STRIP__ /* See strip.py */ @@ -757,7 +757,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end, */ void homa_resend_data(struct homa_rpc *rpc, int start, int end) #endif /* See strip.py */ - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_skb_info *homa_info; struct sk_buff *skb; diff --git a/homa_pacer.c b/homa_pacer.c index 67f75a21..5aa0abe7 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -328,7 +328,7 @@ void homa_pacer_xmit(struct homa_pacer *pacer) * sent because of NIC queue restrictions. Must be locked by caller. */ void homa_pacer_manage_rpc(struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_pacer *pacer = rpc->hsk->homa->pacer; struct homa_rpc *candidate; @@ -378,7 +378,7 @@ void homa_pacer_manage_rpc(struct homa_rpc *rpc) * @rpc: RPC of interest. */ void homa_pacer_unmanage_rpc(struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { struct homa_pacer *pacer = rpc->hsk->homa->pacer; diff --git a/homa_rpc.c b/homa_rpc.c index e3551b19..f7201f0d 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -29,7 +29,7 @@ */ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, const union sockaddr_in_union *dest) - __acquires(rpc_bucket_lock) + __acquires(crpc->bucket->lock) { struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); struct homa_rpc_bucket *bucket; @@ -114,7 +114,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, const struct in6_addr *source, struct homa_data_hdr *h, int *created) - __acquires(rpc_bucket_lock) + __acquires(srpc->bucket->lock) { u64 id = homa_local_id(h->common.sender_id); struct homa_rpc_bucket *bucket; @@ -255,7 +255,7 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, * use the RPC except to unlock it. */ void homa_rpc_end(struct homa_rpc *rpc) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { /* The goal for this function is to make the RPC inaccessible, * so that no other code will ever access it again. However, don't @@ -333,7 +333,7 @@ void homa_rpc_end(struct homa_rpc *rpc) * we just free the RPC. */ void homa_rpc_abort(struct homa_rpc *rpc, int error) - __must_hold(rpc_bucket_lock) + __must_hold(rpc->bucket->lock) { if (!homa_is_client(rpc->id)) { INC_METRIC(server_rpc_discards, 1); @@ -667,7 +667,7 @@ void homa_abort_sock_rpcs(struct homa_sock *hsk, int error) * by invoking homa_rpc_unlock. */ struct homa_rpc *homa_rpc_find_client(struct homa_sock *hsk, u64 id) - __cond_acquires(rpc_bucket_lock) + __cond_acquires(crpc->bucket->lock) { struct homa_rpc_bucket *bucket = homa_client_rpc_bucket(hsk, id); struct homa_rpc *crpc; @@ -694,7 +694,7 @@ struct homa_rpc *homa_rpc_find_client(struct homa_sock *hsk, u64 id) */ struct homa_rpc *homa_rpc_find_server(struct homa_sock *hsk, const struct in6_addr *saddr, u64 id) - __cond_acquires(rpc_bucket_lock) + __cond_acquires(srpc->bucket->lock) { struct homa_rpc_bucket *bucket = homa_server_rpc_bucket(hsk, id); struct homa_rpc *srpc; diff --git a/homa_rpc.h b/homa_rpc.h index 1f95276f..6d2725e7 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -442,7 +442,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all); * @rpc: RPC to lock. */ static inline void homa_rpc_lock(struct homa_rpc *rpc) - __acquires(rpc_bucket_lock) + __acquires(rpc->bucket->lock) { homa_bucket_lock(rpc->bucket, rpc->id); } @@ -454,7 +454,7 @@ static inline void homa_rpc_lock(struct homa_rpc *rpc) * currently owned by someone else. */ static inline int homa_rpc_try_lock(struct homa_rpc *rpc) - __cond_acquires(rpc_bucket_lock) + __cond_acquires(rpc->bucket->lock) { if (!spin_trylock_bh(&rpc->bucket->lock)) return 0; @@ -466,7 +466,7 @@ static inline int homa_rpc_try_lock(struct homa_rpc *rpc) * @rpc: RPC to unlock. */ static inline void homa_rpc_unlock(struct homa_rpc *rpc) - __releases(rpc_bucket_lock) + __releases(rpc->bucket->lock) { homa_bucket_unlock(rpc->bucket, rpc->id); } diff --git a/homa_sock.c b/homa_sock.c index 29e4c3e1..8da0129a 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -465,7 +465,7 @@ void homa_sock_lock_slow(struct homa_sock *hsk) * Used only for metrics. */ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id) - __acquires(rpc_bucket_lock) + __acquires(bucket->lock) { u64 start = homa_clock(); diff --git a/homa_sock.h b/homa_sock.h index 2e4439f7..1b342055 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -91,7 +91,7 @@ struct homa_rpc_bucket { * See "Homa Locking Strategy" in homa_impl.h for more info about * locking. */ - spinlock_t lock __context__(rpc_bucket_lock, 1, 1); + spinlock_t lock; /** * @id: identifier for this bucket, used in error messages etc. @@ -384,7 +384,7 @@ static inline struct homa_rpc_bucket * Used only for metrics. */ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) - __acquires(rpc_bucket_lock) + __acquires(bucket->lock) { if (!spin_trylock_bh(&bucket->lock)) homa_bucket_lock_slow(bucket, id); @@ -397,7 +397,7 @@ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) * Used only for metrics. */ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) - __acquires(rpc_bucket_lock) + __acquires(bucket->lock) { spin_lock_bh(&bucket->lock); } @@ -409,7 +409,7 @@ static inline void homa_bucket_lock(struct homa_rpc_bucket *bucket, u64 id) * @id: ID of the RPC that was using the lock. */ static inline void homa_bucket_unlock(struct homa_rpc_bucket *bucket, u64 id) - __releases(rpc_bucket_lock) + __releases(bucket->lock) { spin_unlock_bh(&bucket->lock); } From 3aa0eec7ee413d511e5b58a51bc597f484c5aadf Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Jul 2025 11:54:20 -0700 Subject: [PATCH 370/625] Fix checkpatch issues --- Makefile | 2 +- homa_grant.c | 4 ++-- homa_skb.c | 2 +- timetrace.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 795b6cfa..b4ad9a14 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ kdoc: $(LINUX_SRC_DIR)/scripts/kernel-doc -none $(CHECK_SRCS) checkpatch: - $(LINUX_SRC_DIR)/scripts/checkpatch.pl --file --strict $(CHECK_SRCS) + $(LINUX_SRC_DIR)/scripts/checkpatch.pl --file --strict --codespell $(CHECK_SRCS) # Copy stripped source files to a Linux source tree HOMA_TARGET ?= $(LINUX_SRC_DIR)/net/homa diff --git a/homa_grant.c b/homa_grant.c index 261ee4ca..d3f9fbb3 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -755,8 +755,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) homa_grant_unmanage_rpc(rpc, &cand); /* Sending a grant is slow, so release the RPC lock while - * sending the grant to reduce contention. - */ + * sending the grant to reduce contention. + */ homa_rpc_hold(rpc); homa_rpc_unlock(rpc); homa_grant_send(rpc, priority); diff --git a/homa_skb.c b/homa_skb.c index 9953b985..c37be6da 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -66,7 +66,7 @@ int homa_skb_init(struct homa *homa) /** * homa_skb_cleanup() - Invoked when a struct homa is deleted; cleans * up information related to skb allocation. - * @homa: Overall inforamtion about the Homa transport. + * @homa: Overall information about the Homa transport. */ void homa_skb_cleanup(struct homa *homa) { diff --git a/timetrace.h b/timetrace.h index 3856b195..23a76561 100644 --- a/timetrace.h +++ b/timetrace.h @@ -52,7 +52,7 @@ struct tt_event { #define TT_BUF_SIZE BIT(TT_BUF_SIZE_EXP) /* Represents a sequence of events, typically consisting of all those - * generated by one thread. Has a fixed capacity, so slots are re-used + * generated by one thread. Has a fixed capacity, so slots are reused * on a circular basis. This class is not thread-safe. */ struct tt_buffer { From 102d7a044f2f51d1a73f96062c275b2ac91b5327 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 2 Jul 2025 13:24:45 -0700 Subject: [PATCH 371/625] Fix issues found by sparse --- homa_incoming.c | 5 +++-- homa_peer.c | 2 +- homa_plumbing.c | 4 ++-- homa_rpc.c | 4 ++-- homa_sock.c | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index d3a189e9..fa8d9471 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -788,8 +788,8 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, __must_hold(rpc->bucket->lock) { struct homa_resend_hdr *h = (struct homa_resend_hdr *)skb->data; - int offset = htonl(h->offset); - int length = htonl(h->length); + int offset = ntohl(h->offset); + int length = ntohl(h->length); int end = offset + length; struct homa_busy_hdr busy; @@ -1146,6 +1146,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) * is returned it will be locked and the caller must unlock. */ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) + __cond_acquires(rpc->bucket->lock) { IF_NO_STRIP(int avail_immediately = 1); struct homa_interest interest; diff --git a/homa_peer.c b/homa_peer.c index d2feb336..b8f6f6dc 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -19,7 +19,7 @@ #define rhashtable_walk_next mock_rht_walk_next #endif /* __UNIT_TEST__ */ -const struct rhashtable_params ht_params = { +static const struct rhashtable_params ht_params = { .key_len = sizeof(struct homa_peer_key), .key_offset = offsetof(struct homa_peer, ht_key), .head_offset = offsetof(struct homa_peer, ht_linkage), diff --git a/homa_plumbing.c b/homa_plumbing.c index 2bbfc802..619d1a8f 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -39,7 +39,7 @@ static struct homa homa_data; * variable is used only by a few functions called from Linux where there * is no struct homa* available. */ -struct homa *global_homa = &homa_data; +static struct homa *global_homa = &homa_data; /* This structure defines functions that handle various operations on * Homa sockets. These functions are relatively generic: they are called @@ -735,7 +735,7 @@ void homa_close(struct sock *sk, long timeout) struct homa_sock *hsk = homa_sk(sk); #ifndef __UPSTREAM__ /* See strip.py */ int port = hsk->port; -#endif/* See strip.py */ +#endif /* See strip.py */ homa_sock_shutdown(hsk); sk_common_release(sk); diff --git a/homa_rpc.c b/homa_rpc.c index f7201f0d..f4f4622d 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -29,7 +29,7 @@ */ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, const union sockaddr_in_union *dest) - __acquires(crpc->bucket->lock) + __cond_acquires(crpc->bucket->lock) { struct in6_addr dest_addr_as_ipv6 = canonical_ipv6_addr(dest); struct homa_rpc_bucket *bucket; @@ -114,7 +114,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, const struct in6_addr *source, struct homa_data_hdr *h, int *created) - __acquires(srpc->bucket->lock) + __cond_acquires(srpc->bucket->lock) { u64 id = homa_local_id(h->common.sender_id); struct homa_rpc_bucket *bucket; diff --git a/homa_sock.c b/homa_sock.c index 8da0129a..63e19954 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -319,7 +319,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) * homa_sock_destroy() - Release all of the internal resources associated * with a socket; is invoked at time when that is safe (i.e., all references * on the socket have been dropped). - * @hsk: Socket to destroy. + * @sk: Socket to destroy. */ void homa_sock_destroy(struct sock *sk) { From f4dac32940aead749386e18bbddc2dd127946930 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 3 Jul 2025 08:51:35 -0700 Subject: [PATCH 372/625] Update rsync-exclude.txt --- rsync-exclude.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rsync-exclude.txt b/rsync-exclude.txt index 19f6df50..8a8bf776 100644 --- a/rsync-exclude.txt +++ b/rsync-exclude.txt @@ -4,10 +4,9 @@ nbproject private cloudlab +patches reports *traces* -bytedance -mle __pycache__ *.data *.pyc From 564b52562f21c536ced8163b91c57c631a849056 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 3 Jul 2025 08:52:13 -0700 Subject: [PATCH 373/625] Don't copy strip-decl.py to net-next --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index b4ad9a14..9a67a1e4 100644 --- a/Makefile +++ b/Makefile @@ -73,8 +73,7 @@ CP_HDRS := homa_impl.h \ CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o homa_grant.o \ homa_metrics.o homa_offload.o homa_skb.o timetrace.o, $(HOMA_OBJS))) CP_EXTRAS := Kconfig \ - Makefile \ - strip_decl.py + Makefile CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS) $(CP_EXTRAS)) net-next: $(CP_TARGETS) $(LINUX_SRC_DIR)/include/uapi/linux/homa.h $(HOMA_TARGET)/%: % util/strip.py From 21995ff4b9c8dde37462f2060f4624c053783ad0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 3 Jul 2025 08:52:38 -0700 Subject: [PATCH 374/625] Add padding to struct homa_recvmsg_args to compile for 32 bits --- homa.h | 5 ++++- homa_plumbing.c | 2 +- test/unit_homa_plumbing.c | 6 ++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/homa.h b/homa.h index c33d1163..d7603488 100644 --- a/homa.h +++ b/homa.h @@ -70,7 +70,7 @@ struct homa_sendmsg_args { */ __u32 flags; - /** @reserved: Not currently used. */ + /** @reserved: Not currently used, must be 0. */ __u32 reserved; }; @@ -105,6 +105,9 @@ struct homa_recvmsg_args { */ __u32 num_bpages; + /** @reserved: Not currently used, must be 0. */ + __u32 reserved; + /** * @bpage_offsets: (in/out) Each entry is an offset into the buffer * region for the socket pool. When returned from recvmsg, the diff --git a/homa_plumbing.c b/homa_plumbing.c index 619d1a8f..09923283 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1164,7 +1164,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, tt_record2("homa_recvmsg starting, port %d, pid %d", hsk->port, current->pid); - if (control.num_bpages > HOMA_MAX_BPAGES) { + if (control.num_bpages > HOMA_MAX_BPAGES || control.reserved != 0) { result = -EINVAL; goto done; } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 97e58579..3ff08a12 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -691,6 +691,12 @@ TEST_F(homa_plumbing, homa_recvmsg__num_bpages_too_large) EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); } +TEST_F(homa_plumbing, homa_recvmsg__reserved_not_zero) +{ + self->recvmsg_args.reserved = 1; + EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, 0, &self->recvmsg_hdr.msg_namelen)); +} TEST_F(homa_plumbing, homa_recvmsg__no_buffer_pool) { struct homa_pool *saved_pool = self->hsk.buffer_pool; From e18d6baa717513558992df21490cbefbe4d02700 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 3 Jul 2025 08:53:33 -0700 Subject: [PATCH 375/625] Fix checkpatch issues --- homa_interest.c | 3 ++- homa_pacer.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/homa_interest.c b/homa_interest.c index 69fb00b8..9e1591cd 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -12,7 +12,8 @@ #endif /* See strip.py */ /** - * homa_interest_init_shared() - Initialize an interest and queue it up on a socket. + * homa_interest_init_shared() - Initialize an interest and queue it up on + * a socket. * @interest: Interest to initialize * @hsk: Socket on which the interests should be queued. Must be locked * by caller. diff --git a/homa_pacer.c b/homa_pacer.c index 5aa0abe7..291c8a02 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -247,7 +247,8 @@ void homa_pacer_xmit(struct homa_pacer *pacer) return; while (1) { - queue_cycles = atomic64_read(&pacer->link_idle_time) - homa_clock(); + queue_cycles = atomic64_read(&pacer->link_idle_time) - + homa_clock(); if (queue_cycles >= pacer->max_nic_queue_cycles) break; if (list_empty(&pacer->throttled_rpcs)) From 0a26cf3c4ad48e43b86bb91243a2b7f348d008e8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sat, 5 Jul 2025 17:50:13 -0700 Subject: [PATCH 376/625] Refactor long-term metrics Add more metrics in order to separate requests from responses and client RPCs from server RPCs. --- homa_devel.c | 173 ++++++++++++++++++++------ homa_devel.h | 45 ++++--- homa_incoming.c | 24 +++- homa_metrics.c | 254 +++++++++++++++++++++----------------- homa_metrics.h | 113 ++++++++++++++--- homa_outgoing.c | 31 +++-- homa_plumbing.c | 10 +- homa_rpc.c | 33 ++++- homa_timer.c | 2 +- test/unit_homa_incoming.c | 66 +++++++++- test/unit_homa_outgoing.c | 40 ++++++ test/unit_homa_plumbing.c | 22 ++++ test/unit_homa_rpc.c | 85 +++++++++++++ util/metrics.py | 30 ++--- 14 files changed, 698 insertions(+), 230 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 6f85d584..a9d37cfa 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -31,7 +31,7 @@ static u32 seed; /* Used to record a history of rx state. */ #define MAX_RX_SNAPSHOTS 1000 -static struct homa_rx_snapshot rx_snapshots[MAX_RX_SNAPSHOTS]; +static struct homa_rpc_snapshot rpc_snapshots[MAX_RX_SNAPSHOTS]; static int next_snapshot; /* homa_clock() time when most recent rx snapshot was taken. */ @@ -930,61 +930,97 @@ int homa_drop_packet(struct homa *homa) #endif /* See strip.py */ /** - * homa_snapshot_rx() - This function is called by homa_timer; it collects - * data about the backlog of partially received incoming messages. + * homa_snapshot_get_stats() - Fill in a homa_rpc_snapshot with the latest + * statistics. + * @snap: Structure to fill in. */ -void homa_snapshot_rx(void) +void homa_snapshot_get_stats(struct homa_rpc_snapshot *snap) { - struct homa_rx_snapshot *snap; - u64 now = homa_clock(); int core; + memset(snap, 0, sizeof(*snap)); + snap->clock = homa_clock(); + for (core = 0; core < nr_cpu_ids; core++) { + struct homa_metrics *m = &per_cpu(homa_metrics, core); + + snap->client_requests_started += m->client_requests_started; + snap->client_request_bytes_started += + m->client_request_bytes_started; + snap->client_request_bytes_done += m->client_request_bytes_done; + snap->client_requests_done += m->client_requests_done; + + snap->client_responses_started += m->client_responses_started; + snap->client_response_bytes_started += + m->client_response_bytes_started; + snap->client_response_bytes_done += + m->client_response_bytes_done; + snap->client_responses_done += m->client_responses_done; + + snap->server_requests_started += m->server_requests_started; + snap->server_request_bytes_started += + m->server_request_bytes_started; + snap->server_request_bytes_done += m->server_request_bytes_done; + snap->server_requests_done += m->server_requests_done; + + snap->server_responses_started += m->server_responses_started; + snap->server_response_bytes_started += + m->server_response_bytes_started; + snap->server_response_bytes_done += + m->server_response_bytes_done; + snap->server_responses_done += m->server_responses_done; + } +} + +/** + * homa_snapshot_rpcs() - This function is called by homa_timer; it collects + * data about overall progress of client and server RPCs. + */ +void homa_snapshot_rpcs(void) +{ + struct homa_rpc_snapshot *snap; + u64 now = homa_clock(); + if (snapshot_interval == 0) snapshot_interval = homa_clock_khz() * RX_SNAPSHOT_INTERVAL; if (now < snapshot_time + snapshot_interval) return; snapshot_time = now; - snap = &rx_snapshots[next_snapshot]; - snap->clock = now; - snap->msgs_started = 0; - snap->msgs_ended = 0; - snap->bytes_started = 0; - snap->bytes_retired = 0; - for (core = 0; core < nr_cpu_ids; core++) { - struct homa_metrics *m = &per_cpu(homa_metrics, core); - - snap->msgs_started += m->rx_msgs_started; - snap->msgs_ended += m->rx_msgs_ended; - snap->bytes_started += m->rx_msg_bytes_started; - snap->bytes_retired += m->rx_msg_bytes_retired; - } + snap = &rpc_snapshots[next_snapshot]; + homa_snapshot_get_stats(snap); next_snapshot++; if (next_snapshot >= MAX_RX_SNAPSHOTS) next_snapshot = 0; } /** - * homa_rx_snapshot_log_tt() - Dump all of the snapshot data for incoming - * messages to the timetrace. + * homa_rpc_snapshot_log_tt() - Dump all of the RPC snapshot data to the + * timetrace. */ -void homa_rx_snapshot_log_tt(void) +void homa_rpc_snapshot_log_tt(void) { - struct homa_rx_snapshot *snap; + struct homa_rpc_snapshot *snap; u64 now = homa_clock(); - u64 mbase, bbase; + u64 creq_base, creq_bbase, cresp_base, cresp_bbase; + u64 sreq_base, sreq_bbase, sresp_base, sresp_bbase; u64 usecs; int i; i = next_snapshot; - /* Adjust all the output values to start at 0, in order to avoid + /* Offset all the output values to start at 0, in order to avoid * wraparound in 32-bit timetrace values. */ - mbase = rx_snapshots[i].msgs_ended; - bbase = rx_snapshots[i].bytes_retired; + creq_base = rpc_snapshots[i].client_requests_done; + creq_bbase = rpc_snapshots[i].client_request_bytes_done; + cresp_base = rpc_snapshots[i].client_responses_done; + cresp_bbase = rpc_snapshots[i].client_response_bytes_done; + sreq_base = rpc_snapshots[i].server_requests_done; + sreq_bbase = rpc_snapshots[i].server_request_bytes_done; + sresp_base = rpc_snapshots[i].server_responses_done; + sresp_bbase = rpc_snapshots[i].server_response_bytes_done; do { - snap = &rx_snapshots[i]; + snap = &rpc_snapshots[i]; /* Compute how many microseconds before now this snapshot * was taken. @@ -992,18 +1028,79 @@ void homa_rx_snapshot_log_tt(void) usecs = 1000*(now - snap->clock); do_div(usecs, homa_clock_khz()); - tt_record3("rx snapshot part 1, usecs %d, msgs_started %d, msgs_ended %d", - -usecs, snap->msgs_started - mbase, - snap->msgs_ended - mbase); - tt_record3("rx snapshot part 2, usecs %d, 4kbytes_started %d, 4kbytes_retired %d", - -usecs, (snap->bytes_started - bbase) >> 12, - (snap->bytes_retired - bbase) >> 12); - tt_record2("rx snapshot time: 0x%x%08x", snap->clock >> 32, - snap->clock & 0xffffffff); + tt_record1("rpc snapshot usecs %d", -usecs); + tt_record4("rpc snapshot client requests started %d, kbytes_started %d, kbytes_done %d, done %d", + snap->client_requests_started - creq_base, + (snap->client_request_bytes_started - + creq_bbase) >> 10, + (snap->client_request_bytes_done - + creq_bbase) >> 10, + snap->client_requests_done - creq_base); + tt_record4("rpc snapshot client responses started %d, kbytes_started %d, kbytes_done %d, done %d", + snap->client_responses_started - cresp_base, + (snap->client_response_bytes_started - + cresp_bbase) >> 10, + (snap->client_response_bytes_done - + cresp_bbase) >> 10, + snap->client_responses_done - cresp_base); + tt_record4("rpc snapshot server requests started %d, kbytes_started %d, kbytes_done %d, done %d", + snap->server_requests_started - sreq_base, + (snap->server_request_bytes_started - + sreq_bbase) >> 10, + (snap->server_request_bytes_done - + sreq_bbase) >> 10, + snap->server_requests_done - sreq_base); + tt_record4("rpc snapshot server responses started %d, kbytes_started %d, kbytes_done %d, done %d", + snap->server_responses_started - sresp_base, + (snap->server_response_bytes_started - + sresp_bbase) >> 10, + (snap->server_response_bytes_done - + sresp_bbase) >> 10, + snap->server_responses_done - sresp_base); i++; if (i >= MAX_RX_SNAPSHOTS) i = 0; } while (i != next_snapshot); - +} +/** + * homa_rpc_stats_log() - Print statistics on RPC progress to the system log. + */ +void homa_rpc_stats_log(void) +{ + struct homa_rpc_snapshot snap; + + homa_snapshot_get_stats(&snap); + pr_notice("Client requests: started %llu, done %llu, delta %llu\n", + snap.client_requests_started, snap.client_requests_done, + snap.client_requests_started - snap.client_requests_done); + pr_notice("Client request bytes: started %llu, bytes_done %llu, delta %llu\n", + snap.client_request_bytes_started, + snap.client_request_bytes_done, + snap.client_request_bytes_started - + snap.client_request_bytes_done); + pr_notice("Client responses: started %llu, done %llu, delta %llu\n", + snap.client_responses_started, snap.client_responses_done, + snap.client_responses_started - snap.client_responses_done); + pr_notice("Client response bytes: started %llu, bytes_done %llu, delta %llu\n", + snap.client_response_bytes_started, + snap.client_response_bytes_done, + snap.client_response_bytes_started - + snap.client_response_bytes_done); + pr_notice("Server requests: started %llu, done %llu, delta %llu\n", + snap.server_requests_started, snap.server_requests_done, + snap.server_requests_started - snap.server_requests_done); + pr_notice("Server request bytes: started %llu, bytes_done %llu, delta %llu\n", + snap.server_request_bytes_started, + snap.server_request_bytes_done, + snap.server_request_bytes_started - + snap.server_request_bytes_done); + pr_notice("Server responses: started %llu, done %llu, delta %llu\n", + snap.server_responses_started, snap.server_responses_done, + snap.server_responses_started - snap.server_responses_done); + pr_notice("Server response bytes: started %llu, bytes_done %llu, delta %llu\n", + snap.server_response_bytes_started, + snap.server_response_bytes_done, + snap.server_response_bytes_started - + snap.server_response_bytes_done); } diff --git a/homa_devel.h b/homa_devel.h index ee74752f..40f4e65c 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -43,24 +43,35 @@ enum homa_freeze_type { }; /** - * struct homa_rx_state - Captures the state of incoming messages at a - * point in time. + * struct homa_rpc_snapshot - Captures the state of RPCs (both client and + * server) on a node at a given point in time. */ -struct homa_rx_snapshot { +struct homa_rpc_snapshot { /** @clock: homa_clock() value when data was gathered. */ u64 clock; - /** @msgs_started: sum of all rx_msgs_started metrics. */ - u64 msgs_started; - - /** @msgs_ended: sum of all rx_msgs_ended metrics. */ - u64 msgs_ended; - - /** @bytes_started: sum of all rx_msg_bytes_started metrics. */ - u64 bytes_started; - - /** @bytes_retired: sum of all rx_msg_bytes_retired metrics. */ - u64 bytes_retired; + /* Each value below is the sum (across all cores) of the metric with + * the same name. + */ + u64 client_requests_started; + u64 client_request_bytes_started; + u64 client_request_bytes_done; + u64 client_requests_done; + + u64 client_responses_started; + u64 client_response_bytes_started; + u64 client_response_bytes_done; + u64 client_responses_done; + + u64 server_requests_started; + u64 server_request_bytes_started; + u64 server_request_bytes_done; + u64 server_requests_done; + + u64 server_responses_started; + u64 server_response_bytes_started; + u64 server_response_bytes_done; + u64 server_responses_done; }; /** @@ -115,8 +126,10 @@ void homa_rpc_log(struct homa_rpc *rpc); void homa_rpc_log_active(struct homa *homa, uint64_t id); void homa_rpc_log_tt(struct homa_rpc *rpc); void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); -void homa_rx_snapshot_log_tt(void); -void homa_snapshot_rx(void); +void homa_rpc_snapshot_log_tt(void); +void homa_rpc_stats_log(void); +void homa_snapshot_get_stats(struct homa_rpc_snapshot *snap); +void homa_snapshot_rpcs(void); int homa_snprintf(char *buffer, int size, int used, const char *format, ...) __printf(4, 5); char *homa_symbol_for_type(uint8_t type); diff --git a/homa_incoming.c b/homa_incoming.c index fa8d9471..80ea5d6f 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -66,8 +66,13 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) INC_METRIC(large_msg_count, 1); INC_METRIC(large_msg_bytes, length); } - INC_METRIC(rx_msgs_started, 1); - INC_METRIC(rx_msg_bytes_started, length); + if (homa_is_client(rpc->id)) { + INC_METRIC(client_responses_started, 1); + INC_METRIC(client_response_bytes_started, length); + } else { + INC_METRIC(server_requests_started, 1); + INC_METRIC(server_request_bytes_started, length); + } #endif /* See strip.py */ return 0; } @@ -253,14 +258,21 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) return; keep: + __skb_queue_tail(&rpc->msgin.packets, skb); + rpc->msgin.bytes_remaining -= length; #ifndef __STRIP__ /* See strip.py */ if (h->retransmit) INC_METRIC(resent_packets_used, 1); + if (homa_is_client(rpc->id)) { + INC_METRIC(client_response_bytes_done, length); + INC_METRIC(client_responses_done, + rpc->msgin.bytes_remaining == 0); + } else { + INC_METRIC(server_request_bytes_done, length); + INC_METRIC(server_requests_done, + rpc->msgin.bytes_remaining == 0); + } #endif /* See strip.py */ - __skb_queue_tail(&rpc->msgin.packets, skb); - rpc->msgin.bytes_remaining -= length; - INC_METRIC(rx_msg_bytes_retired, length); - INC_METRIC(rx_msgs_ended, rpc->msgin.bytes_remaining == 0); } /** diff --git a/homa_metrics.c b/homa_metrics.c index 510ecae0..0af43c89 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -114,104 +114,128 @@ char *homa_metrics_print(void) homa_mout.length = 0; #define M(...) homa_metric_append(__VA_ARGS__) - M("time_cycles %20llu homa_clock() time when metrics were gathered\n", + M("time_cycles %20llu homa_clock() time when metrics were gathered\n", homa_clock()); - M("cpu_khz %15llu Clock rate in khz\n", + M("cpu_khz %15llu Clock rate in khz\n", homa_clock_khz()); for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = &per_cpu(homa_metrics, core); s64 delta; - M("core %15d Core id for following metrics\n", + M("core %15d Core id for following metrics\n", core); for (i = 0; i < HOMA_NUM_SMALL_COUNTS; i++) { - M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", + M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", (i + 1) * 64, m->small_msg_bytes[i], lower, (i + 1) * 64); lower = (i + 1) * 64 + 1; } for (i = (HOMA_NUM_SMALL_COUNTS * 64) / 1024; i < HOMA_NUM_MEDIUM_COUNTS; i++) { - M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", + M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", (i + 1) * 1024, m->medium_msg_bytes[i], lower, (i + 1) * 1024); lower = (i + 1) * 1024 + 1; } - M("large_msg_count %15llu # of incoming messages >= %d bytes\n", + M("large_msg_count %15llu # of incoming messages >= %d bytes\n", m->large_msg_count, lower); - M("large_msg_bytes %15llu Bytes in incoming messages >= %d bytes\n", + M("large_msg_bytes %15llu Bytes in incoming messages >= %d bytes\n", m->large_msg_bytes, lower); - M("rx_msgs_started %15llu Messages for which at least one packet was received\n", - m->rx_msgs_started); - M("rx_msg_bytes_started %15llu Total bytes in new message starts\n", - m->rx_msg_bytes_started); - M("rx_msg_bytes_retired %15llu Incoming message bytes either received or aborted\n", - m->rx_msg_bytes_retired); - M("rx_msgs_ended %15llu Incoming messages completed or aborted\n", - m->rx_msgs_ended); - M("sent_msg_bytes %15llu Total bytes in all outgoing messages\n", + M("client_requests_started %15llu Client RPCs initiated\n", + m->client_requests_started); + M("client_request_bytes_started %15llu Request bytes in all initiated client RPCs\n", + m->client_request_bytes_started); + M("client_request_bytes_done %15llu Transmitted request bytes in all client RPCs\n", + m->client_request_bytes_done); + M("client_requests_done %15llu Client RPC requests fully transmitted\n", + m->client_requests_done); + M("client_responses_started %15llu Client RPCs for which at least one response pkt recvd\n", + m->client_responses_started); + M("client_response_bytes_started %15llu Response bytes in all RPCS in client_responses_started\n", + m->client_response_bytes_started); + M("client_response_bytes_done %15llu Response bytes received for all client RPCs\n", + m->client_response_bytes_done); + M("client_responses_done %15llu Client RPC responses fully received\n", + m->client_responses_done); + M("server_requests_started %15llu Server RPCs for which at least one request pkt rcvd\n", + m->server_requests_started); + M("server_request_bytes_started %15llu Request bytes in all RPCS in server_requests_started\n", + m->server_request_bytes_started); + M("server_request_bytes_done %15llu Request bytes received for all server RPCs\n", + m->server_request_bytes_done); + M("server_requests_done %15llu Server RPC requests fully received\n", + m->server_requests_done); + M("server_responses_started %15llu Server RPCs for which response was initiated\n", + m->server_responses_started); + M("server_response_bytes_started %15llu Message bytes in all initiated server responses\n", + m->server_response_bytes_started); + M("server_response_bytes_done %15llu Transmitted response bytes in all server RPCs\n", + m->server_response_bytes_done); + M("server_responses_done %15llu Server RPC responses fully transmitted\n", + m->server_responses_done); + M("sent_msg_bytes %15llu Total bytes in all outgoing messages\n", m->sent_msg_bytes); for (i = DATA; i <= MAX_OP; i++) { char *symbol = homa_symbol_for_type(i); - M("packets_sent_%-7s %15llu %s packets sent\n", + M("packets_sent_%-12s %15llu %s packets sent\n", symbol, m->packets_sent[i - DATA], symbol); } for (i = DATA; i <= MAX_OP; i++) { char *symbol = homa_symbol_for_type(i); - M("packets_rcvd_%-7s %15llu %s packets received\n", + M("packets_rcvd_%-12s %15llu %s packets received\n", symbol, m->packets_received[i - DATA], symbol); } for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { - M("priority%d_bytes %15llu Bytes sent at priority %d (including headers)\n", + M("priority%d_bytes %15llu Bytes sent at priority %d (including headers)\n", i, m->priority_bytes[i], i); } for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { - M("priority%d_packets %15llu Packets sent at priority %d\n", + M("priority%d_packets %15llu Packets sent at priority %d\n", i, m->priority_packets[i], i); } - M("skb_allocs %15llu sk_buffs allocated\n", + M("skb_allocs %15llu sk_buffs allocated\n", m->skb_allocs); - M("skb_alloc_cycles %15llu Time spent allocating sk_buffs\n", + M("skb_alloc_cycles %15llu Time spent allocating sk_buffs\n", m->skb_alloc_cycles); - M("skb_frees %15llu Data sk_buffs freed in normal paths\n", + M("skb_frees %15llu Data sk_buffs freed in normal paths\n", m->skb_frees); - M("skb_free_cycles %15llu Time spent freeing data sk_buffs\n", + M("skb_free_cycles %15llu Time spent freeing data sk_buffs\n", m->skb_free_cycles); - M("skb_page_allocs %15llu Pages allocated for sk_buff frags\n", + M("skb_page_allocs %15llu Pages allocated for sk_buff frags\n", m->skb_page_allocs); - M("skb_page_alloc_cycles %15llu Time spent allocating pages for sk_buff frags\n", + M("skb_page_alloc_cycles %15llu Time spent allocating pages for sk_buff frags\n", m->skb_page_alloc_cycles); - M("requests_received %15llu Incoming request messages\n", + M("requests_received %15llu Incoming request messages\n", m->requests_received); - M("responses_received %15llu Incoming response messages\n", + M("responses_received %15llu Incoming response messages\n", m->responses_received); - M("wait_none %15llu Messages received without blocking or polling\n", + M("wait_none %15llu Messages received without blocking or polling\n", m->wait_none); - M("wait_fast %15llu Messages received while polling\n", + M("wait_fast %15llu Messages received while polling\n", m->wait_fast); - M("wait_block %15llu Messages received after thread went to sleep\n", + M("wait_block %15llu Messages received after thread went to sleep\n", m->wait_block); - M("handoffs_thread_waiting %15llu RPC handoffs to waiting threads (vs. queue)\n", + M("handoffs_thread_waiting %15llu RPC handoffs to waiting threads (vs. queue)\n", m->handoffs_thread_waiting); - M("handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", + M("handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", m->handoffs_alt_thread); - M("poll_cycles %15llu Time spent polling for incoming messages\n", + M("poll_cycles %15llu Time spent polling for incoming messages\n", m->poll_cycles); - M("softirq_calls %15llu Calls to homa_softirq (i.e. # GRO pkts received)\n", + M("softirq_calls %15llu Calls to homa_softirq (i.e. # GRO pkts received)\n", m->softirq_calls); - M("softirq_cycles %15llu Time spent in homa_softirq during SoftIRQ\n", + M("softirq_cycles %15llu Time spent in homa_softirq during SoftIRQ\n", m->softirq_cycles); - M("bypass_softirq_cycles %15llu Time spent in homa_softirq during bypass from GRO\n", + M("bypass_softirq_cycles %15llu Time spent in homa_softirq during bypass from GRO\n", m->bypass_softirq_cycles); - M("linux_softirq_cycles %15llu Time spent in all Linux SoftIRQ\n", + M("linux_softirq_cycles %15llu Time spent in all Linux SoftIRQ\n", m->linux_softirq_cycles); - M("napi_cycles %15llu Time spent in NAPI-level packet handling\n", + M("napi_cycles %15llu Time spent in NAPI-level packet handling\n", m->napi_cycles); - M("send_cycles %15llu Time spent in homa_sendmsg for requests\n", + M("send_cycles %15llu Time spent in homa_sendmsg for requests\n", m->send_cycles); - M("send_calls %15llu Total invocations of homa_sendmsg for equests\n", + M("send_calls %15llu Total invocations of homa_sendmsg for equests\n", m->send_calls); // It is possible for us to get here at a time when a // thread has been blocked for a long time and has @@ -222,157 +246,157 @@ char *homa_metrics_print(void) delta = m->recv_cycles - m->blocked_cycles; if (delta < 0) delta = 0; - M("recv_cycles %15llu Unblocked time spent in recvmsg kernel call\n", + M("recv_cycles %15llu Unblocked time spent in recvmsg kernel call\n", delta); - M("recv_calls %15llu Total invocations of recvmsg kernel call\n", + M("recv_calls %15llu Total invocations of recvmsg kernel call\n", m->recv_calls); - M("blocked_cycles %15llu Time spent blocked in homa_recvmsg\n", + M("blocked_cycles %15llu Time spent blocked in homa_recvmsg\n", m->blocked_cycles); - M("reply_cycles %15llu Time spent in homa_sendmsg for responses\n", + M("reply_cycles %15llu Time spent in homa_sendmsg for responses\n", m->reply_cycles); - M("reply_calls %15llu Total invocations of homa_sendmsg for responses\n", + M("reply_calls %15llu Total invocations of homa_sendmsg for responses\n", m->reply_calls); - M("abort_cycles %15llu Time spent in homa_ioc_abort kernel call\n", + M("abort_cycles %15llu Time spent in homa_ioc_abort kernel call\n", m->reply_cycles); - M("abort_calls %15llu Total invocations of abort kernel call\n", + M("abort_calls %15llu Total invocations of abort kernel call\n", m->reply_calls); - M("so_set_buf_cycles %15llu Time spent in setsockopt SO_HOMA_RCVBUF\n", + M("so_set_buf_cycles %15llu Time spent in setsockopt SO_HOMA_RCVBUF\n", m->so_set_buf_cycles); - M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_RCVBUF\n", + M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_RCVBUF\n", m->so_set_buf_calls); - M("grant_lock_cycles %15llu Time spent with grant lock locked\n", + M("grant_lock_cycles %15llu Time spent with grant lock locked\n", m->grant_lock_cycles); - M("timer_cycles %15llu Time spent in homa_timer\n", + M("timer_cycles %15llu Time spent in homa_timer\n", m->timer_cycles); - M("timer_reap_cycles %15llu Time in homa_timer spent reaping RPCs\n", + M("timer_reap_cycles %15llu Time in homa_timer spent reaping RPCs\n", m->timer_reap_cycles); - M("data_pkt_reap_cycles %15llu Time in homa_data_pkt spent reaping RPCs\n", + M("data_pkt_reap_cycles %15llu Time in homa_data_pkt spent reaping RPCs\n", m->data_pkt_reap_cycles); - M("pacer_cycles %15llu Time spent in homa_pacer_main\n", + M("pacer_cycles %15llu Time spent in homa_pacer_main\n", m->pacer_cycles); - M("homa_cycles %15llu Total time in all Homa-related functions\n", + M("homa_cycles %15llu Total time in all Homa-related functions\n", m->softirq_cycles + m->napi_cycles + m->send_cycles + m->recv_cycles + m->reply_cycles - m->blocked_cycles + m->timer_cycles + m->pacer_cycles); - M("pacer_lost_cycles %15llu Lost transmission time because pacer was slow\n", + M("pacer_lost_cycles %15llu Lost transmission time because pacer was slow\n", m->pacer_lost_cycles); - M("pacer_bytes %15llu Bytes transmitted when the pacer was active\n", + M("pacer_bytes %15llu Bytes transmitted when the pacer was active\n", m->pacer_bytes); - M("pacer_skipped_rpcs %15llu Pacer aborts because of locked RPCs\n", + M("pacer_skipped_rpcs %15llu Pacer aborts because of locked RPCs\n", m->pacer_skipped_rpcs); - M("pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", + M("pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", m->pacer_needed_help); - M("throttled_cycles %15llu Time when the throttled queue was nonempty\n", + M("throttled_cycles %15llu Time when the throttled queue was nonempty\n", m->throttled_cycles); - M("resent_packets %15llu DATA packets sent in response to RESENDs\n", + M("resent_packets %15llu DATA packets sent in response to RESENDs\n", m->resent_packets); - M("peer_allocs %15llu New entries created in peer table\n", + M("peer_allocs %15llu New entries created in peer table\n", m->peer_allocs); - M("peer_kmalloc_errors %15llu kmalloc failures creating peer table entries\n", + M("peer_kmalloc_errors %15llu kmalloc failures creating peer table entries\n", m->peer_kmalloc_errors); - M("peer_route_errors %15llu Routing failures creating peer table entries\n", + M("peer_route_errors %15llu Routing failures creating peer table entries\n", m->peer_route_errors); - M("peer_dst_refreshes %15llu Obsolete dsts had to be regenerated\n", + M("peer_dst_refreshes %15llu Obsolete dsts had to be regenerated\n", m->peer_dst_refreshes); - M("control_xmit_errors %15llu Errors sending control packets\n", + M("control_xmit_errors %15llu Errors sending control packets\n", m->control_xmit_errors); - M("data_xmit_errors %15llu Errors sending data packets\n", + M("data_xmit_errors %15llu Errors sending data packets\n", m->data_xmit_errors); - M("unknown_rpcs %15llu Non-grant packets discarded because RPC unknown\n", + M("unknown_rpcs %15llu Non-grant packets discarded because RPC unknown\n", m->unknown_rpcs); - M("server_cant_create_rpcs %15llu Packets discarded because server couldn't create RPC\n", + M("server_cant_create_rpcs %15llu Packets discarded because server couldn't create RPC\n", m->server_cant_create_rpcs); - M("unknown_packet_types %15llu Packets discarded because of unsupported type\n", + M("unknown_packet_types %15llu Packets discarded because of unsupported type\n", m->unknown_packet_types); - M("short_packets %15llu Packets discarded because too short\n", + M("short_packets %15llu Packets discarded because too short\n", m->short_packets); - M("packet_discards %15llu Non-resent packets discarded because data already received\n", + M("packet_discards %15llu Non-resent packets discarded because data already received\n", m->packet_discards); - M("resent_discards %15llu Resent packets discarded because data already received\n", + M("resent_discards %15llu Resent packets discarded because data already received\n", m->resent_discards); - M("resent_packets_used %15llu Retransmitted packets that were actually used\n", + M("resent_packets_used %15llu Retransmitted packets that were actually used\n", m->resent_packets_used); - M("rpc_timeouts %15llu RPCs aborted because peer was nonresponsive\n", + M("rpc_timeouts %15llu RPCs aborted because peer was nonresponsive\n", m->rpc_timeouts); - M("server_rpc_discards %15llu RPCs discarded by server because of errors\n", + M("server_rpc_discards %15llu RPCs discarded by server because of errors\n", m->server_rpc_discards); - M("server_rpcs_unknown %15llu RPCs aborted by server because unknown to client\n", + M("server_rpcs_unknown %15llu RPCs aborted by server because unknown to client\n", m->server_rpcs_unknown); - M("client_lock_misses %15llu Bucket lock misses for client RPCs\n", + M("client_lock_misses %15llu Bucket lock misses for client RPCs\n", m->client_lock_misses); - M("client_lock_miss_cycles %15llu Time lost waiting for client bucket locks\n", + M("client_lock_miss_cycles %15llu Time lost waiting for client bucket locks\n", m->client_lock_miss_cycles); - M("server_lock_misses %15llu Bucket lock misses for server RPCs\n", + M("server_lock_misses %15llu Bucket lock misses for server RPCs\n", m->server_lock_misses); - M("server_lock_miss_cycles %15llu Time lost waiting for server bucket locks\n", + M("server_lock_miss_cycles %15llu Time lost waiting for server bucket locks\n", m->server_lock_miss_cycles); - M("socket_lock_misses %15llu Socket lock misses\n", + M("socket_lock_misses %15llu Socket lock misses\n", m->socket_lock_misses); - M("socket_lock_miss_cycles %15llu Time lost waiting for socket locks\n", + M("socket_lock_miss_cycles %15llu Time lost waiting for socket locks\n", m->socket_lock_miss_cycles); - M("throttle_lock_misses %15llu Throttle lock misses\n", + M("throttle_lock_misses %15llu Throttle lock misses\n", m->throttle_lock_misses); - M("throttle_lock_miss_cycles %15llu Time lost waiting for throttle locks\n", + M("throttle_lock_miss_cycles %15llu Time lost waiting for throttle locks\n", m->throttle_lock_miss_cycles); - M("peer_ack_lock_misses %15llu Misses on peer ack locks\n", + M("peer_ack_lock_misses %15llu Misses on peer ack locks\n", m->peer_ack_lock_misses); - M("peer_ack_lock_miss_cycles %15llu Time lost waiting for peer ack locks\n", + M("peer_ack_lock_miss_cycles %15llu Time lost waiting for peer ack locks\n", m->peer_ack_lock_miss_cycles); - M("grant_lock_misses %15llu Grant lock misses\n", + M("grant_lock_misses %15llu Grant lock misses\n", m->grant_lock_misses); - M("grant_lock_miss_cycles %15llu Time lost waiting for grant lock\n", + M("grant_lock_miss_cycles %15llu Time lost waiting for grant lock\n", m->grant_lock_miss_cycles); - M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", + M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", m->grantable_rpcs_integral); - M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", + M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", m->grant_check_calls); - M("grant_check_locked %15llu Number of calls to homa_grant_check_rpc that acquired grant lock\n", + M("grant_check_locked %15llu Number of calls to homa_grant_check_rpc that acquired grant lock\n", m->grant_check_locked); - M("grant_check_others %15llu Number of times homa_grant_check_rpc checked non-caller RPCs for grants\n", + M("grant_check_others %15llu Number of times homa_grant_check_rpc checked non-caller RPCs for grants\n", m->grant_check_others); - M("grant_check_recalcs %15llu Number of times homa_grant_check_rpc updated grant priority order\n", + M("grant_check_recalcs %15llu Number of times homa_grant_check_rpc updated grant priority order\n", m->grant_check_recalcs); - M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", + M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", m->grant_priority_bumps); - M("fifo_grants %15llu Grants issued using FIFO priority\n", + M("fifo_grants %15llu Grants issued using FIFO priority\n", m->fifo_grants); - M("fifo_grants_no_incoming %15llu FIFO grants to messages with no outstanding grants\n", + M("fifo_grants_no_incoming %15llu FIFO grants to messages with no outstanding grants\n", m->fifo_grants_no_incoming); - M("disabled_reaps %15llu Reaper invocations that were disabled\n", + M("disabled_reaps %15llu Reaper invocations that were disabled\n", m->disabled_reaps); - M("deferred_rpc_reaps %15llu RPCs skipped by reaper because still in use\n", + M("deferred_rpc_reaps %15llu RPCs skipped by reaper because still in use\n", m->deferred_rpc_reaps); - M("reaper_calls %15llu Reaper invocations that were not disabled\n", + M("reaper_calls %15llu Reaper invocations that were not disabled\n", m->reaper_calls); - M("reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper calls\n", + M("reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper calls\n", m->reaper_dead_skbs); - M("throttle_list_adds %15llu Calls to homa_add_to_throttled\n", + M("throttle_list_adds %15llu Calls to homa_add_to_throttled\n", m->throttle_list_adds); - M("throttle_list_checks %15llu List elements checked in homa_add_to_throttled\n", + M("throttle_list_checks %15llu List elements checked in homa_add_to_throttled\n", m->throttle_list_checks); - M("ack_overflows %15llu Explicit ACKs sent because peer->acks was full\n", + M("ack_overflows %15llu Explicit ACKs sent because peer->acks was full\n", m->ack_overflows); - M("ignored_need_acks %15llu NEED_ACKs ignored because RPC result not yet received\n", + M("ignored_need_acks %15llu NEED_ACKs ignored because RPC result not yet received\n", m->ignored_need_acks); - M("bpage_reuses %15llu Buffer page could be reused because ref count was zero\n", + M("bpage_reuses %15llu Buffer page could be reused because ref count was zero\n", m->bpage_reuses); - M("buffer_alloc_failures %15llu homa_pool_alloc_msg didn't find enough buffer space for an RPC\n", + M("buffer_alloc_failures %15llu homa_pool_alloc_msg didn't find enough buffer space for an RPC\n", m->buffer_alloc_failures); - M("linux_pkt_alloc_bytes %15llu Bytes allocated in new packets by NIC driver due to cache overflows\n", + M("linux_pkt_alloc_bytes %15llu Bytes allocated in new packets by NIC driver due to cache overflows\n", m->linux_pkt_alloc_bytes); - M("dropped_data_no_bufs %15llu Data bytes dropped because app buffers full\n", + M("dropped_data_no_bufs %15llu Data bytes dropped because app buffers full\n", m->dropped_data_no_bufs); - M("gen3_handoffs %15llu GRO->SoftIRQ handoffs made by Gen3 balancer\n", + M("gen3_handoffs %15llu GRO->SoftIRQ handoffs made by Gen3 balancer\n", m->gen3_handoffs); - M("gen3_alt_handoffs %15llu Gen3 handoffs to secondary core (primary was busy)\n", + M("gen3_alt_handoffs %15llu Gen3 handoffs to secondary core (primary was busy)\n", m->gen3_alt_handoffs); - M("gro_grant_bypasses %15llu Grant packets passed directly to homa_softirq by homa_gro_receive\n", + M("gro_grant_bypasses %15llu Grant packets passed directly to homa_softirq by homa_gro_receive\n", m->gro_grant_bypasses); - M("gro_data_bypasses %15llu Data packets passed directly to homa_softirq by homa_gro_receive\n", + M("gro_data_bypasses %15llu Data packets passed directly to homa_softirq by homa_gro_receive\n", m->gro_data_bypasses); for (i = 0; i < NUM_TEMP_METRICS; i++) - M("temp%-2d %15llu Temporary use in testing\n", + M("temp%-2d %15llu Temporary use in testing\n", i, m->temp[i]); } diff --git a/homa_metrics.h b/homa_metrics.h index d2b33ca3..fc5f355f 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -50,34 +50,111 @@ struct homa_metrics { u64 large_msg_bytes; /** - * @rx_msgs_started: incremented whenever the first packet is received - * for a new incoming message. + * @client_requests_started: cumulative count of all client RPCs + * that have been initiated on this node. */ - u64 rx_msgs_started; + u64 client_requests_started; /** - * @rx_msg_bytes_started: total number of incoming message bytes for - * which at least one packet of the message has been received - * (incremented by the length of the message when the first packet is - * received). + * @client_request_bytes_started: total number of bytes in the + * request messages for all client RPCs that have been initiated on + * this node. */ - u64 rx_msg_bytes_started; + u64 client_request_bytes_started; /** - * @rx_msg_bytes_retired: cumulative count of incoming message bytes - * that were either (a) successfully received (counts only goodput, - * not retransmits) or (b) abandoned because the message was deleted - * before they were received. Or, think of this as the bytes from - * @rx_msg_bytes_started that we're no longer waiting to receive. + * @client_request_bytes_done: total number of bytes in request + * messages that no longer need to be transmitted (for the first time) + * either because they were transmitted or because the RPC was aborted. + * Always <= client_request_bytes_started. */ - u64 rx_msg_bytes_retired; + u64 client_request_bytes_done; /** - * @rx_msgs_ended: incremented whenever an input message is "retired", - * either because it was completed or because it was destroyed before - * it completed. + * @client_requests_done: cumulative count of all client RPCs + * whose request messages have been completely transmitted (or the RPC + * was aborted). */ - u64 rx_msgs_ended; + u64 client_requests_done; + + /** + * @client_responses_started: cumulative count of all client RPCs + * for which at least one packet of the response has been received. + */ + u64 client_responses_started; + + /** + * @client_response_bytes_started: total number of bytes in + * response messages for client RPCs for which at least one byte + * of response has been received. + */ + u64 client_response_bytes_started; + + /** + * @client_response_bytes_done: cumulative count of bytes in + * @client_response_bytes_started that no longer need to be received + * (either they were received or the RPC was aborted). + */ + u64 client_response_bytes_done; + + /** + * @client_responses_done: cumulative count of all client RPCs + * that have been completed on this node (either successfully or + * with errors). + */ + u64 client_responses_done; + + /** + * @server_requests_started: cumulative count of all server RPCs + * for which at least one packet of the request has been received. + */ + u64 server_requests_started; + + /** + * @server_request_bytes_started: total number of bytes in the + * request messages for server RPCs counted by @server_reuqests_started. + */ + u64 server_request_bytes_started; + + /** + * @server_request_bytes_done: total number of bytes in + * @server_request_bytes_started that no longer need to be received + * (either they were received or the RPC was aborted). + */ + u64 server_request_bytes_done; + + /** + * @server_requests_done: cumulative count of all server RPCs + * whose request messages have been completely received (or the RPC + * was aborted). + */ + u64 server_requests_done; + + /** + * @server_responses_started: cumulative count of all server RPCs + * for which transmission of the response has begun. + */ + u64 server_responses_started; + + /** + * @server_response_bytes_started: total number of bytes in + * the messages counted by @server_responses_started. + */ + u64 server_response_bytes_started; + + /** + * @server_response_bytes_done: total number of bytes in + * @server_response_bytes_started that no longer need to be transmitted + * (either they were transmitted at least once or the RPC was aborted). + */ + u64 server_response_bytes_done; + + /** + * @server_responses_done: total number of server RPCS in + * @server_requests_started that are no longer active (either the + * response was completely sent or the RPC was aborted). + */ + u64 server_responses_done; /** * @sent_msg_bytes: The total number of bytes in outbound diff --git a/homa_outgoing.c b/homa_outgoing.c index ff1dbfc9..27a0f62c 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -592,16 +592,13 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) __must_hold(rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; -#ifndef __STRIP__ /* See strip.py */ - struct netdev_queue *txq; -#endif /* See strip.py */ + IF_NO_STRIP(struct netdev_queue *txq); + int length; homa_rpc_hold(rpc); while (*rpc->msgout.next_xmit) { -#ifndef __STRIP__ /* See strip.py */ - int priority; -#endif /* See strip.py */ struct sk_buff *skb = *rpc->msgout.next_xmit; + IF_NO_STRIP(int priority); #ifndef __STRIP__ /* See strip.py */ if (rpc->msgout.next_xmit_offset >= rpc->msgout.granted) { @@ -623,16 +620,28 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) } #ifndef __STRIP__ /* See strip.py */ - if (rpc->msgout.next_xmit_offset < rpc->msgout.unscheduled) { + if (rpc->msgout.next_xmit_offset < rpc->msgout.unscheduled) priority = homa_unsched_priority(homa, rpc->peer, rpc->msgout.length); - } else { + else priority = rpc->msgout.sched_priority; - } #endif /* See strip.py */ rpc->msgout.next_xmit = &(homa_get_skb_info(skb)->next_skb); - rpc->msgout.next_xmit_offset += - homa_get_skb_info(skb)->data_bytes; + length = homa_get_skb_info(skb)->data_bytes; + rpc->msgout.next_xmit_offset += length; +#ifndef __STRIP__ /* See strip.py */ + if (homa_is_client(rpc->id)) { + INC_METRIC(client_request_bytes_done, length); + INC_METRIC(client_requests_done, + rpc->msgout.next_xmit_offset == + rpc->msgout.length); + } else { + INC_METRIC(server_response_bytes_done, length); + INC_METRIC(server_responses_done, + rpc->msgout.next_xmit_offset == + rpc->msgout.length); + } +#endif /* See strip.py */ homa_rpc_hold(rpc); homa_rpc_unlock(rpc); diff --git a/homa_plumbing.c b/homa_plumbing.c index 09923283..2212d1f1 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1063,6 +1063,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) finish = homa_clock(); #endif /* See strip.py */ INC_METRIC(send_cycles, finish - start); + INC_METRIC(client_requests_started, 1); + INC_METRIC(client_request_bytes_started, length); } else { /* This is a response message. */ struct in6_addr canonical_dest; @@ -1110,6 +1112,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) finish = homa_clock(); #endif /* See strip.py */ INC_METRIC(reply_cycles, finish - start); + INC_METRIC(server_responses_started, 1); + INC_METRIC(server_response_bytes_started, length); } tt_record1("homa_sendmsg finished, id %d", args.id); return 0; @@ -1390,7 +1394,7 @@ int homa_softirq(struct sk_buff *skb) if (unlikely(h->type == FREEZE)) { if (!atomic_read(&tt_frozen)) { homa_rpc_log_active_tt(homa_from_skb(skb), 0); - homa_rx_snapshot_log_tt(); + homa_rpc_snapshot_log_tt(); tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", ntohs(h->dport), tt_addr(skb_canonical_ipv6_saddr(skb)), @@ -1654,7 +1658,7 @@ int homa_dointvec(const struct ctl_table *table, int write, tt_freeze(); } else if (homa->sysctl_action == 7) { homa_rpc_log_active_tt(homa, 0); - homa_rx_snapshot_log_tt(); + homa_rpc_snapshot_log_tt(); tt_record("Freezing cluster because of action 7"); homa_freeze_peers(); tt_record("Finished freezing cluster"); @@ -1663,7 +1667,7 @@ int homa_dointvec(const struct ctl_table *table, int write, pr_notice("homa_total_incoming is %d\n", atomic_read(&homa->grant->total_incoming)); } else if (homa->sysctl_action == 9) { - tt_print_file("/users/ouster/node.tt"); + homa_rpc_stats_log(); } else { homa_rpc_log_active(homa, homa->sysctl_action); } diff --git a/homa_rpc.c b/homa_rpc.c index f4f4622d..8973f2a0 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -571,9 +571,10 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) homa_sock_unlock(hsk); homa_skb_free_many_tx(hsk->homa, skbs, num_skbs); for (i = 0; i < num_rpcs; i++) { + IF_NO_STRIP(int tx_left); rpc = rpcs[i]; - UNIT_LOG("; ", "reaped %llu", rpc->id); + UNIT_LOG("; ", "reaped %llu", rpc->id); if (unlikely(rpc->msgin.num_bpages)) homa_pool_release_buffers(rpc->hsk->buffer_pool, rpc->msgin.num_bpages, @@ -598,10 +599,32 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) } tt_record2("homa_rpc_reap finished reaping id %d, socket %d", rpc->id, rpc->hsk->port); - INC_METRIC(rx_msg_bytes_retired, - rpc->msgin.bytes_remaining); - INC_METRIC(rx_msgs_ended, - rpc->msgin.bytes_remaining != 0); +#ifndef __STRIP__ /* See strip.py */ + + tx_left = rpc->msgout.length - + rpc->msgout.next_xmit_offset; + if (homa_is_client(rpc->id)) { + INC_METRIC(client_response_bytes_done, + rpc->msgin.bytes_remaining); + INC_METRIC(client_responses_done, + rpc->msgin.bytes_remaining != 0); + if (tx_left > 0) { + INC_METRIC(client_request_bytes_done, + tx_left); + INC_METRIC(client_requests_done, 1); + } + } else { + INC_METRIC(server_request_bytes_done, + rpc->msgin.bytes_remaining); + INC_METRIC(server_requests_done, + rpc->msgin.bytes_remaining != 0); + if (tx_left > 0) { + INC_METRIC(server_response_bytes_done, + tx_left); + INC_METRIC(server_responses_done, 1); + } + } +#endif /* See strip.py */ rpc->state = 0; kfree(rpc); } diff --git a/homa_timer.c b/homa_timer.c index a266a55f..9ca6d906 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -238,7 +238,7 @@ void homa_timer(struct homa *homa) homa_skb_release_pages(homa); homa_peer_gc(homa->peertab); #ifndef __STRIP__ /* See strip.py */ - homa_snapshot_rx(); + homa_snapshot_rpcs(); end = homa_clock(); INC_METRIC(timer_cycles, end - start); #endif /* See strip.py */ diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 46ab4297..7f3b39e0 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -179,7 +179,7 @@ TEST_F(homa_incoming, homa_message_in_init__no_buffers_available) #endif /* See strip.py */ } #ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_incoming, homa_message_in_init__update_metrics) +TEST_F(homa_incoming, homa_message_in_init__update_message_length_metrics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -198,6 +198,27 @@ TEST_F(homa_incoming, homa_message_in_init__update_metrics) EXPECT_EQ(0, homa_metrics_per_cpu()->medium_msg_bytes[15]); EXPECT_EQ(1900000, homa_metrics_per_cpu()->large_msg_bytes); } +TEST_F(homa_incoming, homa_message_in_init__update_client_rpc_metrics) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, 98, 1000, 1000); + + EXPECT_EQ(0, homa_message_in_init(crpc, 5000, 1000)); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_started); + EXPECT_EQ(5000, homa_metrics_per_cpu()->client_response_bytes_started); +} +TEST_F(homa_incoming, homa_message_in_init__update_server_rpc_metrics) +{ + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 4000, 10000); + EXPECT_FALSE(srpc == NULL); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_requests_started); + EXPECT_EQ(4000, homa_metrics_per_cpu()->server_request_bytes_started); +} #endif /* See strip.py */ TEST_F(homa_incoming, homa_request_retrans__request_gaps) @@ -642,7 +663,7 @@ TEST_F(homa_incoming, homa_add_packet__scan_multiple_gaps) EXPECT_STREQ("start 0, end 1400", unit_print_gaps(crpc)); } #ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_incoming, homa_add_packet__metrics) +TEST_F(homa_incoming, homa_add_packet__discard_metrics) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, @@ -670,6 +691,47 @@ TEST_F(homa_incoming, homa_add_packet__metrics) EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); EXPECT_EQ(1, homa_metrics_per_cpu()->resent_packets_used); } +TEST_F(homa_incoming, homa_add_packet__client_rpc_metrics) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 1000, 2000); + + homa_message_in_init(crpc, 2000, 0); + + /* First packet doesn't complete message. */ + self->data.seg.offset = htonl(0); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 1400, 0)); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_responses_done); + + /* Second packet completes message. */ + self->data.seg.offset = htonl(1400); + homa_add_packet(crpc, mock_skb_alloc(self->client_ip, + &self->data.common, 600, 0)); + EXPECT_EQ(2000, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_done); +} +TEST_F(homa_incoming, homa_add_packet__server_rpc_metrics) +{ + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 2000, 10000); + EXPECT_EQ(1400, homa_metrics_per_cpu()->server_request_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_requests_done); + + /* Second packet completes message. */ + self->data.seg.offset = htonl(1400); + homa_add_packet(srpc, mock_skb_alloc(self->server_ip, + &self->data.common, 600, 0)); + EXPECT_EQ(2000, homa_metrics_per_cpu()->server_request_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_requests_done); +} #endif /* See strip.py */ TEST_F(homa_incoming, homa_copy_to_user__basics) diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 734ff55e..105de871 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -901,6 +901,46 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) unit_log_throttled(&self->homa); EXPECT_STREQ("request id 1234, next_offset 2800", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_outgoing, homa_xmit_data__metrics_for_client_rpc) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 6000, 1000); + + crpc->msgout.granted = 4000; + homa_rpc_lock(crpc); + homa_xmit_data(crpc, false); + EXPECT_EQ(4200, homa_metrics_per_cpu()->client_request_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_requests_done); + + crpc->msgout.granted = 6000; + homa_xmit_data(crpc, false); + EXPECT_EQ(6000, homa_metrics_per_cpu()->client_request_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_requests_done); + homa_rpc_unlock(crpc); +} +TEST_F(homa_outgoing, homa_xmit_data__metrics_for_server_rpc) +{ + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 1000, 10000); + + srpc->msgout.granted = 4000; + homa_rpc_lock(srpc); + homa_xmit_data(srpc, false); + EXPECT_EQ(4200, homa_metrics_per_cpu()->server_response_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_responses_done); + + srpc->msgout.granted = 9900; + homa_xmit_data(srpc, false); + EXPECT_EQ(10000, homa_metrics_per_cpu()->server_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_responses_done); + homa_rpc_unlock(srpc); +} +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__rpc_freed) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 3ff08a12..039da384 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -572,6 +572,15 @@ TEST_F(homa_plumbing, homa_sendmsg__request_sent_successfully) EXPECT_EQ(88888, crpc->completion_cookie); homa_rpc_unlock(crpc); } +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_plumbing, homa_sendmsg__request_metrics) +{ + EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_requests_started); + EXPECT_EQ(200, homa_metrics_per_cpu()->client_request_bytes_started); +} +#endif /* See strip.py */ TEST_F(homa_plumbing, homa_sendmsg__response_nonzero_completion_cookie) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, @@ -662,6 +671,19 @@ TEST_F(homa_plumbing, homa_sendmsg__response_succeeds) EXPECT_EQ(RPC_OUTGOING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); } +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_plumbing, homa_sendmsg__response_metrics) +{ + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 2000, 100); + self->sendmsg_args.id = self->server_id; + EXPECT_EQ(0, -homa_sendmsg(&self->hsk.inet.sk, + &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_responses_started); + EXPECT_EQ(200, homa_metrics_per_cpu()->server_response_bytes_started); +} +#endif /* See strip.py */ TEST_F(homa_plumbing, homa_recvmsg__wrong_args_length) { diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 3848efa0..fb29230f 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -765,6 +765,91 @@ TEST_F(homa_rpc, homa_rpc_reap__release_peer_ref) EXPECT_EQ(0, atomic_read(&peer->refs)); EXPECT_EQ(NULL, crpc->peer); } +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_rpc, homa_rpc_reap__metrics_for_client_response) +{ + struct homa_rpc *crpc, *crpc2; + + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, 4000, 98, 4000, 10000); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_response_bytes_done); + + homa_rpc_end(crpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(10000, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_done); + + /* Second RPC has already completed, so no need to increment metrics. */ + homa_metrics_per_cpu()->client_response_bytes_done = 0; + homa_metrics_per_cpu()->client_responses_done = 0; + crpc2 = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, 4000, 98, 4000, 1400); + ASSERT_NE(NULL, crpc2); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_done); + + homa_rpc_end(crpc2); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_responses_done); +} +TEST_F(homa_rpc, homa_rpc_reap__metrics_for_client_request) +{ + struct homa_rpc *crpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, 4000, 98, 4000, 10000); + ASSERT_NE(NULL, crpc); + crpc->msgout.granted = 1000; + homa_rpc_lock(crpc); + homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); + EXPECT_EQ(1400, homa_metrics_per_cpu()->client_request_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->client_requests_done); + + homa_rpc_end(crpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(4000, homa_metrics_per_cpu()->client_request_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->client_requests_done); +} +TEST_F(homa_rpc, homa_rpc_reap__metrics_for_server_request) +{ + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 5000, 10000); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(1400, homa_metrics_per_cpu()->server_request_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_requests_done); + + homa_rpc_end(srpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(5000, homa_metrics_per_cpu()->server_request_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_requests_done); +} +TEST_F(homa_rpc, homa_rpc_reap__metrics_for_server_response) +{ + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 5000, 10000); + ASSERT_NE(NULL, srpc); + srpc->msgout.granted = 1000; + homa_rpc_lock(srpc); + homa_xmit_data(srpc, false); + homa_rpc_unlock(srpc); + EXPECT_EQ(1400, homa_metrics_per_cpu()->server_response_bytes_done); + EXPECT_EQ(0, homa_metrics_per_cpu()->server_responses_done); + + homa_rpc_end(srpc); + homa_rpc_reap(&self->hsk, false); + EXPECT_EQ(10000, homa_metrics_per_cpu()->server_response_bytes_done); + EXPECT_EQ(1, homa_metrics_per_cpu()->server_responses_done); +} +#endif /* See strip.py */ TEST_F(homa_rpc, homa_rpc_reap__call_homa_sock_wakeup_wmem) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, diff --git a/util/metrics.py b/util/metrics.py index c35227e9..7299e66b 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -142,10 +142,10 @@ def scale_number(number): pad = pad.ljust(13) secs = "(%.1f s)" % (elapsed_secs) secs = secs.ljust(12) - print("%-28s %15d %s %s" % ("time_cycles", time_delta, secs, + print("%-30s %15d %s %s" % ("time_cycles", time_delta, secs, docs["time_cycles"])) else: - print("%-15s %28d %s%s" % ("time_cycles", cur[0]["time_cycles"], + print("%-17s %28d %s%s" % ("time_cycles", cur[0]["time_cycles"], "", docs["time_cycles"])) for symbol in symbols: @@ -164,7 +164,7 @@ def scale_number(number): if symbol.endswith("_cycles") and (time_delta != 0): percent = "(%.1f%%)" % (100.0*delta/time_delta) percent = percent.ljust(12) - print("%-28s %15d %s %s" % (symbol, delta, percent, doc)) + print("%-30s %15d %s %s" % (symbol, delta, percent, doc)) elif symbol.endswith("_queued") and (time_delta != 0): received = deltas[symbol[:-7] + "_received"] if received != 0: @@ -172,35 +172,35 @@ def scale_number(number): else: percent = " " percent = percent.ljust(12) - print("%-28s %15d %s %s" % (symbol, delta, percent, doc)) + print("%-30s %15d %s %s" % (symbol, delta, percent, doc)) else: - print("%-28s %15d %s%s" % (symbol, delta, rate_info, doc)) + print("%-30s %15d %s%s" % (symbol, delta, rate_info, doc)) if symbol.startswith("packets_rcvd_"): total_packets += delta if symbol == "softirq_calls": gro_packets = delta if (symbol == "reaper_dead_skbs") and ("reaper_calls" in deltas): - print("%-28s %6.1f %sAvg. hsk->dead_skbs in reaper" % ( + print("%-30s %6.1f %sAvg. hsk->dead_skbs in reaper" % ( "avg_dead_skbs", delta/deltas["reaper_calls"], pad)) if symbol.endswith("_miss_cycles") and (time_delta != 0): prefix = symbol[:-12] if ((prefix + "_misses") in deltas) and (deltas[prefix + "_misses"] != 0): ns = (delta/deltas[prefix + "_misses"])/(cpu_khz * 1e-06) - print("%-28s %6.1f %sAvg. wait time per %s miss (ns)" % ( + print("%-30s %6.1f %sAvg. wait time per %s miss (ns)" % ( prefix + "_miss_delay", ns, pad, prefix)) if (symbol == "large_msg_bytes") and (total_received_bytes != 0) \ and (time_delta != 0): rate = float(total_received_bytes)/elapsed_secs rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) - print("%-28s %15d %s%s" % ("received_msg_bytes", total_received_bytes, + print("%-30s %15d %s%s" % ("received_msg_bytes", total_received_bytes, rate_info, "Total bytes in all incoming messages")) if gro_packets != 0: - print("%-28s %6.2f %sHoma packets per homa_softirq call" % ( + print("%-30s %6.2f %sHoma packets per homa_softirq call" % ( "gro_benefit", float(total_packets)/float(gro_packets), pad)) avg_grantable_rpcs = 0.0 if ("grantable_rpcs_integral" in deltas) and (time_delta != 0): avg_grantable_rpcs = float(deltas["grantable_rpcs_integral"])/time_delta - print("%-28s %6.2f %sAverage number of grantable incoming RPCs" % ( + print("%-30s %6.2f %sAverage number of grantable incoming RPCs" % ( "avg_grantable_rpcs", avg_grantable_rpcs, pad)) if elapsed_secs != 0: @@ -449,7 +449,7 @@ def scale_number(number): continue rate = float(deltas[symbol])/elapsed_secs rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) - print("%-28s %15d %s%s" % (symbol, deltas[symbol], + print("%-30s %15d %s%s" % (symbol, deltas[symbol], rate_info, docs[symbol])) for symbol in ["pacer_lost_cycles", "timer_reap_cycles", "data_pkt_reap_cycles", "grant_lock_cycles"]: @@ -458,18 +458,18 @@ def scale_number(number): continue percent = "(%.1f%%)" % (100.0*delta/time_delta) percent = percent.ljust(12) - print("%-28s %15d %s %s" % (symbol, delta, percent, docs[symbol])) + print("%-30s %15d %s %s" % (symbol, delta, percent, docs[symbol])) if deltas["throttle_list_adds"] > 0: - print("%-28s %15.1f List traversals per throttle " + print("%-30s %15.1f List traversals per throttle " "list insert" % ("checks_per_throttle_insert", deltas["throttle_list_checks"]/deltas["throttle_list_adds"])) if deltas["responses_received"] > 0: - print("%-28s %15.1f ACK packets sent per 1000 client RPCs" + print("%-30s %15.1f ACK packets sent per 1000 client RPCs" % ("acks_per_krpc", 1000.0 * deltas["packets_sent_ACK"] / deltas["responses_received"])) if avg_grantable_rpcs > 1.0: - print("%-28s %6.2f %sAverage number of grantable incoming RPCs" % ( + print("%-30s %6.2f %sAverage number of grantable incoming RPCs" % ( "avg_grantable_rpcs", avg_grantable_rpcs, pad)) From 93bdded89f4d1412c000349380ec9724a7a217ea Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sat, 5 Jul 2025 17:51:18 -0700 Subject: [PATCH 377/625] Replace rxlongterm analyzer in tthoma.py with longterm The new analyzer takes advantage of the more granular RPC statistics now provided by Homa. --- util/tthoma.py | 859 +++++++++++++++++++++++++++++++------------------ 1 file changed, 554 insertions(+), 305 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index b8727291..f89bf365 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1598,32 +1598,75 @@ def __tcp_xmit(self, trace, time, core, match, interests): 'regexp': '__tcp_transmit_skb sent packet with ([0-9]+) bytes' }) - def __rx_snapshot1(self, trace, time, core, match, interests): + def __snapshot_clock(self, trace, time, core, match, interests): usecs = int(match.group(1)) - msgs_started = int(match.group(2)) - msgs_ended = int(match.group(3)) for interest in interests: - interest.tt_rx_snapshot1(trace, time, core, usecs, msgs_started, - msgs_ended) + interest.tt_snapshot_clock(trace, time, core, usecs) patterns.append({ - 'name': 'rx_snapshot1', - 'regexp': 'rx snapshot part 1, usecs (-[0-9]+), msgs_started ([0-9]+), ' - 'msgs_ended ([0-9]+)' + 'name': 'snapshot_clock', + 'regexp': 'rpc snapshot usecs ([0-9-]+)' }) - def __rx_snapshot2(self, trace, time, core, match, interests): - usecs = int(match.group(1)) - bytes_started = int(match.group(2)) * 4096 - bytes_ended = int(match.group(3)) * 4096 + + def __snapshot_client_request(self, trace, time, core, match, interests): + msgs_started = int(match.group(1)) + bytes_started = int(match.group(2)) + bytes_done = int(match.group(3)) + msgs_done = int(match.group(4)) + for interest in interests: + interest.tt_snapshot_client_request(trace, time, core, msgs_started, + bytes_started, bytes_done, msgs_done) + + patterns.append({ + 'name': 'snapshot_client_request', + 'regexp': 'rpc snapshot client requests started ([0-9]+), ' + 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' + }) + + def __snapshot_client_response(self, trace, time, core, match, interests): + msgs_started = int(match.group(1)) + bytes_started = int(match.group(2)) + bytes_done = int(match.group(3)) + msgs_done = int(match.group(4)) + for interest in interests: + interest.tt_snapshot_client_response(trace, time, core, msgs_started, + bytes_started, bytes_done, msgs_done) + + patterns.append({ + 'name': 'snapshot_client_response', + 'regexp': 'rpc snapshot client responses started ([0-9]+), ' + 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' + }) + + def __snapshot_server_request(self, trace, time, core, match, interests): + msgs_started = int(match.group(1)) + bytes_started = int(match.group(2)) + bytes_done = int(match.group(3)) + msgs_done = int(match.group(4)) + for interest in interests: + interest.tt_snapshot_server_request(trace, time, core, msgs_started, + bytes_started, bytes_done, msgs_done) + + patterns.append({ + 'name': 'snapshot_server_request', + 'regexp': 'rpc snapshot server requests started ([0-9]+), ' + 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' + }) + + def __snapshot_server_response(self, trace, time, core, match, interests): + msgs_started = int(match.group(1)) + bytes_started = int(match.group(2)) + bytes_done = int(match.group(3)) + msgs_done = int(match.group(4)) for interest in interests: - interest.tt_rx_snapshot2(trace, time, core, usecs, bytes_started, - bytes_ended) + interest.tt_snapshot_server_response(trace, time, core, msgs_started, + bytes_started, bytes_done, msgs_done) patterns.append({ - 'name': 'rx_snapshot2', - 'regexp': 'rx snapshot part 2, usecs (-[0-9]+), 4kbytes_started ' - '([-0-9]+), 4kbytes_retired ([-0-9]+)' + 'name': 'snapshot_server_response', + 'regexp': 'rpc snapshot server responses started ([0-9]+), ' + 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' }) #------------------------------------------------ @@ -4434,6 +4477,500 @@ def analyze(self): prev_time = itime interval['tx_q'] = cur_queue +#------------------------------------------------ +# Analyzer: longterm +#------------------------------------------------ +class AnalyzeLongterm: + """ + Uses data recorded by homa_rpc_snapshot_log_tt to analyze statistics on + RPC progress for each node over a much longer time period than covered by + the traces themselves. Generates data about active messages as well as + arrival and service rates, with separate statistics for client vs. server + RPCs and requests vs. responses. This analyzer will not work unless + homa_rpc_snapshot_log_tt was invoked before freezing the timetraces. + If --data is specified then more detailed node-specific files are generated + in the data directory. + """ + + def __init__(self, dispatcher): + # Node name -> list of records for that node. Each record has + # the following fields: + # time: Time when the record was generated. + # creq_start: The client_requests_started Homa metric + # creq_kbstart: The client_request_bytes_started Homa metric, + # except units are KB, not bytes + # creq_kbdone: The client_request_bytes_done Homa metric, + # except units are KB, not bytes + # creq_done: The client_requests_done Homa metric + # cresp_start: The lient_responses_started Homa metric + # cresp_kbstart: The client_response_bytes_started Homa metric, + # except units are KB, not bytes + # cresp_kbdone: The client_response_bytes_done Homa metric, + # except units are KB, not bytes + # cresp_done: The client_responses_done Homa metric + # sreq_start: The server_requests_started Homa metric + # sreq_kbstart: The server_request_bytes_started Homa metric, + # except units are KB, not bytes + # sreq_kbdone: The server_request_bytes_done Homa metric, + # except units are KB, not bytes + # sreq_done: The server_requests_done Homa metric + # sresp_start: The server_responses_started Homa metric + # sresp_kbstart The erver_response_bytes_started Homa metric, + # except units are KB, not bytes + # sresp_kbdone: The server_response_bytes_done Homa metric, + # except units are KB, not bytes + # sresp_done: The erver_responses_done Homa metric + self.node_records = defaultdict(list) + + # A list with one entry for each interval of backlog data (not the + # same intervals as the global variable "intervals"). Each entry + # is a list with two values: + # time: The time that the interval represents + # indexes: A list with one entry for each element in + # get_sorted_nodes, which is the index of the first + # element in node_records whose time is at or after + # time, or -1 if there is no such entry or if the + # index would be zero (so there is no preceding entry) + self.intervals = [] + + # Elepased time between elements of self.intervals + self.interval = None + + def init_trace(self, trace): + # Time of the first snapshot record encountered for this node; + # serves as a reference point for time values in the records. + self.ref_time = None + + def tt_snapshot_clock(self, trace, t, core, usecs): + if self.ref_time == None: + self.ref_time = t + records = self.node_records[trace['node']] + if len(records) > 0 and (not 'creq_start' in records[-1] + or not 'cresp_start' in records[-1] + or not 'sreq_start' in records[-1] + or not 'sresp_start' in records[-1]): + # Previous record was incomplete, so just remove it. + print('Removing incomplete snapshot record for node %s at ' + 'usecs %d' % (trace['node'], usecs)) + del records[-1] + records.append({'time': self.ref_time + usecs}) + + def tt_snapshot_client_request(self, trace, t, core, msgs_started, + bytes_started, bytes_done, msgs_done): + records = self.node_records[trace['node']] + if records: + record = records[-1] + if 'time' in record and not 'creq_start' in record: + record['creq_start'] = msgs_started + record['creq_kbstart'] = bytes_started + record['creq_kbdone'] = bytes_done + record['creq_done'] = msgs_done + + def tt_snapshot_client_response(self, trace, t, core, msgs_started, + bytes_started, bytes_done, msgs_done): + records = self.node_records[trace['node']] + if records: + record = records[-1] + if ('creq_start' in record and + not 'cresp_start' in record): + record['cresp_start'] = msgs_started + record['cresp_kbstart'] = bytes_started + record['cresp_kbdone'] = bytes_done + record['cresp_done'] = msgs_done + + def tt_snapshot_server_request(self, trace, t, core, msgs_started, + bytes_started, bytes_done, msgs_done): + records = self.node_records[trace['node']] + if records: + record = records[-1] + if ('cresp_start' in record and + not 'sreq_start' in record): + record['sreq_start'] = msgs_started + record['sreq_kbstart'] = bytes_started + record['sreq_kbdone'] = bytes_done + record['sreq_done'] = msgs_done + + def tt_snapshot_server_response(self, trace, t, core, msgs_started, + bytes_started, bytes_done, msgs_done): + records = self.node_records[trace['node']] + if records: + record = records[-1] + if ('sreq_start' in record and + not 'sresp_start' in record): + record['sresp_start'] = msgs_started + record['sresp_kbstart'] = bytes_started + record['sresp_kbdone'] = bytes_done + record['sresp_done'] = msgs_done + + def analyze(self): + """ + Determines the length of the intervals in the data and returns a + list with one entry for each interval. Each entry is a list with + two values: + time: The time that the interval represents + indexes: A list with one entry for each element in + get_sorted_nodes, which is the index of the first + element in node_records whose time is at or after + time, or -1 if there is no such entry or if the + index would be zero (so there is no preceding entry) + """ + + nodes = get_sorted_nodes() + start = 1e20 + end = -1e20 + interval = None + for node in nodes: + records = self.node_records[node] + if records[0]['time'] < start: + start = records[0]['time'] + if records[-1]['time'] > end: + end = records[-1]['time'] + + # Figure out the interval for records on this node (round to + # an integer that is all zeroes except the high-order digit) + tend = records[-1]['time'] + tstart = records[0]['time'] + node_interval = (tend - tstart) / (len(records) - 1) + node_interval = int(float('%.0g' % (node_interval))) + if interval == None: + interval = node_interval + elif interval != node_interval: + print('%s has a different interval for rx backlog records than %s (%d vs %d)' % + (node, nodes[0], node_interval, interval), file=sys.stderr) + + start = int(start) // interval * interval + + # Each iteration of the following loop generates one list of indexes + # for the result. + next = [1] * len(nodes) + self.intervals = [] + for t in count(start, interval): + if t > end: + break + indices = [] + for i in range(0, len(nodes)): + records = self.node_records[nodes[i]] + if records[0]['time'] >= t or records[-1]['time'] < t: + indices.append(-1) + continue + while records[next[i]]['time'] < t: + next[i] += 1 + indices.append(next[i]) + # print('Index %d for %s has interval %d, time %d, usecs %d' % ( + # next[i], nodes[i], t, records[next[i]]['time'], + # records[next[i]]['usecs'])) + self.intervals.append([t, indices]) + + self.interval = interval + + def output_node_client_data(self, node, node_index): + """ + Generates a node-specific data file with time series data about + client RPCs issued by that node. + node: Name of node for which to print data + node_index: Index of info for this node in various arrays + """ + + f = open('%s/longterm_client_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Interval-based statistics about outgoing RPCs issued by ' + '%s\n' % (node)) + f.write('# Time: Time in seconds. The actual interval for the ' + 'data spans this\n') + f.write('# time and its length is approximately the same ' + 'as the time between\n') + f.write('# consecutive lines, but its end time could be ' + 'anywhere from the\n') + f.write('# given time up to the next time\n') + f.write('# ActvReq: Number of active request messages as of this interval\n') + f.write('# ReqMB: Pending request data as of this interval ' + '(untransmitted data in\n') + f.write('# active messages, Mbytes)\n') + f.write('# ReqStart: Rate at which new requests started in the ' + 'interval (K/sec)\n') + f.write('# ReqDStart: Total data in new requests that started in the ' + 'interval,\n') + f.write('# expressed as a rate (Gbps)\n') + f.write('# ReqDDone Rate at which request data was transmitted in the ' + 'interval (Gbps)\n') + f.write('# ReqDone: Rate at which request messages completed in the ' + 'interval (K/sec)\n') + f.write('# ActvResp: Number of active response messages as of this interval\n') + f.write('# RspMB: Unreceived response data as of this interval (MB)\n') + f.write('# RspStart: Rate at which new responses started in the ' + 'interval (K/sec)\n') + f.write('# RspDStart: Total data in new responses that started in the ' + 'interval,\n') + f.write('# expressed as a rate (Gbps)\n') + f.write('# RspDDone Rate at which response data was received in the ' + 'interval (Gbps)\n') + f.write('# RspDone: Rate at which response messages completed in the ' + 'interval (K/sec)\n') + f.write('\n') + f.write('# Time ActvReq ReqMB ReqStart ReqDStart ReqDDone ReqDone') + f.write(' ActvRsp RspMB RspStart RspDStart RspDDone RspDone\n') + + records = self.node_records[node] + for interval in self.intervals: + t = interval[0] + record_index = interval[1][node_index] + if record_index < 0: + continue + cur = records[record_index] + prev = records[record_index - 1] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['creq_start'] - cur['creq_done'] + kbpending = cur['creq_kbstart'] - cur['creq_kbdone'] + mstarts = cur['creq_start'] - prev['creq_start'] + kbstarts = cur['creq_kbstart'] - prev['creq_kbstart'] + kbdone = cur['creq_kbdone'] - prev['creq_kbdone'] + mdone = cur['creq_done'] - prev['creq_done'] + f.write('%10.3f %7d %6.2f %8.2f %9.2f %8.2f %7.2f' % (1e-06 * t, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + mpending = cur['cresp_start'] - cur['cresp_done'] + kbpending = cur['cresp_kbstart'] - cur['cresp_kbdone'] + mstarts = cur['cresp_start'] - prev['cresp_start'] + kbstarts = cur['cresp_kbstart'] - prev['cresp_kbstart'] + kbdone = cur['cresp_kbdone'] - prev['cresp_kbdone'] + mdone = cur['cresp_done'] - prev['cresp_done'] + f.write(' %7d %6.2f %8.2f %9.2f %8.2f %7.2f\n' % ( + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + def output_node_server_data(self, node, node_index): + """ + Generates a node-specific data file with time series data about + server RPCs handled by that node. + node: Name of node for which to print data + node_index: Index of info for this node in various arrays + """ + + f = open('%s/longterm_server_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Interval-based statistics about incoming RPCs served by ' + '%s\n' % (node)) + f.write('# Time: Time in seconds. The actual interval for the ' + 'data spans this\n') + f.write('# time and its length is approximately the same ' + 'as the time between\n') + f.write('# consecutive lines, but its end time could be ' + 'anywhere from the\n') + f.write('# given time up to the next time\n') + f.write('# ActvReq: Number of active request messages as of this interval\n') + f.write('# ReqMB: Pending request data as of this interval ' + '(unreceived data in\n') + f.write('# active messages, Mbytes)\n') + f.write('# ReqStart: Rate at which new requests started in the ' + 'interval (K/sec)\n') + f.write('# ReqDStart: Total data in new requests that started in the ' + 'interval,\n') + f.write('# expressed as a rate (Gbps)\n') + f.write('# ReqDDone Rate at which request data was received in the ' + 'interval (Gbps)\n') + f.write('# ReqDone: Rate at which request messages completed in the ' + 'interval (K/sec)\n') + f.write('# ActvResp: Number of active response messages as of this interval\n') + f.write('# RspMB: Untransmitted response data as of this interval (MB)\n') + f.write('# RspStart: Rate at which new responses started in the ' + 'interval (K/sec)\n') + f.write('# RspDStart: Total data in new responses that started in the ' + 'interval,\n') + f.write('# expressed as a rate (Gbps)\n') + f.write('# RspDDone Rate at which response data was transmitted in the ' + 'interval (Gbps)\n') + f.write('# RspDone: Rate at which response messages completed in the ' + 'interval (K/sec)\n') + f.write('\n') + f.write('# Time ActvReq ReqMB ReqStart ReqDStart ReqDDone ReqDone') + f.write(' ActvRsp RspMB RspStart RspDStart RspDDone RspDone\n') + + records = self.node_records[node] + for interval in self.intervals: + t = interval[0] + record_index = interval[1][node_index] + if record_index < 0: + continue + cur = records[record_index] + prev = records[record_index - 1] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['sreq_start'] - cur['sreq_done'] + kbpending = cur['sreq_kbstart'] - cur['sreq_kbdone'] + mstarts = cur['sreq_start'] - prev['sreq_start'] + kbstarts = cur['sreq_kbstart'] - prev['sreq_kbstart'] + kbdone = cur['sreq_kbdone'] - prev['sreq_kbdone'] + mdone = cur['sreq_done'] - prev['sreq_done'] + f.write('%10.3f %7d %6.2f %8.2f %9.2f %8.2f %7.2f' % (1e-06 * t, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + mpending = cur['sresp_start'] - cur['sresp_done'] + kbpending = cur['sresp_kbstart'] - cur['sresp_kbdone'] + mstarts = cur['sresp_start'] - prev['sresp_start'] + kbstarts = cur['sresp_kbstart'] - prev['sresp_kbstart'] + kbdone = cur['sresp_kbdone'] - prev['sresp_kbdone'] + mdone = cur['sresp_done'] - prev['sresp_done'] + f.write(' %7d %6.2f %8.2f %9.2f %8.2f %7.2f\n' % ( + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + def output(self): + print('\n--------------------') + print('Analyzer: longterm') + print('--------------------\n') + + nodes = get_sorted_nodes() + + print('# Activity for client requests issued by each node over the ' + 'last 2 seconds:') + print('# Node: Name of node') + print('# Active: Number of active request messages at the end ' + 'of the traces') + print('# PendMB: Pending (untransmitted) data in active messages ' + 'at the') + print('# end of the traces (Mbytes)') + print('# MStart: Average rate at which new request messages ' + 'started (K/sec)') + print('# DStart: Total data in new requests that started, ' + 'expressed as a rate') + print('# (Gbps)') + print('# DDone Average rate at which request data was ' + 'transmitted (Gbps)') + print('# MDone: Average rate at which request messages ' + 'completed (K/sec)') + print('\n# Node Active PendMB MStart DStart DDone MDone') + + for node in nodes: + records = self.node_records[node] + cur = records[-1] + prev_index = len(records) - 1 - int(2.0 / (self.interval / 1e6)) + if prev_index < 0: + prev_index = 0 + prev = records[prev_index] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['creq_start'] - cur['creq_done'] + kbpending = cur['creq_kbstart'] - cur['creq_kbdone'] + mstarts = cur['creq_start'] - prev['creq_start'] + kbstarts = cur['creq_kbstart'] - prev['creq_kbstart'] + kbdone = cur['creq_kbdone'] - prev['creq_kbdone'] + mdone = cur['creq_done'] - prev['creq_done'] + print('%-10s %7d %7.2f %7.2f %7.2f %7.2f %7.2f' % (node, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + print() + print('# Activity for client responses received by each node ' + 'over the last 2 seconds:') + print('# Node: Name of node') + print('# Active: Number of active response messages as of the end ' + 'of the traces') + print('# PendMB: Pending (unreceived) data in active messages ' + 'at the') + print('# end of the traces (Mbytes)') + print('# MStart: Average rate at which new response messages started (K/sec)') + print('# DStart: Total data in new responses that started, ' + 'expressed as a rate') + print('# (Gbps)') + print('# DDone Average rate at which response data was received (Gbps)') + print('# MDone: Average rate at which response messages completed (K/sec)') + print('\n# Node Active PendMB MStart DStart DDone MDone') + + for node in nodes: + records = self.node_records[node] + cur = records[-1] + prev_index = len(records) - 1 - int(2.0 / (self.interval / 1e6)) + if prev_index < 0: + prev_index = 0 + prev = records[prev_index] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['cresp_start'] - cur['cresp_done'] + kbpending = cur['cresp_kbstart'] - cur['cresp_kbdone'] + mstarts = cur['cresp_start'] - prev['cresp_start'] + kbstarts = cur['cresp_kbstart'] - prev['cresp_kbstart'] + kbdone = cur['cresp_kbdone'] - prev['cresp_kbdone'] + mdone = cur['cresp_done'] - prev['cresp_done'] + print('%-10s %7d %7.2f %7.2f %7.2f %7.2f %7.2f' % (node, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + print('\n# Activity for server requests received by each node over ' + 'the last 2 seconds') + print('# (Columns are the same as for client responses)') + print('\n# Node Active PendMB MStart DStart DDone MDone') + + for node in nodes: + records = self.node_records[node] + cur = records[-1] + prev_index = len(records) - 1 - int(2.0 / (self.interval / 1e6)) + if prev_index < 0: + prev_index = 0 + prev = records[prev_index] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['sreq_start'] - cur['sreq_done'] + kbpending = cur['sreq_kbstart'] - cur['sreq_kbdone'] + mstarts = cur['sreq_start'] - prev['sreq_start'] + kbstarts = cur['sreq_kbstart'] - prev['sreq_kbstart'] + kbdone = cur['sreq_kbdone'] - prev['sreq_kbdone'] + mdone = cur['sreq_done'] - prev['sreq_done'] + print('%-10s %7d %7.2f %7.2f %7.2f %7.2f %7.2f' % (node, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + print('\n# Activity for server responses transmitted by each node over ' + 'the last 2 seconds') + print('# (Columns are the same as for client requests)') + print('\n# Node Active PendMB MStart DStart DDone MDone') + + for node in nodes: + records = self.node_records[node] + cur = records[-1] + prev_index = len(records) - 1 - int(2.0 / (self.interval / 1e6)) + if prev_index < 0: + prev_index = 0 + prev = records[prev_index] + elapsed_secs = 1e-6 * (cur['time'] - prev['time']) + mpending = cur['sresp_start'] - cur['sresp_done'] + kbpending = cur['sresp_kbstart'] - cur['sresp_kbdone'] + mstarts = cur['sresp_start'] - prev['sresp_start'] + kbstarts = cur['sresp_kbstart'] - prev['sresp_kbstart'] + kbdone = cur['sresp_kbdone'] - prev['sresp_kbdone'] + mdone = cur['sresp_done'] - prev['sresp_done'] + print('%-10s %7d %7.2f %7.2f %7.2f %7.2f %7.2f' % (node, + mpending, 1e-3 * kbpending, 1e-3 * (mstarts / elapsed_secs), + 8e-6 * (kbstarts / elapsed_secs), + 8e-6 * (kbdone / elapsed_secs), + 1e-3 * (mdone / elapsed_secs) + )) + + if options.data: + for i in range(0, len(nodes)): + self.output_node_client_data(nodes[i], i) + self.output_node_server_data(nodes[i], i) + #------------------------------------------------ # Analyzer: lost #------------------------------------------------ @@ -6499,294 +7036,6 @@ def output(self): print('%8d %20s %10s %4s %9.3f %9.3f %7.1f' % (active, pkid, node, core_id, gro_time, time, time - gro_time)) -#------------------------------------------------ -# Analyzer: rxlongterm -#------------------------------------------------ -class AnalyzeRxlongterm: - """ - Uses data recorded by homa_rx_snapshot_log_tt to analyze incoming RPC - traffic for each node over a much longer time period than covered by - the traces themselves. Provides information about backlog (incomplete - incoming RPCs) as well as arrival rates of new RPCs and service rates. - This analyzer will not work unless homa_rx_snapshot_log_tt was invoked - before reading the timetraces. If --data is specified then more detailed - node-specific files are generated in the data directory. - """ - - def __init__(self, dispatcher): - # Node name -> list of records for that node. Each record has - # the following fields: - # time: Time when the record was generated. - # mstarts: Value of the msgs_started field from the homa_rx_snapshot - # mends: Value of the msgs_ended field from the homa_rx_snapshot - # bstarts: Value of the msg_bytes_started field from the - # homa_rx_snapshot - # bends: Value of the msg_bytes_retired field from the - # homa_rx_snapshot - # - self.node_records = defaultdict(list) - - # A list with one entry for each interval of backlog data (not the - # same intervals as the global variable "intervals"). Each entry - # is a list with two values: - # time: The time that the interval represents - # indexes: A list with one entry for each element in - # get_sorted_nodes, which is the index of the first - # element in node_records whose time is at or after - # time, or -1 if there is no such entry or if the - # index would be zero (so there is no preceding entry) - self.intervals = [] - - # Elepased time between elements of self.intervals - self.interval = None - - def init_trace(self, trace): - # Time of the first snapshot record encountered for this node; - # serves as a reference point for time values in the records. - self.ref_time = None - - def tt_rx_snapshot1(self, trace, t, core, usecs, msg_starts, msg_ends): - if self.ref_time == None: - self.ref_time = t - records = self.node_records[trace['node']] - if (len(records) > 0 and not 'bstarts' in records[-1]): - # Previous record was incomplete, so just remove it. - print('Removing incomplete rx_snapshot record for node %s at ' - 'usecs %d' % (trace['node'], usecs)) - del records[-1] - records.append({'time': self.ref_time + usecs, 'mstarts': msg_starts, - 'mends': msg_ends, 'usecs': usecs}) - - def tt_rx_snapshot2(self, trace, t, core, usecs, byte_starts, byte_ends): - if self.ref_time == None: - self.ref_time = t - record_time = self.ref_time + usecs - records = self.node_records[trace['node']] - if records: - record = records[-1] - if (record['time'] != record_time): - print('Ignoring rx_snapshot2 record for node %s at usecs %d ' - 'because of time mismatch: expected %.2f, got %.2f' & - (trace['node'], usecs, record_time, record['time'])) - else: - record['bstarts'] = byte_starts - record['bends'] = byte_ends - - def analyze(self): - """ - Returns a list with one entry for each interval. Each entry is a list - with two values: - time: The time that the interval represents - indexes: A list with one entry for each element in - get_sorted_nodes, which is the index of the first - element in node_records whose time is at or after - time, or -1 if there is no such entry or if the - index would be zero (so there is no preceding entry) - """ - - nodes = get_sorted_nodes() - start = 1e20 - end = -1e20 - interval = None - for node in nodes: - records = self.node_records[node] - if records[0]['time'] < start: - start = records[0]['time'] - if records[-1]['time'] > end: - end = records[-1]['time'] - - # Figure out the interval for records on this node (round to - # an integer that is all zeroes except the high-order digit) - tend = records[-1]['time'] - tstart = records[0]['time'] - node_interval = (tend - tstart) / (len(records) - 1) - node_interval = int(float('%.0g' % (node_interval))) - if interval == None: - interval = node_interval - elif interval != node_interval: - print('%s has a different interval for rx backlog records than %s (%d vs %d)' % - (node, nodes[0], node_interval, interval), file=sys.stderr) - - start = int(start) // interval * interval - - # Each iteration of the following loop generates one list of indexes - # for the resut. - next = [1] * len(nodes) - self.intervals = [] - for t in count(start, interval): - if t > end: - break - indices = [] - for i in range(0, len(nodes)): - records = self.node_records[nodes[i]] - if records[0]['time'] >= t or records[-1]['time'] < t: - indices.append(-1) - continue - while records[next[i]]['time'] < t: - next[i] += 1 - indices.append(next[i]) - # print('Index %d for %s has interval %d, time %d, usecs %d' % ( - # next[i], nodes[i], t, records[next[i]]['time'], - # records[next[i]]['usecs'])) - self.intervals.append([t, indices]) - - self.interval = interval - - def output_node_data(self, node, node_index): - """ - Generates a node-specific data file with details about that - particular node. - node: Name of node for which to print data - node_index: Index of info for this node in various arrays - """ - - f = open('%s/rxlongterm_%s.dat' % (options.data, node), 'w') - f.write('# Node: %s\n' % (node)) - f.write('# Generated at %s.\n' % - (time.strftime('%I:%M %p on %m/%d/%Y'))) - f.write('# Interval-based statistics about incoming RPCs on a single node:\n') - f.write('# Time: Time in seconds. The actual interval for the ' - 'data spans this\n') - f.write(' time and its length is approximately the same ' - 'as the time between\n') - f.write(' consecutive lines, but its end time could be ' - 'anywhere from the\n') - f.write(' given time up to the next time\n') - f.write('# MStart: New incoming messages that started during ' - 'the interval\n') - f.write('# MStartR Rate at which new messages started (K/sec)\n') - f.write('# MEnd: Messages for which the last byte was received\n') - f.write('# MEndR Rate at which messages ended (K/sec)\n') - f.write('# DStart: Total data in new messages that started during ' - 'the interval (MB)\n') - f.write('# DStartR Rate coresponding to DStart (Gbps)\n') - f.write('# DRecv: Data that was successfully received in the ' - 'interval (goodput, MB)\n') - f.write('# DRecvR Rate corresonding to DEnd (K/sec)\n') - f.write('\n') - f.write('# Time MStart MStartR MEnd MEndR DStart DStartR DRecv DRecvR\n') - - records = self.node_records[node] - for interval in self.intervals: - t = interval[0] - rec_index = interval[1][node_index] - if rec_index < 0: - continue - cur = records[rec_index] - prev = records[rec_index - 1] - elapsed_secs = 1e-6 * (cur['time'] - prev['time']) - mstarts = cur['mstarts'] - prev['mstarts'] - mends = cur['mends'] - prev['mends'] - f.write('%10.3f %6d %6.2f %5d %6.2f' % (1e-06 * t, - mstarts, 1e-3 * (mstarts / elapsed_secs), - mends, 1e-3 * (mends / elapsed_secs), - )) - bstarts = cur['bstarts'] - prev['bstarts'] - bends = cur['bends'] - prev['bends'] - f.write(' %6.2f %7.2f %6.2f %6.2f\n' % ( - 1e-6 * bstarts, 8e-9 * (bstarts / elapsed_secs), - 1e-6 * bends, 8e-9 * (bends / elapsed_secs), - )) - - def output(self): - print('\n--------------------') - print('Analyzer: rxlongterm') - print('--------------------\n') - - nodes = get_sorted_nodes() - - print('Overall rates of incoming messages for each node:') - print('Secs: Time period over which averages were computed (seconds)') - print('Mstart: Average rate at which incoming messages were ' - 'initiated (first') - print(' packet arrived, K/sec)') - print('Mend: Average rate at which incoming messages were ' - 'completed (last byte') - print(' of data arrived, K/sec)') - print('Bstart: Average rate at which incoming messages were ' - 'initiated, weighted') - print(' by amount of data in the message (Gbps)') - print('Brecv: Average rate at which data was successfully ' - 'received for incoming') - print(' messages (goodput only, Gbps)') - print('') - print('Node Secs Mstart Mend Bstart Brecv') - print('-----------------------------------------------------') - - sum_mstarts = 0 - sum_mends = 0 - sum_bstarts = 0 - sum_bends = 0 - sum_secs = 0 - for node in nodes: - first = self.node_records[node][0] - last = self.node_records[node][-1] - records = self.node_records[node] - secs = 1e-6 * (last['time'] - first['time']) - if secs <= 0: - continue - sum_secs += secs - mstarts = last['mstarts'] - first['mstarts'] - sum_mstarts += mstarts - mends = last['mends'] - first['mends'] - sum_mends += mends - bstarts = last['bstarts'] - first['bstarts'] - sum_bstarts += bstarts - bends = last['bends'] - first['bends'] - sum_bends += bends - print('%-10s %6.2f %8.2f %8.2f %8.2f %8.2f' % ( - node, secs, - 1e-3 * mstarts / secs, - 1e-3 * mends / secs, - 8e-9 * bstarts / secs, - 8e-9 * bends / secs - )) - if sum_secs != 0: - print('Average %6.2f %8.2f %8.2f %8.2f %8.2f' % ( - sum_secs / len(nodes), - 1e-3 * sum_mstarts / sum_secs, - 1e-3 * sum_mends / sum_secs, - 8e-9 * sum_bstarts / sum_secs, - 8e-9 * sum_bends / sum_secs - )) - - print('') - print('The number of active incoming RPCs on each node (those for ' - 'which at least') - print('one packet has been received, but some data is still ' - 'outstanding) as a') - print('function of time (in seconds).') - print('') - - print(' Time', end='') - for node in nodes: - print('%10s' % (node), end='') - print('') - print('-' * 10 * (len(nodes) + 1)) - for interval in self.intervals: - any_data = False - line = '%10.4f' % (interval[0] * 1e-6) - for i, index in zip(range(0, len(interval[1])), interval[1]): - if index == -1: - line += ' ' * 10 - else: - record = self.node_records[nodes[i]][index] - if record['time'] < interval[0] or ( - self.node_records[nodes[i]][index-1]['time'] >= - interval[0]): - print('Index %d for %s has out-of range time ' - '(time %d, interval time %.4f)' % ( - index, nodes[i], record['time'] * 1e-6, - interval[0]), file=sys.stderr) - line += ' %8d' % (record['mstarts'] - record['mends']) - any_data = True - if any_data: - print(line.rstrip()) - - if options.data: - for i in range(0, len(nodes)): - self.output_node_data(nodes[i], i) - #------------------------------------------------ # Analyzer: rxsnapshot #------------------------------------------------ From 16db534fbf20001b0eedd763bf0a5215cff4b962 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 6 Jul 2025 20:43:50 -0700 Subject: [PATCH 378/625] Remove unneeded tt_record calls --- homa_incoming.c | 2 -- homa_interest.h | 1 - 2 files changed, 3 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 80ea5d6f..25d87aca 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -555,8 +555,6 @@ void homa_dispatch_pkts(struct sk_buff *skb) goto discard; } } else { - tt_record1("homa_dispatch_pkts has rpc lock for id %d", - rpc->id); if (h->common.type == DATA || #ifndef __STRIP__ /* See strip.py */ h->common.type == GRANT || diff --git a/homa_interest.h b/homa_interest.h index 4ba81250..3947470c 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -72,7 +72,6 @@ struct homa_interest { */ static inline void homa_interest_unlink_shared(struct homa_interest *interest) { - tt_record("homa_interest_unlink_shared invoked"); if (!list_empty(&interest->links)) { homa_sock_lock(interest->hsk); list_del_init(&interest->links); From 63883fec82f8a691982b1e61dba7190500bc8330 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 7 Jul 2025 09:06:35 -0700 Subject: [PATCH 379/625] Change when rpc->msgin.birth is set Previously it was set when the input message was queued for grants; now it is set when the first data packet arrives. The earlier approach caused messages to have a lower grant priority if they blocked waiting for buffer pool space, which caused overloaded servers to get poor service. --- homa_grant.c | 1 - homa_incoming.c | 1 + homa_rpc.h | 5 ++--- test/unit_homa_grant.c | 2 -- test/unit_homa_incoming.c | 2 ++ 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index d3f9fbb3..0d6a3b2f 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -428,7 +428,6 @@ void homa_grant_manage_rpc(struct homa_rpc *rpc) grant->num_grantable_rpcs, rpc->id); if (grant->num_grantable_rpcs > grant->max_grantable_rpcs) grant->max_grantable_rpcs = grant->num_grantable_rpcs; - rpc->msgin.birth = time; bumped = homa_grant_insert_active(rpc); if (bumped) diff --git a/homa_incoming.c b/homa_incoming.c index 25d87aca..e549a198 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -51,6 +51,7 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) skb_queue_head_init(&rpc->msgin.packets); INIT_LIST_HEAD(&rpc->msgin.gaps); rpc->msgin.bytes_remaining = length; + rpc->msgin.birth = homa_clock(); err = homa_pool_alloc_msg(rpc); if (err != 0) { rpc->msgin.length = -1; diff --git a/homa_rpc.h b/homa_rpc.h index 6d2725e7..dd6533c6 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -191,9 +191,8 @@ struct homa_message_in { int rec_incoming; /** - * @birth: homa_clock() time when homa_grant_manage_rpc was invoked - * for this RPC. Managed by homa_grant.c. Only set if the RPC needs - * grants. + * @birth: homa_clock() time when this structure was initialized + * (i.e. first data packet was received for message). */ u64 birth; diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index a590dac0..cc245f80 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -567,9 +567,7 @@ TEST_F(homa_grant, homa_grant_manage_rpc__insert_and_bump_to_grantables) EXPECT_EQ(850, homa_metrics_per_cpu()->grantable_rpcs_integral); EXPECT_EQ(300, self->homa.grant->last_grantable_change); EXPECT_EQ(-1, rpc1->msgin.rank); - EXPECT_EQ(200, rpc1->msgin.birth); EXPECT_EQ(0, rpc2->msgin.rank); - EXPECT_EQ(300, rpc2->msgin.birth); unit_log_clear(); unit_log_grantables(&self->homa); EXPECT_STREQ("active[0]: id 102 ungranted 19000; " diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 7f3b39e0..e3260e26 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -135,8 +135,10 @@ TEST_F(homa_incoming, homa_message_in_init__basics) UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 99, 1000, 1000); + mock_clock = 200; EXPECT_EQ(0, homa_message_in_init(crpc, 127, 100)); EXPECT_EQ(100, crpc->msgin.granted); + EXPECT_EQ(200, crpc->msgin.birth); EXPECT_EQ(0, homa_message_in_init(crpc, 128, 500)); EXPECT_EQ(128, crpc->msgin.granted); EXPECT_EQ(1, crpc->msgin.num_bpages); From 04df9e05b7fcc7c0675178b701848d09247eb77d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 7 Jul 2025 11:01:19 -0700 Subject: [PATCH 380/625] Change wakeup mechanism for RPCs waiting for buffer pool space Instead of using the grant mechanism to request retransmission, issue a RESEND packet for the discarded packets. The grant approach effectively deprioritized RPCs relative to new RPCs that didn't have to wait for buffer space (the waking RPCs have no received data whereas normal RPCs have received unscheduled data). This caused server backups under high load. This change eliminates the resend_all fields in homa_grant_hdr and homa_message_in. --- homa_devel.c | 10 ++++----- homa_grant.c | 12 +++++----- homa_incoming.c | 4 ---- homa_pool.c | 20 +++++++++++++---- homa_rpc.h | 3 --- homa_wire.h | 7 ------ test/unit_homa_grant.c | 40 +++++++++++---------------------- test/unit_homa_incoming.c | 41 ++-------------------------------- test/unit_homa_offload.c | 1 - test/unit_homa_outgoing.c | 6 +---- test/unit_homa_pool.c | 47 +++++++++++++++++++++++++++++++++++++-- 11 files changed, 86 insertions(+), 105 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index a9d37cfa..32701246 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -217,11 +217,10 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) #ifndef __STRIP__ /* See strip.py */ case GRANT: { struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; - char *resend = (h->resend_all) ? ", resend_all" : ""; used = homa_snprintf(buffer, buf_len, used, - ", offset %d, grant_prio %u%s", - ntohl(h->offset), h->priority, resend); + ", offset %d, grant_prio %u", + ntohl(h->offset), h->priority); break; } #endif /* See strip.py */ @@ -349,10 +348,9 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) #ifndef __STRIP__ /* See strip.py */ case GRANT: { struct homa_grant_hdr *h = (struct homa_grant_hdr *)header; - char *resend = h->resend_all ? " resend_all" : ""; - snprintf(buffer, buf_len, "GRANT %d@%d%s", ntohl(h->offset), - h->priority, resend); + snprintf(buffer, buf_len, "GRANT %d@%d", ntohl(h->offset), + h->priority); break; } #endif /* See strip.py */ diff --git a/homa_grant.c b/homa_grant.c index 0d6a3b2f..0cb88c77 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -146,9 +146,6 @@ void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched) __must_hold(rpc->bucket->lock) { rpc->msgin.rank = -1; - if (rpc->msgin.num_bpages == 0) - /* Can't issue grants until buffer space becomes available. */ - return; if (unsched >= rpc->msgin.length) { rpc->msgin.granted = rpc->msgin.length; rpc->msgin.prev_grant = rpc->msgin.granted; @@ -156,7 +153,11 @@ void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched) } rpc->msgin.granted = unsched; rpc->msgin.prev_grant = unsched; - homa_grant_manage_rpc(rpc); + if (rpc->msgin.num_bpages != 0) + /* Can't issue grants unless buffer space has been allocated + * for the message. + */ + homa_grant_manage_rpc(rpc); } /** @@ -656,9 +657,6 @@ void homa_grant_send(struct homa_rpc *rpc, int priority) grant.offset = htonl(rpc->msgin.granted); grant.priority = priority; - grant.resend_all = rpc->msgin.resend_all; - if (grant.resend_all) - rpc->msgin.resend_all = 0; tt_record4("sending grant for id %d, offset %d, priority %d, increment %d", rpc->id, rpc->msgin.granted, grant.priority, rpc->msgin.granted - rpc->msgin.prev_grant); diff --git a/homa_incoming.c b/homa_incoming.c index e549a198..12a3290c 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -769,10 +769,6 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) homa_local_id(h->common.sender_id), ntohl(h->offset), h->priority, new_offset - rpc->msgout.granted); if (rpc->state == RPC_OUTGOING) { - if (h->resend_all) - homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset, - h->priority); - if (new_offset > rpc->msgout.granted) { rpc->msgout.granted = new_offset; if (new_offset > rpc->msgout.length) diff --git a/homa_pool.c b/homa_pool.c index 6f3e57be..da1d492e 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -527,10 +527,22 @@ void homa_pool_check_waiting(struct homa_pool *pool) homa_pool_alloc_msg(rpc); #ifndef __STRIP__ /* See strip.py */ if (rpc->msgin.num_bpages > 0) { - /* Allocation succeeded; "wake up" the RPC. */ - rpc->msgin.resend_all = 1; - homa_grant_init_rpc(rpc, 0); - homa_grant_check_rpc(rpc); + struct homa_resend_hdr resend; + + /* To "wake up" the RPC, request retransmission of + * all the packets that were dropped. Use the + * next-to-highest priority level to provide a priority + * boost without interfering with the highest priority + * traffic such as control packets. + */ + resend.offset = htonl(0); + resend.length = htonl(-1); + resend.priority = (rpc->hsk->homa->num_priorities < 2) ? + 0 : + rpc->hsk->homa->num_priorities - 2; + homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); + if (rpc->msgin.granted < rpc->msgin.length) + homa_grant_manage_rpc(rpc); } #endif /* See strip.py */ homa_rpc_unlock(rpc); diff --git a/homa_rpc.h b/homa_rpc.h index dd6533c6..4b0e92aa 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -195,9 +195,6 @@ struct homa_message_in { * (i.e. first data packet was received for message). */ u64 birth; - - /** @resend_all: if nonzero, set resend_all in the next grant packet. */ - u8 resend_all; #endif /* See strip.py */ }; diff --git a/homa_wire.h b/homa_wire.h index cce00b64..2c386fed 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -388,13 +388,6 @@ struct homa_grant_hdr { * with higher offset. Larger numbers indicate higher priorities. */ u8 priority; - - /** - * @resend_all: Nonzero means that the sender should resend all previously - * transmitted data, starting at the beginning of the message (assume - * that no packets have been successfully received). - */ - u8 resend_all; } __packed; #endif /* See strip.py */ diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index cc245f80..540a30ba 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -187,32 +187,30 @@ TEST_F(homa_grant, homa_grant_free__sysctls_not_registered) EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_grant, homa_grant_init_rpc__no_bpages_available) +TEST_F(homa_grant, homa_grant_init_rpc__grants_not_needed) { struct homa_rpc *rpc; rpc= unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 20000); - - atomic_set(&self->hsk.buffer_pool->free_bpages, 0); - homa_message_in_init(rpc, 20000, 10000); - EXPECT_EQ(0, rpc->msgin.num_bpages); + homa_message_in_init(rpc, 2000, 2000); EXPECT_EQ(-1, rpc->msgin.rank); - EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_EQ(2000, rpc->msgin.granted); } -TEST_F(homa_grant, homa_grant_init_rpc__grants_not_needed) +TEST_F(homa_grant, homa_grant_init_rpc__grants_needed) { struct homa_rpc *rpc; rpc= unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, 100, 1000, 20000); - homa_message_in_init(rpc, 2000, 2000); - EXPECT_EQ(-1, rpc->msgin.rank); + + homa_message_in_init(rpc, 5000, 2000); + EXPECT_EQ(0, rpc->msgin.rank); EXPECT_EQ(2000, rpc->msgin.granted); } -TEST_F(homa_grant, homa_grant_init_rpc__grants_needed) +TEST_F(homa_grant, homa_grant_init_rpc__no_bpages_available) { struct homa_rpc *rpc; @@ -220,9 +218,11 @@ TEST_F(homa_grant, homa_grant_init_rpc__grants_needed) self->server_ip, self->server_port, 100, 1000, 20000); - homa_message_in_init(rpc, 5000, 2000); - EXPECT_EQ(0, rpc->msgin.rank); - EXPECT_EQ(2000, rpc->msgin.granted); + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); + homa_message_in_init(rpc, 20000, 10000); + EXPECT_EQ(0, rpc->msgin.num_bpages); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(10000, rpc->msgin.granted); } TEST_F(homa_grant, homa_grant_end_rpc__basics) @@ -953,20 +953,6 @@ TEST_F(homa_grant, homa_grant_send__basics) homa_grant_send(rpc, 3); EXPECT_SUBSTR("id 100, offset 2600, grant_prio 3", unit_log_get()); } -TEST_F(homa_grant, homa_grant_send__resend_all) -{ - struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 20000); - - mock_xmit_log_verbose = 1; - rpc->msgin.granted = 9999; - rpc->msgin.rank = 0; - rpc->msgin.resend_all = 1; - unit_log_clear(); - homa_grant_send(rpc, 1); - EXPECT_SUBSTR("id 100, offset 9999, grant_prio 1, resend_all", - unit_log_get()); - EXPECT_EQ(0, rpc->msgin.resend_all); -} TEST_F(homa_grant, homa_grant_check_rpc__msgin_not_initialized) { diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index e3260e26..60df73ae 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -176,9 +176,6 @@ TEST_F(homa_incoming, homa_message_in_init__no_buffers_available) atomic_set(&self->hsk.buffer_pool->free_bpages, 0); EXPECT_EQ(0, homa_message_in_init(crpc, HOMA_BPAGE_SIZE*2, 10000)); EXPECT_EQ(0, crpc->msgin.num_bpages); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(0, crpc->msgin.granted); -#endif /* See strip.py */ } #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_message_in_init__update_message_length_metrics) @@ -1183,7 +1180,7 @@ TEST_F(homa_incoming, homa_dispatch_pkts__reset_counters) .dport = htons(self->hsk.port), .sender_id = cpu_to_be64(self->server_id), .type = GRANT}, - .offset = htonl(12600), .priority = 3, .resend_all = 0}; + .offset = htonl(12600), .priority = 3}; ASSERT_NE(NULL, crpc); EXPECT_EQ(10000, crpc->msgout.granted); @@ -1574,8 +1571,7 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) .sender_id = cpu_to_be64(self->client_id), .type = GRANT}, .offset = htonl(11000), - .priority = 3, - .resend_all = 0}; + .priority = 3}; ASSERT_NE(NULL, srpc); homa_rpc_lock(srpc); @@ -1605,39 +1601,6 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) /* Must restore old state to avoid potential crashes. */ srpc->state = RPC_OUTGOING; } -TEST_F(homa_incoming, homa_grant_pkt__reset) -{ - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 100, 20000); - struct homa_grant_hdr h = {{.sport = htons(srpc->dport), - .dport = htons(self->hsk.port), - .sender_id = cpu_to_be64(self->client_id), - .type = GRANT}, - .offset = htonl(3000), - .priority = 2, - .resend_all = 1}; - - ASSERT_NE(NULL, srpc); - homa_rpc_lock(srpc); - homa_xmit_data(srpc, false); - homa_rpc_unlock(srpc); - unit_log_clear(); - EXPECT_EQ(10000, srpc->msgout.granted); - EXPECT_EQ(10000, srpc->msgout.next_xmit_offset); - - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); - EXPECT_EQ(10000, srpc->msgout.granted); - EXPECT_EQ(10000, srpc->msgout.next_xmit_offset); - EXPECT_STREQ("xmit DATA retrans 1400@0; " - "xmit DATA retrans 1400@1400; " - "xmit DATA retrans 1400@2800; " - "xmit DATA retrans 1400@4200; " - "xmit DATA retrans 1400@5600; " - "xmit DATA retrans 1400@7000; " - "xmit DATA retrans 1400@8400; " - "xmit DATA retrans 200@9800", unit_log_get()); -} TEST_F(homa_incoming, homa_grant_pkt__grant_past_end_of_message) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index ffa0677f..820d9d57 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -337,7 +337,6 @@ TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) h.common.type = GRANT; h.offset = htonl(11000); h.priority = 3; - h.resend_all = 0; /* First attempt: HOMA_GRO_FAST_GRANTS not enabled. */ self->homa.gro_policy = 0; diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 105de871..9d59e201 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -116,7 +116,6 @@ TEST_F(homa_outgoing, set_priority__priority_mapping) h.offset = htonl(12345); h.priority = 4; - h.resend_all = 0; EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), srpc)); self->homa.priority_map[7] = 3; EXPECT_EQ(0, homa_xmit_control(GRANT, &h, sizeof(h), srpc)); @@ -725,7 +724,6 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv4_error) h.offset = htonl(12345); h.priority = 4; - h.resend_all = 0; mock_xmit_log_verbose = 1; mock_ip_queue_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); @@ -749,7 +747,6 @@ TEST_F(homa_outgoing, __homa_xmit_control__ipv6_error) h.offset = htonl(12345); h.priority = 4; - h.resend_all = 0; mock_xmit_log_verbose = 1; mock_ip6_xmit_errors = 1; EXPECT_EQ(ENETDOWN, -homa_xmit_control(GRANT, &h, sizeof(h), srpc)); @@ -763,8 +760,7 @@ TEST_F(homa_outgoing, homa_xmit_unknown) .dport = htons(self->server_port), .sender_id = cpu_to_be64(99990), .type = GRANT}, - .offset = htonl(11200), - .resend_all = 0}; + .offset = htonl(11200)}; struct sk_buff *skb; mock_xmit_log_verbose = 1; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 045057a2..40dbdced 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -675,15 +675,58 @@ TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc) ASSERT_NE(NULL, crpc); EXPECT_EQ(0, crpc->msgin.num_bpages); EXPECT_EQ(2, pool->bpages_needed); + EXPECT_EQ(-1, crpc->msgin.rank); /* Free the required pages. */ unit_log_clear(); atomic_set(&pool->free_bpages, 2); homa_pool_check_waiting(pool); EXPECT_EQ(2, crpc->msgin.num_bpages); + EXPECT_STREQ("xmit RESEND 0--2@6", unit_log_get()); EXPECT_EQ(0, crpc->msgin.rank); - EXPECT_STREQ("xmit GRANT 10000@0 resend_all", - unit_log_get()); +} +TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc_only_one_priority_level) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + /* Queue up an RPC that needs 2 bpages. */ + atomic_set(&pool->free_bpages, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2*HOMA_BPAGE_SIZE); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(0, crpc->msgin.num_bpages); + EXPECT_EQ(2, pool->bpages_needed); + self->homa.num_priorities = 1; + + /* Free the required pages. */ + unit_log_clear(); + atomic_set(&pool->free_bpages, 2); + homa_pool_check_waiting(pool); + EXPECT_EQ(2, crpc->msgin.num_bpages); + EXPECT_EQ(0, crpc->msgin.rank); + EXPECT_STREQ("xmit RESEND 0--2@0", unit_log_get()); +} +TEST_F(homa_pool, homa_pool_check_waiting__wake_up_waiting_rpc_no_need_for_grants) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + struct homa_rpc *crpc; + + atomic_set(&pool->free_bpages, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 5000); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(0, crpc->msgin.num_bpages); + EXPECT_EQ(1, pool->bpages_needed); + EXPECT_EQ(-1, crpc->msgin.rank); + + /* Free the required pages. */ + unit_log_clear(); + atomic_set(&pool->free_bpages, 2); + homa_pool_check_waiting(pool); + EXPECT_EQ(1, crpc->msgin.num_bpages); + EXPECT_STREQ("xmit RESEND 0--2@6", unit_log_get()); + EXPECT_EQ(-1, crpc->msgin.rank); } #endif /* See strip.py */ TEST_F(homa_pool, homa_pool_check_waiting__reallocation_fails) From 2565840a843fd607ff1137db89893861d50c3e88 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 7 Jul 2025 11:41:57 -0700 Subject: [PATCH 381/625] Clean up sparse annotations Use the name of the lock variable, not its address. --- homa_grant.c | 22 +++++++++++----------- homa_grant.h | 4 ++-- homa_incoming.c | 2 +- homa_interest.c | 8 ++++---- homa_interest.h | 2 +- homa_pacer.c | 2 +- homa_pacer.h | 6 +++--- homa_peer.c | 4 ++-- homa_peer.h | 6 +++--- homa_pool.c | 2 +- homa_sock.c | 2 +- homa_sock.h | 6 +++--- homa_timer.c | 2 +- 13 files changed, 34 insertions(+), 34 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 0cb88c77..d53ef471 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -270,7 +270,7 @@ int homa_grant_priority(struct homa *homa, int rank) * that @rpc displaced). */ struct homa_rpc *homa_grant_insert_active(struct homa_rpc *rpc) - __must_hold(&rpc->hsk->homa->grant->lock) + __must_hold(rpc->hsk->homa->grant->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_rpc *other, *result; @@ -350,7 +350,7 @@ struct homa_rpc *homa_grant_insert_active(struct homa_rpc *rpc) * or grantable_peers. */ void homa_grant_insert_grantable(struct homa_rpc *rpc) - __must_hold(&rpc->hsk->homa->grant->lock) + __must_hold(rpc->hsk->homa->grant->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_peer *peer = rpc->peer; @@ -411,7 +411,7 @@ void homa_grant_insert_grantable(struct homa_rpc *rpc) * @rpc: The RPC to add. Must be locked by caller. */ void homa_grant_manage_rpc(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_rpc *bumped; @@ -445,7 +445,7 @@ void homa_grant_manage_rpc(struct homa_rpc *rpc) * a grantable list. */ void homa_grant_remove_grantable(struct homa_rpc *rpc) - __must_hold(&rpc->hsk->homa->grant->lock) + __must_hold(rpc->hsk->homa->grant->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_peer *peer = rpc->peer; @@ -493,7 +493,7 @@ void homa_grant_remove_grantable(struct homa_rpc *rpc) */ void homa_grant_remove_active(struct homa_rpc *rpc, struct homa_grant_candidates *cand) - __must_hold(&rpc->hsk->homa->grant->lock) + __must_hold(rpc->hsk->homa->grant->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_peer *peer; @@ -538,7 +538,7 @@ void homa_grant_remove_active(struct homa_rpc *rpc, */ void homa_grant_unmanage_rpc(struct homa_rpc *rpc, struct homa_grant_candidates *cand) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; u64 time = homa_clock(); @@ -570,7 +570,7 @@ void homa_grant_unmanage_rpc(struct homa_rpc *rpc, * @grant: Grant information for a Homa transport. */ void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa_grant *grant) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { int incoming, delta; @@ -596,7 +596,7 @@ void homa_grant_update_incoming(struct homa_rpc *rpc, struct homa_grant *grant) * and no grant should be sent. */ int homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { int received, new_grant_offset, incoming_delta, avl_incoming, rank; int prev_stalled; @@ -672,7 +672,7 @@ void homa_grant_send(struct homa_rpc *rpc, int priority) * @rpc: RPC to check. Must be locked by the caller. */ void homa_grant_check_rpc(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; int needy_rank, stalled_rank, rank; @@ -808,7 +808,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * or INT_MAX if no RPCs were promoted. */ int homa_grant_fix_order(struct homa_grant *grant) - __must_hold(&grant->lock) + __must_hold(grant->lock) { struct homa_rpc *rpc, *other; int result = INT_MAX; @@ -1028,7 +1028,7 @@ void homa_grant_cand_check(struct homa_grant_candidates *cand, * @grant: Grant management information. */ void homa_grant_lock_slow(struct homa_grant *grant) - __acquires(&grant->lock) + __acquires(grant->lock) { u64 start = homa_clock(); diff --git a/homa_grant.h b/homa_grant.h index 6507ea64..da49e5df 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -288,7 +288,7 @@ static inline bool homa_grant_cand_empty(struct homa_grant_candidates *cand) * @grant: Grant management info. */ static inline void homa_grant_lock(struct homa_grant *grant) - __acquires(&grant->lock) + __acquires(grant->lock) { if (!spin_trylock_bh(&grant->lock)) homa_grant_lock_slow(grant); @@ -300,7 +300,7 @@ static inline void homa_grant_lock(struct homa_grant *grant) * @grant: Grant management info. */ static inline void homa_grant_unlock(struct homa_grant *grant) - __releases(&grant->grant_lock) + __releases(grant->grant_lock) { INC_METRIC(grant_lock_cycles, homa_clock() - grant->lock_time); spin_unlock_bh(&grant->lock); diff --git a/homa_incoming.c b/homa_incoming.c index 12a3290c..33390c09 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1071,7 +1071,7 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, * a negative errno. */ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { struct homa_interest interest; #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_interest.c b/homa_interest.c index 9e1591cd..a3e43573 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -20,7 +20,7 @@ */ void homa_interest_init_shared(struct homa_interest *interest, struct homa_sock *hsk) - __must_hold(&hsk->lock) + __must_hold(hsk->lock) { interest->rpc = NULL; atomic_set(&interest->ready, 0); @@ -42,7 +42,7 @@ void homa_interest_init_shared(struct homa_interest *interest, */ int homa_interest_init_private(struct homa_interest *interest, struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { if (rpc->private_interest) return -EINVAL; @@ -138,7 +138,7 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) * locked by the caller. */ void homa_interest_notify_private(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { if (rpc->private_interest) { atomic_set_release(&rpc->private_interest->ready, 1); @@ -157,7 +157,7 @@ void homa_interest_notify_private(struct homa_rpc *rpc) * currently busy doing Homa transport work. */ struct homa_interest *homa_choose_interest(struct homa_sock *hsk) - __must_hold(&hsk->lock) + __must_hold(hsk->lock) { u64 busy_time = homa_clock() - hsk->homa->busy_cycles; struct homa_interest *interest, *first; diff --git a/homa_interest.h b/homa_interest.h index 3947470c..645d25d5 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -88,7 +88,7 @@ static inline void homa_interest_unlink_shared(struct homa_interest *interest) * the caller. */ static inline void homa_interest_unlink_private(struct homa_interest *interest) - __must_hold(&interest->rpc->bucket->lock) + __must_hold(interest->rpc->bucket->lock) { if (interest == interest->rpc->private_interest) interest->rpc->private_interest = NULL; diff --git a/homa_pacer.c b/homa_pacer.c index 291c8a02..8082df11 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -487,7 +487,7 @@ void homa_pacer_log_throttled(struct homa_pacer *pacer) * @pacer: Pacer information for a Homa transport. */ void homa_pacer_throttle_lock_slow(struct homa_pacer *pacer) - __acquires(&pacer->throttle_lock) + __acquires(pacer->throttle_lock) { u64 start = homa_clock(); diff --git a/homa_pacer.h b/homa_pacer.h index 8611908a..1f4476d4 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -195,7 +195,7 @@ static inline void homa_pacer_check(struct homa_pacer *pacer) * @pacer: Pacer information for a Homa transport. */ static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) - __acquires(&pacer->throttle_lock) + __acquires(pacer->throttle_lock) { if (!spin_trylock_bh(&pacer->throttle_lock)) homa_pacer_throttle_lock_slow(pacer); @@ -206,7 +206,7 @@ static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) * @pacer: Pacer information for a Homa transport. */ static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) - __acquires(&pacer->throttle_lock) + __acquires(pacer->throttle_lock) { spin_lock_bh(&pacer->throttle_lock); } @@ -217,7 +217,7 @@ static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) * @pacer: Pacer information for a Homa transport. */ static inline void homa_pacer_throttle_unlock(struct homa_pacer *pacer) - __releases(&pacer->throttle_lock) + __releases(pacer->throttle_lock) { spin_unlock_bh(&pacer->throttle_lock); } diff --git a/homa_peer.c b/homa_peer.c index b8f6f6dc..08616ac5 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -220,7 +220,7 @@ void homa_peer_rcu_callback(struct rcu_head *head) * @peertab: Check the dead peers here. */ void homa_peer_free_dead(struct homa_peertab *peertab) - __must_hold(&peertab->lock) + __must_hold(peertab->lock) { struct homa_peer *peer, *tmp; @@ -689,7 +689,7 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, * @peer: Peer to lock. */ void homa_peer_lock_slow(struct homa_peer *peer) - __acquires(&peer->ack_lock) + __acquires(peer->ack_lock) { u64 start = homa_clock(); diff --git a/homa_peer.h b/homa_peer.h index cec167d3..df39db05 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -321,7 +321,7 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, * @peer: Peer to lock. */ static inline void homa_peer_lock(struct homa_peer *peer) - __acquires(&peer->ack_lock) + __acquires(peer->ack_lock) { if (!spin_trylock_bh(&peer->ack_lock)) homa_peer_lock_slow(peer); @@ -332,7 +332,7 @@ static inline void homa_peer_lock(struct homa_peer *peer) * @peer: Peer to lock. */ static inline void homa_peer_lock(struct homa_peer *peer) - __acquires(&peer->ack_lock) + __acquires(peer->ack_lock) { spin_lock_bh(&peer->ack_lock); } @@ -343,7 +343,7 @@ static inline void homa_peer_lock(struct homa_peer *peer) * @peer: Peer to lock. */ static inline void homa_peer_unlock(struct homa_peer *peer) - __releases(&peer->ack_lock) + __releases(peer->ack_lock) { spin_unlock_bh(&peer->ack_lock); } diff --git a/homa_pool.c b/homa_pool.c index da1d492e..32c4191a 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -287,7 +287,7 @@ int homa_pool_get_pages(struct homa_pool *pool, int num_pages, u32 *pages, * returned. */ int homa_pool_alloc_msg(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { struct homa_pool *pool = rpc->hsk->buffer_pool; int full_pages, partial, i, core_id; diff --git a/homa_sock.c b/homa_sock.c index 63e19954..47884748 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -444,7 +444,7 @@ struct homa_sock *homa_sock_find(struct homa_net *hnet, u16 port) * @hsk: socket to lock. */ void homa_sock_lock_slow(struct homa_sock *hsk) - __acquires(&hsk->lock) + __acquires(hsk->lock) { u64 start = homa_clock(); diff --git a/homa_sock.h b/homa_sock.h index 1b342055..acbd8d2e 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -294,7 +294,7 @@ struct homa_sock *homa_socktab_start_scan(struct homa_socktab *socktab, * @hsk: Socket to lock. */ static inline void homa_sock_lock(struct homa_sock *hsk) - __acquires(&hsk->lock) + __acquires(hsk->lock) { if (!spin_trylock_bh(&hsk->lock)) homa_sock_lock_slow(hsk); @@ -305,7 +305,7 @@ static inline void homa_sock_lock(struct homa_sock *hsk) * @hsk: Socket to lock. */ static inline void homa_sock_lock(struct homa_sock *hsk) - __acquires(&hsk->lock) + __acquires(hsk->lock) { spin_lock_bh(&hsk->lock); } @@ -316,7 +316,7 @@ static inline void homa_sock_lock(struct homa_sock *hsk) * @hsk: Socket to lock. */ static inline void homa_sock_unlock(struct homa_sock *hsk) - __releases(&hsk->lock) + __releases(hsk->lock) { spin_unlock_bh(&hsk->lock); } diff --git a/homa_timer.c b/homa_timer.c index 9ca6d906..0f7dce83 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -25,7 +25,7 @@ * @rpc: RPC to check; must be locked by the caller. */ void homa_timer_check_rpc(struct homa_rpc *rpc) - __must_hold(&rpc->bucket->lock) + __must_hold(rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; From 1a959a37e88588ac722fbef534f2c79c9208a182 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 7 Jul 2025 13:25:43 -0700 Subject: [PATCH 382/625] Add new function homa_grant_adjust_peer Replaces functionality in homa_grant_insert_grantable and homa_grant_remove_grantable; will be used when reimplementing FIFO grants. --- homa_grant.c | 132 +++++++++++++------------ homa_grant.h | 2 + test/unit_homa_grant.c | 219 +++++++++++++++++++++++++---------------- 3 files changed, 206 insertions(+), 147 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index d53ef471..36678a8a 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -343,6 +343,72 @@ struct homa_rpc *homa_grant_insert_active(struct homa_rpc *rpc) return result; } +/** + * homa_grant_adjust_peer() - This function is invoked when the contents + * of a peer's grantable_rpcs list has changed, so it's possible that + * the position of this peer in grantable_peers is no longer correct. The + * function adjusts the position of peer in grantable_peers (which could + * include adding or removing the peer to/from grantable_peers). + * @grant: Overall information about grants + * @peer: Peer to adjust + */ +void homa_grant_adjust_peer(struct homa_grant *grant, struct homa_peer *peer) + __must_hold(&grant->lock) +{ + struct homa_rpc *head, *other_rpc; + struct homa_peer *other_peer; + + if (list_empty(&peer->grantable_rpcs)) { + list_del_init(&peer->grantable_links); + return; + } + + head = list_first_entry(&peer->grantable_rpcs, + struct homa_rpc, grantable_links); + if (list_empty(&peer->grantable_links)) { + /* Must add peer to grantable_peers. */ + list_for_each_entry(other_peer, &grant->grantable_peers, + grantable_links) { + other_rpc = list_first_entry(&other_peer->grantable_rpcs, + struct homa_rpc, + grantable_links); + if (homa_grant_outranks(head, other_rpc)) { + list_add_tail(&peer->grantable_links, + &other_peer->grantable_links); + return; + } + } + list_add_tail(&peer->grantable_links, &grant->grantable_peers); + return; + } + + /* The peer is on grantable_peers; this loop moves it upward, if + * needed. + */ + while (peer != list_first_entry(&grant->grantable_peers, + struct homa_peer, grantable_links)) { + other_peer = list_prev_entry(peer, grantable_links); + other_rpc = list_first_entry(&other_peer->grantable_rpcs, + struct homa_rpc, grantable_links); + if (!homa_grant_outranks(head, other_rpc)) + break; + __list_del_entry(&other_peer->grantable_links); + list_add(&other_peer->grantable_links, &peer->grantable_links); + } + + /* This loop moves the peer downward in grantable_peers, if needed. */ + while (peer != list_last_entry(&grant->grantable_peers, + struct homa_peer, grantable_links)) { + other_peer = list_next_entry(peer, grantable_links); + other_rpc = list_first_entry(&other_peer->grantable_rpcs, + struct homa_rpc, grantable_links); + if (!homa_grant_outranks(other_rpc, head)) + break; + __list_del_entry(&peer->grantable_links); + list_add(&peer->grantable_links, &other_peer->grantable_links); + } +} + /** * homa_grant_insert_grantable() - Insert an RPC into the grantable list * for its peer. @@ -354,7 +420,6 @@ void homa_grant_insert_grantable(struct homa_rpc *rpc) { struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_peer *peer = rpc->peer; - struct homa_peer *other_peer; struct homa_rpc *other; /* Insert @rpc in the right place in the grantable_rpcs list for @@ -370,38 +435,7 @@ void homa_grant_insert_grantable(struct homa_rpc *rpc) list_add_tail(&rpc->grantable_links, &peer->grantable_rpcs); position_peer: - /* At this point rpc is positioned correctly on the list for its peer. - * However, the peer may need to be added to, or moved upward in, - * grantable_peers. - */ - if (list_empty(&peer->grantable_links)) { - /* Must add peer to grantable_peers. */ - list_for_each_entry(other_peer, &grant->grantable_peers, - grantable_links) { - other = list_first_entry(&other_peer->grantable_rpcs, - struct homa_rpc, - grantable_links); - if (homa_grant_outranks(rpc, other)) { - list_add_tail(&peer->grantable_links, - &other_peer->grantable_links); - return; - } - } - list_add_tail(&peer->grantable_links, &grant->grantable_peers); - return; - } - /* The peer is on grantable_peers, but it may need to move upward. */ - while (peer != list_first_entry(&grant->grantable_peers, - struct homa_peer, grantable_links)) { - struct homa_peer *prev_peer = list_prev_entry(peer, - grantable_links); - other = list_first_entry(&prev_peer->grantable_rpcs, - struct homa_rpc, grantable_links); - if (!homa_grant_outranks(rpc, other)) - break; - __list_del_entry(&prev_peer->grantable_links); - list_add(&prev_peer->grantable_links, &peer->grantable_links); - } + homa_grant_adjust_peer(grant, peer); } /** @@ -447,42 +481,14 @@ void homa_grant_manage_rpc(struct homa_rpc *rpc) void homa_grant_remove_grantable(struct homa_rpc *rpc) __must_hold(rpc->hsk->homa->grant->lock) { - struct homa_grant *grant = rpc->hsk->homa->grant; struct homa_peer *peer = rpc->peer; - struct homa_rpc *other; struct homa_rpc *head; head = list_first_entry(&peer->grantable_rpcs, struct homa_rpc, grantable_links); list_del_init(&rpc->grantable_links); - if (rpc != head) - return; - - /* The removed RPC was at the front of the peer's list. This means - * we may have to adjust the position of the peer in the peer list, - * or perhaps remove it. - */ - if (list_empty(&peer->grantable_rpcs)) { - list_del_init(&peer->grantable_links); - return; - } - - /* The peer may have to move down in Homa's list (its highest priority - * may now be lower). - */ - head = list_first_entry(&peer->grantable_rpcs, - struct homa_rpc, grantable_links); - while (peer != list_last_entry(&grant->grantable_peers, - struct homa_peer, grantable_links)) { - struct homa_peer *next_peer = list_next_entry(peer, - grantable_links); - other = list_first_entry(&next_peer->grantable_rpcs, - struct homa_rpc, grantable_links); - if (!homa_grant_outranks(other, head)) - break; - __list_del_entry(&peer->grantable_links); - list_add(&peer->grantable_links, &next_peer->grantable_links); - } + if (rpc == head) + homa_grant_adjust_peer(rpc->hsk->homa->grant, peer); } /** diff --git a/homa_grant.h b/homa_grant.h index da49e5df..123ae874 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -226,6 +226,8 @@ struct homa_grant_candidates { struct homa_grant *homa_grant_alloc(void); +void homa_grant_adjust_peer(struct homa_grant *grant, + struct homa_peer *peer); void homa_grant_cand_add(struct homa_grant_candidates *cand, struct homa_rpc *rpc); void homa_grant_cand_check(struct homa_grant_candidates *cand, diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 540a30ba..ca83df1b 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -129,6 +129,12 @@ static struct homa_rpc *test_rpc(FIXTURE_DATA(homa_grant) *self, return rpc; } +/* Add an RPC to the front of its peer's grantable_rpcs. */ +// static void add_to_peer(struct homa_rpc *rpc) +// { +// list_add_tail(&rpc->grantable_links, &rpc->peer->grantable_rpcs); +// } + /* Create a client RPC whose msgin is properly initialized with no * unscheduled bytes and no packets received. */ @@ -457,6 +463,134 @@ TEST_F(homa_grant, homa_grant_insert_active__insert_in_middle_no_bump) EXPECT_EQ(4, rpc1->peer->active_rpcs); } +TEST_F(homa_grant, homa_grant_adjust_peer__remove_peer_from_grantable_peers) +{ + struct homa_rpc *rpc = test_rpc(self, 200, self->server_ip, 100000); + struct homa_peer *peer = rpc->peer; + + list_add_tail(&peer->grantable_links, + &self->homa.grant->grantable_peers); + EXPECT_EQ(1, list_empty(&peer->grantable_rpcs)); + EXPECT_EQ(0, list_empty(&peer->grantable_links)); + EXPECT_EQ(0, list_empty(&self->homa.grant->grantable_peers)); + + homa_grant_adjust_peer(self->homa.grant, peer); + EXPECT_EQ(1, list_empty(&peer->grantable_links)); + EXPECT_EQ(1, list_empty(&self->homa.grant->grantable_peers)); +} +TEST_F(homa_grant, homa_grant_adjust_peer__insert_in_grantable_peers) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 70000); + + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip + 2, + 50000)); + list_add_tail(&rpc->grantable_links, &rpc->peer->grantable_rpcs); + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 300 ungranted 49000; " + "peer 1.2.3.4: id 100 ungranted 69000; " + "peer 2.2.3.4: id 200 ungranted 99000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__append_to_grantable_peers) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 120000); + + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip + 2, + 50000)); + list_add_tail(&rpc->grantable_links, &rpc->peer->grantable_rpcs); + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 300 ungranted 49000; " + "peer 2.2.3.4: id 200 ungranted 99000; " + "peer 1.2.3.4: id 100 ungranted 119000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__move_peer_upwards) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 120000); + + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip + 2, + 50000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip + 3, + 80000)); + rpc->msgin.granted += 45000; + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 300 ungranted 49000; " + "peer 1.2.3.4: id 100 ungranted 74000; " + "peer 4.2.3.4: id 400 ungranted 79000; " + "peer 2.2.3.4: id 200 ungranted 99000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__move_peer_to_front) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 100000); + + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 50000)); + rpc->msgin.granted += 55000; + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 1.2.3.4: id 100 ungranted 44000; " + "peer 2.2.3.4: id 200 ungranted 49000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__move_peer_downwards) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 40000); + + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip + 2, + 50000)); + homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip + 3, + 80000)); + rpc->msgin.length += 41000; + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 3.2.3.4: id 300 ungranted 49000; " + "peer 4.2.3.4: id 400 ungranted 79000; " + "peer 1.2.3.4: id 100 ungranted 80000; " + "peer 2.2.3.4: id 200 ungranted 99000", + unit_log_get()); +} +TEST_F(homa_grant, homa_grant_adjust_peer__move_peer_to_back) +{ + struct homa_rpc *rpc = test_rpc(self, 100, self->server_ip, 50000); + + homa_grant_insert_grantable(rpc); + homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip + 1, + 100000)); + rpc->msgin.length += 55000; + homa_grant_adjust_peer(self->homa.grant, rpc->peer); + + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("peer 2.2.3.4: id 200 ungranted 99000; " + "peer 1.2.3.4: id 100 ungranted 104000", + unit_log_get()); +} + TEST_F(homa_grant, homa_grant_insert_grantable__insert_in_peer_list) { homa_grant_insert_grantable(test_rpc(self, 100, self->server_ip, @@ -494,42 +628,6 @@ TEST_F(homa_grant, homa_grant_insert_grantable__insert_peer_in_grantable_peers) "peer 3.2.3.4: id 400 ungranted 119000", unit_log_get()); } -TEST_F(homa_grant, homa_grant_insert_grantable__move_peer_in_grantable_peers) -{ - homa_grant_insert_grantable(test_rpc(self, 200, self->server_ip, - 20000)); - homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip+1, - 30000)); - homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+2, - 40000)); - homa_grant_insert_grantable(test_rpc(self, 500, self->server_ip+3, - 50000)); - - /* This insertion moves the peer upwards in the list. */ - homa_grant_insert_grantable(test_rpc(self, 600, self->server_ip+3, - 25000)); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 19000; " - "peer 4.2.3.4: id 600 ungranted 24000 " - "id 500 ungranted 49000; " - "peer 2.2.3.4: id 300 ungranted 29000; " - "peer 3.2.3.4: id 400 ungranted 39000", - unit_log_get()); - - /* This insertion moves the peer to the front of the list. */ - homa_grant_insert_grantable(test_rpc(self, 700, self->server_ip+3, - 10000)); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("peer 4.2.3.4: id 700 ungranted 9000 " - "id 600 ungranted 24000 " - "id 500 ungranted 49000; " - "peer 1.2.3.4: id 200 ungranted 19000; " - "peer 2.2.3.4: id 300 ungranted 29000; " - "peer 3.2.3.4: id 400 ungranted 39000", - unit_log_get()); -} TEST_F(homa_grant, homa_grant_manage_rpc__update_metrics) { @@ -609,7 +707,7 @@ TEST_F(homa_grant, homa_grant_remove_grantable__not_first_in_peer_list) "peer 2.2.3.4: id 400 ungranted 24000", unit_log_get()); } -TEST_F(homa_grant, homa_grant_remove_grantable__only_entry_in_peer_list) +TEST_F(homa_grant, homa_grant_remove_grantable__remove_peer_from_grantable_peers) { struct homa_rpc *rpc = test_rpc(self, 200, self->server_ip, 30000); @@ -633,53 +731,6 @@ TEST_F(homa_grant, homa_grant_remove_grantable__only_entry_in_peer_list) "peer 2.2.3.4: id 300 ungranted 39000", unit_log_get()); } -TEST_F(homa_grant, homa_grant_remove_grantable__reposition_peer_in_grantable_peers) -{ - struct homa_rpc *rpc1 = test_rpc(self, 200, self->server_ip, 20000); - struct homa_rpc *rpc2 = test_rpc(self, 202, self->server_ip, 35000); - - homa_grant_insert_grantable(rpc1); - homa_grant_insert_grantable(rpc2); - homa_grant_insert_grantable(test_rpc(self, 204, self->server_ip, - 60000)); - homa_grant_insert_grantable(test_rpc(self, 300, self->server_ip+1, - 30000)); - homa_grant_insert_grantable(test_rpc(self, 400, self->server_ip+2, - 40000)); - homa_grant_insert_grantable(test_rpc(self, 500, self->server_ip+3, - 50000)); - - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("peer 1.2.3.4: id 200 ungranted 19000 " - "id 202 ungranted 34000 " - "id 204 ungranted 59000; " - "peer 2.2.3.4: id 300 ungranted 29000; " - "peer 3.2.3.4: id 400 ungranted 39000; " - "peer 4.2.3.4: id 500 ungranted 49000", - unit_log_get()); - - /* First removal moves peer down, but not to end of list. */ - homa_grant_remove_grantable(rpc1); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("peer 2.2.3.4: id 300 ungranted 29000; " - "peer 1.2.3.4: id 202 ungranted 34000 " - "id 204 ungranted 59000; " - "peer 3.2.3.4: id 400 ungranted 39000; " - "peer 4.2.3.4: id 500 ungranted 49000", - unit_log_get()); - - /* Second removal moves peer to end of list. */ - homa_grant_remove_grantable(rpc2); - unit_log_clear(); - unit_log_grantables(&self->homa); - EXPECT_STREQ("peer 2.2.3.4: id 300 ungranted 29000; " - "peer 3.2.3.4: id 400 ungranted 39000; " - "peer 4.2.3.4: id 500 ungranted 49000; " - "peer 1.2.3.4: id 204 ungranted 59000", - unit_log_get()); -} TEST_F(homa_grant, homa_grant_remove_active__copy_existing_rpcs) { From 28a12b1fe48258edd4a052660cd2684de866112f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 7 Jul 2025 14:50:04 -0700 Subject: [PATCH 383/625] Resurrect homa_grant_find_oldest --- homa_grant.c | 41 +++++++++++----- homa_grant.h | 2 +- test/unit_homa_grant.c | 107 +++++++++++++++++++++++++++-------------- 3 files changed, 101 insertions(+), 49 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 36678a8a..7a652d62 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -839,24 +839,23 @@ int homa_grant_fix_order(struct homa_grant *grant) } /** - * homa_grant_find_oldest() - Recompute the value of homa->oldest_rpc. - * @homa: Overall data about the Homa protocol implementation. The - * grant_lock must be held by the caller. + * homa_grant_find_oldest() - Recompute the value of homa->grant->oldest_rpc. + * @homa: Overall data about the Homa protocol implementation. */ -void homa_grant_find_oldest(struct homa *homa) +void homa_grant_find_oldest(struct homa_grant *grant) + __must_hold(grant->lock) { - int max_incoming = homa->grant->window + 2 * homa->grant->fifo_grant_increment; + int max_incoming = grant->window + 2 * grant->fifo_grant_increment; struct homa_rpc *rpc, *oldest; struct homa_peer *peer; u64 oldest_birth; + int i; oldest = NULL; oldest_birth = ~0; - /* Find the oldest message that doesn't currently have an - * outstanding "pity grant". - */ - list_for_each_entry(peer, &homa->grant->grantable_peers, grantable_links) { + /* Check the grantable lists. */ + list_for_each_entry(peer, &grant->grantable_peers, grantable_links) { list_for_each_entry(rpc, &peer->grantable_rpcs, grantable_links) { int received, incoming; @@ -869,7 +868,7 @@ void homa_grant_find_oldest(struct homa *homa) incoming = rpc->msgin.granted - received; if (incoming >= max_incoming) { /* This RPC has been granted way more bytes - * than by the grant window. This can only + * than the grant window. This can only * happen for FIFO grants, and it means the * peer isn't responding to grants we've sent. * Pick a different "oldest" RPC. @@ -880,7 +879,27 @@ void homa_grant_find_oldest(struct homa *homa) oldest_birth = rpc->msgin.birth; } } - homa->grant->oldest_rpc = oldest; + + /* Check the active RPCs. */ + for (i = 0; i < grant->num_active_rpcs; i++) { + int received, incoming; + + rpc = grant->active_rpcs[i]; + if (rpc->msgin.birth >= oldest_birth) + continue; + + received = (rpc->msgin.length + - rpc->msgin.bytes_remaining); + incoming = rpc->msgin.granted - received; + if (incoming >= max_incoming) { + continue; + } + oldest = rpc; + oldest_birth = rpc->msgin.birth; + } + + /* Check active RPCs. */ + grant->oldest_rpc = oldest; } #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_grant.h b/homa_grant.h index 123ae874..9a2186c5 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -236,7 +236,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc); int homa_grant_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void homa_grant_end_rpc(struct homa_rpc *rpc); -void homa_grant_find_oldest(struct homa *homa); +void homa_grant_find_oldest(struct homa_grant *grant); int homa_grant_fix_order(struct homa_grant *grant); void homa_grant_free(struct homa_grant *grant); void homa_grant_init_rpc(struct homa_rpc *rpc, int unsched); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index ca83df1b..9c7483cc 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -129,12 +129,6 @@ static struct homa_rpc *test_rpc(FIXTURE_DATA(homa_grant) *self, return rpc; } -/* Add an RPC to the front of its peer's grantable_rpcs. */ -// static void add_to_peer(struct homa_rpc *rpc) -// { -// list_add_tail(&rpc->grantable_links, &rpc->peer->grantable_rpcs); -// } - /* Create a client RPC whose msgin is properly initialized with no * unscheduled bytes and no packets received. */ @@ -1323,48 +1317,87 @@ TEST_F(homa_grant, homa_grant_fix_order) EXPECT_EQ(3, homa_metrics_per_cpu()->grant_priority_bumps); } -#if 0 -TEST_F(homa_grant, homa_grant_find_oldest__basics) +TEST_F(homa_grant, homa_grant_find_oldest__check_grantable_lists) { - mock_clock_tick = 10; - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 11, 40000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 33, 30000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 55, 20000, 100); + struct homa_rpc *rpc1, *rpc2, *rpc3; - unit_log_clear(); - homa_grant_find_oldest(&self->homa); - EXPECT_NE(NULL, self->homa.oldest_rpc); - EXPECT_EQ(11, self->homa.oldest_rpc->id); + rpc1 = test_rpc(self, 100, self->server_ip, 40000); + rpc1->msgin.birth = 100; + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc2->msgin.birth = 200; + rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); + rpc3->msgin.birth = 300; + homa_grant_insert_grantable(rpc1); + homa_grant_insert_grantable(rpc2); + homa_grant_insert_grantable(rpc3); + + homa_grant_find_oldest(self->homa.grant); + ASSERT_NE(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(100, self->homa.grant->oldest_rpc->id); } TEST_F(homa_grant, homa_grant_find_oldest__fifo_grant_unused) { - struct homa_rpc *srpc1, *srpc2; + struct homa_rpc *rpc1, *rpc2, *rpc3; - mock_clock_tick = 10; - srpc1 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 11, 400000, 100); - srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip+1, - self->server_ip, self->client_port, 33, 300000, 100); - unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, - self->server_ip, self->client_port, 55, 200000, 100); - ASSERT_NE(NULL, srpc1); - ASSERT_NE(NULL, srpc2); - srpc1->msgin.granted += + 2*self->homa.fifo_grant_increment; + rpc1 = test_rpc(self, 100, self->server_ip, 400000); + rpc1->msgin.birth = 100; + self->homa.grant->fifo_grant_increment = 10000; + rpc1->msgin.granted += 20000 + self->homa.grant->window; + rpc2 = test_rpc(self, 102, self->server_ip, 20000); + rpc2->msgin.birth = 200; + rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); + rpc3->msgin.birth = 300; + homa_grant_insert_grantable(rpc1); + homa_grant_insert_grantable(rpc2); + homa_grant_insert_grantable(rpc3); - unit_log_clear(); - homa_grant_find_oldest(&self->homa); - EXPECT_NE(NULL, self->homa.oldest_rpc); - EXPECT_EQ(33, self->homa.oldest_rpc->id); + homa_grant_find_oldest(self->homa.grant); + ASSERT_NE(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(102, self->homa.grant->oldest_rpc->id); +} +TEST_F(homa_grant, homa_grant_find_oldest__check_active_rpcs) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 40000); + rpc1->msgin.birth = 100; + rpc2 = test_rpc_init(self, 102, self->server_ip, 20000); + rpc2->msgin.birth = 200; + rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); + rpc3->msgin.birth = 300; + homa_grant_insert_grantable(rpc3); + EXPECT_EQ(2, self->homa.grant->num_active_rpcs); + + homa_grant_find_oldest(self->homa.grant); + ASSERT_NE(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(100, self->homa.grant->oldest_rpc->id); +} +TEST_F(homa_grant, homa_grant_find_oldest__active_rpc_has_unused_fifo_grant) +{ + struct homa_rpc *rpc1, *rpc2, *rpc3; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 400000); + rpc1->msgin.birth = 100; + self->homa.grant->fifo_grant_increment = 10000; + rpc1->msgin.granted += 20000 + self->homa.grant->window; + rpc2 = test_rpc_init(self, 102, self->server_ip, 20000); + rpc2->msgin.birth = 200; + rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); + rpc3->msgin.birth = 300; + homa_grant_insert_grantable(rpc3); + EXPECT_EQ(2, self->homa.grant->num_active_rpcs); + + homa_grant_find_oldest(self->homa.grant); + ASSERT_NE(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(102, self->homa.grant->oldest_rpc->id); } TEST_F(homa_grant, homa_grant_find_oldest__no_good_candidates) { - homa_grant_find_oldest(&self->homa); - EXPECT_EQ(NULL, self->homa.oldest_rpc); + self->homa.grant->oldest_rpc = + test_rpc(self, 100, self->server_ip, 40000); + homa_grant_find_oldest(self->homa.grant); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); } -#endif TEST_F(homa_grant, homa_grant_cand_add__basics) { From 72ee3417de05b0234cf74916629fa5f1e19ea381 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 8 Jul 2025 09:22:31 -0700 Subject: [PATCH 384/625] Introduce homa_high_priority function --- homa_impl.h | 14 ++++++++++++++ homa_pool.c | 4 +--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 38190201..b1841dbd 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -905,6 +905,20 @@ static inline u64 homa_usecs_to_cycles(u64 usecs) #endif /* __UNIT_TEST__ */ } +/** + * homa_high_priority() - Return the next-to-highest available priority + * level. Used in situations where we want to boost the priority of + * something but don't want to interfere with the highest priority packets + * such as control packets. + * @homa: Overall information about the Homa protocol. + * Return: See above. + * + */ +static inline int homa_high_priority(struct homa *homa) +{ + return (homa->num_priorities <= 2) ? 0 : homa->num_priorities - 2; +} + /* Homa Locking Strategy: * * (Note: this documentation is referenced in several other places in the diff --git a/homa_pool.c b/homa_pool.c index 32c4191a..8651c98e 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -537,9 +537,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) */ resend.offset = htonl(0); resend.length = htonl(-1); - resend.priority = (rpc->hsk->homa->num_priorities < 2) ? - 0 : - rpc->hsk->homa->num_priorities - 2; + resend.priority = homa_high_priority(rpc->hsk->homa); homa_xmit_control(RESEND, &resend, sizeof(resend), rpc); if (rpc->msgin.granted < rpc->msgin.length) homa_grant_manage_rpc(rpc); From e24d820dfd86a0bbffbc5583b463cc1e904126a1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 8 Jul 2025 15:51:06 -0700 Subject: [PATCH 385/625] Modify "starve" distribution to make starvation even worse --- util/dist.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/dist.cc b/util/dist.cc index 68c26ada..e6dfca3c 100644 --- a/util/dist.cc +++ b/util/dist.cc @@ -2412,5 +2412,5 @@ dist_point_gen::weight dist_point_gen::starve[] = { {700000, 100}, {800000, 100}, {900000, 100}, - {1000000, 100}, + {1000000, 50}, }; From 4e44872fa09a94556baf03defccf6e71e5a2b235 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 8 Jul 2025 16:35:12 -0700 Subject: [PATCH 386/625] Fix bit-rot in cp_ scripts * Resuscitate cp_config: it was broken * Minor fix in cp_vs_tcp --- util/cp_config | 22 +++------------------- util/cp_vs_tcp | 3 ++- util/cperf.py | 7 ++++--- 3 files changed, 9 insertions(+), 23 deletions(-) diff --git a/util/cp_config b/util/cp_config index 0ee4859e..2aabc125 100755 --- a/util/cp_config +++ b/util/cp_config @@ -283,19 +283,6 @@ if not options.plot_only: 'label': '%.1f MB' % (mb), 'switch_buffer': mb}) - # Run unloaded experiment - o = copy.deepcopy(options) - o.protocol = 'homa' - o.workload = workload - o.seconds = seconds - o.client_ports = 1 - o.client_max = 1 - o.server_ports = 1 - o.servers = options.nodes[1:2] - o.unloaded = 500 - start_servers(o.servers, o) - run_experiment("unloaded_" + workload, o.clients[0:1], o) - for spec in specs: o = copy.deepcopy(options) o.workload = workload @@ -307,7 +294,8 @@ if not options.plot_only: name = spec['sysctl'][i] value = spec['sysctl'][i+1] if name not in old_values: - old_values[name] = get_sysctl_parameter(name) + old_values[name] = get_sysctl_parameter(name, + options.nodes[0]) log("Setting %s = %s" % (name, value)) set_sysctl_parameter(name, value, options.nodes) if 'options' in spec: @@ -325,7 +313,7 @@ if not options.plot_only: do_ssh(["config", "mtu", str(spec['mtu'])], options.nodes) if 'lb' in spec: do_ssh(["config", "lb", spec['lb']], options.nodes) - start_servers(o.servers, o) + start_servers(exp_name, o.servers, o) run_experiment(exp_name, o.clients, o) except Exception as e: log(traceback.format_exc()) @@ -344,8 +332,6 @@ if switch: # Generate plots and reports for workload, bw, seconds in load_info: - set_unloaded("unloaded_" + workload) - # Generate slowdown plot. log("Generating slowdown plot for %s" % (workload)) title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), @@ -375,8 +361,6 @@ for workload, bw, seconds in load_info: exp_name = "%s_%s" % (spec['exp_name'], workload) x, y = get_short_cdf(exp_name) plt.plot(x, y, label=spec['label']) - x, y = get_short_cdf("unloaded_" + workload) - plt.plot(x, y, label="Homa best case") plt.legend(loc="upper right", prop={'size': 9}) plt.savefig("%s/reports/%s_%s_cdfs.pdf" % diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index 18effb9a..650fe516 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -51,7 +51,8 @@ if options.workload != "": # First, run all of the experiments if not options.plot_only: - congestion = get_sysctl_parameter("net.ipv4.tcp_congestion_control") + congestion = get_sysctl_parameter("net.ipv4.tcp_congestion_control", + options.nodes[0]) for workload, bw, seconds in load_info: options.workload = workload options.gbps = bw * bw_multiplier diff --git a/util/cperf.py b/util/cperf.py index 0bb90a3c..910fa4e2 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -565,14 +565,15 @@ def do_ssh(command, nodes): for id in nodes: do_subprocess(["ssh", "node%d" % id] + command) -def get_sysctl_parameter(name): +def get_sysctl_parameter(name, node): """ Retrieve the value of a particular system parameter using sysctl on - the current host, and return the value as a string. + the given node, and return the value as a string. name: name of the desired configuration parameter + node: node number on which the value should be retrieved """ - output = do_subprocess(["sysctl", name]) + output = do_subprocess(["ssh", "node%d" % node, "sysctl", name]) match = re.match('.*= (.*)', output) if not match: raise Exception("Couldn't parse sysctl output: %s" % output) From 22f6b68679a30e00eb5ffe655b46d5e68792a2fa Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 8 Jul 2025 16:36:56 -0700 Subject: [PATCH 387/625] Reimplement FIFO grants * Remove metrics fifo_grants and fifo_grants_no_incoming * Add new metric fifo_grant_bytes --- homa_grant.c | 242 +++++++++++++++++------------ homa_grant.h | 24 +-- homa_impl.h | 10 +- homa_metrics.c | 6 +- homa_metrics.h | 13 +- homa_pacer.c | 8 +- homa_pacer.h | 10 ++ homa_sock.h | 5 +- homa_utils.c | 14 +- test/unit_homa_grant.c | 336 ++++++++++++++++++++++++++++++++++++++--- test/unit_homa_utils.c | 16 +- 11 files changed, 520 insertions(+), 164 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 7a652d62..aa3d66a5 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -6,6 +6,7 @@ #include "homa_impl.h" #include "homa_grant.h" +#include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" #include "homa_wire.h" @@ -79,9 +80,10 @@ static struct ctl_table grant_ctl_table[] = { /** * homa_grant_alloc() - Allocate and initialize a new grant object, which * will hold grant management information for @homa. + * @homa: The struct homa that the new object is associated with. * Return: A pointer to the new struct grant, or a negative errno. */ -struct homa_grant *homa_grant_alloc(void) +struct homa_grant *homa_grant_alloc(struct homa *homa) { struct homa_grant *grant; int err; @@ -89,6 +91,7 @@ struct homa_grant *homa_grant_alloc(void) grant = kzalloc(sizeof(*grant), GFP_KERNEL); if (!grant) return ERR_PTR(-ENOMEM); + grant->homa = homa; atomic_set(&grant->stalled_rank, INT_MAX); grant->max_incoming = 400000; spin_lock_init(&grant->lock); @@ -97,7 +100,7 @@ struct homa_grant *homa_grant_alloc(void) grant->max_rpcs_per_peer = 1; grant->max_overcommit = 8; grant->recalc_usecs = 20; - grant->fifo_grant_increment = 10000; + grant->fifo_grant_increment = 50000; grant->fifo_fraction = 50; #ifndef __STRIP__ /* See strip.py */ @@ -563,6 +566,10 @@ void homa_grant_unmanage_rpc(struct homa_rpc *rpc, if (!list_empty(&rpc->grantable_links)) homa_grant_remove_grantable(rpc); grant->window = homa_grant_window(grant); + if (rpc == grant->oldest_rpc) { + homa_rpc_put(rpc); + grant->oldest_rpc = NULL; + } homa_grant_unlock(grant); } @@ -713,7 +720,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) * priority) in order to prevent starvation. * * Each of these situations requires the grant lock. - **/ + */ if (rpc->msgin.length < 0 || rpc->msgin.num_bpages <= 0 || rpc->state == RPC_DEAD) @@ -765,6 +772,7 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) homa_grant_send(rpc, priority); if (!homa_grant_cand_empty(&cand)) homa_grant_cand_check(&cand, grant); + homa_grant_check_fifo(grant); homa_rpc_lock(rpc); homa_rpc_put(rpc); } @@ -840,7 +848,8 @@ int homa_grant_fix_order(struct homa_grant *grant) /** * homa_grant_find_oldest() - Recompute the value of homa->grant->oldest_rpc. - * @homa: Overall data about the Homa protocol implementation. + * @grant: Overall grant management information. @grant->oldest_rpc + * must be NULL. */ void homa_grant_find_oldest(struct homa_grant *grant) __must_hold(grant->lock) @@ -858,15 +867,9 @@ void homa_grant_find_oldest(struct homa_grant *grant) list_for_each_entry(peer, &grant->grantable_peers, grantable_links) { list_for_each_entry(rpc, &peer->grantable_rpcs, grantable_links) { - int received, incoming; - if (rpc->msgin.birth >= oldest_birth) continue; - - received = (rpc->msgin.length - - rpc->msgin.bytes_remaining); - incoming = rpc->msgin.granted - received; - if (incoming >= max_incoming) { + if (rpc->msgin.rec_incoming >= max_incoming) { /* This RPC has been granted way more bytes * than the grant window. This can only * happen for FIFO grants, and it means the @@ -880,112 +883,151 @@ void homa_grant_find_oldest(struct homa_grant *grant) } } - /* Check the active RPCs. */ - for (i = 0; i < grant->num_active_rpcs; i++) { - int received, incoming; - + /* Check the active RPCs (skip the highest priority one, since + * it is already getting lots of grants). + */ + for (i = 1; i < grant->num_active_rpcs; i++) { rpc = grant->active_rpcs[i]; if (rpc->msgin.birth >= oldest_birth) continue; - - received = (rpc->msgin.length - - rpc->msgin.bytes_remaining); - incoming = rpc->msgin.granted - received; - if (incoming >= max_incoming) { + if (rpc->msgin.rec_incoming >= max_incoming) { continue; } oldest = rpc; oldest_birth = rpc->msgin.birth; } - /* Check active RPCs. */ + if (oldest) + homa_rpc_hold(oldest); grant->oldest_rpc = oldest; } -#ifndef __STRIP__ /* See strip.py */ -#if 0 /** - * homa_choose_fifo_grant() - This function is invoked occasionally to give - * a high-priority grant to the oldest incoming message. We do this in - * order to reduce the starvation that SRPT can cause for long messages. - * Note: this method is obsolete and should never be invoked; it's code is - * being retained until fifo grants are reimplemented using the new grant - * mechanism. - * @homa: Overall data about the Homa protocol implementation. The - * grant lock must be held by the caller. - * Return: An RPC to which to send a FIFO grant, or NULL if there is - * no appropriate RPC. This method doesn't actually send a grant, - * but it updates @msgin.granted to reflect the desired grant. - * Also updates homa->total_incoming. + * homa_grant_promote_rpc() - This function is invoked when the grant priority + * of an RPC has increased (e.g., because it received a FIFO grant); it adjusts + * the position of the RPC within the grantable lists and may promote it into + * grant->active_rpcs. This function does not promote within grant->active_rpcs: + * that is handled by homa_grant_fix_order. + * @rpc: The RPC to consider for promotion. Must currently be managed for + * grants. */ -struct homa_rpc *homa_choose_fifo_grant(struct homa *homa) +void homa_grant_promote_rpc(struct homa_grant *grant, struct homa_rpc *rpc) + __must_hold(rpc->bucket->lock) { - struct homa_rpc *rpc, *oldest; - u64 oldest_birth; - int granted; + struct homa_peer *peer = rpc->peer; + struct homa_rpc *other, *bumped; - oldest = NULL; - oldest_birth = ~0; + homa_grant_lock(grant); + if (rpc->msgin.rank >= 0) + goto done; + + /* Promote into active_rpcs if appropriate. */ + if (grant->num_active_rpcs < grant->max_overcommit || + homa_grant_outranks(rpc, grant->active_rpcs[grant->num_active_rpcs - + 1])) { + homa_grant_remove_grantable(rpc); + bumped = homa_grant_insert_active(rpc); + if (bumped) + homa_grant_insert_grantable(bumped); + goto done; + } + + /* Promote within the grantable lists. */ + while (rpc != list_first_entry(&peer->grantable_rpcs, + struct homa_rpc, grantable_links)) { + other = list_prev_entry(rpc, grantable_links); + if (!homa_grant_outranks(rpc, other)) + goto done; + list_del(&rpc->grantable_links); + list_add_tail(&rpc->grantable_links, &other->grantable_links); + } - /* Find the oldest message that doesn't currently have an - * outstanding "pity grant". + /* The RPC is now at the head of its peer list, so the peer may need + * to be promoted also. */ - list_for_each_entry(rpc, &homa->grantable_rpcs, grantable_links) { - int received, on_the_way; + homa_grant_adjust_peer(grant, peer); - if (rpc->msgin.birth >= oldest_birth) - continue; +done: + homa_grant_unlock(grant); +} - received = (rpc->msgin.length - - rpc->msgin.bytes_remaining); - on_the_way = rpc->msgin.granted - received; - if (on_the_way > homa->unsched_bytes) { - /* The last "pity" grant hasn't been used - * up yet. - */ - continue; - } - oldest = rpc; - oldest_birth = rpc->msgin.birth; - } - if (!oldest) - return NULL; - INC_METRIC(fifo_grants, 1); - if ((oldest->msgin.length - oldest->msgin.bytes_remaining) - == oldest->msgin.granted) - INC_METRIC(fifo_grants_no_incoming, 1); - - oldest->silent_ticks = 0; - granted = homa->fifo_grant_increment; - oldest->msgin.granted += granted; - if (oldest->msgin.granted >= oldest->msgin.length) { - granted -= oldest->msgin.granted - oldest->msgin.length; - oldest->msgin.granted = oldest->msgin.length; - // homa_remove_grantable_locked(homa, oldest); - } +/** + * homa_grant_check_fifo() - Check to see if it is time to make the next + * FIFO grant; if so, make the grant. FIFO grants keep long messages from + * being starved by Homa's SRPT grant mechanism. + * @grant: Overall grant management information. +*/ +void homa_grant_check_fifo(struct homa_grant *grant) +{ + struct homa_grant_candidates cand; + struct homa_rpc *rpc; + u64 now; - /* Try to update homa->total_incoming; if we can't lock - * the RPC, just skip it (waiting could deadlock), and it - * will eventually get updated elsewhere. + /* Note: placing this check before locking saves lock overhead + * in the normal case where it's not yet time for the next FIFO + * grant. This results in a race (2 cores could simultaneously + * decide to make FIFO grants) but that is relatively harmless + * (an occasional extra FIFO grant). */ - if (homa_rpc_try_lock(oldest)) { - homa_grant_update_incoming(oldest, homa); - homa_rpc_unlock(oldest); + now = homa_clock(); + if (now < grant->fifo_grant_time) + return; + homa_grant_lock(grant); + grant->fifo_grant_time = now + grant->fifo_grant_interval; + if (grant->fifo_fraction == 0 || grant->fifo_grant_increment == 0) { + homa_grant_unlock(grant); + return; } - if (oldest->msgin.granted < (oldest->msgin.length - - oldest->msgin.bytes_remaining)) { - /* We've already received all of the bytes in the new - * grant; most likely this means that the sender sent extra - * data after the last fifo grant (e.g. by rounding up to a - * TSO packet). Don't send this grant. + /* See if there is an RPC to grant. */ + rpc = grant->oldest_rpc; + if (rpc) { + /* If the oldest RPC hasn't been responding to FIFO grants + * then switch to a different RPC. */ - return NULL; + int max_incoming = grant->window + 2 * + grant->fifo_grant_increment; + if (rpc->msgin.rec_incoming >= max_incoming) { + grant->oldest_rpc = NULL; + homa_rpc_put(rpc); + rpc = NULL; + } + } + if (!rpc) { + homa_grant_find_oldest(grant); + rpc = grant->oldest_rpc; + if (!rpc) { + homa_grant_unlock(grant); + return; + } } - return oldest; + + /* Trickiness here: must release the grant lock before acquiring + * the RPC lock. Must acquire a reference on the RPC to keep it + * from being deleted in the gap where no lock is held. + */ + homa_rpc_hold(rpc); + homa_grant_unlock(grant); + homa_rpc_lock(rpc); + homa_grant_cand_init(&cand); + rpc->msgin.granted += grant->fifo_grant_increment; + if (rpc->msgin.granted >= rpc->msgin.length) { + INC_METRIC(fifo_grant_bytes, grant->fifo_grant_increment + + rpc->msgin.length - + rpc->msgin.granted); + rpc->msgin.granted = rpc->msgin.length; + homa_grant_unmanage_rpc(rpc, &cand); + } else { + INC_METRIC(fifo_grant_bytes, grant->fifo_grant_increment); + homa_grant_promote_rpc(grant, rpc); + } + homa_grant_update_incoming(rpc, grant); + homa_rpc_unlock(rpc); + homa_grant_send(rpc, homa_high_priority(grant->homa)); + homa_rpc_put(rpc); + if (!homa_grant_cand_empty(&cand)) + homa_grant_cand_check(&cand, grant); } -#endif -#endif /* See strip.py */ /** * homa_grant_cand_add() - Add an RPC into the struct, if there is @@ -1071,18 +1113,26 @@ void homa_grant_lock_slow(struct homa_grant *grant) */ void homa_grant_update_sysctl_deps(struct homa_grant *grant) { - u64 tmp; + u64 fifo_mbps, clocks_per_fifo_mbit, interval; if (grant->max_overcommit > HOMA_MAX_GRANTS) grant->max_overcommit = HOMA_MAX_GRANTS; if (grant->fifo_fraction > 500) grant->fifo_fraction = 500; - tmp = grant->fifo_fraction; - if (tmp != 0) - tmp = (1000 * grant->fifo_grant_increment) / tmp - - grant->fifo_grant_increment; - grant->grant_nonfifo = tmp; + fifo_mbps = (u64)homa_pacer_get_link_mbps(grant->homa->pacer) * + grant->fifo_fraction; + do_div(fifo_mbps, 1000); + if (fifo_mbps > 0 && grant->fifo_grant_increment > 0) { + clocks_per_fifo_mbit = 1000 * homa_clock_khz(); + do_div(clocks_per_fifo_mbit, fifo_mbps); + interval = clocks_per_fifo_mbit * grant->fifo_grant_increment * + 8; + do_div(interval, 1000000); + grant->fifo_grant_interval = interval; + } else { + grant->fifo_grant_interval = 1000 * homa_clock_khz(); + } grant->recalc_cycles = homa_usecs_to_cycles(grant->recalc_usecs); diff --git a/homa_grant.h b/homa_grant.h index 9a2186c5..d64a4753 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -20,6 +20,9 @@ * stored in each struct homa. */ struct homa_grant { + /** @homa: The struct homa that this object belongs to. */ + struct homa *homa; + /** * @total_incoming: the total number of bytes that we expect to receive * (across all messages) even if we don't send out any more grants @@ -162,23 +165,22 @@ struct homa_grant { int fifo_fraction; /** - * @grant_nonfifo: How many bytes should be granted using the - * normal priority system between grants to the oldest message. + * @fifo_grant_interval: The time (in homa_clock units) between + * successive FIFO grants. */ - int grant_nonfifo; + u64 fifo_grant_interval; /** - * @grant_nonfifo_left: Counts down bytes granted using the normal - * priority mechanism. When this reaches zero, it's time to grant - * to the oldest message. + * @fifo_grant_time: The time when we should issue the next FIFO + * grant. */ - int grant_nonfifo_left; + u64 fifo_grant_time; /** * @oldest_rpc: The RPC with incoming data whose start_cycles is * farthest in the past). NULL means either there are no incoming - * RPCs or the oldest needs to be recomputed. Must hold grant_lock - * to update. + * RPCs or the oldest needs to be recomputed. There is always a + * reference taken for this RPC. Must hold grant_lock to update. */ struct homa_rpc *oldest_rpc; @@ -225,13 +227,14 @@ struct homa_grant_candidates { }; struct homa_grant - *homa_grant_alloc(void); + *homa_grant_alloc(struct homa *homa); void homa_grant_adjust_peer(struct homa_grant *grant, struct homa_peer *peer); void homa_grant_cand_add(struct homa_grant_candidates *cand, struct homa_rpc *rpc); void homa_grant_cand_check(struct homa_grant_candidates *cand, struct homa_grant *grant); +void homa_grant_check_fifo(struct homa_grant *grant); void homa_grant_check_rpc(struct homa_rpc *rpc); int homa_grant_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); @@ -250,6 +253,7 @@ int homa_grant_outranks(struct homa_rpc *rpc1, struct homa_rpc *rpc2); void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); int homa_grant_priority(struct homa *homa, int rank); +void homa_grant_promote_rpc(struct homa_grant *grant, struct homa_rpc *rpc); void homa_grant_remove_active(struct homa_rpc *rpc, struct homa_grant_candidates *cand); void homa_grant_remove_grantable(struct homa_rpc *rpc); diff --git a/homa_impl.h b/homa_impl.h index b1841dbd..d30d3731 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -108,6 +108,11 @@ struct homa { */ atomic64_t next_outgoing_id; + /** + * @pacer: Information related to the pacer; managed by homa_pacer.c. + */ + struct homa_pacer *pacer; + #ifndef __STRIP__ /* See strip.py */ /** * @grant: Contains information used by homa_grant.c to manage @@ -116,11 +121,6 @@ struct homa { struct homa_grant *grant; #endif /* See strip.py */ - /** - * @pacer: Information related to the pacer; managed by homa_pacer.c. - */ - struct homa_pacer *pacer; - /** * @peertab: Info about all the other hosts we have communicated with; * includes peers from all network namespaces. diff --git a/homa_metrics.c b/homa_metrics.c index 0af43c89..a82b5257 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -359,10 +359,8 @@ char *homa_metrics_print(void) m->grant_check_recalcs); M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", m->grant_priority_bumps); - M("fifo_grants %15llu Grants issued using FIFO priority\n", - m->fifo_grants); - M("fifo_grants_no_incoming %15llu FIFO grants to messages with no outstanding grants\n", - m->fifo_grants_no_incoming); + M("fifo_grant_bytes %15llu Bytes of grants issued using the FIFO mechanism\n", + m->fifo_grant_bytes); M("disabled_reaps %15llu Reaper invocations that were disabled\n", m->disabled_reaps); M("deferred_rpc_reaps %15llu RPCs skipped by reaper because still in use\n", diff --git a/homa_metrics.h b/homa_metrics.h index fc5f355f..cca1eb2e 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -622,17 +622,10 @@ struct homa_metrics { u64 grant_priority_bumps; /** - * @fifo_grants: total number of times that grants were sent to - * the oldest message. + * @fifo_grant_bytes: total number of bytes of grants issued via + * the FIFO granting mechanism */ - u64 fifo_grants; - - /** - * @fifo_grants_no_incoming: total number of times that, when a - * FIFO grant was issued, the message had no outstanding grants - * (everything granted had been received). - */ - u64 fifo_grants_no_incoming; + u64 fifo_grant_bytes; /** * @disabled_reaps: total number of times that the reaper couldn't diff --git a/homa_pacer.c b/homa_pacer.c index 8082df11..fed0a559 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -5,6 +5,8 @@ * the buildup of large queues in the NIC. */ +#include "homa_impl.h" +#include "homa_grant.h" #include "homa_pacer.h" #include "homa_rpc.h" @@ -443,8 +445,12 @@ int homa_pacer_dointvec(const struct ctl_table *table, int write, table_copy.data = ((char *)pacer) + (uintptr_t)table_copy.data; result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); - if (write) + if (write) { homa_pacer_update_sysctl_deps(pacer); + + /* Grant info depends on link speed. */ + homa_grant_update_sysctl_deps(pacer->homa->grant); + } return result; } diff --git a/homa_pacer.h b/homa_pacer.h index 1f4476d4..7e2a9c83 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -222,4 +222,14 @@ static inline void homa_pacer_throttle_unlock(struct homa_pacer *pacer) spin_unlock_bh(&pacer->throttle_lock); } +/** + * homa_pacer_get_link_mbps() - Return the link speed for this transport. + * @pacer: Pacer information for a Homa transport. + * Return: The link speed, in units of 1e6 bits per second. + */ +static inline int homa_pacer_get_link_mbps(struct homa_pacer *pacer) +{ + return pacer->link_mbps; +} + #endif /* _HOMA_PACER_H */ diff --git a/homa_sock.h b/homa_sock.h index acbd8d2e..3c8830e7 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -9,10 +9,6 @@ struct homa; struct homa_pool; -#ifndef __STRIP__ /* See strip.py */ -void homa_sock_lock_slow(struct homa_sock *hsk); -#endif /* See strip.py */ - /* Number of hash buckets in a homa_socktab. Must be a power of 2. */ #define HOMA_SOCKTAB_BUCKET_BITS 10 #define HOMA_SOCKTAB_BUCKETS BIT(HOMA_SOCKTAB_BUCKET_BITS) @@ -270,6 +266,7 @@ struct homa_v6_sock { #ifndef __STRIP__ /* See strip.py */ void homa_bucket_lock_slow(struct homa_rpc_bucket *bucket, u64 id); +void homa_sock_lock_slow(struct homa_sock *hsk); #endif /* See strip.py */ int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, u16 port); diff --git a/homa_utils.c b/homa_utils.c index cf22c0d1..bb688b1a 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -33,20 +33,20 @@ int homa_init(struct homa *homa) memset(homa, 0, sizeof(*homa)); atomic64_set(&homa->next_outgoing_id, 2); + homa->pacer = homa_pacer_alloc(homa); + if (IS_ERR(homa->pacer)) { + err = PTR_ERR(homa->pacer); + homa->pacer = NULL; + return err; + } #ifndef __STRIP__ /* See strip.py */ - homa->grant = homa_grant_alloc(); + homa->grant = homa_grant_alloc(homa); if (IS_ERR(homa->grant)) { err = PTR_ERR(homa->grant); homa->grant = NULL; return err; } #endif /* See strip.py */ - homa->pacer = homa_pacer_alloc(homa); - if (IS_ERR(homa->pacer)) { - err = PTR_ERR(homa->pacer); - homa->pacer = NULL; - return err; - } homa->peertab = homa_peer_alloc_peertab(); if (IS_ERR(homa->peertab)) { err = PTR_ERR(homa->peertab); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 9c7483cc..93ac7f67 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -146,7 +146,7 @@ TEST_F(homa_grant, homa_grant_alloc__success) { struct homa_grant *grant; - grant = homa_grant_alloc(); + grant = homa_grant_alloc(&self->homa); EXPECT_EQ(50, grant->fifo_fraction); homa_grant_free(grant); } @@ -155,7 +155,7 @@ TEST_F(homa_grant, homa_grant_alloc__cant_allocate_memory) struct homa_grant *grant; mock_kmalloc_errors = 1; - grant = homa_grant_alloc(); + grant = homa_grant_alloc(&self->homa); EXPECT_TRUE(IS_ERR(grant)); EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); } @@ -164,7 +164,7 @@ TEST_F(homa_grant, homa_grant_alloc__cant_register_sysctls) struct homa_grant *grant; mock_register_sysctl_errors = 1; - grant = homa_grant_alloc(); + grant = homa_grant_alloc(&self->homa); EXPECT_TRUE(IS_ERR(grant)); EXPECT_EQ(ENOMEM, -PTR_ERR(grant)); } @@ -173,7 +173,7 @@ TEST_F(homa_grant, homa_grant_free__basics) { struct homa_grant *grant; - grant = homa_grant_alloc(); + grant = homa_grant_alloc(&self->homa); homa_grant_free(grant); EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); } @@ -181,7 +181,7 @@ TEST_F(homa_grant, homa_grant_free__sysctls_not_registered) { struct homa_grant *grant; - grant = homa_grant_alloc(); + grant = homa_grant_alloc(&self->homa); grant->sysctl_header = NULL; homa_grant_free(grant); EXPECT_STREQ("", unit_log_get()); @@ -832,7 +832,7 @@ TEST_F(homa_grant, homa_grant_remove_active__skip_overactive_peer) EXPECT_FALSE(homa_grant_cand_empty(&self->cand)); } -TEST_F(homa_grant, homa_grant_unmanage_rpc) +TEST_F(homa_grant, homa_grant_unmanage_rpc__basics) { struct homa_rpc *rpc; @@ -871,6 +871,20 @@ TEST_F(homa_grant, homa_grant_unmanage_rpc) EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); EXPECT_EQ(60000, self->homa.grant->window); } +TEST_F(homa_grant, homa_grant_unmanage_rpc__remove_from_oldest_rpc) +{ + struct homa_rpc *rpc; + + rpc = test_rpc(self, 200, self->server_ip, 30000); + homa_grant_manage_rpc(rpc); + self->homa.grant->oldest_rpc = rpc; + homa_rpc_hold(rpc); + EXPECT_EQ(1, rpc->refs.counter); + + homa_grant_unmanage_rpc(rpc, &self->cand); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(0, rpc->refs.counter); +} TEST_F(homa_grant, homa_grant_update_incoming) { @@ -1193,6 +1207,26 @@ TEST_F(homa_grant, homa_grant_check_rpc__fast_path_promote_other_message) EXPECT_STREQ("active[0]: id 102 ungranted 15000", unit_log_get()); EXPECT_EQ(0, homa_metrics_per_cpu()->grant_check_locked); } +TEST_F(homa_grant, homa_grant_check_rpc__fast_path_issue_fifo_grant) +{ + struct homa_rpc *rpc1, *rpc2; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 50000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 100000); + + self->homa.grant->fifo_grant_time = 0; + self->homa.grant->fifo_grant_interval = 10000; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + unit_log_clear(); + homa_rpc_lock(rpc1); + homa_grant_check_rpc(rpc1); + EXPECT_STREQ("xmit GRANT 10000@1; xmit GRANT 20000@0", unit_log_get()); + EXPECT_EQ(10000, rpc1->msgin.granted); + EXPECT_EQ(20000, rpc2->msgin.granted); + homa_rpc_unlock(rpc1); +} TEST_F(homa_grant, homa_grant_check_rpc__dont_check_needy_if_incoming_maxed) { struct homa_rpc *rpc; @@ -1342,7 +1376,7 @@ TEST_F(homa_grant, homa_grant_find_oldest__fifo_grant_unused) rpc1 = test_rpc(self, 100, self->server_ip, 400000); rpc1->msgin.birth = 100; self->homa.grant->fifo_grant_increment = 10000; - rpc1->msgin.granted += 20000 + self->homa.grant->window; + rpc1->msgin.rec_incoming = 20000 + self->homa.grant->window; rpc2 = test_rpc(self, 102, self->server_ip, 20000); rpc2->msgin.birth = 200; rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); @@ -1379,9 +1413,12 @@ TEST_F(homa_grant, homa_grant_find_oldest__active_rpc_has_unused_fifo_grant) rpc1 = test_rpc_init(self, 100, self->server_ip, 400000); rpc1->msgin.birth = 100; self->homa.grant->fifo_grant_increment = 10000; - rpc1->msgin.granted += 20000 + self->homa.grant->window; + rpc1->msgin.rec_incoming = 20000 + self->homa.grant->window; + + /* This RPC will be skipped because it has rank 0. */ rpc2 = test_rpc_init(self, 102, self->server_ip, 20000); rpc2->msgin.birth = 200; + rpc3 = test_rpc(self, 104, self->server_ip + 1, 30000); rpc3->msgin.birth = 300; homa_grant_insert_grantable(rpc3); @@ -1389,7 +1426,7 @@ TEST_F(homa_grant, homa_grant_find_oldest__active_rpc_has_unused_fifo_grant) homa_grant_find_oldest(self->homa.grant); ASSERT_NE(NULL, self->homa.grant->oldest_rpc); - EXPECT_EQ(102, self->homa.grant->oldest_rpc->id); + EXPECT_EQ(104, self->homa.grant->oldest_rpc->id); } TEST_F(homa_grant, homa_grant_find_oldest__no_good_candidates) { @@ -1398,6 +1435,266 @@ TEST_F(homa_grant, homa_grant_find_oldest__no_good_candidates) homa_grant_find_oldest(self->homa.grant); EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); } +TEST_F(homa_grant, homa_grant_find_oldest__take_reference) +{ + struct homa_rpc *rpc; + + rpc = test_rpc(self, 100, self->server_ip, 40000); + homa_grant_insert_grantable(rpc); + ASSERT_EQ(0, rpc->refs.counter); + + homa_grant_find_oldest(self->homa.grant); + ASSERT_EQ(rpc, self->homa.grant->oldest_rpc); + ASSERT_EQ(1, rpc->refs.counter); +} + +TEST_F(homa_grant, homa_grant_promote_rpc__rpc_is_active) +{ + struct homa_rpc *rpc; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 40000); + rpc->msgin.granted += 15000; + EXPECT_EQ(1, rpc->msgin.rank); + + homa_grant_promote_rpc(self->homa.grant, rpc); + EXPECT_EQ(1, rpc->msgin.rank); +} +TEST_F(homa_grant, homa_grant_promote_rpc__promote_into_active_space_available) +{ + struct homa_rpc *rpc1, *rpc2; + + rpc1 = test_rpc_init(self, 100, self->server_ip, 30000); + + rpc2 = test_rpc(self, 102, self->server_ip, 40000); + homa_grant_insert_grantable(rpc2); + + homa_grant_promote_rpc(self->homa.grant, rpc2); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(1, rpc2->msgin.rank); +} +TEST_F(homa_grant, homa_grant_promote_rpc__promote_into_active_bump_existing) +{ + struct homa_rpc *rpc1, *rpc2; + + self->homa.grant->max_overcommit = 1; + rpc1 = test_rpc_init(self, 100, self->server_ip, 30000); + rpc2 = test_rpc_init(self, 102, self->server_ip, 40000); + EXPECT_EQ(0, rpc1->msgin.rank); + EXPECT_EQ(-1, rpc2->msgin.rank); + rpc2->msgin.granted += 15000; + + homa_grant_promote_rpc(self->homa.grant, rpc2); + EXPECT_EQ(-1, rpc1->msgin.rank); + EXPECT_EQ(0, rpc2->msgin.rank); +} +TEST_F(homa_grant, homa_grant_promote_rpc__promote_within_peer_list) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 1; + test_rpc_init(self, 100, self->server_ip, 30000); + test_rpc_init(self, 102, self->server_ip, 40000); + test_rpc_init(self, 104, self->server_ip, 50000); + test_rpc_init(self, 106, self->server_ip, 60000); + rpc = test_rpc_init(self, 108, self->server_ip, 70000); + rpc->msgin.granted += 25000; + + homa_grant_promote_rpc(self->homa.grant, rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 30000; " + "peer 1.2.3.4: id 102 ungranted 40000 " + "id 108 ungranted 45000 " + "id 104 ungranted 50000 " + "id 106 ungranted 60000", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_promote_rpc__promote_to_top_of_peer_list_and_adjust_peer) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 1; + test_rpc_init(self, 100, self->server_ip, 30000); + test_rpc_init(self, 102, self->server_ip + 1, 40000); + test_rpc_init(self, 104, self->server_ip + 2, 50000); + test_rpc_init(self, 106, self->server_ip + 2, 60000); + rpc = test_rpc_init(self, 108, self->server_ip + 2, 70000); + rpc->msgin.granted += 35000; + + homa_grant_promote_rpc(self->homa.grant, rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 100 ungranted 30000; " + "peer 3.2.3.4: id 108 ungranted 35000 " + "id 104 ungranted 50000 " + "id 106 ungranted 60000; " + "peer 2.2.3.4: id 102 ungranted 40000", unit_log_get()); +} + +TEST_F(homa_grant, homa_grant_check_fifo__basics) +{ + struct homa_rpc *rpc; + + mock_clock = 1000; + self->homa.num_priorities = 5; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 0; + self->homa.grant->fifo_grant_interval = 10000; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 400000); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(0, rpc->msgin.granted); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(20000, rpc->msgin.granted); + EXPECT_STREQ("xmit GRANT 20000@3", unit_log_get()); + EXPECT_EQ(rpc, self->homa.grant->oldest_rpc); + EXPECT_EQ(11000, self->homa.grant->fifo_grant_time); + EXPECT_EQ(20000, rpc->msgin.rec_incoming); + EXPECT_EQ(20000, atomic_read(&self->homa.grant->total_incoming)); + EXPECT_EQ(20000, homa_metrics_per_cpu()->fifo_grant_bytes); +} +TEST_F(homa_grant, homa_grant_check_fifo__not_yet_time_for_a_fifo_grant) +{ + struct homa_rpc *rpc; + + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 1001; + self->homa.grant->fifo_grant_increment = 20000; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 400000); + EXPECT_EQ(0, rpc->msgin.granted); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(1001, self->homa.grant->fifo_grant_time); +} +TEST_F(homa_grant, homa_grant_check_fifo__fifo_grants_disabled) +{ + struct homa_rpc *rpc; + + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 1000; + self->homa.grant->fifo_grant_increment = 0; + self->homa.grant->fifo_grant_interval = 2000; + self->homa.grant->fifo_fraction = 50; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 400000); + EXPECT_EQ(0, rpc->msgin.granted); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + EXPECT_EQ(3000, self->homa.grant->fifo_grant_time); +} +TEST_F(homa_grant, homa_grant_check_fifo__oldest_rpc_not_responsive) +{ + struct homa_rpc *rpc1, *rpc2; + + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 1000; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + mock_clock = 1000; + test_rpc_init(self, 100, self->server_ip, 30000); + mock_clock = 2000; + rpc1 = test_rpc_init(self, 102, self->server_ip, 400000); + mock_clock = 3000; + rpc2 = test_rpc_init(self, 104, self->server_ip, 300000); + homa_grant_find_oldest(self->homa.grant); + EXPECT_EQ(102, self->homa.grant->oldest_rpc->id); + rpc1->msgin.rec_incoming = 40000 + self->homa.grant->window; + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(0, rpc1->msgin.granted); + EXPECT_EQ(20000, rpc2->msgin.granted); + EXPECT_STREQ("xmit GRANT 20000@0", unit_log_get()); + EXPECT_EQ(104, self->homa.grant->oldest_rpc->id); +} +TEST_F(homa_grant, homa_grant_check_fifo__no_suitable_rpc) +{ + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 1000; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + test_rpc_init(self, 100, self->server_ip, 30000); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + EXPECT_STREQ("", unit_log_get()); +} +TEST_F(homa_grant, homa_grant_check_fifo__rpc_becomes_fully_granted_so_promote_another) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 2; + self->homa.grant->fifo_grant_increment = 50000; + self->homa.grant->fifo_fraction = 50; + + mock_clock = 1000; + rpc = test_rpc_init(self, 100, self->server_ip, 40000); + mock_clock = 2000; + test_rpc_init(self, 102, self->server_ip, 30000); + mock_clock = 3000; + test_rpc_init(self, 104, self->server_ip, 50000); + EXPECT_EQ(1, rpc->msgin.rank); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(40000, rpc->msgin.granted); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_STREQ("xmit GRANT 40000@0; xmit GRANT 10000@0", unit_log_get()); + EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 102 ungranted 30000; " + "active[1]: id 104 ungranted 40000", unit_log_get()); + EXPECT_EQ(40000, homa_metrics_per_cpu()->fifo_grant_bytes); +} +TEST_F(homa_grant, homa_grant_check_fifo__promote_after_fifo_grant) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_increment = 15000; + self->homa.grant->fifo_fraction = 50; + + mock_clock = 1000; + rpc = test_rpc_init(self, 100, self->server_ip, 50000); + mock_clock = 2000; + test_rpc_init(self, 102, self->server_ip, 30000); + mock_clock = 3000; + test_rpc_init(self, 104, self->server_ip, 40000); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(15000, rpc->msgin.granted); + EXPECT_STREQ("xmit GRANT 15000@0", unit_log_get()); + unit_log_clear(); + unit_log_grantables(&self->homa); + EXPECT_STREQ("active[0]: id 102 ungranted 30000; " + "peer 1.2.3.4: id 100 ungranted 35000 " + "id 104 ungranted 40000", unit_log_get()); +} TEST_F(homa_grant, homa_grant_cand_add__basics) { @@ -1537,7 +1834,7 @@ TEST_F(homa_grant, homa_grant_update_sysctl_deps__max_overcommit) homa_grant_update_sysctl_deps(self->homa.grant); EXPECT_EQ(HOMA_MAX_GRANTS, self->homa.grant->max_overcommit); } -TEST_F(homa_grant, homa_grant_update_sysctl_deps__grant_fifo_fraction) +TEST_F(homa_grant, homa_grant_update_sysctl_deps__fifo_fraction) { self->homa.grant->fifo_fraction = 499; homa_grant_update_sysctl_deps(self->homa.grant); @@ -1547,16 +1844,21 @@ TEST_F(homa_grant, homa_grant_update_sysctl_deps__grant_fifo_fraction) homa_grant_update_sysctl_deps(self->homa.grant); EXPECT_EQ(500, self->homa.grant->fifo_fraction); } -TEST_F(homa_grant, homa_grant_update_sysctl_deps__grant_nonfifo) +TEST_F(homa_grant, homa_grant_update_sysctl_deps__fifo_interval) { - self->homa.grant->fifo_grant_increment = 10000; - self->homa.grant->fifo_fraction = 0; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 500; + self->homa.pacer->link_mbps = 8000; homa_grant_update_sysctl_deps(self->homa.grant); - EXPECT_EQ(0, self->homa.grant->grant_nonfifo); - - self->homa.grant->fifo_fraction = 100; + EXPECT_EQ(40000, self->homa.grant->fifo_grant_interval); +} +TEST_F(homa_grant, homa_grant_update_sysctl_deps__fifo_interval_no_fifo_grants) +{ + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 0; + self->homa.pacer->link_mbps = 8000; homa_grant_update_sysctl_deps(self->homa.grant); - EXPECT_EQ(90000, self->homa.grant->grant_nonfifo); + EXPECT_EQ(1000000000, self->homa.grant->fifo_grant_interval); } TEST_F(homa_grant, homa_grant_update_sysctl_deps__recalc_cycles) { diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 1238aa35..0fb7be60 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -58,32 +58,28 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, } #endif /* See strip.py */ -#ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_utils, homa_init__grant_alloc_failure) +TEST_F(homa_utils, homa_init__pacer_alloc_failure) { struct homa homa2; mock_kmalloc_errors = 1; unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); - EXPECT_EQ(NULL, homa2.grant); + EXPECT_EQ(NULL, homa2.pacer); homa_destroy(&homa2); } -#endif /* See strip.py */ -TEST_F(homa_utils, homa_init__pacer_alloc_failure) +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_utils, homa_init__grant_alloc_failure) { struct homa homa2; -#ifndef __STRIP__ /* See strip.py */ mock_kmalloc_errors = 2; -#else /* See strip.py */ - mock_kmalloc_errors = 1; -#endif/* See strip.py */ unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); - EXPECT_EQ(NULL, homa2.pacer); + EXPECT_EQ(NULL, homa2.grant); homa_destroy(&homa2); } +#endif /* See strip.py */ TEST_F(homa_utils, homa_init__peertab_alloc_failure) { struct homa homa2; From 16bca3ecbb445efb26d751808d2d780782cc3e37 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 9 Jul 2025 10:14:25 -0700 Subject: [PATCH 388/625] Update perf.txt with measurements of FIFO grants --- perf.txt | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/perf.txt b/perf.txt index 339cb309..b06af455 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,46 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +60. (July 2025) Measured impact of new FIFO grant mechanism on xl170 +cluster using "-w starve -b 40 -s 30 -n 6" (priorities were not enabled). +Slowdowns as a function of message length: + + grant_fifo_fraction = 0 grant_fifo_fraction = 50 grant_fifo_fraction = 100 +# length s50 s99 s999 s50 s99 s999 s50 s99 s999 + 100000 13.7 25.5 86.8 13.3 21.7 31.9 13.2 22.2 32.7 + 200000 13.0 32.2 75.7 12.7 21.2 29.0 12.6 21.5 30.7 + 300000 13.4 30.2 64.5 13.1 22.1 28.2 13.0 22.7 30.2 + 400000 14.3 30.9 60.1 14.0 24.5 30.6 14.1 25.9 33.3 + 500000 16.1 35.0 83.0 15.9 30.5 37.4 16.4 32.8 41.6 + 600000 19.0 49.3 185.7 19.5 41.2 53.1 20.8 47.7 62.2 + 700000 24.1 70.1 222.0 26.7 67.8 91.4 30.8 88.6 122.2 + 800000 34.8 121.2 282.6 47.5 178.9 268.4 67.9 315.6 470.3 + 900000 72.6 307.5 470.5 1155.3 2139.8 2314.1 1477.2 1746.0 1823.7 + 1000000 3093.4 12063.2 13050.8 1982.2 2354.0 2482.9 1467.0 1647.1 1709.4 + +Even shorter messages seem to benefit from the FIFO mechanism (not sure why...). +Increasing the FIFO fraction from 5% to 10% doesn't make much difference and +starts to penalize smaller messages more. + +FIFO also helps even when the cluster isn't overloaded: slowdown at +"-w starve -b 20 -s 30 -n 6": + + grant_fifo_fraction = 0 grant_fifo_fraction = 50 +# length s50 s99 s999 s50 s99 s999 + 100000 11.0 20.0 27.4 11.1 20.0 28.1 + 200000 10.5 19.5 26.0 10.5 19.6 26.1 + 300000 10.6 20.4 25.7 10.7 20.8 25.8 + 400000 11.1 22.0 26.8 11.2 22.7 28.5 + 500000 11.8 24.6 31.3 12.1 26.9 35.4 + 600000 13.0 30.3 39.2 13.4 33.5 45.9 + 700000 14.6 39.4 53.5 15.2 46.9 67.8 + 800000 16.9 55.3 82.7 17.6 63.8 92.7 + 900000 20.2 93.6 147.9 20.8 80.5 112.2 + 1000000 23.4 155.6 250.4 23.3 93.3 128.7 + +When the cluster isn't overloaded, short messages get a bit worse when FIFO +is enabled. + 59. (May 2025) Measured overhead to read various clocks on 2.4 GHz Xeon E5-2640 (note: measured when CPU is active, hence running in fastest mode): From cf7d66022c7d7634686cac3f5db4b3c744e7da66 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 9 Jul 2025 11:40:29 -0700 Subject: [PATCH 389/625] Cleanup and simplify use of RPC reference counts Previously, reference counts were only acquired when RPC locks were released temporarily. This resulted in lots of code. Now, references are taken by top-level functions that identify RPCs to manipulate. Any function that receives an RPC as an argument can assume that a reference has been taken on the RPC, so there is no need to take a reference when an RPC lock is released. --- homa_grant.c | 6 ------ homa_incoming.c | 21 ++++++++++---------- homa_outgoing.c | 7 ------- homa_plumbing.c | 40 +++++++++++++++++++++++++-------------- homa_rpc.h | 13 ++++++++++--- test/unit_homa_incoming.c | 6 ++++++ 6 files changed, 52 insertions(+), 41 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index aa3d66a5..12b86d82 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -180,11 +180,9 @@ void homa_grant_end_rpc(struct homa_rpc *rpc) homa_grant_cand_init(&cand); homa_grant_unmanage_rpc(rpc, &cand); if (!homa_grant_cand_empty(&cand)) { - homa_rpc_hold(rpc); homa_rpc_unlock(rpc); homa_grant_cand_check(&cand, grant); homa_rpc_lock(rpc); - homa_rpc_put(rpc); } } @@ -767,14 +765,12 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) /* Sending a grant is slow, so release the RPC lock while * sending the grant to reduce contention. */ - homa_rpc_hold(rpc); homa_rpc_unlock(rpc); homa_grant_send(rpc, priority); if (!homa_grant_cand_empty(&cand)) homa_grant_cand_check(&cand, grant); homa_grant_check_fifo(grant); homa_rpc_lock(rpc); - homa_rpc_put(rpc); } } @@ -801,11 +797,9 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) tt_record1("homa_grant_check_rpc released grant lock (id %d)", rpc->id); if (!homa_grant_cand_empty(&cand)) { - homa_rpc_hold(rpc); homa_rpc_unlock(rpc); homa_grant_cand_check(&cand, grant); homa_rpc_lock(rpc); - homa_rpc_put(rpc); } INC_METRIC(grant_check_others, 1); } diff --git a/homa_incoming.c b/homa_incoming.c index 33390c09..6dca3477 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -338,7 +338,6 @@ int homa_copy_to_user(struct homa_rpc *rpc) * run out of packets); copy any available packets out to * user space. */ - homa_rpc_hold(rpc); homa_rpc_unlock(rpc); tt_record1("starting copy to user space for id %d", @@ -416,7 +415,6 @@ int homa_copy_to_user(struct homa_rpc *rpc) atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); - homa_rpc_put(rpc); if (error) break; } @@ -506,11 +504,13 @@ void homa_dispatch_pkts(struct sk_buff *skb) * the lock more quickly. */ homa_spin(100); - rpc = NULL; + homa_rpc_lock(rpc); } } - /* Find and lock the RPC if we haven't already done so. */ + /* If we don't already have an RPC, find it, lock it, + * and create a reference on it. + */ if (!rpc) { if (!homa_is_client(id)) { /* We are the server for this RPC. */ @@ -535,6 +535,8 @@ void homa_dispatch_pkts(struct sk_buff *skb) } else { rpc = homa_rpc_find_client(hsk, id); } + if (rpc) + homa_rpc_hold(rpc); } if (unlikely(!rpc)) { #ifndef __STRIP__ /* See strip.py */ @@ -626,6 +628,7 @@ void homa_dispatch_pkts(struct sk_buff *skb) } if (rpc) { IF_NO_STRIP(homa_grant_check_rpc(rpc)); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } @@ -1045,12 +1048,10 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, /* Must temporarily release rpc's lock because * homa_rpc_acked needs to acquire RPC locks. */ - homa_rpc_hold(rpc); homa_rpc_unlock(rpc); for (i = 0; i < count; i++) homa_rpc_acked(hsk, &saddr, &h->acks[i]); homa_rpc_lock(rpc); - homa_rpc_put(rpc); } else { for (i = 0; i < count; i++) homa_rpc_acked(hsk, &saddr, &h->acks[i]); @@ -1083,8 +1084,6 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) if (!(atomic_read(&rpc->flags) & RPC_PRIVATE)) return -EINVAL; - homa_rpc_hold(rpc); - /* Each iteration through this loop waits until rpc needs attention * in some way (e.g. packets have arrived), then deals with that need * (e.g. copy to user space). It may take many iterations until the @@ -1138,7 +1137,6 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) else INC_METRIC(wait_fast, 1); #endif /* See strip.py */ - homa_rpc_put(rpc); return result; } @@ -1150,7 +1148,8 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) * * Return: Pointer to an RPC with a complete incoming message or nonzero * error field, or a negative errno (usually -EINTR). If an RPC - * is returned it will be locked and the caller must unlock. + * is returned it will be locked and referenced; the caller + * must release the lock and the reference. */ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) __cond_acquires(rpc->bucket->lock) @@ -1224,7 +1223,6 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) atomic_or(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); - homa_rpc_put(rpc); if (!rpc->error) rpc->error = homa_copy_to_user(rpc); if (rpc->error) { @@ -1233,6 +1231,7 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) } else if (rpc->msgin.bytes_remaining == 0 && skb_queue_len(&rpc->msgin.packets) == 0) break; + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } diff --git a/homa_outgoing.c b/homa_outgoing.c index 27a0f62c..421562b7 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -270,7 +270,6 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) int gso_size; int err; - homa_rpc_hold(rpc); if (unlikely(iter->count > HOMA_MAX_MESSAGE_LENGTH || iter->count == 0)) { tt_record2("homa_message_out_fill found bad length %d for id %d", @@ -391,14 +390,12 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) rpc->id, rpc->msgout.length); INC_METRIC(sent_msg_bytes, rpc->msgout.length); refcount_add(rpc->msgout.skb_memory, &rpc->hsk->sock.sk_wmem_alloc); - homa_rpc_put(rpc); if (!overlap_xmit && xmit) homa_xmit_data(rpc, false); return 0; error: refcount_add(rpc->msgout.skb_memory, &rpc->hsk->sock.sk_wmem_alloc); - homa_rpc_put(rpc); return err; } @@ -595,7 +592,6 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) IF_NO_STRIP(struct netdev_queue *txq); int length; - homa_rpc_hold(rpc); while (*rpc->msgout.next_xmit) { struct sk_buff *skb = *rpc->msgout.next_xmit; IF_NO_STRIP(int priority); @@ -643,7 +639,6 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) } #endif /* See strip.py */ - homa_rpc_hold(rpc); homa_rpc_unlock(rpc); skb_get(skb); #ifndef __STRIP__ /* See strip.py */ @@ -658,11 +653,9 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) #endif /* See strip.py */ force = false; homa_rpc_lock(rpc); - homa_rpc_put(rpc); if (rpc->state == RPC_DEAD) break; } - homa_rpc_put(rpc); } #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_plumbing.c b/homa_plumbing.c index 2212d1f1..df74e1c8 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1037,6 +1037,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) rpc = NULL; goto error; } + homa_rpc_hold(rpc); if (args.flags & HOMA_SENDMSG_PRIVATE) atomic_or(RPC_PRIVATE, &rpc->flags); INC_METRIC(send_calls, 1); @@ -1051,14 +1052,14 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) goto error; args.id = rpc->id; homa_rpc_unlock(rpc); /* Locked by homa_rpc_alloc_client. */ - rpc = NULL; if (unlikely(copy_to_user((void __user *)msg->msg_control, &args, sizeof(args)))) { - rpc = homa_rpc_find_client(hsk, args.id); + homa_rpc_lock(rpc); result = -EFAULT; goto error; } + homa_rpc_put(rpc); #ifndef __STRIP__ /* See strip.py */ finish = homa_clock(); #endif /* See strip.py */ @@ -1089,6 +1090,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) args.id, tt_addr(canonical_dest)); return 0; } + homa_rpc_hold(rpc); if (rpc->error) { result = rpc->error; goto error; @@ -1096,17 +1098,15 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (rpc->state != RPC_IN_SERVICE) { tt_record2("homa_sendmsg error: RPC id %d in bad state %d", rpc->id, rpc->state); - /* Locked by homa_rpc_find_server. */ - homa_rpc_unlock(rpc); - rpc = NULL; result = -EINVAL; - goto error; + goto error_dont_end_rpc; } rpc->state = RPC_OUTGOING; result = homa_message_out_fill(rpc, &msg->msg_iter, 1); if (result && rpc->state != RPC_DEAD) goto error; + homa_rpc_put(rpc); homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ #ifndef __STRIP__ /* See strip.py */ finish = homa_clock(); @@ -1119,9 +1119,15 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) return 0; error: - if (rpc) { + if (rpc) homa_rpc_end(rpc); - homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ + +error_dont_end_rpc: + if (rpc) { + homa_rpc_put(rpc); + + /* Locked by homa_rpc_find_server or homa_rpc_alloc_client. */ + homa_rpc_unlock(rpc); } tt_record2("homa_sendmsg returning error %d for id %d", result, args.id); @@ -1144,8 +1150,8 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, IF_NO_STRIP(u64 start = homa_clock()); struct homa_sock *hsk = homa_sk(sk); struct homa_recvmsg_args control; + struct homa_rpc *rpc = NULL; IF_NO_STRIP(u64 finish); - struct homa_rpc *rpc; int nonblocking; int result; @@ -1189,9 +1195,9 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, result = -EINVAL; goto done; } + homa_rpc_hold(rpc); result = homa_wait_private(rpc, nonblocking); if (result != 0) { - homa_rpc_unlock(rpc); control.id = 0; goto done; } @@ -1203,6 +1209,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, * in the RPC itself are handled below. */ result = PTR_ERR(rpc); + rpc = NULL; goto done; } } @@ -1258,9 +1265,6 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, */ rpc->msgin.num_bpages = 0; - /* Must release the RPC lock (and potentially free the RPC) before - * copying the results back to user space. - */ if (homa_is_client(rpc->id)) { homa_peer_add_ack(rpc); homa_rpc_end(rpc); @@ -1270,7 +1274,6 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, else rpc->state = RPC_IN_SERVICE; } - homa_rpc_unlock(rpc); /* Locked by homa_wait_shared/private. */ if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags)) { /* There are tasks waiting for tx memory, so reap @@ -1280,6 +1283,15 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } done: + /* Note: must release the RPC lock before copying results to user + * space. + */ + if (rpc) { + homa_rpc_put(rpc); + + /* Locked by homa_rpc_find_client or homa_wait_shared. */ + homa_rpc_unlock(rpc); + } if (unlikely(copy_to_user((__force void __user *)msg->msg_control, &control, sizeof(control)))) { #ifndef __UPSTREAM__ /* See strip.py */ diff --git a/homa_rpc.h b/homa_rpc.h index 4b0e92aa..cc30a8b4 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -505,9 +505,16 @@ static inline void homa_unprotect_rpcs(struct homa_sock *hsk) #ifndef __UNIT_TEST__ /** * homa_rpc_hold() - Increment the reference count on an RPC, which will - * prevent it from being freed until homa_rpc_put() is called. Used in - * situations where a pointer to the RPC needs to be retained during a - * period where it is unprotected by locks. + * prevent it from being freed until homa_rpc_put() is called. References + * are taken in two situations: + * 1. An RPC is going to be manipulated by a collection of functions. In + * this case the top-most function that identifies the RPC takes the + * reference; any function that receives an RPC as an argument can + * assume that a reference has been taken on the RPC by some higher + * function on the call stack. + * 2. A pointer to an RPC is stored in an object for use later, such as + * an interest. A reference must be held as long as the pointer remains + * accessible in the object. * @rpc: RPC on which to take a reference. */ static inline void homa_rpc_hold(struct homa_rpc *rpc) diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 60df73ae..a2c6f276 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2468,6 +2468,7 @@ TEST_F(homa_incoming, homa_wait_shared__rpc_already_ready) EXPECT_EQ(crpc, rpc); EXPECT_EQ(0, crpc->msgin.packets.qlen); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_none)); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } TEST_F(homa_incoming, homa_wait_shared__multiple_rpcs_already_ready) @@ -2487,6 +2488,7 @@ TEST_F(homa_incoming, homa_wait_shared__multiple_rpcs_already_ready) rpc = homa_wait_shared(&self->hsk, 0); ASSERT_FALSE(IS_ERR(rpc)); EXPECT_EQ(crpc, rpc); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); EXPECT_SUBSTR("sk->sk_data_ready invoked", unit_log_get()); } @@ -2517,6 +2519,7 @@ TEST_F(homa_incoming, homa_wait_shared__signal_race_with_handoff) EXPECT_EQ(crpc, rpc); EXPECT_EQ(ENOENT, -rpc->error); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } TEST_F(homa_incoming, homa_wait_shared__socket_shutdown_while_blocked) @@ -2548,6 +2551,7 @@ TEST_F(homa_incoming, homa_wait_shared__copy_to_user_fails) rpc = homa_wait_shared(&self->hsk, 0); EXPECT_EQ(crpc, rpc); EXPECT_EQ(EFAULT, -rpc->error); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } TEST_F(homa_incoming, homa_wait_shared__rpc_has_error) @@ -2564,6 +2568,7 @@ TEST_F(homa_incoming, homa_wait_shared__rpc_has_error) rpc = homa_wait_shared(&self->hsk, 0); EXPECT_EQ(crpc, rpc); EXPECT_EQ(2, crpc->msgin.packets.qlen); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } TEST_F(homa_incoming, homa_wait_shared__rpc_dead) @@ -2582,6 +2587,7 @@ TEST_F(homa_incoming, homa_wait_shared__rpc_dead) rpc = homa_wait_shared(&self->hsk, 0); EXPECT_EQ(crpc2, rpc); + homa_rpc_put(rpc); homa_rpc_unlock(rpc); } From 0e936b1ce6e0c44f55dc27b3caef59db06897a4d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 9 Jul 2025 11:59:18 -0700 Subject: [PATCH 390/625] Update notes.txt --- notes.txt | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/notes.txt b/notes.txt index c984bf47..c572fae5 100755 --- a/notes.txt +++ b/notes.txt @@ -1,13 +1,6 @@ Notes for Homa implementation in Linux: --------------------------------------- -* Loose ends as of 6/13/2025: - * Analyze poor throughput under w4 with -b30 - * Reimplement FIFO grants - * Refactor RPC reference counts: acquire at top-level so no-one else - has to worry about them. - * Check out peformance on c6525-25g. - * Move interest cleanup code from homa_sock to a new function in homa_interest. Also move wakeup code from homa_rpc_handoff. @@ -72,18 +65,6 @@ Notes for Homa implementation in Linux: * The grant will be sent more quickly by the server because it doesn't have to process a batch of data packets first -* homa_grant_recalc is being invoked a *lot* and it takes a lot of time each - call (13 usec/call, duty cycle > 100%): - * Does it need to relock every ranked RPC every call? - * About 25% of calls are failed attempts to promote an unranked RPC - (perhaps because the slots for its host are all taken?) - * Could it abort early if there is no incoming headroom? - -* Notes on refactoring of grant mechanism: - * Need to reimplement FIFO grants - * Replace fifo_grant_increment with fifo_grant_interval - * Refactor so that the msgin structure is always properly initialized? - * The current implementation can execute RPCs multiple times on the server: * The initial RPC arrives, but takes the slow path to SoftIRQ, which can take many ms. @@ -189,13 +170,6 @@ Notes for Homa implementation in Linux: * See if error checking made syscalls slower. * GSO always uses SKB_GSO_TCPV6; sometimes it should be V4. -* Refactor of granting mechanism: - * Eliminate grant_increment: change to fifo_grant_increment instead - * grant_non_fifo may need to grant to a message that is also receiving - regular grants - * What if a message receives data beyond incoming, which completes the - message? - * Pinning memory: see mm.h and mm/gup.c * get_user_page * get_user_pages From cbce44f27523d9bf2f27401404cae4653abc8dfa Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 9 Jul 2025 11:59:31 -0700 Subject: [PATCH 391/625] Fix regexp string syntax error in switch.py --- cloudlab/bin/switch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudlab/bin/switch.py b/cloudlab/bin/switch.py index ad7d0e0b..0f9e3228 100755 --- a/cloudlab/bin/switch.py +++ b/cloudlab/bin/switch.py @@ -72,7 +72,7 @@ def do_cmd(self, command, time_limit=5.0): output += data if re.search('Unrecognized command.*xyzzy.*help', output, flags=re.DOTALL): - return output; + return output time.sleep(0.1) def get_max_buffer_usage(self): @@ -80,7 +80,7 @@ def get_max_buffer_usage(self): Return the maximum total buffer usage (across all egress ports). """ output = self.do_cmd("show buffers pools ePool0") - match = re.search('.*ePool0\s+egress.*[0-9.]+M?\s+[0-9.]+M?\s+([0-9.]+)([MK]?)', + match = re.search(r'.*ePool0\s+egress.*[0-9.]+M?\s+[0-9.]+M?\s+([0-9.]+)([MK]?)', output) if match: if match.group(2) == 'M': From b60dd5798d40209e5169356b3ab06ccaf497f0fd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Jul 2025 09:19:56 -0700 Subject: [PATCH 392/625] Clean up homa_load error handling Use variables to keep track of what has been initialized, instead of complex and confusing set of goto labels. --- homa_plumbing.c | 103 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 34 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index df74e1c8..a032eb0d 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -466,7 +466,18 @@ static int timer_thread_exit; */ int __init homa_load(void) { + IF_NO_STRIP(bool init_metrics = false); + IF_NO_STRIP(bool init_offload = false); + IF_NO_STRIP(bool init_sysctl = false); struct homa *homa = global_homa; + bool init_protocol6 = false; + bool init_protosw6 = false; + bool init_protocol = false; + bool init_protosw = false; + bool init_net_ops = false; + bool init_proto6 = false; + bool init_proto = false; + bool init_homa = false; int status; /* Compile-time validations that no packet header is longer @@ -511,7 +522,7 @@ int __init homa_load(void) #endif /* See strip.py */ pr_err("Homa module loading\n"); -#ifndef __STRIP__ /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ pr_notice("Homa structure sizes: homa_data_hdr %lu, homa_seg_hdr %lu, ack %lu, peer %lu, ip_hdr %lu flowi %lu ipv6_hdr %lu, flowi6 %lu tcp_sock %lu homa_rpc %lu sk_buff %lu rcvmsg_control %lu union sockaddr_in_union %lu HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", sizeof(struct homa_data_hdr), sizeof(struct homa_seg_hdr), @@ -534,69 +545,87 @@ int __init homa_load(void) status = proto_register(&homa_prot, 1); if (status != 0) { pr_err("proto_register failed for homa_prot: %d\n", status); - goto proto_register_err; + goto error; } + init_proto = true; + status = proto_register(&homav6_prot, 1); if (status != 0) { pr_err("proto_register failed for homav6_prot: %d\n", status); - goto proto_register_v6_err; + goto error; } + init_proto6 = true; + inet_register_protosw(&homa_protosw); + init_protosw = true; + status = inet6_register_protosw(&homav6_protosw); if (status != 0) { pr_err("inet6_register_protosw failed in %s: %d\n", __func__, status); - goto register_protosw_v6_err; + goto error; } + init_protosw6 = true; + status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA); if (status != 0) { pr_err("inet_add_protocol failed in %s: %d\n", __func__, status); - goto add_protocol_err; + goto error; } + init_protocol = true; + status = inet6_add_protocol(&homav6_protocol, IPPROTO_HOMA); if (status != 0) { pr_err("inet6_add_protocol failed in %s: %d\n", __func__, status); - goto add_protocol_v6_err; + goto error; } + init_protocol6 = true; + status = homa_init(homa); if (status) - goto homa_init_err; + goto error; + init_homa = true; #ifndef __STRIP__ /* See strip.py */ status = homa_metrics_init(); if (status != 0) - goto metrics_err; + goto error; + init_metrics = true; homa_ctl_header = register_net_sysctl(&init_net, "net/homa", homa_ctl_table); if (!homa_ctl_header) { pr_err("couldn't register Homa sysctl parameters\n"); status = -ENOMEM; - goto sysctl_err; + goto error; } + init_sysctl = true; status = homa_offload_init(); if (status != 0) { pr_err("Homa couldn't init offloads\n"); - goto offload_err; + goto error; } + init_offload = true; #endif /* See strip.py */ status = register_pernet_subsys(&homa_net_ops); if (status != 0) { pr_err("Homa got error from register_pernet_subsys: %d\n", status); - goto net_err; + goto error; } + init_net_ops = true; + timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer"); if (IS_ERR(timer_kthread)) { status = PTR_ERR(timer_kthread); pr_err("couldn't create Homa timer thread: error %d\n", status); timer_kthread = NULL; - goto timer_err; + goto error; } #ifndef __STRIP__ /* See strip.py */ @@ -609,30 +638,36 @@ int __init homa_load(void) return 0; -timer_err: - unregister_pernet_subsys(&homa_net_ops); -net_err: +error: + if (timer_kthread) { + timer_thread_exit = 1; + wake_up_process(timer_kthread); + wait_for_completion(&timer_thread_done); + } #ifndef __STRIP__ /* See strip.py */ - homa_offload_end(); -offload_err: - unregister_net_sysctl_table(homa_ctl_header); -sysctl_err: - homa_metrics_end(); -metrics_err: + if (init_offload) + homa_offload_end(); + if (init_sysctl) + unregister_net_sysctl_table(homa_ctl_header); + if (init_metrics) + homa_metrics_end(); #endif /* See strip.py */ - homa_destroy(homa); -homa_init_err: - inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); -add_protocol_v6_err: - inet_del_protocol(&homa_protocol, IPPROTO_HOMA); -add_protocol_err: - inet6_unregister_protosw(&homav6_protosw); -register_protosw_v6_err: - inet_unregister_protosw(&homa_protosw); - proto_unregister(&homav6_prot); -proto_register_v6_err: - proto_unregister(&homa_prot); -proto_register_err: + if (init_net_ops) + unregister_pernet_subsys(&homa_net_ops); + if (init_homa) + homa_destroy(homa); + if (init_protocol) + inet_del_protocol(&homa_protocol, IPPROTO_HOMA); + if (init_protocol6) + inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); + if (init_protosw) + inet_unregister_protosw(&homa_protosw); + if (init_protosw6) + inet6_unregister_protosw(&homav6_protosw); + if (init_proto) + proto_unregister(&homa_prot); + if (init_proto6) + proto_unregister(&homav6_prot); return status; } From 991c055b7e9df52cd655b76e622c7d37d7801362 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Jul 2025 10:42:59 -0700 Subject: [PATCH 393/625] Take a reference on the RPC in homa_pacer_xmit Needed to conform to new RPC reference count rules. This protects against RPC reaping if homa_xmit_data releases the RPC lock. It also makes it easier to lock the RPC in homa_pacer_xmit. --- homa_metrics.c | 2 -- homa_metrics.h | 6 ------ homa_pacer.c | 23 +++++++++-------------- test/unit_homa_pacer.c | 23 ----------------------- 4 files changed, 9 insertions(+), 45 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index a82b5257..f3cf5064 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -283,8 +283,6 @@ char *homa_metrics_print(void) m->pacer_lost_cycles); M("pacer_bytes %15llu Bytes transmitted when the pacer was active\n", m->pacer_bytes); - M("pacer_skipped_rpcs %15llu Pacer aborts because of locked RPCs\n", - m->pacer_skipped_rpcs); M("pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", m->pacer_needed_help); M("throttled_cycles %15llu Time when the throttled queue was nonempty\n", diff --git a/homa_metrics.h b/homa_metrics.h index cca1eb2e..f2766158 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -386,12 +386,6 @@ struct homa_metrics { */ u64 pacer_bytes; - /** - * @pacer_skipped_rpcs: total number of times that the pacer had to - * abort because it couldn't lock an RPC. - */ - u64 pacer_skipped_rpcs; - /** * @pacer_needed_help: total number of times that homa_check_pacer * found that the pacer was running behind, so it actually invoked diff --git a/homa_pacer.c b/homa_pacer.c index fed0a559..0ba9f1ff 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -256,14 +256,12 @@ void homa_pacer_xmit(struct homa_pacer *pacer) if (list_empty(&pacer->throttled_rpcs)) break; - /* Lock the first throttled RPC. This may not be possible - * because we have to hold throttle_lock while locking - * the RPC; that means we can't wait for the RPC lock because - * of lock ordering constraints (see "Homa Locking Strategy" in - * homa_impl.h). Thus, if the RPC lock isn't available, do - * nothing. Holding the throttle lock while locking the RPC - * is important because it keeps the RPC from being deleted - * before it can be locked. + /* Select an RPC to transmit (either SRPT or FIFO) and + * take a reference on it. Must do this while holding the + * throttle_lock to prevent the RPC from being reaped. Then + * release the throttle lock and lock the RPC (can't acquire + * the RPC lock while holding the throttle lock; see "Homa + * Locking Strategy" in homa_impl.h). */ homa_pacer_throttle_lock(pacer); pacer->fifo_count -= pacer->fifo_fraction; @@ -289,13 +287,9 @@ void homa_pacer_xmit(struct homa_pacer *pacer) homa_pacer_throttle_unlock(pacer); break; } - if (!homa_rpc_try_lock(rpc)) { - homa_pacer_throttle_unlock(pacer); - INC_METRIC(pacer_skipped_rpcs, 1); - break; - } + homa_rpc_hold(rpc); homa_pacer_throttle_unlock(pacer); - + homa_rpc_lock(rpc); tt_record4("pacer calling homa_xmit_data for rpc id %llu, port %d, offset %d, bytes_left %d", rpc->id, rpc->hsk->port, rpc->msgout.next_xmit_offset, @@ -319,6 +313,7 @@ void homa_pacer_xmit(struct homa_pacer *pacer) homa_pacer_unmanage_rpc(rpc); } homa_rpc_unlock(rpc); + homa_rpc_put(rpc); } spin_unlock_bh(&pacer->mutex); } diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index 986273dd..ce1254cb 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -460,29 +460,6 @@ TEST_F(homa_pacer, homa_pacer_xmit__rpc_removed_from_queue_before_locked) unit_log_throttled(&self->homa); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_pacer, homa_pacer_xmit__rpc_locked) -{ - struct homa_rpc *crpc; - - crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, - self->client_id, 5000, 1000); - - homa_pacer_manage_rpc(crpc); - self->homa.pacer->max_nic_queue_cycles = 2000; - unit_log_clear(); - mock_trylock_errors = ~1; - homa_pacer_xmit(self->homa.pacer); - EXPECT_STREQ("", unit_log_get()); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_skipped_rpcs); -#endif /* See strip.py */ - unit_log_clear(); - mock_trylock_errors = 0; - homa_pacer_xmit(self->homa.pacer); - EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", - unit_log_get()); -} TEST_F(homa_pacer, homa_pacer_xmit__remove_from_queue) { struct homa_rpc *crpc1, *crpc2; From d66abf6ee98f9e3732fb777aab8526690ea728dd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Jul 2025 10:46:19 -0700 Subject: [PATCH 394/625] Move RPC_DEAD check in homa_xmit_data --- homa_outgoing.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index 421562b7..d9b28ff7 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -592,7 +592,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) IF_NO_STRIP(struct netdev_queue *txq); int length; - while (*rpc->msgout.next_xmit) { + while (*rpc->msgout.next_xmit && rpc->state != RPC_DEAD) { struct sk_buff *skb = *rpc->msgout.next_xmit; IF_NO_STRIP(int priority); @@ -653,8 +653,6 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) #endif /* See strip.py */ force = false; homa_rpc_lock(rpc); - if (rpc->state == RPC_DEAD) - break; } } From 2d091cdd7e992362602c5e9a9a5bf3a811469dde Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Jul 2025 13:15:49 -0700 Subject: [PATCH 395/625] Fix issues with __STRIP__ --- homa_devel.c | 4 +++- homa_impl.h | 2 ++ homa_incoming.c | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 32701246..be4886a0 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -934,10 +934,11 @@ int homa_drop_packet(struct homa *homa) */ void homa_snapshot_get_stats(struct homa_rpc_snapshot *snap) { - int core; + IF_NO_STRIP(int core); memset(snap, 0, sizeof(*snap)); snap->clock = homa_clock(); +#ifndef __STRIP__ /* See strip.py */ for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = &per_cpu(homa_metrics, core); @@ -967,6 +968,7 @@ void homa_snapshot_get_stats(struct homa_rpc_snapshot *snap) m->server_response_bytes_done; snap->server_responses_done += m->server_responses_done; } +#endif /* See strip.py */ } /** diff --git a/homa_impl.h b/homa_impl.h index d30d3731..26370a1a 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -905,6 +905,7 @@ static inline u64 homa_usecs_to_cycles(u64 usecs) #endif /* __UNIT_TEST__ */ } +#ifndef __STRIP__ /* See strip.py */ /** * homa_high_priority() - Return the next-to-highest available priority * level. Used in situations where we want to boost the priority of @@ -918,6 +919,7 @@ static inline int homa_high_priority(struct homa *homa) { return (homa->num_priorities <= 2) ? 0 : homa->num_priorities - 2; } +#endif /* See strip.py */ /* Homa Locking Strategy: * diff --git a/homa_incoming.c b/homa_incoming.c index 6dca3477..f503cfcb 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -51,7 +51,7 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) skb_queue_head_init(&rpc->msgin.packets); INIT_LIST_HEAD(&rpc->msgin.gaps); rpc->msgin.bytes_remaining = length; - rpc->msgin.birth = homa_clock(); + IF_NO_STRIP(rpc->msgin.birth = homa_clock()); err = homa_pool_alloc_msg(rpc); if (err != 0) { rpc->msgin.length = -1; From c546b159c3a59c221a498a6ded552ea371910725 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Jul 2025 13:49:55 -0700 Subject: [PATCH 396/625] Fix checkpatch.pl issues --- homa_grant.c | 7 +++---- homa_incoming.c | 5 +++-- homa_outgoing.c | 9 ++++++--- homa_pacer.c | 1 + homa_plumbing.c | 26 +++++++++++++++----------- homa_rpc.c | 13 +++++++------ homa_sock.c | 1 + homa_timer.c | 1 + homa_utils.c | 1 + 9 files changed, 38 insertions(+), 26 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 12b86d82..01e799d2 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -884,9 +884,8 @@ void homa_grant_find_oldest(struct homa_grant *grant) rpc = grant->active_rpcs[i]; if (rpc->msgin.birth >= oldest_birth) continue; - if (rpc->msgin.rec_incoming >= max_incoming) { + if (rpc->msgin.rec_incoming >= max_incoming) continue; - } oldest = rpc; oldest_birth = rpc->msgin.birth; } @@ -950,7 +949,7 @@ void homa_grant_promote_rpc(struct homa_grant *grant, struct homa_rpc *rpc) * FIFO grant; if so, make the grant. FIFO grants keep long messages from * being starved by Homa's SRPT grant mechanism. * @grant: Overall grant management information. -*/ + */ void homa_grant_check_fifo(struct homa_grant *grant) { struct homa_grant_candidates cand; @@ -1007,7 +1006,7 @@ void homa_grant_check_fifo(struct homa_grant *grant) rpc->msgin.granted += grant->fifo_grant_increment; if (rpc->msgin.granted >= rpc->msgin.length) { INC_METRIC(fifo_grant_bytes, grant->fifo_grant_increment + - rpc->msgin.length - + rpc->msgin.length - rpc->msgin.granted); rpc->msgin.granted = rpc->msgin.length; homa_grant_unmanage_rpc(rpc, &cand); diff --git a/homa_incoming.c b/homa_incoming.c index f503cfcb..377bebe8 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1154,12 +1154,13 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) __cond_acquires(rpc->bucket->lock) { - IF_NO_STRIP(int avail_immediately = 1); struct homa_interest interest; - IF_NO_STRIP(int blocked = 0); struct homa_rpc *rpc; int result; + IF_NO_STRIP(int avail_immediately = 1); + IF_NO_STRIP(int blocked = 0); + INIT_LIST_HEAD(&interest.links); init_waitqueue_head(&interest.wait_queue); /* Each iteration through this loop waits until an RPC needs attention diff --git a/homa_outgoing.c b/homa_outgoing.c index d9b28ff7..a433ea5d 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -446,13 +446,14 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, struct homa_sock *hsk) { - IF_NO_STRIP(struct netdev_queue *txq); - IF_NO_STRIP(int priority); struct homa_common_hdr *h; struct sk_buff *skb; int extra_bytes; int result; + IF_NO_STRIP(struct netdev_queue *txq); + IF_NO_STRIP(int priority); + skb = homa_skb_alloc_tx(HOMA_MAX_HEADER); if (unlikely(!skb)) return -ENOBUFS; @@ -589,11 +590,13 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) __must_hold(rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; - IF_NO_STRIP(struct netdev_queue *txq); int length; + IF_NO_STRIP(struct netdev_queue *txq); + while (*rpc->msgout.next_xmit && rpc->state != RPC_DEAD) { struct sk_buff *skb = *rpc->msgout.next_xmit; + IF_NO_STRIP(int priority); #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_pacer.c b/homa_pacer.c index 0ba9f1ff..b8d7e466 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -331,6 +331,7 @@ void homa_pacer_manage_rpc(struct homa_rpc *rpc) struct homa_pacer *pacer = rpc->hsk->homa->pacer; struct homa_rpc *candidate; int bytes_left; + IF_NO_STRIP(int checks = 0); IF_NO_STRIP(u64 now); diff --git a/homa_plumbing.c b/homa_plumbing.c index a032eb0d..a53002c7 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -466,9 +466,6 @@ static int timer_thread_exit; */ int __init homa_load(void) { - IF_NO_STRIP(bool init_metrics = false); - IF_NO_STRIP(bool init_offload = false); - IF_NO_STRIP(bool init_sysctl = false); struct homa *homa = global_homa; bool init_protocol6 = false; bool init_protosw6 = false; @@ -480,6 +477,10 @@ int __init homa_load(void) bool init_homa = false; int status; + IF_NO_STRIP(bool init_metrics = false); + IF_NO_STRIP(bool init_offload = false); + IF_NO_STRIP(bool init_sysctl = false); + /* Compile-time validations that no packet header is longer * than HOMA_MAX_HEADER. */ @@ -1010,15 +1011,16 @@ int homa_getsockopt(struct sock *sk, int level, int optname, */ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { - IF_NO_STRIP(u64 start = homa_clock()); struct homa_sock *hsk = homa_sk(sk); struct homa_sendmsg_args args; union sockaddr_in_union *addr; struct homa_rpc *rpc = NULL; int result = 0; -#ifndef __STRIP__ /* See strip.py */ - u64 finish; + IF_NO_STRIP(u64 start = homa_clock()); + IF_NO_STRIP(u64 finish); + +#ifndef __STRIP__ /* See strip.py */ per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; #endif /* See strip.py */ @@ -1182,14 +1184,15 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { - IF_NO_STRIP(u64 start = homa_clock()); struct homa_sock *hsk = homa_sk(sk); struct homa_recvmsg_args control; struct homa_rpc *rpc = NULL; - IF_NO_STRIP(u64 finish); int nonblocking; int result; + IF_NO_STRIP(u64 start = homa_clock()); + IF_NO_STRIP(u64 finish); + INC_METRIC(recv_calls, 1); #ifndef __STRIP__ /* See strip.py */ per_cpu(homa_offload_core, raw_smp_processor_id()).last_app_active = start; @@ -1374,12 +1377,13 @@ int homa_softirq(struct sk_buff *skb) { struct sk_buff *packets, *other_pkts, *next; struct sk_buff **prev_link, **other_link; - IF_NO_STRIP(struct homa *homa = homa_from_skb(skb)); struct homa_common_hdr *h; int header_offset; -#ifndef __STRIP__ /* See strip.py */ - u64 start; + IF_NO_STRIP(struct homa *homa = homa_from_skb(skb)); + IF_NO_STRIP(u64 start); + +#ifndef __STRIP__ /* See strip.py */ start = homa_clock(); per_cpu(homa_offload_core, raw_smp_processor_id()).last_active = start; #endif /* See strip.py */ diff --git a/homa_rpc.c b/homa_rpc.c index 8973f2a0..0d7edf19 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -572,6 +572,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) homa_skb_free_many_tx(hsk->homa, skbs, num_skbs); for (i = 0; i < num_rpcs; i++) { IF_NO_STRIP(int tx_left); + rpc = rpcs[i]; UNIT_LOG("; ", "reaped %llu", rpc->id); @@ -605,22 +606,22 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) rpc->msgout.next_xmit_offset; if (homa_is_client(rpc->id)) { INC_METRIC(client_response_bytes_done, - rpc->msgin.bytes_remaining); + rpc->msgin.bytes_remaining); INC_METRIC(client_responses_done, - rpc->msgin.bytes_remaining != 0); + rpc->msgin.bytes_remaining != 0); if (tx_left > 0) { INC_METRIC(client_request_bytes_done, - tx_left); + tx_left); INC_METRIC(client_requests_done, 1); } } else { INC_METRIC(server_request_bytes_done, - rpc->msgin.bytes_remaining); + rpc->msgin.bytes_remaining); INC_METRIC(server_requests_done, - rpc->msgin.bytes_remaining != 0); + rpc->msgin.bytes_remaining != 0); if (tx_left > 0) { INC_METRIC(server_response_bytes_done, - tx_left); + tx_left); INC_METRIC(server_responses_done, 1); } } diff --git a/homa_sock.c b/homa_sock.c index 47884748..606b506a 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -324,6 +324,7 @@ void homa_sock_shutdown(struct homa_sock *hsk) void homa_sock_destroy(struct sock *sk) { struct homa_sock *hsk = homa_sk(sk); + IF_NO_STRIP(int i = 0); if (!hsk->homa) diff --git a/homa_timer.c b/homa_timer.c index 0f7dce83..4f561f73 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -198,6 +198,7 @@ void homa_timer(struct homa *homa) rcu_read_lock(); list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { IF_NO_STRIP(total_rpcs++); + homa_rpc_lock(rpc); if (rpc->state == RPC_IN_SERVICE) { rpc->silent_ticks = 0; diff --git a/homa_utils.c b/homa_utils.c index bb688b1a..97508a1f 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -28,6 +28,7 @@ int homa_init(struct homa *homa) { int err; + IF_NO_STRIP(int i); memset(homa, 0, sizeof(*homa)); From 169f882ebb6d7ca2e712b7d06c2bd60c5c8a9a9b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Jul 2025 14:11:56 -0700 Subject: [PATCH 397/625] Move link_mbps from homa_pacer back to struct homa --- homa_grant.c | 4 ++-- homa_impl.h | 6 ++++++ homa_pacer.c | 17 ++--------------- homa_pacer.h | 16 ---------------- homa_plumbing.c | 7 +++++++ homa_utils.c | 1 + test/unit_homa_grant.c | 4 ++-- test/unit_homa_pacer.c | 6 +++--- 8 files changed, 23 insertions(+), 38 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 01e799d2..3f447bec 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -901,6 +901,7 @@ void homa_grant_find_oldest(struct homa_grant *grant) * the position of the RPC within the grantable lists and may promote it into * grant->active_rpcs. This function does not promote within grant->active_rpcs: * that is handled by homa_grant_fix_order. + * @grant: Overall grant management information. * @rpc: The RPC to consider for promotion. Must currently be managed for * grants. */ @@ -1113,8 +1114,7 @@ void homa_grant_update_sysctl_deps(struct homa_grant *grant) if (grant->fifo_fraction > 500) grant->fifo_fraction = 500; - fifo_mbps = (u64)homa_pacer_get_link_mbps(grant->homa->pacer) * - grant->fifo_fraction; + fifo_mbps = (u64)grant->homa->link_mbps * grant->fifo_fraction; do_div(fifo_mbps, 1000); if (fifo_mbps > 0 && grant->fifo_grant_increment > 0) { clocks_per_fifo_mbit = 1000 * homa_clock_khz(); diff --git a/homa_impl.h b/homa_impl.h index 26370a1a..a90bb73d 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -197,6 +197,12 @@ struct homa { */ int unsched_bytes; + /** + * @link_mbps: The raw bandwidth of the network uplink, in + * units of 1e06 bits per second. Set externally via sysctl. + */ + int link_mbps; + /** * @poll_usecs: Amount of time (in microseconds) that a thread * will spend busy-waiting for an incoming messages before diff --git a/homa_pacer.c b/homa_pacer.c index b8d7e466..2b6f82a3 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -6,7 +6,6 @@ */ #include "homa_impl.h" -#include "homa_grant.h" #include "homa_pacer.h" #include "homa_rpc.h" @@ -17,13 +16,6 @@ */ #define OFFSET(field) ((void *)offsetof(struct homa_pacer, field)) static struct ctl_table pacer_ctl_table[] = { - { - .procname = "link_mbps", - .data = OFFSET(link_mbps), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_pacer_dointvec - }, { .procname = "max_nic_queue_ns", .data = OFFSET(max_nic_queue_ns), @@ -69,7 +61,6 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) INIT_LIST_HEAD_RCU(&pacer->throttled_rpcs); pacer->fifo_fraction = 50; pacer->max_nic_queue_ns = 5000; - pacer->link_mbps = 25000; pacer->throttle_min_bytes = 1000; pacer->exit = false; init_waitqueue_head(&pacer->wait_queue); @@ -409,7 +400,7 @@ void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) /* Underestimate link bandwidth (overestimate time) by 1%. */ tmp = 101 * 8000 * (u64)homa_clock_khz(); - do_div(tmp, pacer->link_mbps * 100); + do_div(tmp, pacer->homa->link_mbps * 100); pacer->cycles_per_mbyte = tmp; } @@ -441,12 +432,8 @@ int homa_pacer_dointvec(const struct ctl_table *table, int write, table_copy.data = ((char *)pacer) + (uintptr_t)table_copy.data; result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); - if (write) { + if (write) homa_pacer_update_sysctl_deps(pacer); - - /* Grant info depends on link speed. */ - homa_grant_update_sysctl_deps(pacer->homa->grant); - } return result; } diff --git a/homa_pacer.h b/homa_pacer.h index 7e2a9c83..67584ab3 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -83,12 +83,6 @@ struct homa_pacer { */ int max_nic_queue_cycles; - /** - * @link_mbps: The raw bandwidth of the network uplink, in - * units of 1e06 bits per second. Set externally via sysctl. - */ - int link_mbps; - /** * @cycles_per_mbyte: the number of homa_clock() cycles that it takes to * transmit 10**6 bytes on our uplink. This is actually a slight @@ -222,14 +216,4 @@ static inline void homa_pacer_throttle_unlock(struct homa_pacer *pacer) spin_unlock_bh(&pacer->throttle_lock); } -/** - * homa_pacer_get_link_mbps() - Return the link speed for this transport. - * @pacer: Pacer information for a Homa transport. - * Return: The link speed, in units of 1e6 bits per second. - */ -static inline int homa_pacer_get_link_mbps(struct homa_pacer *pacer) -{ - return pacer->link_mbps; -} - #endif /* _HOMA_PACER_H */ diff --git a/homa_plumbing.c b/homa_plumbing.c index a53002c7..217604c6 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -268,6 +268,13 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, + { + .procname = "link_mbps", + .data = OFFSET(link_mbps), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_dointvec + }, { .procname = "max_dead_buffs", .data = OFFSET(max_dead_buffs), diff --git a/homa_utils.c b/homa_utils.c index 97508a1f..a83150ee 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -34,6 +34,7 @@ int homa_init(struct homa *homa) memset(homa, 0, sizeof(*homa)); atomic64_set(&homa->next_outgoing_id, 2); + homa->link_mbps = 25000; homa->pacer = homa_pacer_alloc(homa); if (IS_ERR(homa->pacer)) { err = PTR_ERR(homa->pacer); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 93ac7f67..f810e864 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -1848,7 +1848,7 @@ TEST_F(homa_grant, homa_grant_update_sysctl_deps__fifo_interval) { self->homa.grant->fifo_grant_increment = 20000; self->homa.grant->fifo_fraction = 500; - self->homa.pacer->link_mbps = 8000; + self->homa.link_mbps = 8000; homa_grant_update_sysctl_deps(self->homa.grant); EXPECT_EQ(40000, self->homa.grant->fifo_grant_interval); } @@ -1856,7 +1856,7 @@ TEST_F(homa_grant, homa_grant_update_sysctl_deps__fifo_interval_no_fifo_grants) { self->homa.grant->fifo_grant_increment = 20000; self->homa.grant->fifo_fraction = 0; - self->homa.pacer->link_mbps = 8000; + self->homa.link_mbps = 8000; homa_grant_update_sysctl_deps(self->homa.grant); EXPECT_EQ(1000000000, self->homa.grant->fifo_grant_interval); } diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index ce1254cb..f7562b6c 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -640,16 +640,16 @@ TEST_F(homa_pacer, homa_pacer_unmanage_rpc__metrics) TEST_F(homa_pacer, homa_pacer_update_sysctl_deps) { self->homa.pacer->max_nic_queue_ns = 6000; - self->homa.pacer->link_mbps = 10000; + self->homa.link_mbps = 10000; homa_pacer_update_sysctl_deps(self->homa.pacer); EXPECT_EQ(6000, self->homa.pacer->max_nic_queue_cycles); EXPECT_EQ(808000, self->homa.pacer->cycles_per_mbyte); - self->homa.pacer->link_mbps = 1000; + self->homa.link_mbps = 1000; homa_pacer_update_sysctl_deps(self->homa.pacer); EXPECT_EQ(8080000, self->homa.pacer->cycles_per_mbyte); - self->homa.pacer->link_mbps = 40000; + self->homa.link_mbps = 40000; homa_pacer_update_sysctl_deps(self->homa.pacer); EXPECT_EQ(202000, self->homa.pacer->cycles_per_mbyte); } \ No newline at end of file From 21038bb610ed66da17c1d2ab6b8b0bc94d2c3f0d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 11 Jul 2025 15:19:05 -0700 Subject: [PATCH 398/625] Fix issue with __STRIP__ --- homa_impl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/homa_impl.h b/homa_impl.h index a90bb73d..638dea4c 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -196,6 +196,7 @@ struct homa { * @unsched_bytes and @window). Set externally via sysctl. */ int unsched_bytes; +#endif /* See strip.py */ /** * @link_mbps: The raw bandwidth of the network uplink, in @@ -203,6 +204,7 @@ struct homa { */ int link_mbps; +#ifndef __STRIP__ /* See strip.py */ /** * @poll_usecs: Amount of time (in microseconds) that a thread * will spend busy-waiting for an incoming messages before From c99d5fb53d5c690a3fe7535567ee81a8cf8ec5b0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 15 Jul 2025 13:29:54 -0700 Subject: [PATCH 399/625] Refactor code for printing metrics Make code easier to maintain by pulling most of the formatting specifics out into the homa_metric_append function. --- homa_metrics.c | 545 ++++++++++++++++++++------------------- homa_metrics.h | 2 +- test/unit_homa_metrics.c | 26 +- 3 files changed, 293 insertions(+), 280 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index f3cf5064..78ab3372 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -53,13 +53,15 @@ void homa_metrics_end(void) } /** - * homa_metric_append() - Formats a new metric and appends it to - * homa_mout.output. - * @format: Standard printf-style format string describing the - * new metric. Arguments after this provide the usual - * values expected for printf-like functions. + * homa_metric_append() - Format a metric and append it to homa_mout.output. + * @name: Name of the metric + * @value: Value of the metric + * @format: Standard printf-style format string providing a human- + * readable description of the metric. Arguments after this + * provide the usual values expected for printf-like functions, + * if needed. */ -void homa_metric_append(const char *format, ...) +void homa_metric_append(const char *name, u64 value, const char *format, ...) { char *new_buffer; size_t new_chars; @@ -67,7 +69,7 @@ void homa_metric_append(const char *format, ...) if (!homa_mout.output) { #ifdef __UNIT_TEST__ - homa_mout.capacity = 30; + homa_mout.capacity = 200; #else homa_mout.capacity = 4096; #endif @@ -77,19 +79,7 @@ void homa_metric_append(const char *format, ...) homa_mout.length = 0; } - /* May have to execute this loop multiple times if we run out - * of space in homa_mout.output; each iteration expands the storage, - * until eventually it is large enough. - */ - while (true) { - va_start(ap, format); - new_chars = vsnprintf(homa_mout.output + homa_mout.length, - homa_mout.capacity - - homa_mout.length, format, ap); - va_end(ap); - if ((homa_mout.length + new_chars) < homa_mout.capacity) - break; - + while (homa_mout.capacity < homa_mout.length + 200) { /* Not enough room; expand buffer capacity. */ homa_mout.capacity *= 2; new_buffer = kmalloc(homa_mout.capacity, GFP_KERNEL); @@ -99,7 +89,15 @@ void homa_metric_append(const char *format, ...) kfree(homa_mout.output); homa_mout.output = new_buffer; } - homa_mout.length += new_chars; + + new_chars = snprintf(homa_mout.output + homa_mout.length, 60, + "%-30s %20llu ", name, value); + homa_mout.length += (new_chars > 60) ? 60 : new_chars; + va_start(ap, format); + new_chars = vsnprintf(homa_mout.output + homa_mout.length, 120, + format, ap); + va_end(ap); + homa_mout.length += (new_chars > 120) ? 120 : new_chars; } /** @@ -111,132 +109,145 @@ void homa_metric_append(const char *format, ...) char *homa_metrics_print(void) { int core, i, lower = 0; + char name[30]; homa_mout.length = 0; #define M(...) homa_metric_append(__VA_ARGS__) - M("time_cycles %20llu homa_clock() time when metrics were gathered\n", - homa_clock()); - M("cpu_khz %15llu Clock rate in khz\n", - homa_clock_khz()); + M("time_cycles", homa_clock(), + "homa_clock() time when metrics were gathered\n"); + M("cpu_khz", homa_clock_khz(), + "Clock rate in khz\n"); for (core = 0; core < nr_cpu_ids; core++) { struct homa_metrics *m = &per_cpu(homa_metrics, core); s64 delta; - M("core %15d Core id for following metrics\n", - core); + M("core", core, + "Core id for following metrics\n"); for (i = 0; i < HOMA_NUM_SMALL_COUNTS; i++) { - M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", - (i + 1) * 64, m->small_msg_bytes[i], lower, - (i + 1) * 64); + snprintf(name, sizeof(name), "msg_bytes_%d", + (i + 1) * 64); + M(name, m->small_msg_bytes[i], + "Bytes in incoming messages containing %d-%d bytes\n", + lower, (i + 1) * 64); lower = (i + 1) * 64 + 1; } for (i = (HOMA_NUM_SMALL_COUNTS * 64) / 1024; i < HOMA_NUM_MEDIUM_COUNTS; i++) { - M("msg_bytes_%-9d %15llu Bytes in incoming messages containing %d-%d bytes\n", - (i + 1) * 1024, m->medium_msg_bytes[i], lower, - (i + 1) * 1024); + snprintf(name, sizeof(name), "msg_bytes_%d", + (i + 1) * 1024); + M(name, m->medium_msg_bytes[i], + "Bytes in incoming messages containing %d-%d bytes\n", + lower, (i + 1) * 1024); lower = (i + 1) * 1024 + 1; } - M("large_msg_count %15llu # of incoming messages >= %d bytes\n", - m->large_msg_count, lower); - M("large_msg_bytes %15llu Bytes in incoming messages >= %d bytes\n", - m->large_msg_bytes, lower); - M("client_requests_started %15llu Client RPCs initiated\n", - m->client_requests_started); - M("client_request_bytes_started %15llu Request bytes in all initiated client RPCs\n", - m->client_request_bytes_started); - M("client_request_bytes_done %15llu Transmitted request bytes in all client RPCs\n", - m->client_request_bytes_done); - M("client_requests_done %15llu Client RPC requests fully transmitted\n", - m->client_requests_done); - M("client_responses_started %15llu Client RPCs for which at least one response pkt recvd\n", - m->client_responses_started); - M("client_response_bytes_started %15llu Response bytes in all RPCS in client_responses_started\n", - m->client_response_bytes_started); - M("client_response_bytes_done %15llu Response bytes received for all client RPCs\n", - m->client_response_bytes_done); - M("client_responses_done %15llu Client RPC responses fully received\n", - m->client_responses_done); - M("server_requests_started %15llu Server RPCs for which at least one request pkt rcvd\n", - m->server_requests_started); - M("server_request_bytes_started %15llu Request bytes in all RPCS in server_requests_started\n", - m->server_request_bytes_started); - M("server_request_bytes_done %15llu Request bytes received for all server RPCs\n", - m->server_request_bytes_done); - M("server_requests_done %15llu Server RPC requests fully received\n", - m->server_requests_done); - M("server_responses_started %15llu Server RPCs for which response was initiated\n", - m->server_responses_started); - M("server_response_bytes_started %15llu Message bytes in all initiated server responses\n", - m->server_response_bytes_started); - M("server_response_bytes_done %15llu Transmitted response bytes in all server RPCs\n", - m->server_response_bytes_done); - M("server_responses_done %15llu Server RPC responses fully transmitted\n", - m->server_responses_done); - M("sent_msg_bytes %15llu Total bytes in all outgoing messages\n", - m->sent_msg_bytes); + M("large_msg_count", m->large_msg_count, + "# of incoming messages >= %d bytes\n", lower); + M("large_msg_bytes", m->large_msg_bytes, + "Bytes in incoming messages >= %d bytes\n", lower); + M("client_requests_started", m->client_requests_started, + "Client RPCs initiated\n"); + M("client_request_bytes_started", + m->client_request_bytes_started, + "Request bytes in all initiated client RPCs\n"); + M("client_request_bytes_done", m->client_request_bytes_done, + "Transmitted request bytes in all client RPCs\n"); + M("client_requests_done", m->client_requests_done, + "Client RPC requests fully transmitted\n"); + + M("client_responses_started", m->client_responses_started, + "Client RPCs for which at least one response pkt recvd\n"); + M("client_response_bytes_started", + m->client_response_bytes_started, + "Response bytes in all RPCS in client_responses_started\n"); + M("client_response_bytes_done", m->client_response_bytes_done, + "Response bytes received for all client RPCs\n"); + M("client_responses_done", m->client_responses_done, + "Client RPC responses fully received\n"); + M("server_requests_started", m->server_requests_started, + "Server RPCs for which at least one request pkt rcvd\n"); + M("server_request_bytes_started", + m->server_request_bytes_started, + "Request bytes in all RPCS in server_requests_started\n"); + M("server_request_bytes_done", m->server_request_bytes_done, + "Request bytes received for all server RPCs\n"); + M("server_requests_done", m->server_requests_done, + "Server RPC requests fully received\n"); + M("server_responses_started", m->server_responses_started, + "Server RPCs for which response was initiated\n"); + M("server_response_bytes_started",\ + m->server_response_bytes_started, + "Message bytes in all initiated server responses\n"); + M("server_response_bytes_done", m->server_response_bytes_done, + "Transmitted response bytes in all server RPCs\n"); + M("server_responses_done", m->server_responses_done, + "Server RPC responses fully transmitted\n"); + M("sent_msg_bytes", m->sent_msg_bytes, + "Total bytes in all outgoing messages\n"); for (i = DATA; i <= MAX_OP; i++) { char *symbol = homa_symbol_for_type(i); - M("packets_sent_%-12s %15llu %s packets sent\n", - symbol, m->packets_sent[i - DATA], symbol); + snprintf(name, sizeof(name), "packets_sent_%s", symbol); + M(name, m->packets_sent[i - DATA], + "%s packets sent\n", symbol); } for (i = DATA; i <= MAX_OP; i++) { char *symbol = homa_symbol_for_type(i); - M("packets_rcvd_%-12s %15llu %s packets received\n", - symbol, m->packets_received[i - DATA], symbol); + snprintf(name, sizeof(name), "packets_rcvd_%s", symbol); + M(name, m->packets_received[i - DATA], + "%s packets received\n", symbol); } for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { - M("priority%d_bytes %15llu Bytes sent at priority %d (including headers)\n", - i, m->priority_bytes[i], i); + snprintf(name, sizeof(name), "priority%d_bytes", i); + M(name, m->priority_bytes[i], + "Bytes sent at priority %d (including headers)\n", i); } for (i = 0; i < HOMA_MAX_PRIORITIES; i++) { - M("priority%d_packets %15llu Packets sent at priority %d\n", - i, m->priority_packets[i], i); + snprintf(name, sizeof(name), "priority%d_packets", i); + M(name, m->priority_packets[i], + "Packets sent at priority %d\n", i); } - M("skb_allocs %15llu sk_buffs allocated\n", - m->skb_allocs); - M("skb_alloc_cycles %15llu Time spent allocating sk_buffs\n", - m->skb_alloc_cycles); - M("skb_frees %15llu Data sk_buffs freed in normal paths\n", - m->skb_frees); - M("skb_free_cycles %15llu Time spent freeing data sk_buffs\n", - m->skb_free_cycles); - M("skb_page_allocs %15llu Pages allocated for sk_buff frags\n", - m->skb_page_allocs); - M("skb_page_alloc_cycles %15llu Time spent allocating pages for sk_buff frags\n", - m->skb_page_alloc_cycles); - M("requests_received %15llu Incoming request messages\n", - m->requests_received); - M("responses_received %15llu Incoming response messages\n", - m->responses_received); - M("wait_none %15llu Messages received without blocking or polling\n", - m->wait_none); - M("wait_fast %15llu Messages received while polling\n", - m->wait_fast); - M("wait_block %15llu Messages received after thread went to sleep\n", - m->wait_block); - M("handoffs_thread_waiting %15llu RPC handoffs to waiting threads (vs. queue)\n", - m->handoffs_thread_waiting); - M("handoffs_alt_thread %15llu RPC handoffs not to first on list (avoid busy core)\n", - m->handoffs_alt_thread); - M("poll_cycles %15llu Time spent polling for incoming messages\n", - m->poll_cycles); - M("softirq_calls %15llu Calls to homa_softirq (i.e. # GRO pkts received)\n", - m->softirq_calls); - M("softirq_cycles %15llu Time spent in homa_softirq during SoftIRQ\n", - m->softirq_cycles); - M("bypass_softirq_cycles %15llu Time spent in homa_softirq during bypass from GRO\n", - m->bypass_softirq_cycles); - M("linux_softirq_cycles %15llu Time spent in all Linux SoftIRQ\n", - m->linux_softirq_cycles); - M("napi_cycles %15llu Time spent in NAPI-level packet handling\n", - m->napi_cycles); - M("send_cycles %15llu Time spent in homa_sendmsg for requests\n", - m->send_cycles); - M("send_calls %15llu Total invocations of homa_sendmsg for equests\n", - m->send_calls); + M("skb_allocs", m->skb_allocs, "sk_buffs allocated\n"); + M("skb_alloc_cycles", m->skb_alloc_cycles, + "Time spent allocating sk_buffs\n"); + M("skb_frees", m->skb_frees, + "Data sk_buffs freed in normal paths\n"); + M("skb_free_cycles", m->skb_free_cycles, + "Time spent freeing data sk_buffs\n"); + M("skb_page_allocs", m->skb_page_allocs, + "Pages allocated for sk_buff frags\n"); + M("skb_page_alloc_cycles", m->skb_page_alloc_cycles, + "Time spent allocating pages for sk_buff frags\n"); + M("requests_received", m->requests_received, + "Incoming request messages\n"); + M("responses_received", m->responses_received, + "Incoming response messages\n"); + M("wait_none", m->wait_none, + "Messages received without blocking or polling\n"); + M("wait_fast", m->wait_fast, + "Messages received while polling\n"); + M("wait_block", m->wait_block, + "Messages received after thread went to sleep\n"); + M("handoffs_thread_waiting", m->handoffs_thread_waiting, + "RPC handoffs to waiting threads (vs. queue)\n"); + M("handoffs_alt_thread", m->handoffs_alt_thread, + "RPC handoffs not to first on list (avoid busy core)\n"); + M("poll_cycles", m->poll_cycles, + "Time spent polling for incoming messages\n"); + M("softirq_calls", m->softirq_calls, + "Calls to homa_softirq (i.e. # GRO pkts received)\n"); + M("softirq_cycles", m->softirq_cycles, + "Time spent in homa_softirq during SoftIRQ\n"); + M("bypass_softirq_cycles", m->bypass_softirq_cycles, + "Time spent in homa_softirq during bypass from GRO\n"); + M("linux_softirq_cycles", m->linux_softirq_cycles, + "Time spent in all Linux SoftIRQ\n"); + M("napi_cycles", m->napi_cycles, + "Time spent in NAPI-level packet handling\n"); + M("send_cycles", m->send_cycles, + "Time spent in homa_sendmsg for requests\n"); + M("send_calls", m->send_calls, + "Total invocations of homa_sendmsg for equests\n"); // It is possible for us to get here at a time when a // thread has been blocked for a long time and has // recorded blocked_cycles, but hasn't finished the @@ -246,154 +257,156 @@ char *homa_metrics_print(void) delta = m->recv_cycles - m->blocked_cycles; if (delta < 0) delta = 0; - M("recv_cycles %15llu Unblocked time spent in recvmsg kernel call\n", - delta); - M("recv_calls %15llu Total invocations of recvmsg kernel call\n", - m->recv_calls); - M("blocked_cycles %15llu Time spent blocked in homa_recvmsg\n", - m->blocked_cycles); - M("reply_cycles %15llu Time spent in homa_sendmsg for responses\n", - m->reply_cycles); - M("reply_calls %15llu Total invocations of homa_sendmsg for responses\n", - m->reply_calls); - M("abort_cycles %15llu Time spent in homa_ioc_abort kernel call\n", - m->reply_cycles); - M("abort_calls %15llu Total invocations of abort kernel call\n", - m->reply_calls); - M("so_set_buf_cycles %15llu Time spent in setsockopt SO_HOMA_RCVBUF\n", - m->so_set_buf_cycles); - M("so_set_buf_calls %15llu Total invocations of setsockopt SO_HOMA_RCVBUF\n", - m->so_set_buf_calls); - M("grant_lock_cycles %15llu Time spent with grant lock locked\n", - m->grant_lock_cycles); - M("timer_cycles %15llu Time spent in homa_timer\n", - m->timer_cycles); - M("timer_reap_cycles %15llu Time in homa_timer spent reaping RPCs\n", - m->timer_reap_cycles); - M("data_pkt_reap_cycles %15llu Time in homa_data_pkt spent reaping RPCs\n", - m->data_pkt_reap_cycles); - M("pacer_cycles %15llu Time spent in homa_pacer_main\n", - m->pacer_cycles); - M("homa_cycles %15llu Total time in all Homa-related functions\n", + M("recv_cycles", delta, + "Unblocked time spent in recvmsg kernel call\n"); + M("recv_calls", m->recv_calls, + "Total invocations of recvmsg kernel call\n"); + M("blocked_cycles", m->blocked_cycles, + "Time spent blocked in homa_recvmsg\n"); + M("reply_cycles", m->reply_cycles, + "Time spent in homa_sendmsg for responses\n"); + M("reply_calls", m->reply_calls, + "Total invocations of homa_sendmsg for responses\n"); + M("abort_cycles", m->reply_cycles, + "Time spent in homa_ioc_abort kernel call\n"); + M("abort_calls", m->reply_calls, + "Total invocations of abort kernel call\n"); + M("so_set_buf_cycles", m->so_set_buf_cycles, + "Time spent in setsockopt SO_HOMA_RCVBUF\n"); + M("so_set_buf_calls", m->so_set_buf_calls, + "Total invocations of setsockopt SO_HOMA_RCVBUF\n"); + M("grant_lock_cycles", m->grant_lock_cycles, + "Time spent with grant lock locked\n"); + M("timer_cycles", m->timer_cycles, + "Time spent in homa_timer\n"); + M("timer_reap_cycles", m->timer_reap_cycles, + "Time in homa_timer spent reaping RPCs\n"); + M("data_pkt_reap_cycles", m->data_pkt_reap_cycles, + "Time in homa_data_pkt spent reaping RPCs\n"); + M("pacer_cycles", m->pacer_cycles, + "Time spent in homa_pacer_main\n"); + M("homa_cycles", m->softirq_cycles + m->napi_cycles + m->send_cycles + m->recv_cycles + m->reply_cycles - m->blocked_cycles + - m->timer_cycles + m->pacer_cycles); - M("pacer_lost_cycles %15llu Lost transmission time because pacer was slow\n", - m->pacer_lost_cycles); - M("pacer_bytes %15llu Bytes transmitted when the pacer was active\n", - m->pacer_bytes); - M("pacer_needed_help %15llu homa_pacer_xmit invocations from homa_check_pacer\n", - m->pacer_needed_help); - M("throttled_cycles %15llu Time when the throttled queue was nonempty\n", - m->throttled_cycles); - M("resent_packets %15llu DATA packets sent in response to RESENDs\n", - m->resent_packets); - M("peer_allocs %15llu New entries created in peer table\n", - m->peer_allocs); - M("peer_kmalloc_errors %15llu kmalloc failures creating peer table entries\n", - m->peer_kmalloc_errors); - M("peer_route_errors %15llu Routing failures creating peer table entries\n", - m->peer_route_errors); - M("peer_dst_refreshes %15llu Obsolete dsts had to be regenerated\n", - m->peer_dst_refreshes); - M("control_xmit_errors %15llu Errors sending control packets\n", - m->control_xmit_errors); - M("data_xmit_errors %15llu Errors sending data packets\n", - m->data_xmit_errors); - M("unknown_rpcs %15llu Non-grant packets discarded because RPC unknown\n", - m->unknown_rpcs); - M("server_cant_create_rpcs %15llu Packets discarded because server couldn't create RPC\n", - m->server_cant_create_rpcs); - M("unknown_packet_types %15llu Packets discarded because of unsupported type\n", - m->unknown_packet_types); - M("short_packets %15llu Packets discarded because too short\n", - m->short_packets); - M("packet_discards %15llu Non-resent packets discarded because data already received\n", - m->packet_discards); - M("resent_discards %15llu Resent packets discarded because data already received\n", - m->resent_discards); - M("resent_packets_used %15llu Retransmitted packets that were actually used\n", - m->resent_packets_used); - M("rpc_timeouts %15llu RPCs aborted because peer was nonresponsive\n", - m->rpc_timeouts); - M("server_rpc_discards %15llu RPCs discarded by server because of errors\n", - m->server_rpc_discards); - M("server_rpcs_unknown %15llu RPCs aborted by server because unknown to client\n", - m->server_rpcs_unknown); - M("client_lock_misses %15llu Bucket lock misses for client RPCs\n", - m->client_lock_misses); - M("client_lock_miss_cycles %15llu Time lost waiting for client bucket locks\n", - m->client_lock_miss_cycles); - M("server_lock_misses %15llu Bucket lock misses for server RPCs\n", - m->server_lock_misses); - M("server_lock_miss_cycles %15llu Time lost waiting for server bucket locks\n", - m->server_lock_miss_cycles); - M("socket_lock_misses %15llu Socket lock misses\n", - m->socket_lock_misses); - M("socket_lock_miss_cycles %15llu Time lost waiting for socket locks\n", - m->socket_lock_miss_cycles); - M("throttle_lock_misses %15llu Throttle lock misses\n", - m->throttle_lock_misses); - M("throttle_lock_miss_cycles %15llu Time lost waiting for throttle locks\n", - m->throttle_lock_miss_cycles); - M("peer_ack_lock_misses %15llu Misses on peer ack locks\n", - m->peer_ack_lock_misses); - M("peer_ack_lock_miss_cycles %15llu Time lost waiting for peer ack locks\n", - m->peer_ack_lock_miss_cycles); - M("grant_lock_misses %15llu Grant lock misses\n", - m->grant_lock_misses); - M("grant_lock_miss_cycles %15llu Time lost waiting for grant lock\n", - m->grant_lock_miss_cycles); - M("grantable_rpcs_integral %15llu Integral of homa->num_grantable_rpcs*dt\n", - m->grantable_rpcs_integral); - M("grant_check_calls %15llu Number of calls to homa_grant_check_rpc\n", - m->grant_check_calls); - M("grant_check_locked %15llu Number of calls to homa_grant_check_rpc that acquired grant lock\n", - m->grant_check_locked); - M("grant_check_others %15llu Number of times homa_grant_check_rpc checked non-caller RPCs for grants\n", - m->grant_check_others); - M("grant_check_recalcs %15llu Number of times homa_grant_check_rpc updated grant priority order\n", - m->grant_check_recalcs); - M("grant_priority_bumps %15llu Number of times an RPC moved up in the grant priority order\n", - m->grant_priority_bumps); - M("fifo_grant_bytes %15llu Bytes of grants issued using the FIFO mechanism\n", - m->fifo_grant_bytes); - M("disabled_reaps %15llu Reaper invocations that were disabled\n", - m->disabled_reaps); - M("deferred_rpc_reaps %15llu RPCs skipped by reaper because still in use\n", - m->deferred_rpc_reaps); - M("reaper_calls %15llu Reaper invocations that were not disabled\n", - m->reaper_calls); - M("reaper_dead_skbs %15llu Sum of hsk->dead_skbs across all reaper calls\n", - m->reaper_dead_skbs); - M("throttle_list_adds %15llu Calls to homa_add_to_throttled\n", - m->throttle_list_adds); - M("throttle_list_checks %15llu List elements checked in homa_add_to_throttled\n", - m->throttle_list_checks); - M("ack_overflows %15llu Explicit ACKs sent because peer->acks was full\n", - m->ack_overflows); - M("ignored_need_acks %15llu NEED_ACKs ignored because RPC result not yet received\n", - m->ignored_need_acks); - M("bpage_reuses %15llu Buffer page could be reused because ref count was zero\n", - m->bpage_reuses); - M("buffer_alloc_failures %15llu homa_pool_alloc_msg didn't find enough buffer space for an RPC\n", - m->buffer_alloc_failures); - M("linux_pkt_alloc_bytes %15llu Bytes allocated in new packets by NIC driver due to cache overflows\n", - m->linux_pkt_alloc_bytes); - M("dropped_data_no_bufs %15llu Data bytes dropped because app buffers full\n", - m->dropped_data_no_bufs); - M("gen3_handoffs %15llu GRO->SoftIRQ handoffs made by Gen3 balancer\n", - m->gen3_handoffs); - M("gen3_alt_handoffs %15llu Gen3 handoffs to secondary core (primary was busy)\n", - m->gen3_alt_handoffs); - M("gro_grant_bypasses %15llu Grant packets passed directly to homa_softirq by homa_gro_receive\n", - m->gro_grant_bypasses); - M("gro_data_bypasses %15llu Data packets passed directly to homa_softirq by homa_gro_receive\n", - m->gro_data_bypasses); - for (i = 0; i < NUM_TEMP_METRICS; i++) - M("temp%-2d %15llu Temporary use in testing\n", - i, m->temp[i]); + m->timer_cycles + m->pacer_cycles, + "Total time in all Homa-related functions\n"); + M("pacer_lost_cycles", m->pacer_lost_cycles, + "Lost transmission time because pacer was slow\n"); + M("pacer_bytes", m->pacer_bytes, + "Bytes transmitted when the pacer was active\n"); + M("pacer_needed_help", m->pacer_needed_help, + "homa_pacer_xmit invocations from homa_check_pacer\n"); + M("throttled_cycles", m->throttled_cycles, + "Time when the throttled queue was nonempty\n"); + M("resent_packets", m->resent_packets, + "DATA packets sent in response to RESENDs\n"); + M("peer_allocs", m->peer_allocs, + "New entries created in peer table\n"); + M("peer_kmalloc_errors", m->peer_kmalloc_errors, + "kmalloc failures creating peer table entries\n"); + M("peer_route_errors", m->peer_route_errors, + "Routing failures creating peer table entries\n"); + M("peer_dst_refreshes", m->peer_dst_refreshes, + "Obsolete dsts had to be regenerated\n"); + M("control_xmit_errors", m->control_xmit_errors, + "Errors sending control packets\n"); + M("data_xmit_errors", m->data_xmit_errors, + "Errors sending data packets\n"); + M("unknown_rpcs", m->unknown_rpcs, + "Non-grant packets discarded because RPC unknown\n"); + M("server_cant_create_rpcs", m->server_cant_create_rpcs, + "Packets discarded because server couldn't create RPC\n"); + M("unknown_packet_types", m->unknown_packet_types, + "Packets discarded because of unsupported type\n"); + M("short_packets", m->short_packets, + "Packets discarded because too short\n"); + M("packet_discards", m->packet_discards, + "Non-resent packets discarded because data already received\n"); + M("resent_discards", m->resent_discards, + "Resent packets discarded because data already received\n"); + M("resent_packets_used", m->resent_packets_used, + "Retransmitted packets that were actually used\n"); + M("rpc_timeouts", m->rpc_timeouts, + " RPCs aborted because peer was nonresponsive\n"); + M("server_rpc_discards", m->server_rpc_discards, + "RPCs discarded by server because of errors\n"); + M("server_rpcs_unknown", m->server_rpcs_unknown, + "RPCs aborted by server because unknown to client\n"); + M("client_lock_misses", m->client_lock_misses, + "Bucket lock misses for client RPCs\n"); + M("client_lock_miss_cycles", m->client_lock_miss_cycles, + "Time lost waiting for client bucket locks\n"); + M("server_lock_misses", m->server_lock_misses, + "Bucket lock misses for server RPCs\n"); + M("server_lock_miss_cycles", m->server_lock_miss_cycles, + "Time lost waiting for server bucket locks\n"); + M("socket_lock_misses", m->socket_lock_misses, + "Socket lock misses\n"); + M("socket_lock_miss_cycles", m->socket_lock_miss_cycles, + "Time lost waiting for socket locks\n"); + M("throttle_lock_misses", m->throttle_lock_misses, + "Throttle lock misses\n"); + M("throttle_lock_miss_cycles", m->throttle_lock_miss_cycles, + "Time lost waiting for throttle locks\n"); + M("peer_ack_lock_misses", m->peer_ack_lock_misses, + "Misses on peer ack locks\n"); + M("peer_ack_lock_miss_cycles", m->peer_ack_lock_miss_cycles, + "Time lost waiting for peer ack locks\n"); + M("grant_lock_misses", m->grant_lock_misses, + "Grant lock misses\n"); + M("grant_lock_miss_cycles", m->grant_lock_miss_cycles, + "Time lost waiting for grant lock\n"); + M("grantable_rpcs_integral", m->grantable_rpcs_integral, + "Integral of homa->num_grantable_rpcs*dt\n"); + M("grant_check_calls", m->grant_check_calls, + "Number of calls to homa_grant_check_rpc\n"); + M("grant_check_locked", m->grant_check_locked, + "Number of calls to homa_grant_check_rpc that acquired grant lock\n"); + M("grant_check_others", m->grant_check_others, + "Number of times homa_grant_check_rpc checked non-caller RPCs for grants\n"); + M("grant_check_recalcs", m->grant_check_recalcs, + "Number of times homa_grant_check_rpc updated grant priority order\n"); + M("grant_priority_bumps", m->grant_priority_bumps, + "Number of times an RPC moved up in the grant priority order\n"); + M("fifo_grant_bytes", m->fifo_grant_bytes, + "Bytes of grants issued using the FIFO mechanism\n"); + M("disabled_reaps", m->disabled_reaps, + "Reaper invocations that were disabled\n"); + M("deferred_rpc_reaps", m->deferred_rpc_reaps, + "RPCs skipped by reaper because still in use\n"); + M("reaper_calls", m->reaper_calls, + "Reaper invocations that were not disabled\n"); + M("reaper_dead_skbs", m->reaper_dead_skbs, + "Sum of hsk->dead_skbs across all reaper calls\n"); + M("throttle_list_adds", m->throttle_list_adds, + "Calls to homa_add_to_throttled\n"); + M("throttle_list_checks", m->throttle_list_checks, + "List elements checked in homa_add_to_throttled\n"); + M("ack_overflows", m->ack_overflows, + "Explicit ACKs sent because peer->acks was full\n"); + M("ignored_need_acks", m->ignored_need_acks, + "NEED_ACKs ignored because RPC result not yet received\n"); + M("bpage_reuses", m->bpage_reuses, + "Buffer page could be reused because ref count was zero\n"); + M("buffer_alloc_failures", m->buffer_alloc_failures, + "homa_pool_alloc_msg didn't find enough buffer space for an RPC\n"); + M("linux_pkt_alloc_bytes", m->linux_pkt_alloc_bytes, + "Bytes allocated in new packets by NIC driver due to cache overflows\n"); + M("dropped_data_no_bufs", m->dropped_data_no_bufs, + "Data bytes dropped because app buffers full\n"); + M("gen3_handoffs", m->gen3_handoffs, + "GRO->SoftIRQ handoffs made by Gen3 balancer\n"); + M("gen3_alt_handoffs", m->gen3_alt_handoffs, + "Gen3 handoffs to secondary core (primary was busy)\n"); + M("gro_grant_bypasses", m->gro_grant_bypasses, + "Grant packets passed directly to homa_softirq by homa_gro_receive\n"); + M("gro_data_bypasses", m->gro_data_bypasses, + "Data packets passed directly to homa_softirq by homa_gro_receive\n"); + for (i = 0; i < NUM_TEMP_METRICS; i++) { + snprintf(name, sizeof(name), "temp%d", i); + M(name, m->temp[i], "Temporary use in testing\n"); + } } return homa_mout.output; diff --git a/homa_metrics.h b/homa_metrics.h index f2766158..b0b28a26 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -790,7 +790,7 @@ static inline struct homa_metrics *homa_metrics_per_cpu(void) extern struct homa_metrics_output homa_mout; -void homa_metric_append(const char *format, ...); +void homa_metric_append(const char *name, u64 value, const char *format, ...); void homa_metrics_end(void); int homa_metrics_init(void); loff_t homa_metrics_lseek(struct file *file, loff_t offset, diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c index 86f9a304..523f1e8a 100644 --- a/test/unit_homa_metrics.c +++ b/test/unit_homa_metrics.c @@ -24,21 +24,21 @@ FIXTURE_TEARDOWN(homa_metrics) TEST_F(homa_metrics, homa_metric_append) { homa_mout.length = 0; - homa_metric_append("x: %d, y: %d", 10, 20); - EXPECT_EQ(12, homa_mout.length); - EXPECT_STREQ("x: 10, y: 20", homa_mout.output); + homa_metric_append("metric1", 12345, "Description 1\n"); + EXPECT_EQ(200, homa_mout.capacity); + EXPECT_EQ(66, homa_mout.length); + EXPECT_STREQ("metric1 12345 Description 1\n", + homa_mout.output); - homa_metric_append(", z: %d", 12345); - EXPECT_EQ(22, homa_mout.length); - EXPECT_STREQ("x: 10, y: 20, z: 12345", homa_mout.output); - EXPECT_EQ(30, homa_mout.capacity); - - homa_metric_append(", q: %050d", 88); - EXPECT_EQ(77, homa_mout.length); - EXPECT_STREQ("x: 10, y: 20, z: 12345, q: 00000000000000000000000000000000000000000000000088", - homa_mout.output); - EXPECT_EQ(120, homa_mout.capacity); + homa_metric_append("value with long name", 8, "Value %d, value 2 %08d\n", + 16, 44); + EXPECT_EQ(400, homa_mout.capacity); + EXPECT_EQ(145, homa_mout.length); + EXPECT_STREQ("metric1 12345 Description 1\n" + "value with long name 8 Value 16, value 2 00000044\n", + homa_mout.output); } + TEST_F(homa_metrics, homa_metrics_open) { EXPECT_EQ(0, homa_metrics_open(NULL, NULL)); From f9d152ebe3384ff2cba557dbebfdc479bdf43526 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 15 Jul 2025 13:56:32 -0700 Subject: [PATCH 400/625] Fix deadlock in homa_recvmsg homa_rpc_reap was being invoked while holding an RPC lock. In addition, specifying "reap all" feels like it might cause problems also. --- homa_plumbing.c | 19 ++++++++++--------- test/unit_homa_plumbing.c | 14 ++++++++++++++ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 217604c6..cb12326e 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1320,16 +1320,9 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, rpc->state = RPC_IN_SERVICE; } - if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags)) { - /* There are tasks waiting for tx memory, so reap - * immediately. - */ - homa_rpc_reap(hsk, true); - } - done: - /* Note: must release the RPC lock before copying results to user - * space. + /* Note: must release the RPC lock before calling homa_rpc_reap + * or copying results to user space. */ if (rpc) { homa_rpc_put(rpc); @@ -1337,6 +1330,14 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, /* Locked by homa_rpc_find_client or homa_wait_shared. */ homa_rpc_unlock(rpc); } + + if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags)) { + /* There are tasks waiting for tx memory, so reap + * immediately. + */ + homa_rpc_reap(hsk, false); + } + if (unlikely(copy_to_user((__force void __user *)msg->msg_control, &control, sizeof(control)))) { #ifndef __UPSTREAM__ /* See strip.py */ diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 039da384..dd98739b 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -919,6 +919,20 @@ TEST_F(homa_plumbing, homa_recvmsg__delete_server_rpc_after_error) EXPECT_EQ(RPC_DEAD, srpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } +TEST_F(homa_plumbing, homa_recvmsg__reap_because_of_SOCK_NOSPACE) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, + self->client_ip, self->server_ip, self->server_port, + self->client_id, 100, 2000); + + EXPECT_NE(NULL, crpc); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + + set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); + EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, 0, &self->recvmsg_hdr.msg_namelen)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->reaper_calls)); +} TEST_F(homa_plumbing, homa_recvmsg__error_copying_out_args) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, From 728cb4529ef9363b1eecdece408cb1be3a77194a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 15 Jul 2025 14:29:02 -0700 Subject: [PATCH 401/625] Use tsc_khz instead of cpu_khz Also, move the definition of tsc_khz for unit tests from time_trace.c to mock.c --- homa_impl.h | 14 +++----------- test/mock.c | 2 ++ timetrace.c | 10 +++------- 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 638dea4c..1299ed4e 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -860,7 +860,7 @@ static inline u64 homa_clock_khz(void) return 1000000; #else /* __UNIT_TEST__ */ #ifdef CONFIG_X86_TSC - return cpu_khz; + return tsc_khz; #else return 1000000; #endif /* CONFIG_X86_TSC */ @@ -878,15 +878,11 @@ static inline u64 homa_ns_to_cycles(u64 ns) #ifdef __UNIT_TEST__ return ns; #else /* __UNIT_TEST__ */ -#ifdef CONFIG_X86_TSC u64 tmp; - tmp = ns * cpu_khz; + tmp = ns * homa_clock_khz(); do_div(tmp, 1000000); return tmp; -#else - return ns; -#endif /* CONFIG_X86_TSC */ #endif /* __UNIT_TEST__ */ } @@ -901,15 +897,11 @@ static inline u64 homa_usecs_to_cycles(u64 usecs) #ifdef __UNIT_TEST__ return usecs * 1000; #else /* __UNIT_TEST__ */ -#ifdef CONFIG_X86_TSC u64 tmp; - tmp = usecs * cpu_khz; + tmp = usecs * homa_clock_khz(); do_div(tmp, 1000); return tmp; -#else - return usecs * 1000; -#endif /* CONFIG_X86_TSC */ #endif /* __UNIT_TEST__ */ } diff --git a/test/mock.c b/test/mock.c index 5e1af8c3..ee2cb3b7 100644 --- a/test/mock.c +++ b/test/mock.c @@ -182,6 +182,8 @@ int mock_num_clock_vals = 0; /* Used as the return value for tt_get_cycles. */ u64 mock_tt_cycles; +unsigned int tsc_khz = 1000000; + /* Indicates whether we should be simulation IPv6 or IPv4 in the * current test. Can be overridden by a test. */ diff --git a/timetrace.c b/timetrace.c index 38446cd2..201cca2c 100644 --- a/timetrace.c +++ b/timetrace.c @@ -92,10 +92,6 @@ int tt_pf_storage = TT_PF_BUF_SIZE; /* Set during tests to disable "cpu_khz" line in trace output. */ bool tt_test_no_khz; -#ifdef __UNIT_TEST__ -unsigned int cpu_khz = 1000000; -#endif - /** * tt_init(): Enable time tracing, create /proc file for reading traces. * @proc_file: Name of a file in /proc; this file can be read to extract @@ -365,7 +361,7 @@ int tt_proc_open(struct inode *inode, struct file *file) if (!tt_test_no_khz) { pf->bytes_available = snprintf(pf->msg_storage, TT_PF_BUF_SIZE, - "cpu_khz: %u\n", cpu_khz); + "cpu_khz: %u\n", tsc_khz); } done: @@ -605,7 +601,7 @@ void tt_print_file(char *path) bytes_used += snprintf(buffer + bytes_used, sizeof(buffer) - bytes_used, - "cpu_khz: %u\n", cpu_khz); + "cpu_khz: %u\n", tsc_khz); /* Each iteration of this loop printk's one event. */ while (true) { @@ -727,7 +723,7 @@ void tt_printk(void) oldest[i] = (pos[i] - 200) & (TT_BUF_SIZE - 1); } - pr_err("cpu_khz: %u, start: %llu\n", cpu_khz, start_time); + pr_err("cpu_khz: %u, start: %llu\n", tsc_khz, start_time); /* Each iteration of this loop printk's one event. */ while (true) { From 46775dee5a153eeb22a498fc52ae842650030d3e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 15 Jul 2025 15:23:48 -0700 Subject: [PATCH 402/625] Check for dead RPC in homa_grant_check_fifo --- homa_grant.c | 5 +++++ test/unit_homa_grant.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/homa_grant.c b/homa_grant.c index 3f447bec..44aab6b7 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -1003,6 +1003,11 @@ void homa_grant_check_fifo(struct homa_grant *grant) homa_rpc_hold(rpc); homa_grant_unlock(grant); homa_rpc_lock(rpc); + if (rpc->state == RPC_DEAD) { + homa_rpc_unlock(rpc); + homa_rpc_put(rpc); + return; + } homa_grant_cand_init(&cand); rpc->msgin.granted += grant->fifo_grant_increment; if (rpc->msgin.granted >= rpc->msgin.length) { diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index f810e864..14df6514 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -45,6 +45,19 @@ static void grant_check_stalled_hook(char *id) atomic_dec(&hook_grant->stalled_rank); } +static struct homa_rpc *hook_end_rpc; +static int hook_end_lock_count; +static void grant_spinlock_end_hook(char *id) +{ + if (strcmp(id, "spin_lock") != 0) + return; + if (hook_end_lock_count > 0) { + hook_end_lock_count--; + if (hook_end_lock_count == 0) + homa_rpc_end(hook_end_rpc); + } +} + FIXTURE(homa_grant) { struct in6_addr client_ip[5]; int client_port; @@ -1642,6 +1655,32 @@ TEST_F(homa_grant, homa_grant_check_fifo__no_suitable_rpc) EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); EXPECT_STREQ("", unit_log_get()); } +TEST_F(homa_grant, homa_grant_check_fifo__rpc_dead) +{ + struct homa_rpc *rpc; + + mock_clock = 1000; + self->homa.grant->max_overcommit = 1; + self->homa.grant->fifo_grant_time = 0; + self->homa.grant->fifo_grant_increment = 20000; + self->homa.grant->fifo_fraction = 50; + + test_rpc_init(self, 100, self->server_ip, 30000); + rpc = test_rpc_init(self, 102, self->server_ip, 400000); + EXPECT_EQ(-1, rpc->msgin.rank); + EXPECT_EQ(0, rpc->msgin.granted); + self->homa.grant->oldest_rpc = rpc; + homa_rpc_hold(rpc); + hook_end_rpc = rpc; + hook_end_lock_count = 2; + unit_hook_register(grant_spinlock_end_hook); + + unit_log_clear(); + homa_grant_check_fifo(self->homa.grant); + EXPECT_EQ(0, rpc->msgin.granted); + EXPECT_EQ(0, homa_metrics_per_cpu()->fifo_grant_bytes); + EXPECT_EQ(RPC_DEAD, rpc->state); +} TEST_F(homa_grant, homa_grant_check_fifo__rpc_becomes_fully_granted_so_promote_another) { struct homa_rpc *rpc; From 2b60f2c3e6788d4d397d61e85c8624d6940b7d50 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 16 Jul 2025 16:01:57 -0700 Subject: [PATCH 403/625] Use kthread_should_stop() to control pacer thread exit This simplifies the code (eliminates the exit and kthread_done fields in struct homa_pacer) and conforms to expected Linux usage. --- homa_pacer.c | 11 +++-------- homa_pacer.h | 11 ----------- test/mock.c | 11 +++++++++++ test/mock.h | 1 + test/unit_homa_pacer.c | 4 ++-- 5 files changed, 17 insertions(+), 21 deletions(-) diff --git a/homa_pacer.c b/homa_pacer.c index 2b6f82a3..c14dde0d 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -62,7 +62,6 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) pacer->fifo_fraction = 50; pacer->max_nic_queue_ns = 5000; pacer->throttle_min_bytes = 1000; - pacer->exit = false; init_waitqueue_head(&pacer->wait_queue); pacer->kthread = kthread_run(homa_pacer_main, pacer, "homa_pacer"); if (IS_ERR(pacer->kthread)) { @@ -70,7 +69,6 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) pr_err("Homa couldn't create pacer thread: error %d\n", err); goto error; } - init_completion(&pacer->kthread_done); atomic64_set(&pacer->link_idle_time, homa_clock()); #ifndef __STRIP__ /* See strip.py */ @@ -98,7 +96,6 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) */ void homa_pacer_free(struct homa_pacer *pacer) { - pacer->exit = true; #ifndef __STRIP__ /* See strip.py */ if (pacer->sysctl_header) { unregister_net_sysctl_table(pacer->sysctl_header); @@ -106,9 +103,7 @@ void homa_pacer_free(struct homa_pacer *pacer) } #endif /* See strip.py */ if (pacer->kthread) { - wake_up(&pacer->wait_queue); kthread_stop(pacer->kthread); - wait_for_completion(&pacer->kthread_done); pacer->kthread = NULL; } kfree(pacer); @@ -188,7 +183,7 @@ int homa_pacer_main(void *arg) int status; while (1) { - if (pacer->exit) + if (kthread_should_stop()) break; pacer->wake_time = homa_clock(); homa_pacer_xmit(pacer); @@ -206,12 +201,12 @@ int homa_pacer_main(void *arg) tt_record("pacer sleeping"); status = wait_event_interruptible(pacer->wait_queue, - pacer->exit || !list_empty(&pacer->throttled_rpcs)); + kthread_should_stop() || + !list_empty(&pacer->throttled_rpcs)); tt_record1("pacer woke up with status %d", status); if (status != 0 && status != -ERESTARTSYS) break; } - kthread_complete_and_exit(&pacer->kthread_done, 0); return 0; } diff --git a/homa_pacer.h b/homa_pacer.h index 67584ab3..d19e25da 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -110,12 +110,6 @@ struct homa_pacer { struct ctl_table_header *sysctl_header; #endif /* See strip.py */ - /** - * @exit: true means that the pacer thread should exit as - * soon as possible. - */ - bool exit; - /** * @wait_queue: Used to block the pacer thread when there * are no throttled RPCs. @@ -129,11 +123,6 @@ struct homa_pacer { */ struct task_struct *kthread; - /** - * @kthread_done: Used to wait for @kthread to exit. - */ - struct completion kthread_done; - /** * @link_idle_time: The homa_clock() time at which we estimate * that all of the packets we have passed to the NIC for transmission diff --git a/test/mock.c b/test/mock.c index ee2cb3b7..49ff3c0e 100644 --- a/test/mock.c +++ b/test/mock.c @@ -184,6 +184,11 @@ u64 mock_tt_cycles; unsigned int tsc_khz = 1000000; +/* True means that kthread_stop has been invoked for some thread, + * so kthread_should_stop should return true. + */ +bool mock_exit_thread; + /* Indicates whether we should be simulation IPv6 or IPv4 in the * current test. Can be overridden by a test. */ @@ -962,9 +967,14 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), return &mock_task; } +bool kthread_should_stop(void) { + return mock_exit_thread; +} + int kthread_stop(struct task_struct *k) { unit_log_printf("; ", "kthread_stop"); + mock_exit_thread = true; return 0; } @@ -2124,6 +2134,7 @@ void mock_teardown(void) mock_next_clock_val = 0; mock_num_clock_vals = 0; mock_tt_cycles = 0; + mock_exit_thread = false; mock_ipv6 = mock_ipv6_default; mock_dst_check_errors = 0; mock_import_ubuf_errors = 0; diff --git a/test/mock.h b/test/mock.h index 9f508b33..2428c411 100644 --- a/test/mock.h +++ b/test/mock.h @@ -121,6 +121,7 @@ extern int mock_copy_to_user_dont_copy; extern int mock_copy_to_user_errors; extern int mock_cpu_idle; extern int mock_dst_check_errors; +extern bool mock_exit_thread; extern int mock_import_iovec_errors; extern int mock_import_ubuf_errors; extern int mock_ip6_xmit_errors; diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index f7562b6c..bd69d504 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -26,14 +26,14 @@ static struct homa_pacer *hook_pacer; static void exit_hook(char *id) { mock_clock += mock_clock_tick; if (mock_clock >= hook_exit_cycles) - hook_pacer->exit = true; + mock_exit_thread = true; } static void exit_idle_hook(char *id) { if (strcmp(id, "schedule") == 0) unit_log_printf("; ", "time %llu", mock_clock); if (list_empty(&hook_pacer->throttled_rpcs)) - hook_pacer->exit = true; + mock_exit_thread = true; } static void manage_hook(char *id) From a867522334f1b499709ae7523953bd14a6f4ebc1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 16 Jul 2025 14:07:25 -0700 Subject: [PATCH 404/625] Partial work on homa_qdisc.c Not yet complete, but a fair amount of functionality has been implemented and unit-tested. --- Makefile | 1 + cloudlab/bin/config | 171 ++++++++--- cloudlab/bin/update_qdisc | 37 +++ homa_impl.h | 38 +++ homa_metrics.c | 2 + homa_metrics.h | 7 + homa_outgoing.c | 2 + homa_plumbing.c | 12 + homa_qdisc.c | 614 ++++++++++++++++++++++++++++++++++++++ homa_qdisc.h | 138 +++++++++ homa_utils.c | 4 + homa_wire.h | 9 +- test/Makefile | 4 +- test/mock.c | 131 +++++++- test/mock.h | 16 + test/unit_homa_qdisc.c | 451 ++++++++++++++++++++++++++++ 16 files changed, 1594 insertions(+), 43 deletions(-) create mode 100755 cloudlab/bin/update_qdisc create mode 100755 homa_qdisc.c create mode 100644 homa_qdisc.h create mode 100644 test/unit_homa_qdisc.c diff --git a/Makefile b/Makefile index 9a67a1e4..0e460059 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,7 @@ else HOMA_OBJS += homa_grant.o \ homa_metrics.o \ homa_offload.o \ + homa_qdisc.o \ homa_skb.o endif diff --git a/cloudlab/bin/config b/cloudlab/bin/config index 4f50270e..02eb11ca 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -109,6 +109,20 @@ def get_cpu_type(): raise Exception("Couldn't find 'model name' line in /proc/cpuinfo") return cpu_type +def get_exp_ports(): + """ + Return a list containing all of the switch egress port numbers used by + nodes in the current CloudLab experiment. + """ + + ports = [] + for name in get_node_names(): + match = re.match('[^0-9]+([0-9]+)', name) + if not match: + raise Exception("bad host name %s: expected number at end" % (name)) + ports.append(int(match.group(1)) % 40) + return ports + def get_interfaces(): """ Runs ifconfig and parses its output to identify the key network @@ -121,7 +135,7 @@ def get_interfaces(): if interface: return [interface, vlan] available = "" - for line in subprocess.run(["ifconfig"], stdout=subprocess.PIPE, + for line in exec_cmd(["ifconfig"], stdout=subprocess.PIPE, encoding="utf-8", check=True).stdout.splitlines(): match = re.match('^([a-z0-9]*):', line) if match: @@ -156,7 +170,7 @@ def get_link_speed(): nic = get_interfaces()[0] num_channels = -1 - for line in subprocess.run(["ethtool", nic], stdout=subprocess.PIPE, + for line in exec_cmd(["ethtool", nic], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8", check=True).stdout.splitlines(): match = re.match('.*Speed: ([0-9]+)Mb/s', line) @@ -196,7 +210,7 @@ def get_node_num(): global node_num if node_num != None: return node_num - hostname = subprocess.run(["hostname"], stdout=subprocess.PIPE, + hostname = exec_cmd(["hostname"], stdout=subprocess.PIPE, encoding="utf-8", check=True).stdout match = re.match(r'node([0-9]+)\.', hostname) if not match: @@ -261,19 +275,37 @@ def get_num_nodes(): f.close() return num_nodes -def get_exp_ports(): +def get_qdisc_config(): """ - Return a list containing all of the switch egress port numbers used by - nodes in the current CloudLab experiment. + Returns a dictionary with the following keys, which describe the current + configuration of queuing disciplines for the current interface: + root_handle: The handle (e.g. ":0" of the root qdisc for the interface) + children: A list of dictionaries describing the queue-specific qdiscs: + type: The type of the qdisc (e.g. fg_codel or homa) + handle: The handle for that qdisc + queue: The queue number that the qdisc is associated with + (e.g. if the parent handle is "1:4", the queue + will be 4) """ - ports = [] - for name in get_node_names(): - match = re.match('[^0-9]+([0-9]+)', name) - if not match: - raise Exception("bad host name %s: expected number at end" % (name)) - ports.append(int(match.group(1)) % 40) - return ports + result = {} + result['children'] = [] + nic = get_interfaces()[0] + for line in exec_cmd(['tc', 'qdisc', 'show', 'dev', nic], + stdout=subprocess.PIPE, encoding='utf-8', + check=True).stdout.splitlines(): + match = re.match('qdisc mq ([0-9a-f]+:[0-9a-f]*) root', line) + if match: + result['root_handle'] = match.group(1) + match = re.match('qdisc ([^ ]+) ([0-9a-f]+:[0-9a-f]*) ' + 'parent [^:]*:([0-9a-f]+)', line); + if match: + type = match.group(1) + handle = match.group(2) + queue = int(match.group(3), 16) + result['children'].append({'type': type, 'handle': handle, + 'queue': queue}) + return result def print_rss(): """ @@ -284,7 +316,7 @@ def print_rss(): nic = get_interfaces()[0] irqs = get_nic_irqs() num_channels = -1 - for line in subprocess.run(["ethtool", "-l", nic], stdout=subprocess.PIPE, + for line in exec_cmd(["ethtool", "-l", nic], stdout=subprocess.PIPE, encoding="utf-8", check=True).stdout.splitlines(): match = re.match('Combined:[^0-9]+([0-9]+)', line) if match: @@ -366,12 +398,21 @@ def add_ipv6_to_etc_hosts(num_hosts): input = "" for i in range(first, num_hosts): input += "fd00::%d node%d\n" % (i+1, i) - subprocess.run(["sudo", "bash", "-c", "cat >> /etc/hosts"], + exec_cmd(["sudo", "bash", "-c", "cat >> /etc/hosts"], input=input, encoding="utf-8", check=True) else: print("/etc/hosts already contains IPv6 addresses for nodes 0-%d" % ( num_hosts-1)) +def exec_cmd(*args, **kwargs): + """ + This method is a wrapper around subprocess.run, which logs the command + before executing it. The arguments are the same as those to + subprocess.run. + """ + print("%% %s" % (" ".join(args[0]))) + return subprocess.run(*args, **kwargs) + def set_sysctl(name, value): """ Set a Homa sysctl configuration option as given by name and value. @@ -380,7 +421,7 @@ def set_sysctl(name, value): if not sysctl_avl: return - subprocess.run(["sudo", "sysctl", ".net.homa.%s=%s" % (name, value)], + exec_cmd(["sudo", "sysctl", ".net.homa.%s=%s" % (name, value)], check=True) def config_homa(mod): @@ -389,15 +430,16 @@ def config_homa(mod): this node type. mod: the path to the Homa module '.ko' file """ + type = get_node_type() print("Installing Homa kernel module from %s" % (mod)) - subprocess.run(["sudo", "rmmod", "homa"], check=False) - subprocess.run(["sudo", "bash", "-c", "insmod %s" % (mod)], + exec_cmd(["sudo", "rmmod", "homa"], check=False) + exec_cmd(["sudo", "bash", "-c", "insmod %s" % (mod)], check=True) # See if Homa supports sysctls (if it has been stripped down for Linux # upstreaming, it might not). - result = subprocess.run(["sysctl", ".net.homa.num_priorities"], + result = exec_cmd(["sysctl", ".net.homa.num_priorities"], capture_output=True) if result.returncode != 0: global sysctl_avl @@ -407,20 +449,18 @@ def config_homa(mod): set_sysctl("num_priorities", "8") link_mbps = get_link_speed() set_sysctl ("link_mbps", str(link_mbps)) + set_sysctl("max_nic_queue_ns", "5000") if link_mbps == 10000: - set_sysctl("max_nic_queue_ns", "5000") set_sysctl("unsched_bytes", "30000") set_sysctl("window", "50000") set_sysctl("max_incoming", "400000") set_sysctl("max_gso_size", "10000") elif link_mbps == 25000: - set_sysctl("max_nic_queue_ns", "5000") set_sysctl("unsched_bytes", "60000") set_sysctl("window", "100000") set_sysctl("max_incoming", "480000") set_sysctl("max_gso_size", "10000") elif link_mbps == 100000: - set_sysctl("max_nic_queue_ns", "5000") set_sysctl("unsched_bytes", "60000") set_sysctl("window", "200000") set_sysctl("max_incoming", "1600000") @@ -449,14 +489,14 @@ def config_ipv6(num_hosts, vlan): """ vlan = get_interfaces()[1] # Configure ifconfig and route if not already done. - if "inet6 fd00::" in subprocess.run(["ifconfig", vlan], + if "inet6 fd00::" in exec_cmd(["ifconfig", vlan], stdout=subprocess.PIPE, encoding="utf-8", check=True).stdout: print("IPv6 already configured") else: print("Configuring IPv6:") - subprocess.run(["sudo", "ifconfig", vlan, "add", "fd00::%d/64" % ( + exec_cmd(["sudo", "ifconfig", vlan, "add", "fd00::%d/64" % ( get_node_num() + 1)], check=True) - subprocess.run(["sudo", "route", "-6", "add", "fd00::/16", vlan], + exec_cmd(["sudo", "route", "-6", "add", "fd00::/16", vlan], check=True) add_ipv6_to_etc_hosts(num_hosts) @@ -499,7 +539,7 @@ def config_lb(config): raise Exception('Bad load balancing config "%s"; must be ' 'xl170_default, gen2, gen3, or gen3_alt' % (config)) - subprocess.run(["sudo", "ethtool", "-L", get_interfaces()[0], "combined", + exec_cmd(["sudo", "ethtool", "-L", get_interfaces()[0], "combined", str(len(cores))], check=True) irqs = get_nic_irqs() @@ -516,12 +556,12 @@ def config_lb(config): softirq_msg = '' if softirq_cores: softirq_msg = ", SoftIRQ on cores %s" % (softirq_cores) - subprocess.run(["sudo", "sysctl", + exec_cmd(["sudo", "sysctl", ".net.homa.gen3_softirq_cores=%d %d %d %d" % (cores[i], softirq_cores[0], softirq_cores[1], softirq_cores[2])], check=True) - subprocess.run(["sudo", "bash", "-c", + exec_cmd(["sudo", "bash", "-c", "echo %d > /proc/irq/%s/smp_affinity_list" % (cores[i], irqs[i])], check=True) @@ -533,9 +573,9 @@ def config_mtu(size): Set the maximum allowable packet length for this node to size. """ [interface, vlan] = get_interfaces() - subprocess.run(["sudo", "ip", "link", "set", interface, "mtu", str(size)], + exec_cmd(["sudo", "ip", "link", "set", interface, "mtu", str(size)], check=True) - subprocess.run(["sudo", "ip", "link", "set", vlan, "mtu", str(size)], + exec_cmd(["sudo", "ip", "link", "set", vlan, "mtu", str(size)], check=True) print("MTU set to %d bytes" % (size)) @@ -545,9 +585,9 @@ def config_nic(): mechanisms). """ interface = get_interfaces()[0] - subprocess.run(["sudo", "ethtool", "-C", interface, "adaptive-rx", "off"], + exec_cmd(["sudo", "ethtool", "-C", interface, "adaptive-rx", "off"], check=False) - subprocess.run(["sudo", "ethtool", "-C", interface, "rx-usecs", "5", + exec_cmd(["sudo", "ethtool", "-C", interface, "rx-usecs", "5", "rx-frames", "1"], check=False) def config_power(): @@ -560,13 +600,59 @@ def config_power(): # are disabled, then so is Turbo mode, and that will hurt peak peformance. print("Configuring power settings for Intel CPUs") try: - subprocess.run(["sudo", "cpupower", "frequency-set", "-g", + exec_cmd(["sudo", "cpupower", "frequency-set", "-g", "performance"], check=True) except subprocess.CalledProcessError: print("*** cpupower error; ignoring for now") else: print("Skipping power settings (non-Intel CPU type)") +def config_qdisc(): + """ + Install Homa's queuing discipline for all of the queues of the NIC. + This will only work if the Homa version of tc is in the search path. + """ + + nic = get_interfaces()[0] + config = get_qdisc_config() + root = config['root_handle'] + if root == '0:': + # Must reset the root qdisc (it isn't possible to modify the + # default one) + exec_cmd(['sudo', 'tc', 'qdisc', 'add', 'dev', nic, 'root', 'handle', + '1:', 'mq'], check=True) + root = '1:' + + # Replace the qdisc for each NIC queue with a homa one. + for child in config['children']: + if child['handle'] != '0:': + exec_cmd(['sudo', 'tc', 'qdisc', 'del', 'dev', nic, + 'parent', '%s%x' % (root, child['queue']), 'handle', + child['handle']], check=True) + exec_cmd(['sudo', 'tc', 'qdisc', 'add', 'dev', nic, + 'parent', '%s%x' % (root, child['queue']), 'handle', + '%x:' % (64 + child['queue']), 'homa'], check=True) + +def config_reset_qdisc(): + """ + Reset the qdisc configuration by replacing any homa qdiscs with + fq_codel ones (this assumes that fq_codel is the default). + """ + + nic = get_interfaces()[0] + config = get_qdisc_config() + root = config['root_handle'] + + for child in config['children']: + if child['type'] != 'homa': + continue + exec_cmd(['sudo', 'tc', 'qdisc', 'del', 'dev', nic, + 'parent', '%s%x' % (root, child['queue']), 'handle', + '%x:' % (64 + child['queue'])], check=True) + exec_cmd(['sudo', 'tc', 'qdisc', 'replace', 'dev', nic, + 'parent', '%s%x' % (root, child['queue']), 'handle', + '%x:' % (64 + child['queue']), 'fq_codel'], check=True) + def config_reset_switch_all_ports(): """ Reset the configuration of all egress ports at the top-of-rack switch @@ -594,17 +680,17 @@ def config_rps(): interface = get_interfaces()[0] mask = get_core_mask() - subprocess.run(["sudo", "sysctl", "-w", + exec_cmd(["sudo", "sysctl", "-w", "net.core.rps_sock_flow_entries=32768"], check=True) flow_cnt = 0 for file in glob("/sys/class/net/%s/queues/rx-*/rps_flow_cnt" % (interface)): - subprocess.run(["sudo", "bash", "-c", "echo 2048 > %s" % (file)], + exec_cmd(["sudo", "bash", "-c", "echo 2048 > %s" % (file)], check=True) flow_cnt += 1 cpus = 0 for file in glob("/sys/class/net/%s/queues/rx-*/rps_cpus" % (interface)): - subprocess.run(["sudo", "bash", "-c", "echo %s > %s" % (mask, file)], + exec_cmd(["sudo", "bash", "-c", "echo %s > %s" % (mask, file)], check=True) cpus += 1 print("Configured RPS and RFS: %d rps_flow_cnt files and %d rps_cpus files" @@ -647,7 +733,8 @@ def print_help(): print("\nEach feature may be one of the following:") print(" --help Print this help text and exit") print(" default Normal configuration for Homa: equivalent to") - print(" 'homa ~/bin/homa.ko ipv6 nic power rps'") + print(" 'reset_qdisc homa ~/bin/homa.ko ipv6 nic power") + print(" rps'") print(" ecn_threshold KB Set the ECN marking threshold for all ports in") print(" the experiment to KB (Kbytes)") print(" homa HHH Install and configure the Homa kernel driver;") @@ -665,6 +752,11 @@ def print_help(): print(" power Configure power management (e.g., C-states)") print(" for best Homa performance") print(" print_rss Print out current RSS configuration") + print(" qdisc Install Homa's queuing discipline for all") + print(" of the NIC tx queues") + print(" reset_qdisc Uninstall Homa's queuing discipline for any") + print(" NIC queues where it is installed, and install") + print(" fq_codel in its place") print(" reset_switch_all_ports Issue commands to TOR switch to restore original") print(" port settings for all ports on the switch") print(" (even those not used by current experiment)") @@ -692,6 +784,7 @@ while i < len(sys.argv): print_help() exit(0) elif arg == "default": + config_reset_qdisc() config_homa("~/bin/homa.ko") config_ipv6(get_num_nodes(), vlan) config_nic() @@ -735,6 +828,10 @@ while i < len(sys.argv): config_power() elif arg == "print_rss": print_rss() + elif arg == "qdisc": + config_qdisc() + elif arg == "reset_qdisc": + config_reset_qdisc() elif arg == "reset_switch_all_ports": config_reset_switch_all_ports() elif arg == "reset_switch_ports": diff --git a/cloudlab/bin/update_qdisc b/cloudlab/bin/update_qdisc new file mode 100755 index 00000000..7f4bae0d --- /dev/null +++ b/cloudlab/bin/update_qdisc @@ -0,0 +1,37 @@ +#!/bin/bash + +# Copyright (c) 2024 Homa Developers +# SPDX-License-Identifier: BSD-1-Clause + +# Update qdisc modules on one or more machines, based on information +# on the current machine. Must run this in the root linux build directory +# +# Usage: +# update_qdisc num_nodes [first] +# +# The "num_nodes" arguments indicates how many nodes the command should +# be run on (starting at node1). The "first" argument is optional; it is +# an integer identifying the first node on which installation will occur +# (e.g. "update_qdisc 4 2" means node2 through node5 will be updated). +# "first" defaults to 1. + +v=`uname -r` +#v=5.17.7+ + +if [ $# -eq 2 ]; then + first=$2 +elif [ $# -eq 1 ]; then + first=1 +else + echo "Usage: update_qdisc num_nodes [first]" + exit 1 +fi +last=`expr $first + $1 - 1` + +for ((i = $first ; i <= $last; i++)); do + node=node$i + echo + echo $node + rsync -rtv net/sched/sch_fq_codel.ko $node:sched/ + ssh $node 'sudo rsync -rtv sched/ /lib/modules/`uname -r`/kernel/net/sched/' +done diff --git a/homa_impl.h b/homa_impl.h index 1299ed4e..31b4a162 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -520,6 +520,19 @@ struct homa_net { * for this namespace. Managed by homa_peer.c under the peertab lock. */ int num_peers; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @qdisc_devs: List of all homa_qdisc_dev objects that exist for + * this namespace. Protected by qdisc_devs_lock. + */ + struct list_head qdisc_devs; + + /** + * @qdisc_dev_lock: Must hold when reading or writing @qdisc_devs. + */ + spinlock_t qdisc_devs_lock; +#endif /* See strip.py */ }; /** @@ -556,6 +569,31 @@ struct homa_skb_info { * this packet. */ int offset; + + /** + * @bytes_left: number of bytes in this packet and all later packets + * in the same message. Used to priroitize packets for SRPT. + */ + int bytes_left; + + /** + * @rpc: RPC that this packet came from. Used only as a unique + * identifier: it is not safe to dereference this pointer (the RPC + * may no longer exist). + */ + void *rpc; + + /** + * @next_sibling: next packet in @rpc that has been deferred in + * homa_qdisc because the NIC queue was too long, or NULL if none. + */ + struct sk_buff *next_sibling; + + /** + * @last_sibling: last packet in @next_sibling list. Only valid + * for the "head" packet (which is in qdev->homa_deferred). + */ + struct sk_buff *last_sibling; }; /** diff --git a/homa_metrics.c b/homa_metrics.c index 78ab3372..d6a960db 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -283,6 +283,8 @@ char *homa_metrics_print(void) "Time in homa_timer spent reaping RPCs\n"); M("data_pkt_reap_cycles", m->data_pkt_reap_cycles, "Time in homa_data_pkt spent reaping RPCs\n"); + M("idle_time_conflicts", m->idle_time_conflicts, + "Cache conflicts when updating link_idle_time\n"); M("pacer_cycles", m->pacer_cycles, "Time spent in homa_pacer_main\n"); M("homa_cycles", diff --git a/homa_metrics.h b/homa_metrics.h index b0b28a26..664915b7 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -367,6 +367,13 @@ struct homa_metrics { */ u64 data_pkt_reap_cycles; + /** + * @idle_time_conflicts: total number of times that an update to + * link_idle_time in homa_qdisc_update_link_idle failed because + * of a conflicting access. + */ + __u64 idle_time_conflicts; + /** * @pacer_cycles: total time spent executing in homa_pacer_main * (not including blocked time). diff --git a/homa_outgoing.c b/homa_outgoing.c index a433ea5d..2bd06cb9 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -188,6 +188,8 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, homa_info->data_bytes = length; homa_info->seg_length = max_seg_data; homa_info->offset = offset; + homa_info->bytes_left = rpc->msgout.length - offset; + homa_info->rpc = rpc; #ifndef __STRIP__ /* See strip.py */ if (segs > 1 && rpc->hsk->sock.sk_protocol != IPPROTO_TCP) { diff --git a/homa_plumbing.c b/homa_plumbing.c index cb12326e..502bf8ce 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -8,6 +8,7 @@ #ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" #include "homa_offload.h" +#include "homa_qdisc.h" #endif /* See strip.py */ #include "homa_pacer.h" #include "homa_peer.h" @@ -481,6 +482,7 @@ int __init homa_load(void) bool init_net_ops = false; bool init_proto6 = false; bool init_proto = false; + bool init_qdisc = false; bool init_homa = false; int status; @@ -617,6 +619,13 @@ int __init homa_load(void) goto error; } init_offload = true; + + status = homa_qdisc_register(); + if (status != 0) { + pr_err("Homa couldn't load its qdisc: error %d\n", status); + goto error; + } + init_qdisc = true; #endif /* See strip.py */ status = register_pernet_subsys(&homa_net_ops); @@ -653,6 +662,8 @@ int __init homa_load(void) wait_for_completion(&timer_thread_done); } #ifndef __STRIP__ /* See strip.py */ + if (init_qdisc) + homa_qdisc_unregister(); if (init_offload) homa_offload_end(); if (init_sysctl) @@ -695,6 +706,7 @@ void __exit homa_unload(void) wake_up_process(timer_kthread); wait_for_completion(&timer_thread_done); } + homa_qdisc_unregister(); if (homa_offload_end() != 0) pr_err("Homa couldn't stop offloads\n"); unregister_net_sysctl_table(homa_ctl_header); diff --git a/homa_qdisc.c b/homa_qdisc.c new file mode 100755 index 00000000..c7b2e151 --- /dev/null +++ b/homa_qdisc.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: BSD-2-Clause + +/* This file implements a special-purpose queuing discipline for Homa. + * This queuing discipline serves the following purposes: + * - It paces output traffic so that queues do not build up in the NIC + * (they build up here instead). + * - It implements the SRPT policy for Homa traffic (highest priority goes + * to the message with the fewest bytes remaining to transmit). + * - It manages TCP traffic as well as Homa traffic, so that TCP doesn't + * result in long NIC queues. + * - When queues do build up, it balances output traffic between Homa and TCP. + */ + +#include "homa_impl.h" +#include "homa_pacer.h" +#include "homa_qdisc.h" +#include "timetrace.h" + +#include +#include + +static struct Qdisc_ops homa_qdisc_ops __read_mostly = { + .id = "homa", + .priv_size = sizeof(struct homa_qdisc), + .enqueue = homa_qdisc_enqueue, + .dequeue = qdisc_dequeue_head, + .peek = qdisc_peek_head, + .init = homa_qdisc_init, + .reset = qdisc_reset_queue, + .destroy = homa_qdisc_destroy, + .owner = THIS_MODULE, +}; + +/** + * homa_qdisc_register() - Invoked when the Homa module is loaded; makes + * the homa qdisk known to Linux. + * Return: 0 for success or a negative errno if an error occurred. + */ +int homa_qdisc_register(void) +{ + return register_qdisc(&homa_qdisc_ops); +} + +/** + * homa_qdisc_unregister() - Invoked when the Homa module is about to be + * unloaded: deletes all information related to the homa qdisc. + */ +void homa_qdisc_unregister(void) +{ + unregister_qdisc(&homa_qdisc_ops); +} + +/** + * homa_qdisc_init() - Initialize a new instance of this queuing discipline. + * @sch: Qdisc to initialize. + * @opt: Options for this qdisc; not currently used. + * @extack: For reporting detailed information relating to errors; not used. + * Return: 0 for success, otherwise a negative errno. + */ +int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct homa_qdisc *q = qdisc_priv(sch); + struct homa_qdisc_dev *qdev; + struct homa_net *hnet; + bool found = false; + + hnet = homa_net_from_net(dev_net(sch->dev_queue->dev)); + spin_lock_bh(&hnet->qdisc_devs_lock); + list_for_each_entry(qdev, &hnet->qdisc_devs, links) { + if (qdev->dev == sch->dev_queue->dev) { + found = true; + break; + } + } + if (!found) { + qdev = homa_qdisc_qdev_new(hnet, sch->dev_queue->dev); + if (IS_ERR(qdev)) { + spin_unlock_bh(&hnet->qdisc_devs_lock); + return PTR_ERR(qdev); + } + } else { + qdev->num_qdiscs++; + } + spin_unlock_bh(&hnet->qdisc_devs_lock); + + q->qdev = qdev; + skb_queue_head_init(&q->queue); + + sch->limit = 10*1024; + return 0; +} + +/** + * homa_qdisc_qdev_new() - Allocate and initialize a new homa_qdisc_dev. + * @hnet: Network namespace for the homa_qdisc_dev. + * @dev: NIC that the homa_qdisc_dev will manage. + * Return A pointer to the new homa_qdisc_dev, or a PTR_ERR errno. + */ +struct homa_qdisc_dev *homa_qdisc_qdev_new(struct homa_net *hnet, + struct net_device *dev) + __must_hold(hnet->qdisc_devs_lock) +{ + struct homa_qdisc_dev *qdev; + + qdev = kzalloc(sizeof(*qdev), GFP_ATOMIC); + if (!qdev) + return ERR_PTR(-ENOMEM); + qdev->dev = dev; + qdev->hnet = hnet; + qdev->num_qdiscs = 1; + homa_qdisc_update_sysctl(qdev); + INIT_LIST_HEAD(&qdev->links); + skb_queue_head_init(&qdev->homa_deferred); + skb_queue_head_init(&qdev->tcp_deferred); + init_waitqueue_head(&qdev->pacer_sleep); + spin_lock_init(&qdev->pacer_mutex); + + qdev->pacer_kthread = kthread_run(homa_qdisc_pacer_main, qdev, + "homa_qdisc_pacer"); + if (IS_ERR(qdev->pacer_kthread)) { + int error = PTR_ERR(qdev->pacer_kthread); + + pr_err("couldn't create homa qdisc pacer thread: error %d\n", + error); + kfree(qdev); + return ERR_PTR(error); + } + list_add(&qdev->links, &hnet->qdisc_devs); + return qdev; +} + +/** + * homa_qdisc_destroy() - This function is invoked to perform final cleanup + * before a qdisc is deleted. + * @sch: Qdisc that is being deleted. + */ +void homa_qdisc_destroy(struct Qdisc *sch) +{ + struct homa_qdisc *q = qdisc_priv(sch); + struct homa_qdisc_dev *qdev = q->qdev; + + spin_lock_bh(&qdev->hnet->qdisc_devs_lock); + qdev->num_qdiscs--; + if (qdev->num_qdiscs == 0) + homa_qdisc_qdev_destroy(qdev); + spin_unlock_bh(&qdev->hnet->qdisc_devs_lock); +} + +/** + * homa_qdisc_qdev_destroy() - Cleanup and release memory for a homa_qdisc_dev. + * @qdev: Object to destroy; its memory will be freed. + */ +void homa_qdisc_qdev_destroy(struct homa_qdisc_dev *qdev) + __must_hold(qde) +{ + kthread_stop(qdev->pacer_kthread); + qdev->pacer_kthread = NULL; + + __list_del_entry(&qdev->links); + homa_qdisc_srpt_free(&qdev->homa_deferred); + skb_queue_purge(&qdev->tcp_deferred); + kfree(qdev); +} + +/** + * homa_qdisc_enqueue() - Add a packet to the queue for this qdisc. + * @skb: Packet to enqueue. + * @sch: Qdisc on which to enqueue @skb. + * @to_free: Used when dropping packets. + */ +int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct homa_qdisc *q = qdisc_priv(sch); + struct homa_qdisc_dev *qdev = q->qdev; + struct homa *homa = qdev->hnet->homa; + int pkt_len; + + if (skb == q->qdev->pacer_skb) { + q->qdev->pacer_skb = NULL; + goto enqueue; + } + + /* The packet length computed by Linux didn't include overheads + * such as inter-frame gap; add that in here. + */ + pkt_len = qdisc_skb_cb(skb)->pkt_len + HOMA_ETH_FRAME_OVERHEAD; + if (pkt_len < homa->pacer->throttle_min_bytes) { + homa_qdisc_update_link_idle(q->qdev, pkt_len, -1); + goto enqueue; + } + + if (!is_homa_pkt(skb)) { + homa_qdisc_update_link_idle(q->qdev, pkt_len, -1); + goto enqueue; + } + + if (homa_qdisc_update_link_idle(q->qdev, pkt_len, + homa->pacer->max_nic_queue_cycles)) + goto enqueue; + + /* This packet needs to be deferred until the NIC queue has + * been drained a bit. + */ + homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); + wake_up(&qdev->pacer_sleep); + +enqueue: + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) + return qdisc_enqueue_tail(skb, sch); + return qdisc_drop(skb, sch, to_free); +} + +/** + * homa_qdisc_srpt_enqueue() - Add a Homa packet to an skb queue in SRPT + * priority order. + * @list: List on which to enqueue packet (usually &qdev->homa_deferred). + * @skb: Packet to enqueue. + */ +void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, struct sk_buff *skb) +{ + struct homa_skb_info *info = homa_get_skb_info(skb); + struct sk_buff *other; + unsigned long flags; + + /* Tricky point: only one packet from an RPC may appear in + * qdev->homa_deferred at once (the earliest one in the message). + * If later packets from the same message were also in the queue, + * they would have higher priorities and would get transmitted + * first, which we don't want. So, if more than one packet from + * a message is waiting, only the first appears in qdev->homa_deferred; + * the others are queued up using links in the homa_skb_info of + * the first packet. + * + * This also means that we must scan the list starting at the + * low-priority end, so we'll notice if there is an earlier + * (lower priority) packet for the same RPC already in the list. + */ + + info->next_sibling = NULL; + info->last_sibling = NULL; + spin_lock_irqsave(&list->lock, flags); + if (skb_queue_empty(list)) { + __skb_queue_head(list, skb); + goto done; + } + skb_queue_reverse_walk(list, other) { + struct homa_skb_info *other_info = homa_get_skb_info(other); + + if (other_info->rpc == info->rpc) { + if (!other_info->last_sibling) + other_info->next_sibling = skb; + else + homa_get_skb_info(other_info->last_sibling)-> + next_sibling = skb; + other_info->last_sibling = skb; + break; + } + + if (other_info->bytes_left <= info->bytes_left) { + __skb_queue_after(list, other, skb); + break; + } + + if (skb_queue_is_first(list, other)) { + __skb_queue_head(list, skb); + break; + } + } + +done: + spin_unlock_irqrestore(&list->lock, flags); +} + +/** + * homa_qdisc_srpt_dequeue() - Remove the frontmost packet from a list that + * is managed with SRPT priority. + * @list: List from which to remove packet. + * Return: The frontmost packet from the list, or NULL if the list was empty. + */ +struct sk_buff *homa_qdisc_srpt_dequeue(struct sk_buff_head *list) +{ + struct homa_skb_info *sibling_info; + struct sk_buff *skb, *sibling; + struct homa_skb_info *info; + unsigned long flags; + + /* The only tricky element about this function is that skb may + * have a sibling list. If so, we need to enqueue the next + * sibling. + */ + spin_lock_irqsave(&list->lock, flags); + if (skb_queue_empty(list)) { + spin_unlock_irqrestore(&list->lock, flags); + return NULL; + } + skb = list->next; + __skb_unlink(skb, list); + info = homa_get_skb_info(skb); + if (info->next_sibling) { + /* This is a "compound" packet, containing multiple + * packets from the same RPC. Put the next packet + * back on the list at the front (it should have even + * higher priority than skb, since it is later in the + * message). + */ + sibling = info->next_sibling; + sibling_info = homa_get_skb_info(sibling); + sibling_info->last_sibling = info->last_sibling; + __skb_queue_head(list, sibling); + } + + spin_unlock_irqrestore(&list->lock, flags); + return skb; +} + +/** + * homa_qdisc_srpt_free() - Free all of the packets on @list, + * including siblings that are nested inside packets on the list. + * @list: List containing packets to free, which is managed using + * by homa_qdisc_srpt_enqueue and homa_qdisc_srpt_dequeue; + * it will be empty on return. + */ +void homa_qdisc_srpt_free(struct sk_buff_head *list) +{ + struct sk_buff *skb; + + while (1) { + skb = homa_qdisc_srpt_dequeue(list); + if (!skb) + break; + kfree_skb(skb); + } +} + +/** + * homa_qdisc_update_link_idle() - This function is invoked before transmitting + * a packet. If the current NIC queue length is no more than @max_queue_cycles + * then it updates @qdev->link_idle_time to include @bytes; otherwise it does + * nothing. + * @qdev: Information about the device. + * @bytes: Size of a packet that is about to be transmitted; + * includes all headers out through the Ethernet header, + * but not additional overhead such as CRC and gap + * between packets. + * @max_queue_cycles: If it will take longer than this amount of time for + * previously queued bytes to be transmitted, then don't + * update @qdev->link_idle_time. A negative value means + * any length queue is OK. + * Return: Nonzero if @qdev->link_idle_time was updated, false + * if the queue was too long. + */ +int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, + int max_queue_cycles) +{ + u64 idle, new_idle, clock, cycles_for_packet; + + cycles_for_packet = qdev->cycles_per_mibyte; + cycles_for_packet = (cycles_for_packet * + (bytes + HOMA_ETH_FRAME_OVERHEAD)) >> 20; + + /* The following loop may be executed multiple times if there + * are conflicting udpates to qdev->link_idle_time. + */ + while (1) { + clock = homa_clock(); + idle = atomic64_read(&qdev->link_idle_time); + if (idle < clock) { + new_idle = clock + cycles_for_packet; + } else { + if (max_queue_cycles >= 0 && (idle - clock) > + max_queue_cycles) + return 0; + new_idle = idle + cycles_for_packet; + } + + if (atomic64_cmpxchg_relaxed(&qdev->link_idle_time, idle, + new_idle) == idle) + break; + INC_METRIC(idle_time_conflicts, 1); + } + return 1; +} + +/** + * homa_qdisc_pacer_main() - Top-level function for a device-specific + * thread that is responsible for transmitting deferred packets on that + * device. + * @device: Pointer to a struct homa_qdisc_dev. + * Return: Always 0. + */ +int homa_qdisc_pacer_main(void *device) +{ + struct homa_qdisc_dev *qdev = device; + int status; + u64 start; + + while (1) { + if (kthread_should_stop()) + break; + start = homa_clock(); + homa_qdisc_pacer(qdev); + INC_METRIC(pacer_cycles, homa_clock() - start); + + if (!skb_queue_empty(&qdev->homa_deferred) || + !skb_queue_empty(&qdev->tcp_deferred)) { + /* There are more packets to transmit (the NIC queue + * must be full); call the pacer again, but first + * give other threads a chance to run (otherwise + * low-level packet processing such as softirq could + * starve). + */ + schedule(); + continue; + } + + tt_record("homa_qdisc pacer sleeping"); + status = wait_event_interruptible(qdev->pacer_sleep, + kthread_should_stop() || + !skb_queue_empty(&qdev->homa_deferred) || + !skb_queue_empty(&qdev->tcp_deferred)); + tt_record1("homa_qdisc pacer woke up with status %d", status); + if (status != 0 && status != -ERESTARTSYS) + break; + } + return 0; +} + +/** + * homa_qdisc_pacer() - Transmit a few packets from the homa_deferred and + * tcp_deferred lists while keeping NIC queue short. There may still be + * deferred packets when this function returns. + * + * Note: this function may be invoked from places other than + * homa_qdisc_pacer_main. The reason for this is that (as of 10/2019) + * Linux's thread scheduler is unpredictable and could neglect the thread + * for long periods of time (e.g., because it is assigned to the same + * CPU as a busy interrupt handler). This can result in poor utilization + * of the network link. So, this method gets invoked from other places as + * well, to increase the likelihood that we keep the link busy. Those other + * invocations are not guaranteed to happen, so the pacer thread provides a + * backstop. + * @homa: Overall data about the Homa protocol implementation. + */ +void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) +{ + int i; + + /* Make sure only one instance of this function executes at a + * time. + */ + if (!spin_trylock_bh(&qdev->pacer_mutex)) + return; + + /* Each iteration through the following loop sends one packet. We + * limit the number of passes through this loop in order to cap the + * time spent in one call to this function (see note in + * homa_qdisc_pacer_main about interfering with softirq handlers). + */ + for (i = 0; i < 5; i++) { + struct sk_buff *skb; + u64 idle_time, now; + + /* If the NIC queue is too long, wait until it gets shorter. */ + now = homa_clock(); + idle_time = atomic64_read(&qdev->link_idle_time); + while ((now + qdev->hnet->homa->pacer->max_nic_queue_cycles) < + idle_time) { + /* If we've xmitted at least one packet then + * return (this helps with testing and also + * allows homa_qdisc_pacer_main to yield the core). + */ + if (i != 0) + goto done; + now = homa_clock(); + } + + /* Note: when we get here, it's possible that the NIC queue is + * still too long because other threads have queued packets, + * but we transmit anyway so the pacer thread doesn't starve. + */ + skb = homa_qdisc_srpt_dequeue(&qdev->homa_deferred); + if (!skb) + break; + homa_qdisc_update_link_idle(qdev, qdisc_skb_cb(skb)->pkt_len, + -1); + + /* Resubmit the packet. Concentrate all of the (long) + * resubmitted packets on device queue 0, in order + * to reduce contention between them and short packets + * on other queues. + */ + qdev->pacer_skb = skb; + homa_qdisc_resubmit_skb(skb, qdev->dev, 0); + qdev->pacer_skb = NULL; + } +done: + spin_unlock_bh(&qdev->pacer_mutex); +} + +/** + * homa_qdisc_resubmit_skb() - This function is called by the pacer to + * restart the transmission of an skb that was deferred because of NIC + * queue length. The packet may be dropped under various error conditions. + * @skb: Packet to resubmit. + * @dev: Network device to which the packet should be resubmitted. + * @queue: Index of desired tx queue on @dev. + * Return: Zero for success, otherwise a negative errno. + */ +void homa_qdisc_resubmit_skb(struct sk_buff *skb, struct net_device *dev, + int queue) +{ + /* The code of this function was extracted from __dev_xmit_skb + * (with RCU lock/unlock from __dev_queue_xmit). Ideally this + * module would simply invoke __dev_xmit_skb, but it isn't + * globally available. + */ + struct sk_buff *to_free = NULL; + struct netdev_queue *txq; + spinlock_t *root_lock; + struct Qdisc *q; + bool contended; + + rcu_read_lock_bh(); + txq = netdev_get_tx_queue(dev, 0); + q = rcu_dereference_bh(txq->qdisc); + root_lock = qdisc_lock(q); + + contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); + if (unlikely(contended)) + spin_lock(&q->busylock); + + spin_lock(root_lock); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + } else { + WRITE_ONCE(q->owner, smp_processor_id()); + q->enqueue(skb, q, &to_free); + WRITE_ONCE(q->owner, -1); + if (qdisc_run_begin(q)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + // __qdisc_run(q); + qdisc_run_end(q); + } + } + spin_unlock(root_lock); + if (unlikely(to_free)) + kfree_skb_list_reason(to_free, + tcf_get_drop_reason(to_free)); + if (unlikely(contended)) + spin_unlock(&q->busylock); + rcu_read_unlock_bh(); +} + +/** + * homa_qdisc_update_sysctl() - Recompute information in a homa_qdisc_dev + * that depends on sysctl parameters. + * @homa: Used to fetch current sysctl parameter values. + * @qdev: Update information here that depends on sysctl values. + */ +void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev) +{ + struct ethtool_link_ksettings ksettings; + struct homa *homa = qdev->hnet->homa; + const struct ethtool_ops *ops; + u64 tmp; + + qdev->link_mbps = homa->link_mbps; + ops = qdev->dev->ethtool_ops; + if (ops && ops->get_link_ksettings) { + if (ops->get_link_ksettings(qdev->dev, &ksettings) == 0) + qdev->link_mbps = ksettings.base.speed; + } + + /* Underestimate link bandwidth (overestimate time) by 1%. + * + * cycles/sec + * cycles/mibyte = (101/100) * ------------- + * mibytes/sec + * + * 101 * homa_clock_khz() * 1000 + * = --------------------------------------- + * 100 * link_mbps * (1<<20 / 1000000) / 8 + * + * 8 * 1010 * homa_clock_khz() 1<<20 + * = ----------------------------- * --------- + * link_mbps 1000000 + */ + tmp = 8ULL * 1010; + tmp *= homa_clock_khz(); + do_div(tmp, qdev->link_mbps); + tmp <<= 20; + do_div(tmp, 1000000); + qdev->cycles_per_mibyte = tmp; +} + +/** + * homa_qdisc_update_all_sysctl() - Invoked whenever a sysctl value is changed; + * updates all qdisc structures to reflect new values. + * @homa: Overall data about the Homa protocol implementation. + */ +void homa_qdisc_update_all_sysctl(struct homa_net *hnet) +{ + struct homa_qdisc_dev *qdev; + + spin_lock_bh(&hnet->qdisc_devs_lock); + list_for_each_entry(qdev, &hnet->qdisc_devs, links) + homa_qdisc_update_sysctl(qdev); + spin_unlock_bh(&hnet->qdisc_devs_lock); +} diff --git a/homa_qdisc.h b/homa_qdisc.h new file mode 100644 index 00000000..8f16a627 --- /dev/null +++ b/homa_qdisc.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file contains definitions related to Homa's special-purpose + * queuing discipline + */ + +#ifdef __UNIT_TEST__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#endif /* __UNIT_TEST__*/ +#include +#ifdef __UNIT_TEST__ +#pragma GCC diagnostic pop +#endif /* __UNIT_TEST__*/ + +#ifndef _HOMA_QDISC_H +#define _HOMA_QDISC_H + +/** + * struct homa_qdisc - Contains Homa-specific data for a single instance of + * the homa queuing discipline + */ +struct homa_qdisc { + /** @dev: Info shared among all qdiscs for a net_device. */ + struct homa_qdisc_dev *qdev; + + /** @queue: Packets waiting to be transmitted. */ + struct sk_buff_head queue; +}; + +/** + * struct homa_qdisc_dev - Contains information shared across all of the + * homa_qdiscs associated with a net_device. + */ +struct homa_qdisc_dev { + /** @dev: Device common to all qdiscs using this struct. */ + struct net_device *dev; + + /** + * @homa_net: Homa's information about the network namesapce + * this object belongs to. + */ + struct homa_net *hnet; + + /** + * @num_qdiscs: Number of homa_qdisc objects referencing this struct. + * Access only when holding homa->qdisc_devs_lock. + */ + int num_qdiscs; + + /** @link_mbps: Speed of the link associated with @dev, in Mbps. */ + int link_mbps; + + /** + * @cycles_per_mibyte: The number of homa_clock cycles that it takes + * to transmit 2**20 bytes on the link associated with @dev; computed + * from @link_mbps. This is actually a slight overestimate (if we + * underestimate, the link queue could grow without bound during + * periods of high traffic). + */ + int cycles_per_mibyte; + + /** + * @link_idle_time: The time, measured by homa_clock, at which we + * estimate that all of the packets passed to @dev will have been + * transmitted. May be in the past. + */ + atomic64_t link_idle_time __aligned(L1_CACHE_BYTES); + + /** @links: Used to link this struct into homa->qdisc_devs. */ + struct list_head links; + + /** + * @homa_deferred: Homa packets whose transmission was deferred + * because the NIC queue was too long. The queue is in SRPT order. + */ + struct sk_buff_head homa_deferred; + + /** + * @tcp_deferred: TCP packets whose transmission was deferred + * because the NIC queue was too long. The queue is in order of + * packet arrival at the qdisc. + */ + struct sk_buff_head tcp_deferred; + + /** + * @pacer_kthread: Kernel thread that eventually transmits packets + * on homa_deferred and tcp_deferred. + */ + struct task_struct *pacer_kthread; + + /** + * @pacer_sleep: Used to block the pacer thread when there + * are no throttled RPCs. + */ + struct wait_queue_head pacer_sleep; + + /** + * @pacer_mutex: Ensures that only one instance of + * homa_qdisc_pacer runs at a time. Only used in "try" mode: + * never block on this. + */ + spinlock_t pacer_mutex __aligned(L1_CACHE_BYTES); + + /** + * @pacer_skb: The current skb that the pacer has selected for + * transmission and is pushing through __dev_xmit_skb. This skb + * should be transmitted without any further delay or accounting. + */ + struct sk_buff *pacer_skb; +}; + +void homa_qdisc_destroy(struct Qdisc *sch); +int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free); +int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack); +int homa_qdisc_pacer_main(void *device); +void homa_qdisc_qdev_destroy(struct homa_qdisc_dev *qdev); +struct homa_qdisc_dev * + homa_qdisc_qdev_new(struct homa_net *hnet, + struct net_device *dev); +int homa_qdisc_register(void); +void homa_qdisc_resubmit_skb(struct sk_buff *skb, + struct net_device *dev, int queue); +void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, + struct sk_buff *skb); +struct sk_buff * + homa_qdisc_srpt_dequeue(struct sk_buff_head *list); +void homa_qdisc_srpt_free(struct sk_buff_head *list); +void homa_qdisc_unregister(void); +void homa_qdisc_update_all_sysctl(struct homa_net *hnet); +int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, + int bytes, int max_queue_ns); +void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev); +void homa_qdisc_pacer(struct homa_qdisc_dev *qdev); + +#endif /* _HOMA_QDISC_H */ \ No newline at end of file diff --git a/homa_utils.c b/homa_utils.c index a83150ee..14a6b2e2 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -165,6 +165,10 @@ int homa_net_init(struct homa_net *hnet, struct net *net, struct homa *homa) hnet->net = net; hnet->homa = homa; hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; +#ifndef __STRIP__ /* See strip.py */ + INIT_LIST_HEAD(&hnet->qdisc_devs); + spin_lock_init(&hnet->qdisc_devs_lock); +#endif /* See strip.py */ return 0; } diff --git a/homa_wire.h b/homa_wire.h index 2c386fed..f4296c15 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -52,11 +52,18 @@ enum homa_packet_type { */ #define HOMA_SKB_EXTRA MAX_TCP_HEADER +/** + * define HOMA_ETH_FRAME_OVERHEAD - Additional overhead bytes for each + * Ethernet packet that are not included in the packet header (preamble, + * start frame delimiter, CRC, and inter-packet gap). + */ +#define HOMA_ETH_FRAME_OVERHEAD 24 + /** * define HOMA_ETH_OVERHEAD - Number of bytes per Ethernet packet for Ethernet * header, CRC, preamble, and inter-packet gap. */ -#define HOMA_ETH_OVERHEAD 42 +#define HOMA_ETH_OVERHEAD (18 + HOMA_ETH_FRAME_OVERHEAD) /** * define HOMA_MIN_PKT_LENGTH - Every Homa packet must be padded to at least diff --git a/test/Makefile b/test/Makefile index 0b04acf8..cafa34ae 100644 --- a/test/Makefile +++ b/test/Makefile @@ -54,8 +54,9 @@ TEST_SRCS := unit_homa_incoming.c \ unit_timetrace.c ifeq ($(__STRIP__),) TEST_SRCS += unit_homa_grant.c \ - unit_homa_offload.c \ unit_homa_metrics.c \ + unit_homa_offload.c \ + unit_homa_qdisc.c \ unit_homa_skb.c endif TEST_OBJS := $(patsubst %.c,%.o,$(TEST_SRCS)) @@ -68,6 +69,7 @@ HOMA_SRCS := homa_devel.c \ homa_peer.c \ homa_pool.c \ homa_plumbing.c \ + homa_qdisc.c \ homa_rpc.c \ homa_sock.c \ homa_timer.c \ diff --git a/test/mock.c b/test/mock.c index 49ff3c0e..a381d64d 100644 --- a/test/mock.c +++ b/test/mock.c @@ -8,6 +8,7 @@ #include "homa_impl.h" #include "homa_pool.h" #ifndef __STRIP__ /* See strip.py */ +#include "homa_qdisc.h" #include "homa_skb.h" #endif /* See strip.py */ #include "ccutils.h" @@ -34,11 +35,13 @@ extern void *memcpy(void *dest, const void *src, size_t n); */ int mock_alloc_page_errors; int mock_alloc_skb_errors; +int mock_cmpxchg_errors; int mock_copy_data_errors; int mock_copy_to_iter_errors; int mock_copy_to_user_errors; int mock_cpu_idle; int mock_dst_check_errors; +int mock_ethtool_ksettings_errors; int mock_import_ubuf_errors; int mock_import_iovec_errors; int mock_ip6_xmit_errors; @@ -47,6 +50,7 @@ int mock_kmalloc_errors; int mock_kthread_create_errors; int mock_prepare_to_wait_errors; int mock_register_protosw_errors; +int mock_register_qdisc_errors; int mock_register_sysctl_errors; int mock_rht_init_errors; int mock_rht_insert_errors; @@ -121,6 +125,11 @@ static struct unit_hash *proc_files_in_use; */ static struct unit_hash *pages_in_use; +/* Number of qdiscs that have been registered but not yet unregistered + * during the current test. Reset for each test. + */ +static int registered_qdiscs; + /* Keeps track of all the results returned by ip_route_output_flow that * have not yet been freed. Reset for each test. */ @@ -238,15 +247,20 @@ __u16 mock_min_default_port = 0x8000; static struct socket mock_socket; #define MOCK_MAX_NETS 10 -static struct net mock_nets[MOCK_MAX_NETS]; -static struct homa_net mock_hnets[MOCK_MAX_NETS]; -static int mock_num_hnets; +struct net mock_nets[MOCK_MAX_NETS]; +struct homa_net mock_hnets[MOCK_MAX_NETS]; +int mock_num_hnets; /* Nonzero means don't generate a unit test failure when freeing peers * if the reference count isn't zero (log a message instead). */ int mock_peer_free_no_fail; +/* Link speed to return from mock_get_link_ksettings. */ +int mock_link_mbps = 10000; + +struct ethtool_ops mock_ethtool_ops = + {.get_link_ksettings = mock_get_link_ksettings}; struct dst_ops mock_dst_ops = { .mtu = mock_get_mtu, .check = mock_dst_check}; @@ -255,7 +269,8 @@ struct net_device mock_net_device = { .gso_max_segs = 1000, .gso_max_size = 0, ._tx = &mock_net_queue, - .nd_net = {.net = &mock_nets[0]} + .nd_net = {.net = &mock_nets[0]}, + .ethtool_ops = &mock_ethtool_ops }; const struct net_offload *inet_offloads[MAX_INET_PROTOS]; const struct net_offload *inet6_offloads[MAX_INET_PROTOS]; @@ -906,6 +921,16 @@ void __kfree_skb(struct sk_buff *skb) free(skb); } +void kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) +{ + while (segs) { + struct sk_buff *next = segs->next; + + __kfree_skb(segs); + segs = next; + } +} + void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) { return mock_kmalloc(size, gfpflags); @@ -1068,6 +1093,9 @@ int netif_receive_skb(struct sk_buff *skb) return 0; } +void __netif_schedule(struct Qdisc *q) +{} + void preempt_count_add(int val) { int i; @@ -1203,6 +1231,12 @@ void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) mock_total_spin_locks++; } +unsigned long _raw_spin_lock_irqsave(raw_spinlock_t *lock) +{ + mock_record_locked(lock); + return 1234; +} + void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner) {} @@ -1234,6 +1268,15 @@ void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) mock_record_unlocked(lock); } +void _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, + unsigned long flags) +{ + if (flags != 1234) + FAIL("incorrect flags %ld returned to %sa (expected 1234)", + flags, __func__); + mock_record_unlocked(lock); +} + int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { UNIT_HOOK("spin_lock"); @@ -1283,6 +1326,14 @@ int register_pernet_subsys(struct pernet_operations *) return 0; } +int register_qdisc(struct Qdisc_ops *qops) +{ + if (mock_check_error(&mock_register_qdisc_errors)) + return -EINVAL; + registered_qdiscs++; + return 0; +} + void release_sock(struct sock *sk) { mock_active_locks--; @@ -1293,6 +1344,25 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) {} +int rtnl_is_locked(void) { + return 0; +} + +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) +{ + if (!head || !tail) + return; + + while (true) { + struct sk_buff *next = head->next; + + __kfree_skb(head); + if (head == tail) + break; + head = next; + } +} + void schedule(void) { UNIT_HOOK("schedule"); @@ -1505,6 +1575,11 @@ void unregister_net_sysctl_table(struct ctl_table_header *header) void unregister_pernet_subsys(struct pernet_operations *) {} +void unregister_qdisc(struct Qdisc_ops *qops) +{ + registered_qdiscs--; +} + void vfree(const void *block) { if (!vmallocs_in_use || unit_hash_get(vmallocs_in_use, block) == NULL) { @@ -1615,6 +1690,17 @@ int mock_check_error(int *errorMask) return result; } +/** + * mock_cmpxchg() - Replacement for atomic64_cmpxchg_relaxed. + */ +s64 mock_cmpxchg(atomic64_t *target, s64 old, s64 new) +{ + if (mock_check_error(&mock_cmpxchg_errors)) + return old+1; + atomic64_set(target, new); + return old; +} + /** * mock_clear_xmit_prios() - Remove all information from the list of * transmit priorities. @@ -1704,6 +1790,16 @@ void mock_get_page(struct page *page) unit_hash_set(pages_in_use, page, (void *) (ref_count+1)); } +int mock_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *settings) +{ + if (mock_check_error(&mock_ethtool_ksettings_errors)) + return -EOPNOTSUPP; + memset(settings, 0, sizeof(*settings)); + settings->base.speed = mock_link_mbps; + return 0; +} + void *mock_net_generic(const struct net *net, unsigned int id) { struct homa_net *hnet; @@ -1777,6 +1873,24 @@ void mock_put_page(struct page *page) } } +/** + * mock_qdisc_new() - Allocate and initialize a new Qdisc suitable for + * use in unit tests as a homa qdisc. + * Return: The new Qdisc. The memory is dynamically allocated and must + * be kfree-d by the caller. homa_qdisc_init has not been invoked on + * this Qdisc yet. + */ +struct Qdisc *mock_qdisc_new(struct netdev_queue *dev_queue) +{ + struct Qdisc *sch; + + sch = kmalloc(sizeof(struct Qdisc) + sizeof(struct homa_qdisc), + GFP_ATOMIC); + sch->dev_queue = dev_queue; + mock_net_queue.dev = &mock_net_device; + return sch; +} + /** * mock_rcu_read_lock() - Called instead of rcu_read_lock when Homa is compiled * for unit testing. @@ -2124,6 +2238,7 @@ void mock_teardown(void) pcpu_hot.current_task = &mock_task; mock_alloc_page_errors = 0; mock_alloc_skb_errors = 0; + mock_cmpxchg_errors = 0; mock_copy_data_errors = 0; mock_copy_to_iter_errors = 0; mock_copy_to_user_errors = 0; @@ -2134,6 +2249,7 @@ void mock_teardown(void) mock_next_clock_val = 0; mock_num_clock_vals = 0; mock_tt_cycles = 0; + mock_ethtool_ksettings_errors = 0; mock_exit_thread = false; mock_ipv6 = mock_ipv6_default; mock_dst_check_errors = 0; @@ -2145,6 +2261,7 @@ void mock_teardown(void) mock_kthread_create_errors = 0; mock_prepare_to_wait_errors = 0; mock_register_protosw_errors = 0; + mock_register_qdisc_errors = 0; mock_register_sysctl_errors = 0; mock_rht_init_errors = 0; mock_rht_insert_errors = 0; @@ -2176,6 +2293,7 @@ void mock_teardown(void) homa_net_id = 0; mock_num_hnets = 0; mock_peer_free_no_fail = 0; + mock_link_mbps = 10000; mock_net_device.gso_max_size = 0; mock_net_device.gso_max_segs = 1000; memset(inet_offloads, 0, sizeof(inet_offloads)); @@ -2209,6 +2327,11 @@ void mock_teardown(void) unit_hash_free(pages_in_use); pages_in_use = NULL; + if (registered_qdiscs != 0) + FAIL(" %d qdiscs still registered after test", + registered_qdiscs); + registered_qdiscs = 0; + count = unit_hash_size(proc_files_in_use); if (count > 0) FAIL(" %u proc file(s) still allocated after test", count); diff --git a/test/mock.h b/test/mock.h index 2428c411..db8b3adb 100644 --- a/test/mock.h +++ b/test/mock.h @@ -4,10 +4,14 @@ #ifndef _HOMA_MOCK_H #define _HOMA_MOCK_H +#include + /* Replace various Linux variables and functions with mocked ones. */ #undef alloc_pages #define alloc_pages mock_alloc_pages +#define atomic64_cmpxchg_relaxed mock_cmpxchg + #undef alloc_percpu_gfp #define alloc_percpu_gfp(type, flags) mock_kmalloc(10 * sizeof(type), flags) @@ -115,12 +119,14 @@ extern int mock_bpage_size; extern int mock_bpage_shift; extern u64 mock_clock; extern u64 mock_clock_tick; +extern int mock_cmpxchg_errors; extern int mock_compound_order_mask; extern int mock_copy_data_errors; extern int mock_copy_to_user_dont_copy; extern int mock_copy_to_user_errors; extern int mock_cpu_idle; extern int mock_dst_check_errors; +extern int mock_ethtool_ksettings_errors; extern bool mock_exit_thread; extern int mock_import_iovec_errors; extern int mock_import_ubuf_errors; @@ -130,8 +136,10 @@ extern bool mock_ipv6; extern bool mock_ipv6_default; extern int mock_kmalloc_errors; extern int mock_kthread_create_errors; +extern int mock_link_mbps; extern int mock_prepare_to_wait_errors; extern int mock_register_protosw_errors; +extern int mock_register_qdisc_errors; extern int mock_register_sysctl_errors; extern int mock_wait_intr_irq_errors; extern char mock_xmit_prios[]; @@ -143,6 +151,9 @@ extern __u16 mock_min_default_port; extern int mock_mtu; extern struct net_device mock_net_device; +extern struct netdev_queue + mock_net_queue; +extern struct net mock_nets[]; extern int mock_numa_mask; extern int mock_page_nid_mask; extern int mock_peer_free_no_fail; @@ -173,12 +184,15 @@ struct homa_net *mock_alloc_hnet(struct homa *homa); int mock_check_error(int *errorMask); void mock_clear_xmit_prios(void); +s64 mock_cmpxchg(atomic64_t *target, s64 old, s64 new); unsigned int mock_compound_order(struct page *page); int mock_cpu_to_node(int core); void mock_data_ready(struct sock *sk); struct dst_entry *mock_dst_check(struct dst_entry *, __u32 cookie); cycles_t mock_get_cycles(void); +int mock_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *settings); unsigned int mock_get_mtu(const struct dst_entry *dst); void mock_get_page(struct page *page); @@ -191,6 +205,8 @@ void mock_preempt_disable(void); void mock_preempt_enable(void); int mock_processor_id(void); void mock_put_page(struct page *page); +struct Qdisc + *mock_qdisc_new(struct netdev_queue *dev_queue); void mock_rcu_read_lock(void); void mock_rcu_read_unlock(void); void mock_record_locked(void *lock); diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c new file mode 100644 index 00000000..51347d82 --- /dev/null +++ b/test/unit_homa_qdisc.c @@ -0,0 +1,451 @@ +// SPDX-License-Identifier: BSD-2-Clause + +#include "homa_impl.h" +#include "homa_qdisc.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + +#include + +/** + * new_test_skb() - Create a new skb for use in testing qdisc stuff. + * The skb will have a small data area plus homa_skb_info and + * @rpc_name: Store this as the rpc field in homa_skb_info. This string + * will be included in messages generated about the skb. + * @bytes_left: Store this as the @bytes_left field in homa_skb_info. + */ +static struct sk_buff *new_test_skb(char *rpc_name, int bytes_left) +{ + struct homa_skb_info *info; + struct sk_buff *skb; + + skb = alloc_skb(100 + sizeof(struct homa_skb_info), GFP_ATOMIC); + info = homa_get_skb_info(skb); + info->rpc = rpc_name; + info->bytes_left = bytes_left; + return skb; +} + +/** + * log_skb_list() - Print info to the unit test log describing a list of + * skb's (including sibling sub-lists)a. + * @list: List to print out. + */ +void log_skb_list(struct sk_buff_head *list) +{ + struct homa_skb_info *info; + struct sk_buff *skb; + + skb_queue_walk(list, skb) { + info = homa_get_skb_info(skb); + unit_log_printf("; ", "%s:%d", (char *)info->rpc, + info->bytes_left); + if (info->next_sibling) { + struct sk_buff *sibling = info->next_sibling; + char *separator = " ["; + + while (sibling) { + struct homa_skb_info *sibling_info = + homa_get_skb_info(sibling); + + unit_log_printf(separator, "%s:%d", + (char *)sibling_info->rpc, + sibling_info->bytes_left); + separator = " "; + sibling = sibling_info->next_sibling; + } + unit_log_printf("", "]"); + } + } +} + +static struct homa_qdisc_dev *hook_qdev; +static int hook_sleep_count; +static void pacer_sleep_hook(char *id) { + if (strcmp(id, "prepare_to_wait") != 0) + return; + if (hook_sleep_count > 0) { + hook_sleep_count--; + if (hook_sleep_count == 0) + mock_exit_thread = true; + } +} + +FIXTURE(homa_qdisc) { + struct homa homa; + struct homa_net *hnet; + struct Qdisc *qdisc; +}; +FIXTURE_SETUP(homa_qdisc) +{ + homa_init(&self->homa); + self->hnet = mock_alloc_hnet(&self->homa); + self->qdisc = mock_qdisc_new(&mock_net_queue); + mock_clock = 10000; + unit_log_clear(); +} +FIXTURE_TEARDOWN(homa_qdisc) +{ + kfree(self->qdisc); + homa_destroy(&self->homa); + unit_teardown(); +} + +TEST_F(homa_qdisc, homa_qdisc_init__basics) +{ + struct homa_qdisc_dev *qdev; + + EXPECT_EQ(0, homa_qdisc_init(self->qdisc, NULL, NULL)); + qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, + struct homa_qdisc_dev, links); + ASSERT_NE(NULL, qdev); + EXPECT_EQ(1, qdev->num_qdiscs); + EXPECT_EQ(10000, qdev->link_mbps); + EXPECT_EQ(10240, self->qdisc->limit); + homa_qdisc_destroy(self->qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_init__cant_create_new_qdisc_dev) +{ + struct homa_qdisc_dev *qdev; + + mock_kmalloc_errors = 1; + EXPECT_EQ(ENOMEM, -homa_qdisc_init(self->qdisc, NULL, NULL)); + qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, + struct homa_qdisc_dev, links); + EXPECT_EQ(NULL, qdev); +} +TEST_F(homa_qdisc, homa_qdisc_init__existing_qdisc_dev) +{ + struct homa_qdisc_dev *qdev; + struct Qdisc *sch2; + + EXPECT_EQ(0, homa_qdisc_init(self->qdisc, NULL, NULL)); + qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, + struct homa_qdisc_dev, links); + EXPECT_NE(NULL, qdev); + EXPECT_EQ(1, qdev->num_qdiscs); + + sch2 = mock_qdisc_new(&mock_net_queue); + EXPECT_EQ(0, homa_qdisc_init(sch2, NULL, NULL)); + EXPECT_EQ(2, qdev->num_qdiscs); + homa_qdisc_destroy(sch2); + kfree(sch2); + homa_qdisc_destroy(self->qdisc); +} + +TEST_F(homa_qdisc, homa_qdisc_qdev_new__success) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + EXPECT_FALSE(IS_ERR(qdev)); + + homa_qdisc_qdev_destroy(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_qdev_new__kmalloc_failure) +{ + struct homa_qdisc_dev *qdev; + + mock_kmalloc_errors = 1; + qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + EXPECT_TRUE(IS_ERR(qdev)); + EXPECT_EQ(ENOMEM, -PTR_ERR(qdev)); +} +TEST_F(homa_qdisc, homa_qdisc_qdev_new__cant_create_thread) +{ + struct homa_qdisc_dev *qdev; + + mock_kthread_create_errors = 1; + qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + EXPECT_TRUE(IS_ERR(qdev)); + EXPECT_EQ(EACCES, -PTR_ERR(qdev)); +} + +TEST_F(homa_qdisc, homa_qdisc_destroy) +{ + struct homa_qdisc_dev *qdev; + struct Qdisc *sch2; + + EXPECT_EQ(0, homa_qdisc_init(self->qdisc, NULL, NULL)); + sch2 = mock_qdisc_new(&mock_net_queue); + EXPECT_EQ(0, homa_qdisc_init(sch2, NULL, NULL)); + qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, + struct homa_qdisc_dev, links); + EXPECT_NE(NULL, qdev); + EXPECT_EQ(2, qdev->num_qdiscs); + + homa_qdisc_destroy(sch2); + EXPECT_EQ(1, qdev->num_qdiscs); + kfree(sch2); + + homa_qdisc_destroy(self->qdisc); + qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, + struct homa_qdisc_dev, links); + EXPECT_EQ(NULL, qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_qdev_destroy) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + EXPECT_FALSE(IS_ERR(qdev)); + + /* The test infrastructure will warn if these packets aren't all + * freed. + */ + homa_qdisc_srpt_enqueue(&qdev->homa_deferred, new_test_skb("msg1", 80)); + homa_qdisc_srpt_enqueue(&qdev->homa_deferred, new_test_skb("msg1", 60)); + homa_qdisc_srpt_enqueue(&qdev->tcp_deferred, new_test_skb("msg3", 20)); + + homa_qdisc_qdev_destroy(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_srpt_enqueue__basics) +{ + struct sk_buff_head list; + + skb_queue_head_init(&list); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 1000)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 2000)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg3", 500)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg4", 1000)); + log_skb_list(&list); + EXPECT_STREQ("msg3:500; msg1:1000; msg4:1000; msg2:2000", unit_log_get()); + homa_qdisc_srpt_free(&list); +} +TEST_F(homa_qdisc, homa_qdisc_srpt_enqueue__multiple_pkts_for_rpc) +{ + struct sk_buff_head list; + + skb_queue_head_init(&list); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 1000)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 2000)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 800)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 600)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 400)); + log_skb_list(&list); + EXPECT_STREQ("msg1:1000 [msg1:800 msg1:600 msg1:400]; msg2:2000", + unit_log_get()); + homa_qdisc_srpt_free(&list); +} + +TEST_F(homa_qdisc, homa_qdisc_srpt_dequeue__list_empty) +{ + struct sk_buff_head list; + + skb_queue_head_init(&list); + EXPECT_EQ(NULL, homa_qdisc_srpt_dequeue(&list)); +} +TEST_F(homa_qdisc, homa_qdisc_srpt_dequeue__no_siblings) +{ + struct sk_buff *skb; + struct sk_buff_head list; + + skb_queue_head_init(&list); + skb = new_test_skb("msg1", 1000); + homa_qdisc_srpt_enqueue(&list, skb); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 2000)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg3", 3000)); + log_skb_list(&list); + EXPECT_STREQ("msg1:1000; msg2:2000; msg3:3000", unit_log_get()); + + EXPECT_EQ(skb, homa_qdisc_srpt_dequeue(&list)); + unit_log_clear(); + log_skb_list(&list); + EXPECT_STREQ("msg2:2000; msg3:3000", unit_log_get()); + kfree_skb(skb); + homa_qdisc_srpt_free(&list); +} +TEST_F(homa_qdisc, homa_qdisc_srpt_dequeue__siblings) +{ + struct sk_buff *skb1, *skb2; + struct sk_buff_head list; + + skb_queue_head_init(&list); + skb1 = new_test_skb("msg1", 1000); + homa_qdisc_srpt_enqueue(&list, skb1); + skb2 = new_test_skb("msg2", 2000); + homa_qdisc_srpt_enqueue(&list, skb2); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg3", 3000)); + log_skb_list(&list); + EXPECT_STREQ("msg1:1000; msg2:2000; msg3:3000", unit_log_get()); + + EXPECT_EQ(skb1, homa_qdisc_srpt_dequeue(&list)); + unit_log_clear(); + log_skb_list(&list); + EXPECT_STREQ("msg2:2000; msg3:3000", unit_log_get()); + kfree_skb(skb1); + + EXPECT_EQ(skb2, homa_qdisc_srpt_dequeue(&list)); + unit_log_clear(); + log_skb_list(&list); + EXPECT_STREQ("msg3:3000", unit_log_get()); + kfree_skb(skb2); + homa_qdisc_srpt_free(&list); +} + +TEST_F(homa_qdisc, homa_qdisc_srpt_free) +{ + struct sk_buff_head list; + + skb_queue_head_init(&list); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 500)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 1000)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 600)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 400)); + homa_qdisc_srpt_enqueue(&list, new_test_skb("msg3", 2000)); + log_skb_list(&list); + EXPECT_STREQ("msg1:500; msg2:1000 [msg2:600 msg2:400]; msg3:2000", + unit_log_get()); + + homa_qdisc_srpt_free(&list); + unit_log_clear(); + log_skb_list(&list); + EXPECT_STREQ("", unit_log_get()); +} + +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__nic_idle) +{ + struct homa_qdisc_dev qdev; + + memset(&qdev, 0, sizeof(qdev)); + qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ + mock_clock = 1000; + + EXPECT_EQ(1, homa_qdisc_update_link_idle(&qdev, 200, 0)); + EXPECT_EQ(1200 + HOMA_ETH_FRAME_OVERHEAD, + atomic64_read(&qdev.link_idle_time)); +} +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__queue_too_long) +{ + struct homa_qdisc_dev qdev; + + memset(&qdev, 0, sizeof(qdev)); + qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ + mock_clock = 1000; + atomic64_set(&qdev.link_idle_time, 1100); + + /* First attempt: queue too long. */ + EXPECT_EQ(0, homa_qdisc_update_link_idle(&qdev, 200, 99)); + EXPECT_EQ(1100, atomic64_read(&qdev.link_idle_time)); + + /* Second attempt tolerates longer queue. */ + EXPECT_EQ(1, homa_qdisc_update_link_idle(&qdev, 200, 110)); + EXPECT_EQ(1300 + HOMA_ETH_FRAME_OVERHEAD, + atomic64_read(&qdev.link_idle_time)); +} +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__ignore_queue_length) +{ + struct homa_qdisc_dev qdev; + + memset(&qdev, 0, sizeof(qdev)); + qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ + mock_clock = 1000; + atomic64_set(&qdev.link_idle_time, 1200); + + EXPECT_EQ(1, homa_qdisc_update_link_idle(&qdev, 120, -1)); + EXPECT_EQ(1320 + HOMA_ETH_FRAME_OVERHEAD, + atomic64_read(&qdev.link_idle_time)); +} +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__cmpxchg_conflicts) +{ + struct homa_qdisc_dev qdev; + + memset(&qdev, 0, sizeof(qdev)); + qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ + mock_clock = 1000; + mock_cmpxchg_errors = 0xf; + + EXPECT_EQ(1, homa_qdisc_update_link_idle(&qdev, 200, 0)); + EXPECT_EQ(1200 + HOMA_ETH_FRAME_OVERHEAD, + atomic64_read(&qdev.link_idle_time)); + EXPECT_EQ(4, homa_metrics_per_cpu()->idle_time_conflicts); +} + +TEST_F(homa_qdisc, homa_qdisc_pacer_main__basics) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + EXPECT_FALSE(IS_ERR(qdev)); + + unit_hook_register(pacer_sleep_hook); + hook_qdev = qdev; + hook_sleep_count = 3; + mock_clock_tick = 200; + + homa_qdisc_pacer_main(qdev); + EXPECT_EQ(400, homa_metrics_per_cpu()->pacer_cycles); + + homa_qdisc_qdev_destroy(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_update_sysctl__basics) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + EXPECT_FALSE(IS_ERR(qdev)); + + self->homa.link_mbps = 25000; + mock_link_mbps = 8000; + homa_qdisc_update_sysctl(qdev); + EXPECT_EQ(8000, qdev->link_mbps); + EXPECT_EQ(1059061, qdev->cycles_per_mibyte); + + homa_qdisc_qdev_destroy(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_update_sysctl__cant_get_link_speed_from_dev) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + EXPECT_FALSE(IS_ERR(qdev)); + + self->homa.link_mbps = 16000; + mock_link_mbps = 8000; + mock_ethtool_ksettings_errors = 1; + homa_qdisc_update_sysctl(qdev); + EXPECT_EQ(16000, qdev->link_mbps); + EXPECT_EQ(529530, qdev->cycles_per_mibyte); + + homa_qdisc_qdev_destroy(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_update_all_sysctl) +{ + struct homa_qdisc *q, *q2; + struct netdev_queue net_queue2; + struct net_device net_device2; + struct Qdisc *sch2; + + memset(&net_queue2, 0, sizeof(net_queue2)); + memset(&net_device2, 0, sizeof(net_device2)); + net_queue2.dev = &net_device2; + net_device2.nd_net.net = &mock_nets[0]; + sch2 = mock_qdisc_new(&net_queue2); + self->homa.link_mbps = 16000; + mock_link_mbps = 40000; + + EXPECT_EQ(0, homa_qdisc_init(self->qdisc, NULL, NULL)); + EXPECT_EQ(0, homa_qdisc_init(sch2, NULL, NULL)); + q = qdisc_priv(self->qdisc); + q2 = qdisc_priv(sch2); + EXPECT_EQ(40000, q->qdev->link_mbps); + EXPECT_EQ(16000, q2->qdev->link_mbps); + + self->homa.link_mbps = 25000; + mock_link_mbps = 8000; + homa_qdisc_update_all_sysctl(self->hnet); + + EXPECT_EQ(8000, q->qdev->link_mbps); + EXPECT_EQ(25000, q2->qdev->link_mbps); + + homa_qdisc_destroy(self->qdisc); + homa_qdisc_destroy(sch2); + kfree(sch2); +} \ No newline at end of file From 6dc37c53a92859f44c734a9bccf7b9c55d025444 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 21 Jul 2025 10:00:45 -0700 Subject: [PATCH 405/625] Make is_homa_pkt work for IPv6 as well as IPv4. --- homa_impl.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 31b4a162..cb2eb97b 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -688,14 +688,16 @@ static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb) */ static inline bool is_homa_pkt(struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); + int protocol; + protocol = (skb_is_ipv6(skb)) ? ipv6_hdr(skb)->nexthdr : + ip_hdr(skb)->protocol; #ifndef __STRIP__ /* See strip.py */ - return ((iph->protocol == IPPROTO_HOMA) || - ((iph->protocol == IPPROTO_TCP) && - (tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT)))); + return (protocol == IPPROTO_HOMA || + (protocol == IPPROTO_TCP && + tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT))); #else /* See strip.py */ - return iph->protocol == IPPROTO_HOMA; + return protocol == IPPROTO_HOMA; #endif /* See strip.py */ } From 748e7540782cee3065f6956efc8e3d9f0bcc1d22 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 21 Jul 2025 15:22:21 -0700 Subject: [PATCH 406/625] Complete first possibly-working version of homa_qdisc Doesn't attempt to throttle TCP packets yet, but should be functional to run as well as existing pacer. Has full unit tests. --- homa_qdisc.c | 308 +++++++++++--------- homa_qdisc.h | 67 +++-- test/mock.c | 47 +-- test/mock.h | 5 +- test/unit_homa_qdisc.c | 637 +++++++++++++++++++++++++++++++++++------ 5 files changed, 814 insertions(+), 250 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index c7b2e151..6f3636f3 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -51,64 +51,37 @@ void homa_qdisc_unregister(void) } /** - * homa_qdisc_init() - Initialize a new instance of this queuing discipline. - * @sch: Qdisc to initialize. - * @opt: Options for this qdisc; not currently used. - * @extack: For reporting detailed information relating to errors; not used. - * Return: 0 for success, otherwise a negative errno. + * homa_qdisc_qdev_get() - Find the homa_qdisc_dev to use for a particular + * net_device and increment its reference count. Create a new one if there + * isn't an existing one to use. + * @hnet: Network namespace for the homa_qdisc_dev. + * @dev: NIC that the homa_qdisc_dev will manage. + * Return: A pointer to the new homa_qdisc_dev, or a PTR_ERR errno. */ -int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, + struct net_device *dev) { - struct homa_qdisc *q = qdisc_priv(sch); struct homa_qdisc_dev *qdev; - struct homa_net *hnet; - bool found = false; - hnet = homa_net_from_net(dev_net(sch->dev_queue->dev)); spin_lock_bh(&hnet->qdisc_devs_lock); list_for_each_entry(qdev, &hnet->qdisc_devs, links) { - if (qdev->dev == sch->dev_queue->dev) { - found = true; - break; - } - } - if (!found) { - qdev = homa_qdisc_qdev_new(hnet, sch->dev_queue->dev); - if (IS_ERR(qdev)) { - spin_unlock_bh(&hnet->qdisc_devs_lock); - return PTR_ERR(qdev); + if (qdev->dev == dev) { + qdev->refs++; + goto done; } - } else { - qdev->num_qdiscs++; } - spin_unlock_bh(&hnet->qdisc_devs_lock); - - q->qdev = qdev; - skb_queue_head_init(&q->queue); - - sch->limit = 10*1024; - return 0; -} - -/** - * homa_qdisc_qdev_new() - Allocate and initialize a new homa_qdisc_dev. - * @hnet: Network namespace for the homa_qdisc_dev. - * @dev: NIC that the homa_qdisc_dev will manage. - * Return A pointer to the new homa_qdisc_dev, or a PTR_ERR errno. - */ -struct homa_qdisc_dev *homa_qdisc_qdev_new(struct homa_net *hnet, - struct net_device *dev) - __must_hold(hnet->qdisc_devs_lock) -{ - struct homa_qdisc_dev *qdev; qdev = kzalloc(sizeof(*qdev), GFP_ATOMIC); - if (!qdev) - return ERR_PTR(-ENOMEM); + if (!qdev) { + qdev = ERR_PTR(-ENOMEM); + goto done; + } qdev->dev = dev; qdev->hnet = hnet; - qdev->num_qdiscs = 1; + qdev->refs = 1; + spin_lock_init(&qdev->lock); + qdev->pacer_qix = -1; + qdev->redirect_qix = -1; homa_qdisc_update_sysctl(qdev); INIT_LIST_HEAD(&qdev->links); skb_queue_head_init(&qdev->homa_deferred); @@ -124,43 +97,123 @@ struct homa_qdisc_dev *homa_qdisc_qdev_new(struct homa_net *hnet, pr_err("couldn't create homa qdisc pacer thread: error %d\n", error); kfree(qdev); - return ERR_PTR(error); + qdev = ERR_PTR(error); + goto done; } list_add(&qdev->links, &hnet->qdisc_devs); + +done: + spin_unlock_bh(&hnet->qdisc_devs_lock); return qdev; } +/** + * homa_qdisc_qdev_put() - Decrement the reference count for a homa_qdisc_qdev + * and free it if the count becomes zero. + * @qdev: Object to unreference. + */ +void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) +{ + struct homa_net *hnet = qdev->hnet; + + spin_lock_bh(&hnet->qdisc_devs_lock); + qdev->refs--; + if (qdev->refs == 0) { + kthread_stop(qdev->pacer_kthread); + qdev->pacer_kthread = NULL; + + __list_del_entry(&qdev->links); + homa_qdisc_srpt_free(&qdev->homa_deferred); + skb_queue_purge(&qdev->tcp_deferred); + kfree(qdev); + } + spin_unlock_bh(&hnet->qdisc_devs_lock); +} + +/** + * homa_qdisc_init() - Initialize a new instance of this queuing discipline. + * @sch: Qdisc to initialize. + * @opt: Options for this qdisc; not currently used. + * @extack: For reporting detailed information relating to errors; not used. + * Return: 0 for success, otherwise a negative errno. + */ +int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct homa_qdisc *q = qdisc_priv(sch); + struct homa_qdisc_dev *qdev; + struct homa_net *hnet; + int i; + + hnet = homa_net_from_net(dev_net(sch->dev_queue->dev)); + qdev = homa_qdisc_qdev_get(hnet, sch->dev_queue->dev); + if (IS_ERR(qdev)) + return PTR_ERR(qdev); + + q->qdev = qdev; + q->ix = -1; + for (i = 0; i < qdev->dev->num_tx_queues; i++) { + if (netdev_get_tx_queue(qdev->dev, i) == sch->dev_queue) { + q->ix = i; + break; + } + } + + sch->limit = 10*1024; + return 0; +} + /** * homa_qdisc_destroy() - This function is invoked to perform final cleanup * before a qdisc is deleted. * @sch: Qdisc that is being deleted. */ -void homa_qdisc_destroy(struct Qdisc *sch) +void homa_qdisc_destroy(struct Qdisc *qdisc) { - struct homa_qdisc *q = qdisc_priv(sch); - struct homa_qdisc_dev *qdev = q->qdev; + struct homa_qdisc *q = qdisc_priv(qdisc); - spin_lock_bh(&qdev->hnet->qdisc_devs_lock); - qdev->num_qdiscs--; - if (qdev->num_qdiscs == 0) - homa_qdisc_qdev_destroy(qdev); - spin_unlock_bh(&qdev->hnet->qdisc_devs_lock); + qdisc_reset_queue(qdisc); + homa_qdisc_qdev_put(q->qdev); } /** - * homa_qdisc_qdev_destroy() - Cleanup and release memory for a homa_qdisc_dev. - * @qdev: Object to destroy; its memory will be freed. + * homa_qdisc_set_qixs() - Recompute the @pacer_qix and @redirect_qix + * fields in @qdev. Upon return, both fields will be valid unless there + * are no Homa qdiscs associated with qdev's net_device. + * @qdev: Identifies net_device containing qnetdev_queues to choose + * between. */ -void homa_qdisc_qdev_destroy(struct homa_qdisc_dev *qdev) - __must_hold(qde) +void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev) { - kthread_stop(qdev->pacer_kthread); - qdev->pacer_kthread = NULL; + int i, pacer_qix, redirect_qix; + struct netdev_queue *txq; + struct Qdisc *qdisc; - __list_del_entry(&qdev->links); - homa_qdisc_srpt_free(&qdev->homa_deferred); - skb_queue_purge(&qdev->tcp_deferred); - kfree(qdev); + /* Note: it's safe for mutliple instances of this function to + * execute concurrently so no synchronization is needed (other + * than using RCU to protect against deletion of the underlying + * data structures). + */ + + pacer_qix = -1; + redirect_qix = -1; + rcu_read_lock(); + for (i = 0; i < qdev->dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(qdev->dev, i); + qdisc = rcu_dereference_bh(txq->qdisc); + if (!qdisc || qdisc->ops != &homa_qdisc_ops) + continue; + if (pacer_qix == -1) { + pacer_qix = i; + redirect_qix = i; + } else { + redirect_qix = i; + break; + } + } + qdev->pacer_qix = pacer_qix; + qdev->redirect_qix = redirect_qix; + rcu_read_unlock(); } /** @@ -176,11 +229,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct homa_qdisc_dev *qdev = q->qdev; struct homa *homa = qdev->hnet->homa; int pkt_len; - - if (skb == q->qdev->pacer_skb) { - q->qdev->pacer_skb = NULL; - goto enqueue; - } + int result; /* The packet length computed by Linux didn't include overheads * such as inter-frame gap; add that in here. @@ -205,11 +254,19 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, */ homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); wake_up(&qdev->pacer_sleep); + return NET_XMIT_SUCCESS; enqueue: - if (likely(sch->q.qlen < READ_ONCE(sch->limit))) - return qdisc_enqueue_tail(skb, sch); - return qdisc_drop(skb, sch, to_free); + if (q->ix != qdev->pacer_qix) { + if (unlikely(sch->q.qlen >= READ_ONCE(sch->limit))) + return qdisc_drop(skb, sch, to_free); + spin_lock_bh(qdisc_lock(sch)); + result = qdisc_enqueue_tail(skb, sch); + spin_unlock_bh(qdisc_lock(sch)); + } else { + result = homa_qdisc_enqueue_special(skb, qdev, false); + } + return result; } /** @@ -485,75 +542,62 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) break; homa_qdisc_update_link_idle(qdev, qdisc_skb_cb(skb)->pkt_len, -1); - - /* Resubmit the packet. Concentrate all of the (long) - * resubmitted packets on device queue 0, in order - * to reduce contention between them and short packets - * on other queues. - */ - qdev->pacer_skb = skb; - homa_qdisc_resubmit_skb(skb, qdev->dev, 0); - qdev->pacer_skb = NULL; + homa_qdisc_enqueue_special(skb, qdev, true); } done: spin_unlock_bh(&qdev->pacer_mutex); } /** - * homa_qdisc_resubmit_skb() - This function is called by the pacer to - * restart the transmission of an skb that was deferred because of NIC - * queue length. The packet may be dropped under various error conditions. + * homa_qdisc_enqueue_special() - This function is called by the pacer to + * enqueue a packet on one of the distinguished transmit queues and wake + * up the queue for transmission. * @skb: Packet to resubmit. - * @dev: Network device to which the packet should be resubmitted. - * @queue: Index of desired tx queue on @dev. - * Return: Zero for success, otherwise a negative errno. + * @qdev: Homa data about the networkd device on which the packet should + * be resubmitted. + * @pacer: True means queue the packet on qdev->pacer_qix, false means + * qdev->redirect_qix. + * Return: Standard enqueue return code (usually NET_XMIT_SUCCESS). */ -void homa_qdisc_resubmit_skb(struct sk_buff *skb, struct net_device *dev, - int queue) +int homa_qdisc_enqueue_special(struct sk_buff *skb, + struct homa_qdisc_dev *qdev, bool pacer) { - /* The code of this function was extracted from __dev_xmit_skb - * (with RCU lock/unlock from __dev_queue_xmit). Ideally this - * module would simply invoke __dev_xmit_skb, but it isn't - * globally available. - */ - struct sk_buff *to_free = NULL; struct netdev_queue *txq; - spinlock_t *root_lock; - struct Qdisc *q; - bool contended; - - rcu_read_lock_bh(); - txq = netdev_get_tx_queue(dev, 0); - q = rcu_dereference_bh(txq->qdisc); - root_lock = qdisc_lock(q); - - contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); - if (unlikely(contended)) - spin_lock(&q->busylock); - - spin_lock(root_lock); - if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - __qdisc_drop(skb, &to_free); - } else { - WRITE_ONCE(q->owner, smp_processor_id()); - q->enqueue(skb, q, &to_free); - WRITE_ONCE(q->owner, -1); - if (qdisc_run_begin(q)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } - // __qdisc_run(q); - qdisc_run_end(q); - } - } - spin_unlock(root_lock); - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - tcf_get_drop_reason(to_free)); - if (unlikely(contended)) - spin_unlock(&q->busylock); - rcu_read_unlock_bh(); + struct Qdisc *qdisc; + int result; + int qix; + int i; + + rcu_read_lock(); + + /* Must make sure that the queue index is still valid (refers + * to a Homa qdisc). + */ + for (i = 0; ; i++) { + qix = pacer ? qdev->pacer_qix : qdev->redirect_qix; + if (qix >= 0 && qix < qdev->dev->num_tx_queues) { + txq = netdev_get_tx_queue(qdev->dev, qix); + qdisc = rcu_dereference_bh(txq->qdisc); + if (qdisc->ops== &homa_qdisc_ops) + break; + } + if (i > 0) { + /* Couldn't find a Homa qdisc to use; drop the skb. */ + kfree_skb(skb); + result = NET_XMIT_DROP; + goto done; + } + homa_qdisc_set_qixs(qdev); + } + + spin_lock_bh(qdisc_lock(qdisc)); + result = qdisc_enqueue_tail(skb, qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); + netif_schedule_queue(txq); + +done: + rcu_read_unlock(); + return result; } /** diff --git a/homa_qdisc.h b/homa_qdisc.h index 8f16a627..65e6757e 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -24,8 +24,11 @@ struct homa_qdisc { /** @dev: Info shared among all qdiscs for a net_device. */ struct homa_qdisc_dev *qdev; - /** @queue: Packets waiting to be transmitted. */ - struct sk_buff_head queue; + /** + * @ix: Index of this qdisc's transmit queue among all those for + * its net_device. + */ + int ix; }; /** @@ -43,10 +46,47 @@ struct homa_qdisc_dev { struct homa_net *hnet; /** - * @num_qdiscs: Number of homa_qdisc objects referencing this struct. - * Access only when holding homa->qdisc_devs_lock. + * @refs: Reference count (e.g. includes one reference for each + * homa_qdisc that references this object). Must hold + * hnet->qdisc_devs_lock to access. + */ + int refs; + + /** + * @lock: Used to synchronize access to mutable fields within + * this struct, such as @pacer_qix and @redirect_qix. + */ + spinlock_t lock; + + /** + * @pacer_qix: Index of a netdev_queue within dev that is reserved + * for the pacer to use for transmitting packets. We segregate paced + * traffic (which is almost entirely large packets) from non-paced + * traffic (mostly small packets). All the paced traffic goes to a + * single transmit queue, and though we try to limit the length of + * this queue, there are situations where the queue can still build + * up (under some scenarios it appears that NICs cannot actually + * transmit at line rate). If the pacer queue is segregated, queue + * buildup there will not affect non-paced packets. In order to + * reserve pacer_qix for pacer traffic, short-packet traffic that + * is assigned to that queue must be redirected to another queue; + * redirect_qix is used for that. -1 means there currently isn't + * a netdev_queue assigned for pacer traffic. Note: this field is + * a hint; the value must be verified under RCU to have a Homa qdisc + * before using. + */ + int pacer_qix; + + /** + * @redirect_qix: Index of a netdev_queue within dev; packets + * originally passed to pacer_qix are redirected here, so that + * pacer_qix is used only for packets sent by the pacer. -1 means + * there isn't currently a netdev_queue assigned for this purpose. + * This field is a hint that must be verified under RCU before using + * to be sure it still refers to a Homa qdisc. May be the same as + * pacer_qix if there is only one Homa qdisc associated with dev. */ - int num_qdiscs; + int redirect_qix; /** @link_mbps: Speed of the link associated with @dev, in Mbps. */ int link_mbps; @@ -101,28 +141,23 @@ struct homa_qdisc_dev { * never block on this. */ spinlock_t pacer_mutex __aligned(L1_CACHE_BYTES); - - /** - * @pacer_skb: The current skb that the pacer has selected for - * transmission and is pushing through __dev_xmit_skb. This skb - * should be transmitted without any further delay or accounting. - */ - struct sk_buff *pacer_skb; }; void homa_qdisc_destroy(struct Qdisc *sch); int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); +int homa_qdisc_enqueue_special(struct sk_buff *skb, + struct homa_qdisc_dev *qdev, + bool pacer); int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack); int homa_qdisc_pacer_main(void *device); -void homa_qdisc_qdev_destroy(struct homa_qdisc_dev *qdev); struct homa_qdisc_dev * - homa_qdisc_qdev_new(struct homa_net *hnet, + homa_qdisc_qdev_get(struct homa_net *hnet, struct net_device *dev); +void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev); int homa_qdisc_register(void); -void homa_qdisc_resubmit_skb(struct sk_buff *skb, - struct net_device *dev, int queue); +void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev); void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, struct sk_buff *skb); struct sk_buff * diff --git a/test/mock.c b/test/mock.c index a381d64d..5f5d15a0 100644 --- a/test/mock.c +++ b/test/mock.c @@ -130,6 +130,9 @@ static struct unit_hash *pages_in_use; */ static int registered_qdiscs; +/* Registered by most recent call to register_qdisc. */ +static struct Qdisc_ops *qdisc_ops; + /* Keeps track of all the results returned by ip_route_output_flow that * have not yet been freed. Reset for each test. */ @@ -266,12 +269,16 @@ struct dst_ops mock_dst_ops = { .check = mock_dst_check}; struct netdev_queue mock_net_queue = {.state = 0}; struct net_device mock_net_device = { - .gso_max_segs = 1000, - .gso_max_size = 0, - ._tx = &mock_net_queue, - .nd_net = {.net = &mock_nets[0]}, - .ethtool_ops = &mock_ethtool_ops - }; + .gso_max_segs = 1000, + .gso_max_size = 0, + ._tx = &mock_net_queue, + .nd_net = {.net = &mock_nets[0]}, + .ethtool_ops = &mock_ethtool_ops +}; + +/* Number of invocations of netif_schedule_queue. */ +int mock_netif_schedule_calls; + const struct net_offload *inet_offloads[MAX_INET_PROTOS]; const struct net_offload *inet6_offloads[MAX_INET_PROTOS]; struct net_offload tcp_offload; @@ -1065,9 +1072,7 @@ ssize_t __modver_version_show(struct module_attribute *a, void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) -{ - -} +{} #ifdef CONFIG_DEBUG_LOCK_ALLOC void mutex_lock_nested(struct mutex *lock, unsigned int subclass) @@ -1084,6 +1089,11 @@ void mutex_unlock(struct mutex *lock) mock_active_locks--; } +void netif_schedule_queue(struct netdev_queue *txq) +{ + mock_netif_schedule_calls++; +} + int netif_receive_skb(struct sk_buff *skb) { struct homa_data_hdr *h = (struct homa_data_hdr *) @@ -1331,6 +1341,7 @@ int register_qdisc(struct Qdisc_ops *qops) if (mock_check_error(&mock_register_qdisc_errors)) return -EINVAL; registered_qdiscs++; + qdisc_ops = qops; return 0; } @@ -1578,6 +1589,7 @@ void unregister_pernet_subsys(struct pernet_operations *) void unregister_qdisc(struct Qdisc_ops *qops) { registered_qdiscs--; + qdisc_ops = NULL; } void vfree(const void *block) @@ -1874,21 +1886,22 @@ void mock_put_page(struct page *page) } /** - * mock_qdisc_new() - Allocate and initialize a new Qdisc suitable for + * mock_alloc_qdisc() - Allocate and initialize a new Qdisc suitable for * use in unit tests as a homa qdisc. * Return: The new Qdisc. The memory is dynamically allocated and must * be kfree-d by the caller. homa_qdisc_init has not been invoked on * this Qdisc yet. */ -struct Qdisc *mock_qdisc_new(struct netdev_queue *dev_queue) +struct Qdisc *mock_alloc_qdisc(struct netdev_queue *dev_queue) { - struct Qdisc *sch; + struct Qdisc *qdisc; - sch = kmalloc(sizeof(struct Qdisc) + sizeof(struct homa_qdisc), + qdisc = kzalloc(sizeof(struct Qdisc) + sizeof(struct homa_qdisc), GFP_ATOMIC); - sch->dev_queue = dev_queue; - mock_net_queue.dev = &mock_net_device; - return sch; + qdisc->dev_queue = dev_queue; + qdisc->ops = qdisc_ops; + spin_lock_init(&qdisc->q.lock); + return qdisc; } /** @@ -2296,6 +2309,7 @@ void mock_teardown(void) mock_link_mbps = 10000; mock_net_device.gso_max_size = 0; mock_net_device.gso_max_segs = 1000; + mock_netif_schedule_calls = 0; memset(inet_offloads, 0, sizeof(inet_offloads)); inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) &tcp_offload; memset(inet6_offloads, 0, sizeof(inet6_offloads)); @@ -2331,6 +2345,7 @@ void mock_teardown(void) FAIL(" %d qdiscs still registered after test", registered_qdiscs); registered_qdiscs = 0; + qdisc_ops = NULL; count = unit_hash_size(proc_files_in_use); if (count > 0) diff --git a/test/mock.h b/test/mock.h index db8b3adb..a32a06b6 100644 --- a/test/mock.h +++ b/test/mock.h @@ -137,6 +137,7 @@ extern bool mock_ipv6_default; extern int mock_kmalloc_errors; extern int mock_kthread_create_errors; extern int mock_link_mbps; +extern int mock_netif_schedule_calls; extern int mock_prepare_to_wait_errors; extern int mock_register_protosw_errors; extern int mock_register_qdisc_errors; @@ -182,6 +183,8 @@ struct page * mock_alloc_pages(gfp_t gfp, unsigned order); struct homa_net *mock_alloc_hnet(struct homa *homa); +struct Qdisc + *mock_alloc_qdisc(struct netdev_queue *dev_queue); int mock_check_error(int *errorMask); void mock_clear_xmit_prios(void); s64 mock_cmpxchg(atomic64_t *target, s64 old, s64 new); @@ -205,8 +208,6 @@ void mock_preempt_disable(void); void mock_preempt_enable(void); int mock_processor_id(void); void mock_put_page(struct page *page); -struct Qdisc - *mock_qdisc_new(struct netdev_queue *dev_queue); void mock_rcu_read_lock(void); void mock_rcu_read_unlock(void); void mock_record_locked(void *lock); diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 51347d82..623a8460 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause #include "homa_impl.h" +#include "homa_pacer.h" #include "homa_qdisc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" @@ -77,131 +78,354 @@ static void pacer_sleep_hook(char *id) { FIXTURE(homa_qdisc) { struct homa homa; struct homa_net *hnet; - struct Qdisc *qdisc; + struct in6_addr addr; + struct net_device dev; +#define NUM_TXQS 4 + struct netdev_queue txqs[NUM_TXQS]; + struct Qdisc *qdiscs[NUM_TXQS]; + struct ethtool_ops ethtool_ops; + struct homa_data_hdr data; }; FIXTURE_SETUP(homa_qdisc) { + int i; + + homa_qdisc_register(); homa_init(&self->homa); self->hnet = mock_alloc_hnet(&self->homa); - self->qdisc = mock_qdisc_new(&mock_net_queue); + self->addr = unit_get_in_addr("1.2.3.4"); + memset(&self->dev, 0, sizeof(self->dev)); + self->dev._tx = self->txqs; + self->dev.num_tx_queues = NUM_TXQS; + self->dev.nd_net.net = self->hnet->net; + self->dev.ethtool_ops = &self->ethtool_ops; + memset(&self->ethtool_ops, 0, sizeof(self->ethtool_ops)); + self->ethtool_ops.get_link_ksettings = mock_get_link_ksettings; + + memset(&self->txqs, 0, sizeof(self->txqs)); + memset(&self->qdiscs, 0, sizeof(self->qdiscs)); + for (i = 0; i < NUM_TXQS; i++) { + self->txqs[i].state = 0; + self->txqs[i].dev = &self->dev; + self->qdiscs[i] = mock_alloc_qdisc(&self->txqs[i]); + self->txqs[i].qdisc = self->qdiscs[i]; + } + mock_net_queue.dev = &mock_net_device; + + self->data.common = (struct homa_common_hdr){ + .sport = htons(1000), + .dport = htons(2000), + .type = DATA, + .sender_id = cpu_to_be64(100) + }; + self->data.message_length = htonl(10000); + mock_clock = 10000; unit_log_clear(); } FIXTURE_TEARDOWN(homa_qdisc) { - kfree(self->qdisc); + int i; + + for (i = 0; i < NUM_TXQS; i++) + kfree(self->qdiscs[i]); homa_destroy(&self->homa); + homa_qdisc_unregister(); unit_teardown(); } -TEST_F(homa_qdisc, homa_qdisc_init__basics) +TEST_F(homa_qdisc, homa_qdisc_qdev_get__create_new) { struct homa_qdisc_dev *qdev; - EXPECT_EQ(0, homa_qdisc_init(self->qdisc, NULL, NULL)); - qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, - struct homa_qdisc_dev, links); - ASSERT_NE(NULL, qdev); - EXPECT_EQ(1, qdev->num_qdiscs); - EXPECT_EQ(10000, qdev->link_mbps); - EXPECT_EQ(10240, self->qdisc->limit); - homa_qdisc_destroy(self->qdisc); + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + EXPECT_FALSE(IS_ERR(qdev)); + EXPECT_EQ(1, qdev->refs); + + homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_init__cant_create_new_qdisc_dev) +TEST_F(homa_qdisc, homa_qdisc_get__use_existing) { struct homa_qdisc_dev *qdev; - mock_kmalloc_errors = 1; - EXPECT_EQ(ENOMEM, -homa_qdisc_init(self->qdisc, NULL, NULL)); - qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, - struct homa_qdisc_dev, links); - EXPECT_EQ(NULL, qdev); + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + EXPECT_FALSE(IS_ERR(qdev)); + EXPECT_EQ(1, qdev->refs); + + EXPECT_EQ(qdev, homa_qdisc_qdev_get(self->hnet, &mock_net_device)); + EXPECT_EQ(2, qdev->refs); + + homa_qdisc_qdev_put(qdev); + homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_init__existing_qdisc_dev) +TEST_F(homa_qdisc, homa_qdisc_qdev_get__kmalloc_failure) { struct homa_qdisc_dev *qdev; - struct Qdisc *sch2; - EXPECT_EQ(0, homa_qdisc_init(self->qdisc, NULL, NULL)); - qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, - struct homa_qdisc_dev, links); - EXPECT_NE(NULL, qdev); - EXPECT_EQ(1, qdev->num_qdiscs); + mock_kmalloc_errors = 1; + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + EXPECT_TRUE(IS_ERR(qdev)); + EXPECT_EQ(ENOMEM, -PTR_ERR(qdev)); +} +TEST_F(homa_qdisc, homa_qdisc_qdev_get__cant_create_thread) +{ + struct homa_qdisc_dev *qdev; - sch2 = mock_qdisc_new(&mock_net_queue); - EXPECT_EQ(0, homa_qdisc_init(sch2, NULL, NULL)); - EXPECT_EQ(2, qdev->num_qdiscs); - homa_qdisc_destroy(sch2); - kfree(sch2); - homa_qdisc_destroy(self->qdisc); + mock_kthread_create_errors = 1; + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + EXPECT_TRUE(IS_ERR(qdev)); + EXPECT_EQ(EACCES, -PTR_ERR(qdev)); } -TEST_F(homa_qdisc, homa_qdisc_qdev_new__success) +TEST_F(homa_qdisc, homa_qdisc_qdev_put) { - struct homa_qdisc_dev *qdev; + struct homa_qdisc_dev *qdev, *qdev2; - qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); EXPECT_FALSE(IS_ERR(qdev)); - - homa_qdisc_qdev_destroy(qdev); + homa_qdisc_qdev_get(self->hnet, &mock_net_device); + EXPECT_EQ(2, qdev->refs); + + homa_qdisc_qdev_put(qdev); + EXPECT_EQ(1, qdev->refs); + qdev2 = list_first_entry_or_null(&self->hnet->qdisc_devs, + struct homa_qdisc_dev, links); + EXPECT_EQ(qdev, qdev2); + + homa_qdisc_qdev_put(qdev); + qdev2 = list_first_entry_or_null(&self->hnet->qdisc_devs, + struct homa_qdisc_dev, links); + EXPECT_EQ(NULL, qdev2); } -TEST_F(homa_qdisc, homa_qdisc_qdev_new__kmalloc_failure) + +TEST_F(homa_qdisc, homa_qdisc_init__basics) { + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct homa_qdisc_dev *qdev; + struct homa_qdisc *q; - mock_kmalloc_errors = 1; - qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); - EXPECT_TRUE(IS_ERR(qdev)); - EXPECT_EQ(ENOMEM, -PTR_ERR(qdev)); + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, + struct homa_qdisc_dev, links); + ASSERT_NE(NULL, qdev); + EXPECT_EQ(1, qdev->refs); + EXPECT_EQ(10000, qdev->link_mbps); + EXPECT_EQ(10240, qdisc->limit); + q = qdisc_priv(qdisc); + EXPECT_EQ(-1, q->ix); + homa_qdisc_destroy(qdisc); + kfree(qdisc); } -TEST_F(homa_qdisc, homa_qdisc_qdev_new__cant_create_thread) +TEST_F(homa_qdisc, homa_qdisc_init__cant_create_new_qdisc_dev) { + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct homa_qdisc_dev *qdev; - mock_kthread_create_errors = 1; - qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); - EXPECT_TRUE(IS_ERR(qdev)); - EXPECT_EQ(EACCES, -PTR_ERR(qdev)); + mock_kmalloc_errors = 1; + EXPECT_EQ(ENOMEM, -homa_qdisc_init(qdisc, NULL, NULL)); + qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, + struct homa_qdisc_dev, links); + EXPECT_EQ(NULL, qdev); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_init__set_qix) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&self->txqs[2]); + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + EXPECT_EQ(2, q->ix); + homa_qdisc_destroy(qdisc); + kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_destroy) { + struct Qdisc *qdisc, *qdisc2; struct homa_qdisc_dev *qdev; - struct Qdisc *sch2; - EXPECT_EQ(0, homa_qdisc_init(self->qdisc, NULL, NULL)); - sch2 = mock_qdisc_new(&mock_net_queue); - EXPECT_EQ(0, homa_qdisc_init(sch2, NULL, NULL)); + qdisc = mock_alloc_qdisc(&mock_net_queue); + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + qdisc2 = mock_alloc_qdisc(&mock_net_queue); + EXPECT_EQ(0, homa_qdisc_init(qdisc2, NULL, NULL)); qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, struct homa_qdisc_dev, links); EXPECT_NE(NULL, qdev); - EXPECT_EQ(2, qdev->num_qdiscs); + EXPECT_EQ(2, qdev->refs); - homa_qdisc_destroy(sch2); - EXPECT_EQ(1, qdev->num_qdiscs); - kfree(sch2); + homa_qdisc_destroy(qdisc2); + EXPECT_EQ(1, qdev->refs); - homa_qdisc_destroy(self->qdisc); + homa_qdisc_destroy(qdisc); qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, struct homa_qdisc_dev, links); EXPECT_EQ(NULL, qdev); + kfree(qdisc); + kfree(qdisc2); } -TEST_F(homa_qdisc, homa_qdisc_qdev_destroy) +TEST_F(homa_qdisc, _homa_qdisc_homa_qdisc_set_qixs_object) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); - EXPECT_FALSE(IS_ERR(qdev)); + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + /* Simple working case. */ + homa_qdisc_set_qixs(qdev); + EXPECT_EQ(0, qdev->pacer_qix); + EXPECT_EQ(1, qdev->redirect_qix); + + /* No qdisc in devnet_queue. */ + self->txqs[0].qdisc = NULL; + homa_qdisc_set_qixs(qdev); + EXPECT_EQ(1, qdev->pacer_qix); + EXPECT_EQ(2, qdev->redirect_qix); + + /* Qdisc isn't Homa. */ + self->txqs[2].qdisc->ops = NULL; + homa_qdisc_set_qixs(qdev); + EXPECT_EQ(1, qdev->pacer_qix); + EXPECT_EQ(3, qdev->redirect_qix); + + /* Can't find separate qdisc for short_pkt_qix. */ + self->txqs[3].qdisc->ops = NULL; + homa_qdisc_set_qixs(qdev); + EXPECT_EQ(1, qdev->pacer_qix); + EXPECT_EQ(1, qdev->redirect_qix); + + /* Can't find any Homa qdiscs. */ + self->txqs[1].qdisc->ops = NULL; + homa_qdisc_set_qixs(qdev); + EXPECT_EQ(-1, qdev->pacer_qix); + EXPECT_EQ(-1, qdev->redirect_qix); + + homa_qdisc_qdev_put(qdev); +} - /* The test infrastructure will warn if these packets aren't all - * freed. - */ - homa_qdisc_srpt_enqueue(&qdev->homa_deferred, new_test_skb("msg1", 80)); - homa_qdisc_srpt_enqueue(&qdev->homa_deferred, new_test_skb("msg1", 60)); - homa_qdisc_srpt_enqueue(&qdev->tcp_deferred, new_test_skb("msg3", 20)); +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_qdisc *q; + u64 idle; + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + idle = mock_clock + 1 + self->homa.pacer->max_nic_queue_cycles + 1; + atomic64_set(&q->qdev->link_idle_time, idle); + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + qdisc_skb_cb(skb)->pkt_len = 1500; + to_free = NULL; + + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, q->qdev->homa_deferred.qlen); + EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__short_packet) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + atomic64_set(&q->qdev->link_idle_time, 1000000); + q->ix = 3; + skb = mock_skb_alloc(&self->addr, &self->data.common, 100, 0); + qdisc_skb_cb(skb)->pkt_len = 100; + to_free = NULL; + unit_log_clear(); - homa_qdisc_qdev_destroy(qdev); + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(0, q->qdev->homa_deferred.qlen); + EXPECT_EQ(1, qdisc->q.qlen); + EXPECT_STREQ("", unit_log_get()); + EXPECT_LT(1000000, atomic64_read(&q->qdev->link_idle_time)); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + atomic64_set(&q->qdev->link_idle_time, 1000000); + q->ix = 3; + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + qdisc_skb_cb(skb)->pkt_len = 1500; + if (skb_is_ipv6(skb)) + ipv6_hdr(skb)->nexthdr = IPPROTO_TCP; + else + ip_hdr(skb)->protocol = IPPROTO_TCP; + to_free = NULL; + unit_log_clear(); + + homa_qdisc_enqueue(skb, qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(0, q->qdev->homa_deferred.qlen); + EXPECT_EQ(1, qdisc->q.qlen); + EXPECT_STREQ("", unit_log_get()); + EXPECT_LT(1000000, atomic64_read(&q->qdev->link_idle_time)); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__drop_packet_queue_over_limit) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + q->ix = 3; + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + qdisc->limit = 1; + qdisc->q.qlen = 5; + to_free = NULL; + unit_log_clear(); + + EXPECT_EQ(NET_XMIT_DROP, homa_qdisc_enqueue(skb, qdisc, &to_free)); + ASSERT_NE(NULL, to_free); + EXPECT_EQ(0, q->qdev->homa_deferred.qlen); + EXPECT_EQ(5, qdisc->q.qlen); + + kfree_skb(to_free); + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__use_special_queue) +{ + struct sk_buff *skb, *to_free; + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + q = qdisc_priv(self->qdiscs[1]); + q->qdev->pacer_qix = 1; + q->qdev->redirect_qix = 3; + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + unit_log_clear(); + + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, self->qdiscs[1], + &to_free)); + ASSERT_NE(NULL, to_free); + EXPECT_EQ(0, q->qdev->homa_deferred.qlen); + EXPECT_EQ(0, self->qdiscs[1]->q.qlen); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + + homa_qdisc_destroy(self->qdiscs[1]); + homa_qdisc_destroy(self->qdiscs[3]); } TEST_F(homa_qdisc, homa_qdisc_srpt_enqueue__basics) @@ -370,7 +594,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_main__basics) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); EXPECT_FALSE(IS_ERR(qdev)); unit_hook_register(pacer_sleep_hook); @@ -381,14 +605,256 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_main__basics) homa_qdisc_pacer_main(qdev); EXPECT_EQ(400, homa_metrics_per_cpu()->pacer_cycles); - homa_qdisc_qdev_destroy(qdev); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_pacer__queue_empty) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + unit_log_clear(); + + homa_qdisc_pacer(qdev); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, atomic64_read(&qdev->link_idle_time)); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) +{ + struct homa_qdisc_dev *qdev; + u64 link_idle; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + link_idle = atomic64_read(&qdev->link_idle_time); + homa_qdisc_srpt_enqueue(&qdev->homa_deferred, + new_test_skb("msg1", 1000)); + EXPECT_EQ(1, qdev->homa_deferred.qlen); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + unit_log_clear(); + + homa_qdisc_pacer(qdev); + EXPECT_EQ(0, qdev->homa_deferred.qlen); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + + homa_qdisc_destroy(self->qdiscs[3]); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) +{ + struct homa_qdisc_dev *qdev; + u64 link_idle; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + link_idle = atomic64_read(&qdev->link_idle_time); + homa_qdisc_srpt_enqueue(&qdev->homa_deferred, + new_test_skb("msg1", 1000)); + EXPECT_EQ(1, qdev->homa_deferred.qlen); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + unit_log_clear(); + + mock_trylock_errors = 1; + homa_qdisc_pacer(qdev); + EXPECT_EQ(1, qdev->homa_deferred.qlen); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_EQ(link_idle, atomic64_read(&qdev->link_idle_time)); + + homa_qdisc_destroy(self->qdiscs[3]); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + homa_qdisc_srpt_enqueue(&qdev->homa_deferred, + new_test_skb("msg1", 1000)); + + mock_clock = 0; + mock_clock_tick = 1000; + atomic64_set(&qdev->link_idle_time, 10000); + self->homa.pacer->max_nic_queue_cycles = 3500; + unit_log_clear(); + + homa_qdisc_pacer(qdev); + EXPECT_EQ(0, qdev->homa_deferred.qlen); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + + /* Packet will get transmitted when mock_clock ticks to 7000, but + * clock ticks once more in homa_qdisc_update_link_idle, then once + * in homa_qdisc_pacer before it returns. + */ + EXPECT_EQ(9000, mock_clock); + + homa_qdisc_destroy(self->qdiscs[3]); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) +{ + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + + skb = new_test_skb("msg1", 1000); + qdisc_skb_cb(skb)->pkt_len = 1500; + homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); + skb = new_test_skb("msg2", 1000); + qdisc_skb_cb(skb)->pkt_len = 1500; + homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); + EXPECT_EQ(2, qdev->homa_deferred.qlen); + + mock_clock = atomic64_read(&qdev->link_idle_time); + self->homa.pacer->max_nic_queue_cycles = 100; + unit_log_clear(); + + homa_qdisc_pacer(qdev); + EXPECT_EQ(1, qdev->homa_deferred.qlen); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_LT(mock_clock + 100, atomic64_read(&qdev->link_idle_time)); + + homa_qdisc_destroy(self->qdiscs[3]); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_enqueue_special__use_pacer_qix) +{ + struct sk_buff *skb; + struct homa_qdisc_dev *qdev; + int status; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[1]))->qdev; + qdev->pacer_qix = 1; + qdev->redirect_qix = 3; + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + unit_log_clear(); + + status = homa_qdisc_enqueue_special(skb, qdev, true); + EXPECT_EQ(NET_XMIT_SUCCESS, status); + EXPECT_EQ(1, self->qdiscs[1]->q.qlen); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_EQ(1, mock_netif_schedule_calls); + + homa_qdisc_destroy(self->qdiscs[1]); + homa_qdisc_destroy(self->qdiscs[3]); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue_special__use_redirect_qix) +{ + struct sk_buff *skb; + struct homa_qdisc_dev *qdev; + int status; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[1]))->qdev; + qdev->pacer_qix = 1; + qdev->redirect_qix = 3; + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + unit_log_clear(); + + status = homa_qdisc_enqueue_special(skb, qdev, false); + EXPECT_EQ(NET_XMIT_SUCCESS, status); + EXPECT_EQ(0, self->qdiscs[1]->q.qlen); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + + homa_qdisc_destroy(self->qdiscs[1]); + homa_qdisc_destroy(self->qdiscs[3]); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue_special__redirect_qix_invalid) +{ + struct sk_buff *skb; + struct homa_qdisc_dev *qdev; + int status; + int i; + + for (i = 0; i < 4; i++) + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[i], NULL, NULL)); + qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[0]))->qdev; + qdev->pacer_qix = 3; + qdev->redirect_qix = 5; + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + unit_log_clear(); + + status = homa_qdisc_enqueue_special(skb, qdev, false); + EXPECT_EQ(NET_XMIT_SUCCESS, status); + EXPECT_EQ(1, self->qdiscs[1]->q.qlen); + EXPECT_EQ(0, qdev->pacer_qix); + EXPECT_EQ(1, qdev->redirect_qix); + + for (i = 0; i < 4; i++) + homa_qdisc_destroy(self->qdiscs[i]); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue_special__redirect_qix_not_a_homa_qdisc) +{ + struct sk_buff *skb; + struct homa_qdisc_dev *qdev; + int status; + int i; + + for (i = 0; i < 4; i++) + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[i], NULL, NULL)); + qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[0]))->qdev; + qdev->pacer_qix = 3; + qdev->redirect_qix = 0; + self->qdiscs[0]->ops = NULL; + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + unit_log_clear(); + + status = homa_qdisc_enqueue_special(skb, qdev, false); + EXPECT_EQ(NET_XMIT_SUCCESS, status); + EXPECT_EQ(1, self->qdiscs[2]->q.qlen); + EXPECT_EQ(1, qdev->pacer_qix); + EXPECT_EQ(2, qdev->redirect_qix); + + for (i = 0; i < 4; i++) + homa_qdisc_destroy(self->qdiscs[i]); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue_special__no_suitable_qdisc) +{ + struct sk_buff *skb; + struct homa_qdisc_dev *qdev; + int status; + int i; + + for (i = 0; i < 4; i++) { + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[i], NULL, NULL)); + self->qdiscs[i]->ops = NULL; + } + qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[0]))->qdev; + qdev->pacer_qix = 3; + qdev->redirect_qix = 0; + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + unit_log_clear(); + + status = homa_qdisc_enqueue_special(skb, qdev, false); + EXPECT_EQ(NET_XMIT_DROP, status); + EXPECT_EQ(-1, qdev->pacer_qix); + EXPECT_EQ(-1, qdev->redirect_qix); + EXPECT_EQ(0, mock_netif_schedule_calls); + + for (i = 0; i < 4; i++) + homa_qdisc_destroy(self->qdiscs[i]); } TEST_F(homa_qdisc, homa_qdisc_update_sysctl__basics) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); EXPECT_FALSE(IS_ERR(qdev)); self->homa.link_mbps = 25000; @@ -397,13 +863,13 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl__basics) EXPECT_EQ(8000, qdev->link_mbps); EXPECT_EQ(1059061, qdev->cycles_per_mibyte); - homa_qdisc_qdev_destroy(qdev); + homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_update_sysctl__cant_get_link_speed_from_dev) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_new(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); EXPECT_FALSE(IS_ERR(qdev)); self->homa.link_mbps = 16000; @@ -413,28 +879,30 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl__cant_get_link_speed_from_dev) EXPECT_EQ(16000, qdev->link_mbps); EXPECT_EQ(529530, qdev->cycles_per_mibyte); - homa_qdisc_qdev_destroy(qdev); + homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_update_all_sysctl) { - struct homa_qdisc *q, *q2; - struct netdev_queue net_queue2; + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct netdev_queue txq2; struct net_device net_device2; - struct Qdisc *sch2; + struct homa_qdisc *q, *q2; + struct Qdisc *qdisc2; - memset(&net_queue2, 0, sizeof(net_queue2)); + memset(&txq2, 0, sizeof(txq2)); memset(&net_device2, 0, sizeof(net_device2)); - net_queue2.dev = &net_device2; + txq2.dev = &net_device2; net_device2.nd_net.net = &mock_nets[0]; - sch2 = mock_qdisc_new(&net_queue2); + qdisc2 = mock_alloc_qdisc(&txq2); + self->homa.link_mbps = 16000; mock_link_mbps = 40000; - EXPECT_EQ(0, homa_qdisc_init(self->qdisc, NULL, NULL)); - EXPECT_EQ(0, homa_qdisc_init(sch2, NULL, NULL)); - q = qdisc_priv(self->qdisc); - q2 = qdisc_priv(sch2); + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + EXPECT_EQ(0, homa_qdisc_init(qdisc2, NULL, NULL)); + q = qdisc_priv(qdisc); + q2 = qdisc_priv(qdisc2); EXPECT_EQ(40000, q->qdev->link_mbps); EXPECT_EQ(16000, q2->qdev->link_mbps); @@ -445,7 +913,8 @@ TEST_F(homa_qdisc, homa_qdisc_update_all_sysctl) EXPECT_EQ(8000, q->qdev->link_mbps); EXPECT_EQ(25000, q2->qdev->link_mbps); - homa_qdisc_destroy(self->qdisc); - homa_qdisc_destroy(sch2); - kfree(sch2); + homa_qdisc_destroy(qdisc); + kfree(qdisc); + homa_qdisc_destroy(qdisc2); + kfree(qdisc2); } \ No newline at end of file From 0f50f0d13d6708064df64178ddd4574fe904a499 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 21 Jul 2025 15:43:06 -0700 Subject: [PATCH 407/625] Don't use pacer if homa_qdisc is in use Let homa_qdisc perform the throttling and SRPT output if it is enabled; no need for the pacer. --- homa_outgoing.c | 10 ++++++++-- test/unit_homa_outgoing.c | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index 2bd06cb9..834b4eee 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -610,8 +610,14 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) } #endif /* See strip.py */ - if ((rpc->msgout.length - rpc->msgout.next_xmit_offset) - >= homa->pacer->throttle_min_bytes) { +#ifndef __STRIP__ /* See strip.py */ + if (rpc->msgout.length - rpc->msgout.next_xmit_offset > + homa->pacer->throttle_min_bytes && + list_empty(&rpc->hsk->hnet->qdisc_devs)) { +#else /* See strip.py */ + if (rpc->msgout.length - rpc->msgout.next_xmit_offset > + homa->pacer->throttle_min_bytes) { +#endif /* See strip.py */ if (!homa_pacer_check_nic_q(homa->pacer, skb, force)) { tt_record1("homa_xmit_data adding id %u to throttle queue", rpc->id); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 9d59e201..47f19a4e 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -6,6 +6,7 @@ #include "homa_peer.h" #include "homa_rpc.h" #ifndef __STRIP__ /* See strip.py */ +#include "homa_qdisc.h" #include "homa_skb.h" #else /* See strip.py */ #include "homa_stub.h" @@ -877,6 +878,30 @@ TEST_F(homa_outgoing, homa_xmit_data__force) EXPECT_STREQ("request id 1234, next_offset 2800; " "request id 1236, next_offset 1400", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_outgoing, homa_xmit_data__dont_throttle_because_homa_qdisc_in_use) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 2000, 1000); + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + unit_log_clear(); + atomic64_set(&self->homa.pacer->link_idle_time, 1000000); + self->homa.pacer->max_nic_queue_cycles = 0; + self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; + + homa_rpc_lock(crpc); + homa_xmit_data(crpc, false); + homa_rpc_unlock(crpc); + EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 600@1400", unit_log_get()); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__throttle) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, From 324273d88725548114709bb0ebc0777d10f6a434 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 22 Jul 2025 09:52:31 -0700 Subject: [PATCH 408/625] Fix trivial typo in tthoma.py --- util/tthoma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/tthoma.py b/util/tthoma.py index f89bf365..528b7f3e 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -7782,7 +7782,7 @@ def output(self): ['softirq gets first grant', 'softirq_grant',lambda x : x[0][0]], ['last request packet sent', 'send_data', lambda x : x[-1][0]], ['gro gets first response packet','gro_data', lambda x : x[0][0]], - ['softrq gets first response pkt','softirq_data', lambda x : x[0][0]], + ['softirq gets first response pkt','softirq_data', lambda x : x[0][0]], ['sent grant', 'send_grant', lambda x : x[0][0]], ['gro gets last response packet', 'gro_data', lambda x : x[-1][0]], ['homa_recvmsg returning', 'recvmsg_done', lambda x : x] From fa9b90fb544f36a054fd449f36a54ef4ea6ea58f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 28 Jul 2025 11:24:46 -0700 Subject: [PATCH 409/625] Add --extra option to ttsyslog.py --- util/ttsyslog.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/util/ttsyslog.py b/util/ttsyslog.py index 47316676..16194ded 100755 --- a/util/ttsyslog.py +++ b/util/ttsyslog.py @@ -9,9 +9,11 @@ out with times in microseconds instead of clock cycles. Usage: -ttsyslog.py [file] +ttsyslog.py [--extra file2] [file] If no file is given, the information is read from standard input. +If "--extra file2" is specified, all of the lines that are *not* valid +timetrace records are output to file file2. """ from __future__ import division, print_function @@ -32,6 +34,11 @@ # Time in cycles of previous event. prev_time = 0 +extra = None +if (len(sys.argv) > 2): + if sys.argv[1] == '--extra': + extra = open(sys.argv[2], 'w') + del sys.argv[1:3] f = sys.stdin if len(sys.argv) > 1: f = open(sys.argv[1]) @@ -67,5 +74,12 @@ (this_time - prev_time)/(1000.0 * cpu_ghz), this_event)) prev_time = this_time +if extra: + for line in lines: + if not re.match('.* ([0-9.]+) (\[C..\] .+)', line): + extra.write(line) + extra.write('\n') + extra.close() + if cpu_ghz == None: print("Couldn't find initial line with clock speed", file=sys.stderr) \ No newline at end of file From 21e725601fb1ab9775308c279c6c53ab42fb5d27 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 28 Jul 2025 11:31:05 -0700 Subject: [PATCH 410/625] Fix checkpatch.pl issues in homa_qdisc* Many whitespace problems (spaces instead of tabs). --- homa_qdisc.c | 626 +++++++++++++++++++++++++-------------------------- homa_qdisc.h | 8 +- 2 files changed, 317 insertions(+), 317 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 6f3636f3..d0f90d79 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -20,15 +20,15 @@ #include static struct Qdisc_ops homa_qdisc_ops __read_mostly = { - .id = "homa", - .priv_size = sizeof(struct homa_qdisc), - .enqueue = homa_qdisc_enqueue, - .dequeue = qdisc_dequeue_head, - .peek = qdisc_peek_head, - .init = homa_qdisc_init, - .reset = qdisc_reset_queue, - .destroy = homa_qdisc_destroy, - .owner = THIS_MODULE, + .id = "homa", + .priv_size = sizeof(struct homa_qdisc), + .enqueue = homa_qdisc_enqueue, + .dequeue = qdisc_dequeue_head, + .peek = qdisc_peek_head, + .init = homa_qdisc_init, + .reset = qdisc_reset_queue, + .destroy = homa_qdisc_destroy, + .owner = THIS_MODULE, }; /** @@ -38,7 +38,7 @@ static struct Qdisc_ops homa_qdisc_ops __read_mostly = { */ int homa_qdisc_register(void) { - return register_qdisc(&homa_qdisc_ops); + return register_qdisc(&homa_qdisc_ops); } /** @@ -47,7 +47,7 @@ int homa_qdisc_register(void) */ void homa_qdisc_unregister(void) { - unregister_qdisc(&homa_qdisc_ops); + unregister_qdisc(&homa_qdisc_ops); } /** @@ -59,33 +59,33 @@ void homa_qdisc_unregister(void) * Return: A pointer to the new homa_qdisc_dev, or a PTR_ERR errno. */ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, - struct net_device *dev) + struct net_device *dev) { - struct homa_qdisc_dev *qdev; + struct homa_qdisc_dev *qdev; spin_lock_bh(&hnet->qdisc_devs_lock); - list_for_each_entry(qdev, &hnet->qdisc_devs, links) { - if (qdev->dev == dev) { - qdev->refs++; - goto done; - } - } - - qdev = kzalloc(sizeof(*qdev), GFP_ATOMIC); - if (!qdev) { - qdev = ERR_PTR(-ENOMEM); - goto done; - } - qdev->dev = dev; - qdev->hnet = hnet; - qdev->refs = 1; + list_for_each_entry(qdev, &hnet->qdisc_devs, links) { + if (qdev->dev == dev) { + qdev->refs++; + goto done; + } + } + + qdev = kzalloc(sizeof(*qdev), GFP_ATOMIC); + if (!qdev) { + qdev = ERR_PTR(-ENOMEM); + goto done; + } + qdev->dev = dev; + qdev->hnet = hnet; + qdev->refs = 1; spin_lock_init(&qdev->lock); - qdev->pacer_qix = -1; - qdev->redirect_qix = -1; - homa_qdisc_update_sysctl(qdev); - INIT_LIST_HEAD(&qdev->links); - skb_queue_head_init(&qdev->homa_deferred); - skb_queue_head_init(&qdev->tcp_deferred); + qdev->pacer_qix = -1; + qdev->redirect_qix = -1; + homa_qdisc_update_sysctl(qdev); + INIT_LIST_HEAD(&qdev->links); + skb_queue_head_init(&qdev->homa_deferred); + skb_queue_head_init(&qdev->tcp_deferred); init_waitqueue_head(&qdev->pacer_sleep); spin_lock_init(&qdev->pacer_mutex); @@ -95,16 +95,16 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, int error = PTR_ERR(qdev->pacer_kthread); pr_err("couldn't create homa qdisc pacer thread: error %d\n", - error); - kfree(qdev); - qdev = ERR_PTR(error); - goto done; + error); + kfree(qdev); + qdev = ERR_PTR(error); + goto done; } - list_add(&qdev->links, &hnet->qdisc_devs); + list_add(&qdev->links, &hnet->qdisc_devs); done: - spin_unlock_bh(&hnet->qdisc_devs_lock); - return qdev; + spin_unlock_bh(&hnet->qdisc_devs_lock); + return qdev; } /** @@ -114,19 +114,19 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, */ void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) { - struct homa_net *hnet = qdev->hnet; + struct homa_net *hnet = qdev->hnet; spin_lock_bh(&hnet->qdisc_devs_lock); - qdev->refs--; - if (qdev->refs == 0) { - kthread_stop(qdev->pacer_kthread); - qdev->pacer_kthread = NULL; - - __list_del_entry(&qdev->links); - homa_qdisc_srpt_free(&qdev->homa_deferred); - skb_queue_purge(&qdev->tcp_deferred); - kfree(qdev); - } + qdev->refs--; + if (qdev->refs == 0) { + kthread_stop(qdev->pacer_kthread); + qdev->pacer_kthread = NULL; + + __list_del_entry(&qdev->links); + homa_qdisc_srpt_free(&qdev->homa_deferred); + skb_queue_purge(&qdev->tcp_deferred); + kfree(qdev); + } spin_unlock_bh(&hnet->qdisc_devs_lock); } @@ -140,27 +140,27 @@ void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { - struct homa_qdisc *q = qdisc_priv(sch); - struct homa_qdisc_dev *qdev; - struct homa_net *hnet; - int i; - - hnet = homa_net_from_net(dev_net(sch->dev_queue->dev)); - qdev = homa_qdisc_qdev_get(hnet, sch->dev_queue->dev); - if (IS_ERR(qdev)) - return PTR_ERR(qdev); - - q->qdev = qdev; - q->ix = -1; - for (i = 0; i < qdev->dev->num_tx_queues; i++) { - if (netdev_get_tx_queue(qdev->dev, i) == sch->dev_queue) { - q->ix = i; - break; - } - } - - sch->limit = 10*1024; - return 0; + struct homa_qdisc *q = qdisc_priv(sch); + struct homa_qdisc_dev *qdev; + struct homa_net *hnet; + int i; + + hnet = homa_net_from_net(dev_net(sch->dev_queue->dev)); + qdev = homa_qdisc_qdev_get(hnet, sch->dev_queue->dev); + if (IS_ERR(qdev)) + return PTR_ERR(qdev); + + q->qdev = qdev; + q->ix = -1; + for (i = 0; i < qdev->dev->num_tx_queues; i++) { + if (netdev_get_tx_queue(qdev->dev, i) == sch->dev_queue) { + q->ix = i; + break; + } + } + + sch->limit = 10 * 1024; + return 0; } /** @@ -170,10 +170,10 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, */ void homa_qdisc_destroy(struct Qdisc *qdisc) { - struct homa_qdisc *q = qdisc_priv(qdisc); + struct homa_qdisc *q = qdisc_priv(qdisc); - qdisc_reset_queue(qdisc); - homa_qdisc_qdev_put(q->qdev); + qdisc_reset_queue(qdisc); + homa_qdisc_qdev_put(q->qdev); } /** @@ -185,35 +185,35 @@ void homa_qdisc_destroy(struct Qdisc *qdisc) */ void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev) { - int i, pacer_qix, redirect_qix; - struct netdev_queue *txq; - struct Qdisc *qdisc; - - /* Note: it's safe for mutliple instances of this function to - * execute concurrently so no synchronization is needed (other - * than using RCU to protect against deletion of the underlying - * data structures). - */ - - pacer_qix = -1; - redirect_qix = -1; - rcu_read_lock(); - for (i = 0; i < qdev->dev->num_tx_queues; i++) { - txq = netdev_get_tx_queue(qdev->dev, i); - qdisc = rcu_dereference_bh(txq->qdisc); - if (!qdisc || qdisc->ops != &homa_qdisc_ops) - continue; - if (pacer_qix == -1) { - pacer_qix = i; - redirect_qix = i; - } else { - redirect_qix = i; - break; - } - } - qdev->pacer_qix = pacer_qix; - qdev->redirect_qix = redirect_qix; - rcu_read_unlock(); + int i, pacer_qix, redirect_qix; + struct netdev_queue *txq; + struct Qdisc *qdisc; + + /* Note: it's safe for mutltiple instances of this function to + * execute concurrently so no synchronization is needed (other + * than using RCU to protect against deletion of the underlying + * data structures). + */ + + pacer_qix = -1; + redirect_qix = -1; + rcu_read_lock(); + for (i = 0; i < qdev->dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(qdev->dev, i); + qdisc = rcu_dereference_bh(txq->qdisc); + if (!qdisc || qdisc->ops != &homa_qdisc_ops) + continue; + if (pacer_qix == -1) { + pacer_qix = i; + redirect_qix = i; + } else { + redirect_qix = i; + break; + } + } + qdev->pacer_qix = pacer_qix; + qdev->redirect_qix = redirect_qix; + rcu_read_unlock(); } /** @@ -223,50 +223,50 @@ void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev) * @to_free: Used when dropping packets. */ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) + struct sk_buff **to_free) { - struct homa_qdisc *q = qdisc_priv(sch); - struct homa_qdisc_dev *qdev = q->qdev; - struct homa *homa = qdev->hnet->homa; - int pkt_len; - int result; - - /* The packet length computed by Linux didn't include overheads - * such as inter-frame gap; add that in here. - */ - pkt_len = qdisc_skb_cb(skb)->pkt_len + HOMA_ETH_FRAME_OVERHEAD; - if (pkt_len < homa->pacer->throttle_min_bytes) { - homa_qdisc_update_link_idle(q->qdev, pkt_len, -1); - goto enqueue; - } - - if (!is_homa_pkt(skb)) { - homa_qdisc_update_link_idle(q->qdev, pkt_len, -1); - goto enqueue; - } - - if (homa_qdisc_update_link_idle(q->qdev, pkt_len, - homa->pacer->max_nic_queue_cycles)) - goto enqueue; - - /* This packet needs to be deferred until the NIC queue has - * been drained a bit. - */ - homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); + struct homa_qdisc *q = qdisc_priv(sch); + struct homa_qdisc_dev *qdev = q->qdev; + struct homa *homa = qdev->hnet->homa; + int pkt_len; + int result; + + /* The packet length computed by Linux didn't include overheads + * such as inter-frame gap; add that in here. + */ + pkt_len = qdisc_skb_cb(skb)->pkt_len + HOMA_ETH_FRAME_OVERHEAD; + if (pkt_len < homa->pacer->throttle_min_bytes) { + homa_qdisc_update_link_idle(q->qdev, pkt_len, -1); + goto enqueue; + } + + if (!is_homa_pkt(skb)) { + homa_qdisc_update_link_idle(q->qdev, pkt_len, -1); + goto enqueue; + } + + if (homa_qdisc_update_link_idle(q->qdev, pkt_len, + homa->pacer->max_nic_queue_cycles)) + goto enqueue; + + /* This packet needs to be deferred until the NIC queue has + * been drained a bit. + */ + homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); wake_up(&qdev->pacer_sleep); - return NET_XMIT_SUCCESS; + return NET_XMIT_SUCCESS; enqueue: - if (q->ix != qdev->pacer_qix) { - if (unlikely(sch->q.qlen >= READ_ONCE(sch->limit))) - return qdisc_drop(skb, sch, to_free); - spin_lock_bh(qdisc_lock(sch)); - result = qdisc_enqueue_tail(skb, sch); - spin_unlock_bh(qdisc_lock(sch)); - } else { - result = homa_qdisc_enqueue_special(skb, qdev, false); - } - return result; + if (q->ix != qdev->pacer_qix) { + if (unlikely(sch->q.qlen >= READ_ONCE(sch->limit))) + return qdisc_drop(skb, sch, to_free); + spin_lock_bh(qdisc_lock(sch)); + result = qdisc_enqueue_tail(skb, sch); + spin_unlock_bh(qdisc_lock(sch)); + } else { + result = homa_qdisc_enqueue_special(skb, qdev, false); + } + return result; } /** @@ -277,54 +277,54 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, */ void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, struct sk_buff *skb) { - struct homa_skb_info *info = homa_get_skb_info(skb); - struct sk_buff *other; + struct homa_skb_info *info = homa_get_skb_info(skb); + struct sk_buff *other; unsigned long flags; - /* Tricky point: only one packet from an RPC may appear in - * qdev->homa_deferred at once (the earliest one in the message). - * If later packets from the same message were also in the queue, - * they would have higher priorities and would get transmitted - * first, which we don't want. So, if more than one packet from - * a message is waiting, only the first appears in qdev->homa_deferred; - * the others are queued up using links in the homa_skb_info of - * the first packet. - * - * This also means that we must scan the list starting at the - * low-priority end, so we'll notice if there is an earlier - * (lower priority) packet for the same RPC already in the list. - */ - - info->next_sibling = NULL; - info->last_sibling = NULL; + /* Tricky point: only one packet from an RPC may appear in + * qdev->homa_deferred at once (the earliest one in the message). + * If later packets from the same message were also in the queue, + * they would have higher priorities and would get transmitted + * first, which we don't want. So, if more than one packet from + * a message is waiting, only the first appears in qdev->homa_deferred; + * the others are queued up using links in the homa_skb_info of + * the first packet. + * + * This also means that we must scan the list starting at the + * low-priority end, so we'll notice if there is an earlier + * (lower priority) packet for the same RPC already in the list. + */ + + info->next_sibling = NULL; + info->last_sibling = NULL; spin_lock_irqsave(&list->lock, flags); - if (skb_queue_empty(list)) { - __skb_queue_head(list, skb); - goto done; - } - skb_queue_reverse_walk(list, other) { - struct homa_skb_info *other_info = homa_get_skb_info(other); - - if (other_info->rpc == info->rpc) { - if (!other_info->last_sibling) - other_info->next_sibling = skb; - else - homa_get_skb_info(other_info->last_sibling)-> - next_sibling = skb; - other_info->last_sibling = skb; - break; - } - - if (other_info->bytes_left <= info->bytes_left) { - __skb_queue_after(list, other, skb); - break; - } - - if (skb_queue_is_first(list, other)) { - __skb_queue_head(list, skb); - break; - } - } + if (skb_queue_empty(list)) { + __skb_queue_head(list, skb); + goto done; + } + skb_queue_reverse_walk(list, other) { + struct homa_skb_info *other_info = homa_get_skb_info(other); + + if (other_info->rpc == info->rpc) { + if (!other_info->last_sibling) + other_info->next_sibling = skb; + else + homa_get_skb_info(other_info->last_sibling)-> + next_sibling = skb; + other_info->last_sibling = skb; + break; + } + + if (other_info->bytes_left <= info->bytes_left) { + __skb_queue_after(list, other, skb); + break; + } + + if (skb_queue_is_first(list, other)) { + __skb_queue_head(list, skb); + break; + } + } done: spin_unlock_irqrestore(&list->lock, flags); @@ -338,38 +338,38 @@ void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, struct sk_buff *skb) */ struct sk_buff *homa_qdisc_srpt_dequeue(struct sk_buff_head *list) { - struct homa_skb_info *sibling_info; - struct sk_buff *skb, *sibling; - struct homa_skb_info *info; + struct homa_skb_info *sibling_info; + struct sk_buff *skb, *sibling; + struct homa_skb_info *info; unsigned long flags; - /* The only tricky element about this function is that skb may - * have a sibling list. If so, we need to enqueue the next - * sibling. - */ + /* The only tricky element about this function is that skb may + * have a sibling list. If so, we need to enqueue the next + * sibling. + */ spin_lock_irqsave(&list->lock, flags); - if (skb_queue_empty(list)) { - spin_unlock_irqrestore(&list->lock, flags); - return NULL; - } - skb = list->next; - __skb_unlink(skb, list); - info = homa_get_skb_info(skb); - if (info->next_sibling) { - /* This is a "compound" packet, containing multiple - * packets from the same RPC. Put the next packet - * back on the list at the front (it should have even - * higher priority than skb, since it is later in the - * message). - */ - sibling = info->next_sibling; - sibling_info = homa_get_skb_info(sibling); - sibling_info->last_sibling = info->last_sibling; - __skb_queue_head(list, sibling); - } + if (skb_queue_empty(list)) { + spin_unlock_irqrestore(&list->lock, flags); + return NULL; + } + skb = list->next; + __skb_unlink(skb, list); + info = homa_get_skb_info(skb); + if (info->next_sibling) { + /* This is a "compound" packet, containing multiple + * packets from the same RPC. Put the next packet + * back on the list at the front (it should have even + * higher priority than skb, since it is later in the + * message). + */ + sibling = info->next_sibling; + sibling_info = homa_get_skb_info(sibling); + sibling_info->last_sibling = info->last_sibling; + __skb_queue_head(list, sibling); + } spin_unlock_irqrestore(&list->lock, flags); - return skb; + return skb; } /** @@ -381,14 +381,14 @@ struct sk_buff *homa_qdisc_srpt_dequeue(struct sk_buff_head *list) */ void homa_qdisc_srpt_free(struct sk_buff_head *list) { - struct sk_buff *skb; - - while (1) { - skb = homa_qdisc_srpt_dequeue(list); - if (!skb) - break; - kfree_skb(skb); - } + struct sk_buff *skb; + + while (1) { + skb = homa_qdisc_srpt_dequeue(list); + if (!skb) + break; + kfree_skb(skb); + } } /** @@ -409,33 +409,33 @@ void homa_qdisc_srpt_free(struct sk_buff_head *list) * if the queue was too long. */ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, - int max_queue_cycles) + int max_queue_cycles) { u64 idle, new_idle, clock, cycles_for_packet; cycles_for_packet = qdev->cycles_per_mibyte; cycles_for_packet = (cycles_for_packet * - (bytes + HOMA_ETH_FRAME_OVERHEAD)) >> 20; + (bytes + HOMA_ETH_FRAME_OVERHEAD)) >> 20; - /* The following loop may be executed multiple times if there - * are conflicting udpates to qdev->link_idle_time. - */ + /* The following loop may be executed multiple times if there + * are conflicting updates to qdev->link_idle_time. + */ while (1) { clock = homa_clock(); idle = atomic64_read(&qdev->link_idle_time); if (idle < clock) { new_idle = clock + cycles_for_packet; } else { - if (max_queue_cycles >= 0 && (idle - clock) > - max_queue_cycles) - return 0; + if (max_queue_cycles >= 0 && (idle - clock) > + max_queue_cycles) + return 0; new_idle = idle + cycles_for_packet; - } + } if (atomic64_cmpxchg_relaxed(&qdev->link_idle_time, idle, new_idle) == idle) break; - INC_METRIC(idle_time_conflicts, 1); + INC_METRIC(idle_time_conflicts, 1); } return 1; } @@ -450,20 +450,20 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, int homa_qdisc_pacer_main(void *device) { struct homa_qdisc_dev *qdev = device; - int status; - u64 start; + int status; + u64 start; while (1) { if (kthread_should_stop()) break; - start = homa_clock(); + start = homa_clock(); homa_qdisc_pacer(qdev); INC_METRIC(pacer_cycles, homa_clock() - start); if (!skb_queue_empty(&qdev->homa_deferred) || - !skb_queue_empty(&qdev->tcp_deferred)) { + !skb_queue_empty(&qdev->tcp_deferred)) { /* There are more packets to transmit (the NIC queue - * must be full); call the pacer again, but first + * must be full); call the pacer again, but first * give other threads a chance to run (otherwise * low-level packet processing such as softirq could * starve). @@ -475,8 +475,8 @@ int homa_qdisc_pacer_main(void *device) tt_record("homa_qdisc pacer sleeping"); status = wait_event_interruptible(qdev->pacer_sleep, kthread_should_stop() || - !skb_queue_empty(&qdev->homa_deferred) || - !skb_queue_empty(&qdev->tcp_deferred)); + !skb_queue_empty(&qdev->homa_deferred) || + !skb_queue_empty(&qdev->tcp_deferred)); tt_record1("homa_qdisc pacer woke up with status %d", status); if (status != 0 && status != -ERESTARTSYS) break; @@ -516,14 +516,14 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) * homa_qdisc_pacer_main about interfering with softirq handlers). */ for (i = 0; i < 5; i++) { - struct sk_buff *skb; + struct sk_buff *skb; u64 idle_time, now; /* If the NIC queue is too long, wait until it gets shorter. */ now = homa_clock(); idle_time = atomic64_read(&qdev->link_idle_time); while ((now + qdev->hnet->homa->pacer->max_nic_queue_cycles) < - idle_time) { + idle_time) { /* If we've xmitted at least one packet then * return (this helps with testing and also * allows homa_qdisc_pacer_main to yield the core). @@ -537,12 +537,12 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) * still too long because other threads have queued packets, * but we transmit anyway so the pacer thread doesn't starve. */ - skb = homa_qdisc_srpt_dequeue(&qdev->homa_deferred); - if (!skb) - break; - homa_qdisc_update_link_idle(qdev, qdisc_skb_cb(skb)->pkt_len, - -1); - homa_qdisc_enqueue_special(skb, qdev, true); + skb = homa_qdisc_srpt_dequeue(&qdev->homa_deferred); + if (!skb) + break; + homa_qdisc_update_link_idle(qdev, qdisc_skb_cb(skb)->pkt_len, + -1); + homa_qdisc_enqueue_special(skb, qdev, true); } done: spin_unlock_bh(&qdev->pacer_mutex); @@ -560,44 +560,44 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) * Return: Standard enqueue return code (usually NET_XMIT_SUCCESS). */ int homa_qdisc_enqueue_special(struct sk_buff *skb, - struct homa_qdisc_dev *qdev, bool pacer) + struct homa_qdisc_dev *qdev, bool pacer) { - struct netdev_queue *txq; - struct Qdisc *qdisc; - int result; - int qix; - int i; + struct netdev_queue *txq; + struct Qdisc *qdisc; + int result; + int qix; + int i; rcu_read_lock(); - /* Must make sure that the queue index is still valid (refers - * to a Homa qdisc). - */ - for (i = 0; ; i++) { - qix = pacer ? qdev->pacer_qix : qdev->redirect_qix; - if (qix >= 0 && qix < qdev->dev->num_tx_queues) { - txq = netdev_get_tx_queue(qdev->dev, qix); - qdisc = rcu_dereference_bh(txq->qdisc); - if (qdisc->ops== &homa_qdisc_ops) - break; - } - if (i > 0) { - /* Couldn't find a Homa qdisc to use; drop the skb. */ - kfree_skb(skb); - result = NET_XMIT_DROP; - goto done; - } - homa_qdisc_set_qixs(qdev); - } - - spin_lock_bh(qdisc_lock(qdisc)); - result = qdisc_enqueue_tail(skb, qdisc); - spin_unlock_bh(qdisc_lock(qdisc)); - netif_schedule_queue(txq); + /* Must make sure that the queue index is still valid (refers + * to a Homa qdisc). + */ + for (i = 0; ; i++) { + qix = pacer ? qdev->pacer_qix : qdev->redirect_qix; + if (qix >= 0 && qix < qdev->dev->num_tx_queues) { + txq = netdev_get_tx_queue(qdev->dev, qix); + qdisc = rcu_dereference_bh(txq->qdisc); + if (qdisc->ops == &homa_qdisc_ops) + break; + } + if (i > 0) { + /* Couldn't find a Homa qdisc to use; drop the skb. */ + kfree_skb(skb); + result = NET_XMIT_DROP; + goto done; + } + homa_qdisc_set_qixs(qdev); + } + + spin_lock_bh(qdisc_lock(qdisc)); + result = qdisc_enqueue_tail(skb, qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); + netif_schedule_queue(txq); done: rcu_read_unlock(); - return result; + return result; } /** @@ -608,38 +608,38 @@ int homa_qdisc_enqueue_special(struct sk_buff *skb, */ void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev) { - struct ethtool_link_ksettings ksettings; - struct homa *homa = qdev->hnet->homa; - const struct ethtool_ops *ops; + struct ethtool_link_ksettings ksettings; + struct homa *homa = qdev->hnet->homa; + const struct ethtool_ops *ops; u64 tmp; - qdev->link_mbps = homa->link_mbps; - ops = qdev->dev->ethtool_ops; - if (ops && ops->get_link_ksettings) { - if (ops->get_link_ksettings(qdev->dev, &ksettings) == 0) - qdev->link_mbps = ksettings.base.speed; - } - - /* Underestimate link bandwidth (overestimate time) by 1%. - * - * cycles/sec - * cycles/mibyte = (101/100) * ------------- - * mibytes/sec - * - * 101 * homa_clock_khz() * 1000 - * = --------------------------------------- - * 100 * link_mbps * (1<<20 / 1000000) / 8 - * - * 8 * 1010 * homa_clock_khz() 1<<20 - * = ----------------------------- * --------- - * link_mbps 1000000 - */ + qdev->link_mbps = homa->link_mbps; + ops = qdev->dev->ethtool_ops; + if (ops && ops->get_link_ksettings) { + if (ops->get_link_ksettings(qdev->dev, &ksettings) == 0) + qdev->link_mbps = ksettings.base.speed; + } + + /* Underestimate link bandwidth (overestimate time) by 1%. + * + * cycles/sec + * cycles/mibyte = (101/100) * ------------- + * mibytes/sec + * + * 101 * homa_clock_khz() * 1000 + * = --------------------------------------- + * 100 * link_mbps * (1<<20 / 1000000) / 8 + * + * 8 * 1010 * homa_clock_khz() 1<<20 + * = ----------------------------- * --------- + * link_mbps 1000000 + */ tmp = 8ULL * 1010; - tmp *= homa_clock_khz(); - do_div(tmp, qdev->link_mbps); - tmp <<= 20; - do_div(tmp, 1000000); - qdev->cycles_per_mibyte = tmp; + tmp *= homa_clock_khz(); + do_div(tmp, qdev->link_mbps); + tmp <<= 20; + do_div(tmp, 1000000); + qdev->cycles_per_mibyte = tmp; } /** @@ -649,10 +649,10 @@ void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev) */ void homa_qdisc_update_all_sysctl(struct homa_net *hnet) { - struct homa_qdisc_dev *qdev; + struct homa_qdisc_dev *qdev; spin_lock_bh(&hnet->qdisc_devs_lock); - list_for_each_entry(qdev, &hnet->qdisc_devs, links) - homa_qdisc_update_sysctl(qdev); + list_for_each_entry(qdev, &hnet->qdisc_devs, links) + homa_qdisc_update_sysctl(qdev); spin_unlock_bh(&hnet->qdisc_devs_lock); } diff --git a/homa_qdisc.h b/homa_qdisc.h index 65e6757e..2424ffe8 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -40,7 +40,7 @@ struct homa_qdisc_dev { struct net_device *dev; /** - * @homa_net: Homa's information about the network namesapce + * @homa_net: Homa's information about the network namespace * this object belongs to. */ struct homa_net *hnet; @@ -147,7 +147,7 @@ void homa_qdisc_destroy(struct Qdisc *sch); int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); int homa_qdisc_enqueue_special(struct sk_buff *skb, - struct homa_qdisc_dev *qdev, + struct homa_qdisc_dev *qdev, bool pacer); int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack); @@ -161,7 +161,7 @@ void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev); void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, struct sk_buff *skb); struct sk_buff * - homa_qdisc_srpt_dequeue(struct sk_buff_head *list); + homa_qdisc_srpt_dequeue(struct sk_buff_head *list); void homa_qdisc_srpt_free(struct sk_buff_head *list); void homa_qdisc_unregister(void); void homa_qdisc_update_all_sysctl(struct homa_net *hnet); @@ -170,4 +170,4 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev); void homa_qdisc_pacer(struct homa_qdisc_dev *qdev); -#endif /* _HOMA_QDISC_H */ \ No newline at end of file +#endif /* _HOMA_QDISC_H */ From 0f78ab3b88c54b10767042e80291da210eef2baa Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 28 Jul 2025 20:59:07 -0700 Subject: [PATCH 411/625] Add tt_unfreeze method to timetrace.c Also add support in homa_plumbing.c and in cperf.py (unfreeze timetraces at the start of every run). Also make a few other trivial cleanups in timetrace.c. --- homa_plumbing.c | 2 ++ timetrace.c | 25 +++++++++++++++++++++---- timetrace.h | 1 + util/cperf.py | 2 ++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 502bf8ce..cda1b8cf 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1739,6 +1739,8 @@ int homa_dointvec(const struct ctl_table *table, int write, atomic_read(&homa->grant->total_incoming)); } else if (homa->sysctl_action == 9) { homa_rpc_stats_log(); + } else if (homa->sysctl_action == 10) { + tt_unfreeze(); } else { homa_rpc_log_active(homa, homa->sysctl_action); } diff --git a/timetrace.c b/timetrace.c index 201cca2c..c7ea7377 100644 --- a/timetrace.c +++ b/timetrace.c @@ -227,11 +227,23 @@ void tt_freeze(void) */ if (atomic_xchg(&tt_frozen, 1) == 0) { tt_record("timetrace frozen"); - pr_notice("%s invoked\n", __func__); + pr_err("%s invoked\n", __func__); atomic_inc(&tt_freeze_count); } } +/** + * tt_unfreeze() - Release any freeze that may be in effect: normal + * timetrace recording will resume if it had stopped. + */ +void tt_unfreeze(void) +{ + if (atomic_xchg(&tt_frozen, 0) == 1) { + pr_err("%s invoked\n", __func__); + atomic_dec(&tt_freeze_count); + } +} + /** * tt_record_buf(): record an event in a core-specific tt_buffer. * @@ -697,6 +709,7 @@ void tt_printk(void) */ static int pos[NR_CPUS]; u64 start_time; + int events; int i; if (atomic_xchg(&active, 1)) { @@ -706,7 +719,9 @@ void tt_printk(void) if (!init) return; atomic_inc(&tt_freeze_count); + pr_err("Dumping timetrace on core %d\n", raw_smp_processor_id()); start_time = tt_find_oldest(oldest); + events = 0; for (i = 0; i < nr_cpu_ids; i++) { if (oldest[i] == tt_buffers[i]->next_index) pos[i] = -1; @@ -715,6 +730,7 @@ void tt_printk(void) (tt_buffer_size - 1); } +#if 0 /* Limit the number of entries logged per core (logging too many * seems to cause entries to be lost). */ @@ -722,6 +738,7 @@ void tt_printk(void) if (((pos[i] - oldest[i]) & (TT_BUF_SIZE - 1)) > 200) oldest[i] = (pos[i] - 200) & (TT_BUF_SIZE - 1); } +#endif pr_err("cpu_khz: %u, start: %llu\n", tsc_khz, start_time); @@ -760,8 +777,9 @@ void tt_printk(void) pr_err("%lu [C%02d] %s\n", (unsigned long)event->timestamp, current_core, msg); + events++; } - pr_err("Finished dumping timetrace to syslog\n"); + pr_err("Finished dumping %d timetrace events to syslog\n", events); atomic_dec(&tt_freeze_count); atomic_set(&active, 0); @@ -843,12 +861,11 @@ void tt_get_messages(char *buffer, size_t length) */ void tt_dbg1(char *msg, ...) { + pr_err("tt_dbg1 starting\n"); if (atomic_read(&tt_frozen)) return; tt_freeze(); - pr_err("Dumping timetrace on core %d\n", raw_smp_processor_id()); tt_printk(); - pr_err("Finished dumping timetrace\n"); } /** diff --git a/timetrace.h b/timetrace.h index 23a76561..08468ad1 100644 --- a/timetrace.h +++ b/timetrace.h @@ -101,6 +101,7 @@ void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, const char *format, u32 arg0, u32 arg1, u32 arg2, u32 arg3); void tt_set_temp(int *temp); +void tt_unfreeze(void); /* Private methods and variables: exposed so they can be accessed * by unit tests. diff --git a/util/cperf.py b/util/cperf.py index 910fa4e2..24993cc3 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -747,6 +747,8 @@ def run_experiment(name, clients, options): do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) if not "no_rtt_files" in options: do_cmd("dump_times /dev/null %s" % (name), clients) + log("Unfreezing timetraces on %s" % (nodes)) + set_sysctl_parameter(".net.homa.action", "10", nodes) do_cmd("log Starting measurements for %s experiment" % (name), server_nodes, clients) log("Starting measurements") From e1f1ee66cb8719db17a7a92796fb2923ce915c56 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 28 Jul 2025 21:17:05 -0700 Subject: [PATCH 412/625] Fix bug in is_homa_pkt Didn't properly handle situations where the network header hasn't yet been added. This required changes in the packet format produced by mock_skb_alloc, which led to a few test changes. --- homa_impl.h | 5 +++++ homa_plumbing.c | 5 ++++- test/mock.c | 16 ++++++++++++++-- test/unit_homa_plumbing.c | 6 ++++-- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index cb2eb97b..f7b96723 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -690,6 +690,11 @@ static inline bool is_homa_pkt(struct sk_buff *skb) { int protocol; + /* If the network header hasn't been created yet, assume it's a + * Homa packet (Homa never generates any non-Homa packets). + */ + if (skb->network_header == 0) + return true; protocol = (skb_is_ipv6(skb)) ? ipv6_hdr(skb)->nexthdr : ip_hdr(skb)->protocol; #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_plumbing.c b/homa_plumbing.c index cda1b8cf..f2ee9ea9 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1551,6 +1551,8 @@ int homa_softirq(struct sk_buff *skb) /** * homa_err_handler_v4() - Invoked by IP to handle an incoming error * packet, such as ICMP UNREACHABLE. + * @skb: The incoming packet; skb->data points to the byte just after + * the ICMP header (the first byte of the embedded packet IP header). * @skb: The incoming packet. * @info: Information about the error that occurred? * @@ -1592,7 +1594,8 @@ int homa_err_handler_v4(struct sk_buff *skb, u32 info) /** * homa_err_handler_v6() - Invoked by IP to handle an incoming error * packet, such as ICMP UNREACHABLE. - * @skb: The incoming packet. + * @skb: The incoming packet; skb->data points to the byte just after + * the ICMP header (the first byte of the embedded packet IP header). * @opt: Not used. * @type: Type of ICMP packet. * @code: Additional information about the error. diff --git a/test/mock.c b/test/mock.c index 5f5d15a0..84e14c32 100644 --- a/test/mock.c +++ b/test/mock.c @@ -2072,6 +2072,11 @@ struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h struct sk_buff *skb; unsigned char *p; + /* Don't let the IP header start at the beginning of the packet + * buffer: that will confuse is_homa_pkt. + */ +#define IP_HDR_OFFSET 4 + if (h) { switch (h->type) { case DATA: @@ -2119,7 +2124,8 @@ struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h unit_hash_set(skbs_in_use, skb, "used"); ip_size = mock_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); - data_size = SKB_DATA_ALIGN(ip_size + header_size + extra_bytes); + data_size = SKB_DATA_ALIGN(IP_HDR_OFFSET + ip_size + header_size + + extra_bytes); shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (h) { skb->head = malloc(data_size + shinfo_size); @@ -2132,7 +2138,11 @@ struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h skb->data = skb->head; skb_reset_tail_pointer(skb); skb->end = skb->tail + data_size; - skb_reserve(skb, ip_size); + + /* Don't want IP header starting at the beginning of the packet + * buffer (will confuse is_homa_pkt). + */ + skb_reserve(skb, IP_HDR_OFFSET + ip_size); skb_reset_transport_header(skb); if (header_size != 0) { p = skb_put(skb, header_size); @@ -2143,6 +2153,8 @@ struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h unit_fill_data(p, extra_bytes, first_value); } skb->users.refs.counter = 1; + skb_reset_network_header(skb); + skb_set_network_header(skb, -ip_size); if (mock_ipv6) { ipv6_hdr(skb)->version = 6; ipv6_hdr(skb)->saddr = *saddr; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index dd98739b..bbc8a4f6 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -1214,7 +1214,8 @@ TEST_F(homa_plumbing, homa_err_handler_v6__port_unreachable) ipv6_hdr(failed)->daddr = self->server_ip[0]; icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); - memcpy(skb_put(icmp, failed->len), failed->head, failed->len); + memcpy(skb_put(icmp, failed->len), skb_network_header(failed), + failed->len); EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, 111)); @@ -1237,7 +1238,8 @@ TEST_F(homa_plumbing, homa_err_handler_v6__protocol_not_supported) ipv6_hdr(failed)->daddr = self->server_ip[0]; icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); - memcpy(skb_put(icmp, failed->len), failed->head, failed->len); + memcpy(skb_put(icmp, failed->len), skb_network_header(failed), + failed->len); EXPECT_EQ(0, homa_err_handler_v6(icmp, NULL, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR, 0, 111)); From f6957a024e505db9f8936a54690da314963a4791 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 29 Jul 2025 11:39:09 -0700 Subject: [PATCH 413/625] Add minor missing initialization in homa_resend_data This exposed a more serious bug, where homa_resend_data wasn't allocating enough bytes in new_skb. --- homa_outgoing.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index 834b4eee..1d750945 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -816,8 +816,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) /* This segment must be retransmitted. */ #ifndef __STRIP__ /* See strip.py */ - new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) - - sizeof(struct homa_seg_hdr)); + new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr)); #else /* See strip.py */ new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) + seg_length); @@ -845,6 +844,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) } new_homa_info = homa_get_skb_info(new_skb); + new_homa_info->next_skb = NULL; new_homa_info->wire_bytes = rpc->hsk->ip_header_length + sizeof(struct homa_data_hdr) + seg_length + HOMA_ETH_OVERHEAD; From af2bb2fb6730b87a6114a5f237f3032d1f8eba59 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 29 Jul 2025 11:53:49 -0700 Subject: [PATCH 414/625] Update notes.txt --- notes.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/notes.txt b/notes.txt index c572fae5..6acc2c90 100755 --- a/notes.txt +++ b/notes.txt @@ -101,6 +101,8 @@ Notes for Homa implementation in Linux: * CloudLab cluster issues: * amd272 had a problem where all xmits from core 47 incurred a 1-2 ms delay; power-cycling it fixed the problem. + * (July 2025) amd163 was causing cp_vs_tcp to run slowly (max cluster + throughput only 16-18 Gbps). * Notes on performance data from buffer benchmarking 8/2023: * Restricting buffer space ("buffers"): From 5aece8173a81005779a4b32aabb6e0d2c34277fb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 29 Jul 2025 11:54:17 -0700 Subject: [PATCH 415/625] Fix bugs in homa_qdisc homa_qdisc is now functional (can run cp_vs_tcp benchmark, though max throughput has dropped a bit) --- homa_impl.h | 7 +++-- homa_outgoing.c | 5 +++- homa_qdisc.c | 54 ++++++++++++++++++++++++++++----------- homa_qdisc.h | 17 +++++++----- homa_utils.c | 2 +- test/unit_homa_outgoing.c | 49 +++++++++++++++-------------------- test/unit_homa_qdisc.c | 2 ++ 7 files changed, 82 insertions(+), 54 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index f7b96723..62be3634 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -529,9 +529,12 @@ struct homa_net { struct list_head qdisc_devs; /** - * @qdisc_dev_lock: Must hold when reading or writing @qdisc_devs. + * @qdisc_devs_mutex: Used to synchronize operations on @qdisc_devs + * (creation and deletion of qdiscs). Must be a mutex rather than + * a spinlock because homa_qdisc_dev_get calls functions that may + * blocko. */ - spinlock_t qdisc_devs_lock; + struct mutex qdisc_devs_mutex; #endif /* See strip.py */ }; diff --git a/homa_outgoing.c b/homa_outgoing.c index 1d750945..e92cc546 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -9,6 +9,7 @@ #include "homa_peer.h" #include "homa_rpc.h" #ifndef __STRIP__ /* See strip.py */ +#include "homa_qdisc.h" #include "homa_skb.h" #endif /* See strip.py */ #include "homa_wire.h" @@ -325,6 +326,8 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) overlap_xmit = rpc->msgout.length > 2 * max_gso_data; #ifndef __STRIP__ /* See strip.py */ + if (homa_qdisc_active(rpc->hsk->hnet)) + overlap_xmit = 0; rpc->msgout.granted = rpc->msgout.unscheduled; #endif /* See strip.py */ homa_skb_stash_pages(rpc->hsk->homa, rpc->msgout.length); @@ -613,7 +616,7 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) #ifndef __STRIP__ /* See strip.py */ if (rpc->msgout.length - rpc->msgout.next_xmit_offset > homa->pacer->throttle_min_bytes && - list_empty(&rpc->hsk->hnet->qdisc_devs)) { + !homa_qdisc_active(rpc->hsk->hnet)) { #else /* See strip.py */ if (rpc->msgout.length - rpc->msgout.next_xmit_offset > homa->pacer->throttle_min_bytes) { diff --git a/homa_qdisc.c b/homa_qdisc.c index d0f90d79..6628fade 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -63,7 +63,7 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, { struct homa_qdisc_dev *qdev; - spin_lock_bh(&hnet->qdisc_devs_lock); + mutex_lock(&hnet->qdisc_devs_mutex); list_for_each_entry(qdev, &hnet->qdisc_devs, links) { if (qdev->dev == dev) { qdev->refs++; @@ -79,7 +79,6 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, qdev->dev = dev; qdev->hnet = hnet; qdev->refs = 1; - spin_lock_init(&qdev->lock); qdev->pacer_qix = -1; qdev->redirect_qix = -1; homa_qdisc_update_sysctl(qdev); @@ -103,7 +102,7 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, list_add(&qdev->links, &hnet->qdisc_devs); done: - spin_unlock_bh(&hnet->qdisc_devs_lock); + mutex_unlock(&hnet->qdisc_devs_mutex); return qdev; } @@ -116,7 +115,7 @@ void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) { struct homa_net *hnet = qdev->hnet; - spin_lock_bh(&hnet->qdisc_devs_lock); + mutex_lock(&hnet->qdisc_devs_mutex); qdev->refs--; if (qdev->refs == 0) { kthread_stop(qdev->pacer_kthread); @@ -127,7 +126,7 @@ void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) skb_queue_purge(&qdev->tcp_deferred); kfree(qdev); } - spin_unlock_bh(&hnet->qdisc_devs_lock); + mutex_unlock(&hnet->qdisc_devs_mutex); } /** @@ -189,7 +188,7 @@ void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev) struct netdev_queue *txq; struct Qdisc *qdisc; - /* Note: it's safe for mutltiple instances of this function to + /* Note: it's safe for multiple instances of this function to * execute concurrently so no synchronization is needed (other * than using RCU to protect against deletion of the underlying * data structures). @@ -228,13 +227,11 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct homa_qdisc *q = qdisc_priv(sch); struct homa_qdisc_dev *qdev = q->qdev; struct homa *homa = qdev->hnet->homa; + struct homa_data_hdr *h; int pkt_len; int result; - /* The packet length computed by Linux didn't include overheads - * such as inter-frame gap; add that in here. - */ - pkt_len = qdisc_skb_cb(skb)->pkt_len + HOMA_ETH_FRAME_OVERHEAD; + pkt_len = qdisc_skb_cb(skb)->pkt_len; if (pkt_len < homa->pacer->throttle_min_bytes) { homa_qdisc_update_link_idle(q->qdev, pkt_len, -1); goto enqueue; @@ -252,19 +249,38 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* This packet needs to be deferred until the NIC queue has * been drained a bit. */ + h = (struct homa_data_hdr *) skb_transport_header(skb); + tt_record4("homa_qdisc_enqueue deferring homa data packet for id %d, offset %d, bytes_left %d on qid %d", + be64_to_cpu(h->common.sender_id), + ntohl(h->seg.offset), + homa_get_skb_info(skb)->bytes_left, qdev->pacer_qix); homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); wake_up(&qdev->pacer_sleep); return NET_XMIT_SUCCESS; enqueue: + if (is_homa_pkt(skb)) { + h = (struct homa_data_hdr *) skb_transport_header(skb); + tt_record4("homa_qdisc_enqueue queuing homa data packet for id %d, offset %d, bytes_left %d on qid %d", + be64_to_cpu(h->common.sender_id), + ntohl(h->seg.offset), + homa_get_skb_info(skb)->bytes_left, q->ix); + } else { + tt_record2("homa_qdisc_enqueue queuing non-homa packet, qix %d, pacer_qix %d", + q->ix, qdev->pacer_qix); + } if (q->ix != qdev->pacer_qix) { if (unlikely(sch->q.qlen >= READ_ONCE(sch->limit))) return qdisc_drop(skb, sch, to_free); - spin_lock_bh(qdisc_lock(sch)); result = qdisc_enqueue_tail(skb, sch); - spin_unlock_bh(qdisc_lock(sch)); } else { + /* homa_enqueue_special is going to lock a different qdisc, + * so in order to avoid deadlocks we have to release the + * lock for this qdisc. + * */ + spin_unlock(qdisc_lock(sch)); result = homa_qdisc_enqueue_special(skb, qdev, false); + spin_lock(qdisc_lock(sch)); } return result; } @@ -498,7 +514,7 @@ int homa_qdisc_pacer_main(void *device) * well, to increase the likelihood that we keep the link busy. Those other * invocations are not guaranteed to happen, so the pacer thread provides a * backstop. - * @homa: Overall data about the Homa protocol implementation. + * @qdev: The device on which to transmit. */ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) { @@ -516,6 +532,7 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) * homa_qdisc_pacer_main about interfering with softirq handlers). */ for (i = 0; i < 5; i++) { + struct homa_data_hdr *h; struct sk_buff *skb; u64 idle_time, now; @@ -540,8 +557,14 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) skb = homa_qdisc_srpt_dequeue(&qdev->homa_deferred); if (!skb) break; + homa_qdisc_update_link_idle(qdev, qdisc_skb_cb(skb)->pkt_len, -1); + h = (struct homa_data_hdr *) skb_transport_header(skb); + tt_record4("homa_qdisc_pacer queuing homa data packet for id %d, offset %d, bytes_left %d on qid %d", + be64_to_cpu(h->common.sender_id), + ntohl(h->seg.offset), + homa_get_skb_info(skb)->bytes_left, qdev->pacer_qix); homa_qdisc_enqueue_special(skb, qdev, true); } done: @@ -590,6 +613,7 @@ int homa_qdisc_enqueue_special(struct sk_buff *skb, homa_qdisc_set_qixs(qdev); } + skb_set_queue_mapping(skb, qix); spin_lock_bh(qdisc_lock(qdisc)); result = qdisc_enqueue_tail(skb, qdisc); spin_unlock_bh(qdisc_lock(qdisc)); @@ -651,8 +675,8 @@ void homa_qdisc_update_all_sysctl(struct homa_net *hnet) { struct homa_qdisc_dev *qdev; - spin_lock_bh(&hnet->qdisc_devs_lock); + mutex_lock(&hnet->qdisc_devs_mutex); list_for_each_entry(qdev, &hnet->qdisc_devs, links) homa_qdisc_update_sysctl(qdev); - spin_unlock_bh(&hnet->qdisc_devs_lock); + mutex_unlock(&hnet->qdisc_devs_mutex); } diff --git a/homa_qdisc.h b/homa_qdisc.h index 2424ffe8..11aff643 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -52,12 +52,6 @@ struct homa_qdisc_dev { */ int refs; - /** - * @lock: Used to synchronize access to mutable fields within - * this struct, such as @pacer_qix and @redirect_qix. - */ - spinlock_t lock; - /** * @pacer_qix: Index of a netdev_queue within dev that is reserved * for the pacer to use for transmitting packets. We segregate paced @@ -170,4 +164,15 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev); void homa_qdisc_pacer(struct homa_qdisc_dev *qdev); +/** + * homa_qdisc_active() - Return true if homa qdiscs are enabled for @hnet + * (so the old pacer should not be used), false otherwise. + * @hnet: Homa's information about a network namespace. + * Return: See above. + */ +static inline bool homa_qdisc_active(struct homa_net *hnet) +{ + return !list_empty(&hnet->qdisc_devs); +} + #endif /* _HOMA_QDISC_H */ diff --git a/homa_utils.c b/homa_utils.c index 14a6b2e2..bce7dce1 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -167,7 +167,7 @@ int homa_net_init(struct homa_net *hnet, struct net *net, struct homa *homa) hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; #ifndef __STRIP__ /* See strip.py */ INIT_LIST_HEAD(&hnet->qdisc_devs); - spin_lock_init(&hnet->qdisc_devs_lock); + mutex_init(&hnet->qdisc_devs_mutex); #endif /* See strip.py */ return 0; } diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 47f19a4e..3d1e16f8 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -480,34 +480,6 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) homa_rpc_unlock(crpc2); EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 4200", unit_log_get()); } -TEST_F(homa_outgoing, homa_message_out_fill__gso_force_software) -{ - struct homa_rpc *crpc1 = homa_rpc_alloc_client(&self->hsk, - &self->server_addr); - struct homa_rpc *crpc2; - - ASSERT_FALSE(crpc1 == NULL); - mock_net_device.gso_max_size = 10000; - mock_xmit_log_verbose = 1; - self->homa.gso_force_software = 0; - ASSERT_EQ(0, -homa_message_out_fill(crpc1, - unit_iov_iter((void *) 1000, 5000), 0)); - unit_log_clear(); - homa_xmit_data(crpc1, false); - homa_rpc_unlock(crpc1); - EXPECT_SUBSTR("xmit DATA", unit_log_get()); - EXPECT_NOSUBSTR("TSO disabled", unit_log_get()); - - crpc2 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); - ASSERT_FALSE(crpc2 == NULL); - self->homa.gso_force_software = 1; - ASSERT_EQ(0, -homa_message_out_fill(crpc2, - unit_iov_iter((void *) 1000, 5000), 0)); - unit_log_clear(); - homa_xmit_data(crpc2, false); - homa_rpc_unlock(crpc2); - EXPECT_SUBSTR("TSO disabled", unit_log_get()); -} TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) { struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, @@ -522,6 +494,25 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) homa_rpc_unlock(crpc); EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 1400;", unit_log_get()); } +#ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_outgoing, homa_message_out_fill__disable_overlap_xmit_because_of_homa_qdisc) +{ + struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, + &self->server_addr); + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + + ASSERT_FALSE(crpc == NULL); + ASSERT_EQ(0, -homa_message_out_fill(crpc, + unit_iov_iter((void *) 1000, 5000), 1)); + homa_rpc_unlock(crpc); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) { struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, @@ -550,7 +541,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) unit_log_get()); EXPECT_EQ(4200, homa_get_skb_info(crpc->msgout.packets)->data_bytes); } -TEST_F(homa_outgoing, homa_message_out_fill__error_in_homa_new_data_packet) +TEST_F(homa_outgoing, homa_message_out_fill__error_in_homa_tx_data_packet_alloc) { struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 623a8460..f1e0a925 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -417,8 +417,10 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__use_special_queue) skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); unit_log_clear(); + spin_lock(qdisc_lock(self->qdiscs[1])); EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, self->qdiscs[1], &to_free)); + spin_unlock(qdisc_lock(self->qdiscs[1])); ASSERT_NE(NULL, to_free); EXPECT_EQ(0, q->qdev->homa_deferred.qlen); EXPECT_EQ(0, self->qdiscs[1]->q.qlen); From c820388f34d05e8c88c095f7290cbbb5f9bbb3f8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 29 Jul 2025 12:01:55 -0700 Subject: [PATCH 416/625] Rename homa_qdisc_enqueue_special -> homa_qdisc_redirect_skb --- homa_qdisc.c | 20 ++++++++++++-------- homa_qdisc.h | 2 +- test/unit_homa_qdisc.c | 20 ++++++++++---------- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 6628fade..fb339a7e 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -279,7 +279,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * lock for this qdisc. * */ spin_unlock(qdisc_lock(sch)); - result = homa_qdisc_enqueue_special(skb, qdev, false); + result = homa_qdisc_redirect_skb(skb, qdev, false); spin_lock(qdisc_lock(sch)); } return result; @@ -565,24 +565,26 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) be64_to_cpu(h->common.sender_id), ntohl(h->seg.offset), homa_get_skb_info(skb)->bytes_left, qdev->pacer_qix); - homa_qdisc_enqueue_special(skb, qdev, true); + homa_qdisc_redirect_skb(skb, qdev, true); } done: spin_unlock_bh(&qdev->pacer_mutex); } /** - * homa_qdisc_enqueue_special() - This function is called by the pacer to - * enqueue a packet on one of the distinguished transmit queues and wake - * up the queue for transmission. + * homa_qdisc_redirect_skb() - Enqueue a packet on a different queue from + * the one it was originally passed to and wakeup that queue for + * transmission. This is used to transmit all pacer packets via a single + * queue and to redirect other packets originally sent to that queue to + * another queue. * @skb: Packet to resubmit. - * @qdev: Homa data about the networkd device on which the packet should + * @qdev: Homa data about the network device on which the packet should * be resubmitted. * @pacer: True means queue the packet on qdev->pacer_qix, false means * qdev->redirect_qix. * Return: Standard enqueue return code (usually NET_XMIT_SUCCESS). */ -int homa_qdisc_enqueue_special(struct sk_buff *skb, +int homa_qdisc_redirect_skb(struct sk_buff *skb, struct homa_qdisc_dev *qdev, bool pacer) { struct netdev_queue *txq; @@ -605,7 +607,9 @@ int homa_qdisc_enqueue_special(struct sk_buff *skb, break; } if (i > 0) { - /* Couldn't find a Homa qdisc to use; drop the skb. */ + /* Couldn't find a Homa qdisc to use; drop the skb. + * Shouldn't ever happen? + */ kfree_skb(skb); result = NET_XMIT_DROP; goto done; diff --git a/homa_qdisc.h b/homa_qdisc.h index 11aff643..8302bc41 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -140,7 +140,7 @@ struct homa_qdisc_dev { void homa_qdisc_destroy(struct Qdisc *sch); int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); -int homa_qdisc_enqueue_special(struct sk_buff *skb, +int homa_qdisc_redirect_skb(struct sk_buff *skb, struct homa_qdisc_dev *qdev, bool pacer); int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index f1e0a925..91c22877 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -731,7 +731,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_enqueue_special__use_pacer_qix) +TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_pacer_qix) { struct sk_buff *skb; struct homa_qdisc_dev *qdev; @@ -745,7 +745,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue_special__use_pacer_qix) skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); unit_log_clear(); - status = homa_qdisc_enqueue_special(skb, qdev, true); + status = homa_qdisc_redirect_skb(skb, qdev, true); EXPECT_EQ(NET_XMIT_SUCCESS, status); EXPECT_EQ(1, self->qdiscs[1]->q.qlen); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); @@ -754,7 +754,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue_special__use_pacer_qix) homa_qdisc_destroy(self->qdiscs[1]); homa_qdisc_destroy(self->qdiscs[3]); } -TEST_F(homa_qdisc, homa_qdisc_enqueue_special__use_redirect_qix) +TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_redirect_qix) { struct sk_buff *skb; struct homa_qdisc_dev *qdev; @@ -768,7 +768,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue_special__use_redirect_qix) skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); unit_log_clear(); - status = homa_qdisc_enqueue_special(skb, qdev, false); + status = homa_qdisc_redirect_skb(skb, qdev, false); EXPECT_EQ(NET_XMIT_SUCCESS, status); EXPECT_EQ(0, self->qdiscs[1]->q.qlen); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); @@ -776,7 +776,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue_special__use_redirect_qix) homa_qdisc_destroy(self->qdiscs[1]); homa_qdisc_destroy(self->qdiscs[3]); } -TEST_F(homa_qdisc, homa_qdisc_enqueue_special__redirect_qix_invalid) +TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_invalid) { struct sk_buff *skb; struct homa_qdisc_dev *qdev; @@ -791,7 +791,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue_special__redirect_qix_invalid) skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); unit_log_clear(); - status = homa_qdisc_enqueue_special(skb, qdev, false); + status = homa_qdisc_redirect_skb(skb, qdev, false); EXPECT_EQ(NET_XMIT_SUCCESS, status); EXPECT_EQ(1, self->qdiscs[1]->q.qlen); EXPECT_EQ(0, qdev->pacer_qix); @@ -800,7 +800,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue_special__redirect_qix_invalid) for (i = 0; i < 4; i++) homa_qdisc_destroy(self->qdiscs[i]); } -TEST_F(homa_qdisc, homa_qdisc_enqueue_special__redirect_qix_not_a_homa_qdisc) +TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_not_a_homa_qdisc) { struct sk_buff *skb; struct homa_qdisc_dev *qdev; @@ -816,7 +816,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue_special__redirect_qix_not_a_homa_qdisc) skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); unit_log_clear(); - status = homa_qdisc_enqueue_special(skb, qdev, false); + status = homa_qdisc_redirect_skb(skb, qdev, false); EXPECT_EQ(NET_XMIT_SUCCESS, status); EXPECT_EQ(1, self->qdiscs[2]->q.qlen); EXPECT_EQ(1, qdev->pacer_qix); @@ -825,7 +825,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue_special__redirect_qix_not_a_homa_qdisc) for (i = 0; i < 4; i++) homa_qdisc_destroy(self->qdiscs[i]); } -TEST_F(homa_qdisc, homa_qdisc_enqueue_special__no_suitable_qdisc) +TEST_F(homa_qdisc, homa_qdisc_redirect_skb__no_suitable_qdisc) { struct sk_buff *skb; struct homa_qdisc_dev *qdev; @@ -842,7 +842,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue_special__no_suitable_qdisc) skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); unit_log_clear(); - status = homa_qdisc_enqueue_special(skb, qdev, false); + status = homa_qdisc_redirect_skb(skb, qdev, false); EXPECT_EQ(NET_XMIT_DROP, status); EXPECT_EQ(-1, qdev->pacer_qix); EXPECT_EQ(-1, qdev->redirect_qix); From 893d2c7e1bed0ac54fbbb5fc09fe0771b6d7bf4d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 29 Jul 2025 13:45:12 -0700 Subject: [PATCH 417/625] Improve description of linux_pkt_alloc_bytes metric --- homa_metrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_metrics.c b/homa_metrics.c index d6a960db..1b3d4f38 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -394,7 +394,7 @@ char *homa_metrics_print(void) M("buffer_alloc_failures", m->buffer_alloc_failures, "homa_pool_alloc_msg didn't find enough buffer space for an RPC\n"); M("linux_pkt_alloc_bytes", m->linux_pkt_alloc_bytes, - "Bytes allocated in new packets by NIC driver due to cache overflows\n"); + "Bytes allocated for rx packets by NIC driver due to cache overflows\n"); M("dropped_data_no_bufs", m->dropped_data_no_bufs, "Data bytes dropped because app buffers full\n"); M("gen3_handoffs", m->gen3_handoffs, From 3b66a5eb5f6b0250bd13dd03ac3df79eeaa89f7c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 30 Jul 2025 13:49:38 -0700 Subject: [PATCH 418/625] Add metrics to homa_qdisc.c New metrics: pacer_throttled, pacer_lost_cycles, pacer_bytes Refactor several method APIs in order to pass in qdev object needed for metrics. --- homa_metrics.c | 2 +- homa_qdisc.c | 81 ++++++++------ homa_qdisc.h | 28 +++-- test/unit_homa_qdisc.c | 232 +++++++++++++++++++++++++++++------------ util/metrics.py | 6 +- 5 files changed, 238 insertions(+), 111 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index 1b3d4f38..379dbbf5 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -300,7 +300,7 @@ char *homa_metrics_print(void) M("pacer_needed_help", m->pacer_needed_help, "homa_pacer_xmit invocations from homa_check_pacer\n"); M("throttled_cycles", m->throttled_cycles, - "Time when the throttled queue was nonempty\n"); + "Time when output was throttled because NIC was backlogged\n"); M("resent_packets", m->resent_packets, "DATA packets sent in response to RESENDs\n"); M("peer_allocs", m->peer_allocs, diff --git a/homa_qdisc.c b/homa_qdisc.c index fb339a7e..0a9ad37a 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -122,7 +122,7 @@ void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) qdev->pacer_kthread = NULL; __list_del_entry(&qdev->links); - homa_qdisc_srpt_free(&qdev->homa_deferred); + homa_qdisc_free_homa(qdev); skb_queue_purge(&qdev->tcp_deferred); kfree(qdev); } @@ -254,7 +254,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, be64_to_cpu(h->common.sender_id), ntohl(h->seg.offset), homa_get_skb_info(skb)->bytes_left, qdev->pacer_qix); - homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); + homa_qdisc_defer_homa(qdev, skb); wake_up(&qdev->pacer_sleep); return NET_XMIT_SUCCESS; @@ -286,14 +286,15 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } /** - * homa_qdisc_srpt_enqueue() - Add a Homa packet to an skb queue in SRPT - * priority order. - * @list: List on which to enqueue packet (usually &qdev->homa_deferred). + * homa_qdisc_defer_homa() - Add a Homa packet to the deferred list for + * a qdev. + * @qdev: Network device for which the packet should be enqueued. * @skb: Packet to enqueue. */ -void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, struct sk_buff *skb) +void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb) { struct homa_skb_info *info = homa_get_skb_info(skb); + u64 now = homa_clock(); struct sk_buff *other; unsigned long flags; @@ -313,12 +314,13 @@ void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, struct sk_buff *skb) info->next_sibling = NULL; info->last_sibling = NULL; - spin_lock_irqsave(&list->lock, flags); - if (skb_queue_empty(list)) { - __skb_queue_head(list, skb); + spin_lock_irqsave(&qdev->homa_deferred.lock, flags); + if (skb_queue_empty(&qdev->homa_deferred)) { + __skb_queue_head(&qdev->homa_deferred, skb); goto done; } - skb_queue_reverse_walk(list, other) { + INC_METRIC(throttled_cycles, now - qdev->last_defer); + skb_queue_reverse_walk(&qdev->homa_deferred, other) { struct homa_skb_info *other_info = homa_get_skb_info(other); if (other_info->rpc == info->rpc) { @@ -332,27 +334,29 @@ void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, struct sk_buff *skb) } if (other_info->bytes_left <= info->bytes_left) { - __skb_queue_after(list, other, skb); + __skb_queue_after(&qdev->homa_deferred, other, skb); break; } - if (skb_queue_is_first(list, other)) { - __skb_queue_head(list, skb); + if (skb_queue_is_first(&qdev->homa_deferred, other)) { + __skb_queue_head(&qdev->homa_deferred, skb); break; } } done: - spin_unlock_irqrestore(&list->lock, flags); + qdev->last_defer = now; + spin_unlock_irqrestore(&qdev->homa_deferred.lock, flags); } /** - * homa_qdisc_srpt_dequeue() - Remove the frontmost packet from a list that - * is managed with SRPT priority. - * @list: List from which to remove packet. + * homa_qdisc_dequeue_homa() - Remove the frontmost packet from the list + * of deferred Homa packets for a qdev. + * @qdev: The homa_deferred element is the list from which a packet + * will be dequeued. * Return: The frontmost packet from the list, or NULL if the list was empty. */ -struct sk_buff *homa_qdisc_srpt_dequeue(struct sk_buff_head *list) +struct sk_buff *homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev) { struct homa_skb_info *sibling_info; struct sk_buff *skb, *sibling; @@ -363,13 +367,13 @@ struct sk_buff *homa_qdisc_srpt_dequeue(struct sk_buff_head *list) * have a sibling list. If so, we need to enqueue the next * sibling. */ - spin_lock_irqsave(&list->lock, flags); - if (skb_queue_empty(list)) { - spin_unlock_irqrestore(&list->lock, flags); + spin_lock_irqsave(&qdev->homa_deferred.lock, flags); + if (skb_queue_empty(&qdev->homa_deferred)) { + spin_unlock_irqrestore(&qdev->homa_deferred.lock, flags); return NULL; } - skb = list->next; - __skb_unlink(skb, list); + skb = qdev->homa_deferred.next; + __skb_unlink(skb, &qdev->homa_deferred); info = homa_get_skb_info(skb); if (info->next_sibling) { /* This is a "compound" packet, containing multiple @@ -381,26 +385,26 @@ struct sk_buff *homa_qdisc_srpt_dequeue(struct sk_buff_head *list) sibling = info->next_sibling; sibling_info = homa_get_skb_info(sibling); sibling_info->last_sibling = info->last_sibling; - __skb_queue_head(list, sibling); + __skb_queue_head(&qdev->homa_deferred, sibling); } - spin_unlock_irqrestore(&list->lock, flags); + if (skb_queue_empty(&qdev->homa_deferred)) + INC_METRIC(throttled_cycles, homa_clock() - qdev->last_defer); + spin_unlock_irqrestore(&qdev->homa_deferred.lock, flags); return skb; } /** - * homa_qdisc_srpt_free() - Free all of the packets on @list, - * including siblings that are nested inside packets on the list. - * @list: List containing packets to free, which is managed using - * by homa_qdisc_srpt_enqueue and homa_qdisc_srpt_dequeue; - * it will be empty on return. + * homa_qdisc_free_homa() - Free all of the Homa packets that have been + * deferred for @qdev. + * @qdev: Object whose @homa_deferred list should be emptied. */ -void homa_qdisc_srpt_free(struct sk_buff_head *list) +void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev) { struct sk_buff *skb; while (1) { - skb = homa_qdisc_srpt_dequeue(list); + skb = homa_qdisc_dequeue_homa(qdev); if (!skb) break; kfree_skb(skb); @@ -440,6 +444,13 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, clock = homa_clock(); idle = atomic64_read(&qdev->link_idle_time); if (idle < clock) { + if (qdev->pacer_wake_time) { + u64 lost = (qdev->pacer_wake_time > idle) + ? clock - qdev->pacer_wake_time + : clock - idle; + INC_METRIC(pacer_lost_cycles, lost); + tt_record1("pacer lost %d cycles", lost); + } new_idle = clock + cycles_for_packet; } else { if (max_queue_cycles >= 0 && (idle - clock) > @@ -453,6 +464,8 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, break; INC_METRIC(idle_time_conflicts, 1); } + if (!skb_queue_empty(&qdev->homa_deferred)) + INC_METRIC(pacer_bytes, bytes); return 1; } @@ -473,7 +486,9 @@ int homa_qdisc_pacer_main(void *device) if (kthread_should_stop()) break; start = homa_clock(); + qdev->pacer_wake_time = start; homa_qdisc_pacer(qdev); + qdev->pacer_wake_time = 0; INC_METRIC(pacer_cycles, homa_clock() - start); if (!skb_queue_empty(&qdev->homa_deferred) || @@ -554,7 +569,7 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) * still too long because other threads have queued packets, * but we transmit anyway so the pacer thread doesn't starve. */ - skb = homa_qdisc_srpt_dequeue(&qdev->homa_deferred); + skb = homa_qdisc_dequeue_homa(qdev); if (!skb) break; diff --git a/homa_qdisc.h b/homa_qdisc.h index 8302bc41..b68e03c6 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -117,6 +117,18 @@ struct homa_qdisc_dev { */ struct sk_buff_head tcp_deferred; + /** + * @last_defer: The most recent homa_clock() time when a packet was + * added to homa_deferred or tcp_deferred. + */ + u64 last_defer; + + /** + * @pacer_wake_time: homa_clock() time when the pacer woke up (if + * the pacer is running) or 0 if the pacer is sleeping. + */ + u64 pacer_wake_time; + /** * @pacer_kthread: Kernel thread that eventually transmits packets * on homa_deferred and tcp_deferred. @@ -137,12 +149,14 @@ struct homa_qdisc_dev { spinlock_t pacer_mutex __aligned(L1_CACHE_BYTES); }; +void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, + struct sk_buff *skb); +struct sk_buff * + homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev); void homa_qdisc_destroy(struct Qdisc *sch); int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); -int homa_qdisc_redirect_skb(struct sk_buff *skb, - struct homa_qdisc_dev *qdev, - bool pacer); +void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev); int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack); int homa_qdisc_pacer_main(void *device); @@ -150,13 +164,11 @@ struct homa_qdisc_dev * homa_qdisc_qdev_get(struct homa_net *hnet, struct net_device *dev); void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev); +int homa_qdisc_redirect_skb(struct sk_buff *skb, + struct homa_qdisc_dev *qdev, + bool pacer); int homa_qdisc_register(void); void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev); -void homa_qdisc_srpt_enqueue(struct sk_buff_head *list, - struct sk_buff *skb); -struct sk_buff * - homa_qdisc_srpt_dequeue(struct sk_buff_head *list); -void homa_qdisc_srpt_free(struct sk_buff_head *list); void homa_qdisc_unregister(void); void homa_qdisc_update_all_sysctl(struct homa_net *hnet); int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 91c22877..32ab3088 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -430,108 +430,162 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__use_special_queue) homa_qdisc_destroy(self->qdiscs[3]); } -TEST_F(homa_qdisc, homa_qdisc_srpt_enqueue__basics) +TEST_F(homa_qdisc, homa_qdisc_defer_homa__basics) { - struct sk_buff_head list; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - skb_queue_head_init(&list); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 1000)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 2000)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg3", 500)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg4", 1000)); - log_skb_list(&list); + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 500)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg4", 1000)); + unit_log_clear(); + log_skb_list(&qdev->homa_deferred); EXPECT_STREQ("msg3:500; msg1:1000; msg4:1000; msg2:2000", unit_log_get()); - homa_qdisc_srpt_free(&list); + homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_srpt_enqueue__multiple_pkts_for_rpc) +TEST_F(homa_qdisc, homa_qdisc_defer_homa__throttled_cycles_metric) { - struct sk_buff_head list; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + mock_clock = 5000; + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); + EXPECT_EQ(5000, qdev->last_defer); + EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); + + mock_clock = 12000; + homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); + EXPECT_EQ(12000, qdev->last_defer); + EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_cycles); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_defer_homa__multiple_pkts_for_rpc) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - skb_queue_head_init(&list); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 1000)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 2000)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 800)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 600)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 400)); - log_skb_list(&list); + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 800)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 600)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 400)); + unit_log_clear(); + log_skb_list(&qdev->homa_deferred); EXPECT_STREQ("msg1:1000 [msg1:800 msg1:600 msg1:400]; msg2:2000", unit_log_get()); - homa_qdisc_srpt_free(&list); + homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_srpt_dequeue__list_empty) +TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__list_empty) { - struct sk_buff_head list; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + EXPECT_EQ(1, skb_queue_empty(&qdev->homa_deferred)); - skb_queue_head_init(&list); - EXPECT_EQ(NULL, homa_qdisc_srpt_dequeue(&list)); + EXPECT_EQ(NULL, homa_qdisc_dequeue_homa(qdev)); + homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_srpt_dequeue__no_siblings) +TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__no_siblings) { + struct homa_qdisc_dev *qdev; struct sk_buff *skb; - struct sk_buff_head list; - skb_queue_head_init(&list); + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + skb = new_test_skb("msg1", 1000); - homa_qdisc_srpt_enqueue(&list, skb); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 2000)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg3", 3000)); - log_skb_list(&list); + homa_qdisc_defer_homa(qdev, skb); + homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 3000)); + unit_log_clear(); + log_skb_list(&qdev->homa_deferred); EXPECT_STREQ("msg1:1000; msg2:2000; msg3:3000", unit_log_get()); - EXPECT_EQ(skb, homa_qdisc_srpt_dequeue(&list)); + EXPECT_EQ(skb, homa_qdisc_dequeue_homa(qdev)); unit_log_clear(); - log_skb_list(&list); + log_skb_list(&qdev->homa_deferred); EXPECT_STREQ("msg2:2000; msg3:3000", unit_log_get()); kfree_skb(skb); - homa_qdisc_srpt_free(&list); + homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_srpt_dequeue__siblings) +TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__siblings) { + struct homa_qdisc_dev *qdev; struct sk_buff *skb1, *skb2; - struct sk_buff_head list; - skb_queue_head_init(&list); + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + skb1 = new_test_skb("msg1", 1000); - homa_qdisc_srpt_enqueue(&list, skb1); + homa_qdisc_defer_homa(qdev, skb1); skb2 = new_test_skb("msg2", 2000); - homa_qdisc_srpt_enqueue(&list, skb2); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg3", 3000)); - log_skb_list(&list); + homa_qdisc_defer_homa(qdev, skb2); + homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 3000)); + unit_log_clear(); + log_skb_list(&qdev->homa_deferred); EXPECT_STREQ("msg1:1000; msg2:2000; msg3:3000", unit_log_get()); - EXPECT_EQ(skb1, homa_qdisc_srpt_dequeue(&list)); + EXPECT_EQ(skb1, homa_qdisc_dequeue_homa(qdev)); unit_log_clear(); - log_skb_list(&list); + log_skb_list(&qdev->homa_deferred); EXPECT_STREQ("msg2:2000; msg3:3000", unit_log_get()); kfree_skb(skb1); - EXPECT_EQ(skb2, homa_qdisc_srpt_dequeue(&list)); + EXPECT_EQ(skb2, homa_qdisc_dequeue_homa(qdev)); unit_log_clear(); - log_skb_list(&list); + log_skb_list(&qdev->homa_deferred); EXPECT_STREQ("msg3:3000", unit_log_get()); kfree_skb(skb2); - homa_qdisc_srpt_free(&list); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__throttled_cycles_metric) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + mock_clock = 5000; + homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 3000)); + EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(5000, qdev->last_defer); + + mock_clock = 12000; + kfree_skb(homa_qdisc_dequeue_homa(qdev)); + EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(0, skb_queue_empty(&qdev->homa_deferred)); + + kfree_skb(homa_qdisc_dequeue_homa(qdev)); + EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(1, skb_queue_empty(&qdev->homa_deferred)); + homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_srpt_free) +TEST_F(homa_qdisc, homa_qdisc_free_homa) { - struct sk_buff_head list; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - skb_queue_head_init(&list); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg1", 500)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 1000)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 600)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg2", 400)); - homa_qdisc_srpt_enqueue(&list, new_test_skb("msg3", 2000)); - log_skb_list(&list); + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 500)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 1000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 600)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 400)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 2000)); + unit_log_clear(); + log_skb_list(&qdev->homa_deferred); EXPECT_STREQ("msg1:500; msg2:1000 [msg2:600 msg2:400]; msg3:2000", unit_log_get()); - homa_qdisc_srpt_free(&list); + homa_qdisc_free_homa(qdev); unit_log_clear(); - log_skb_list(&list); + log_skb_list(&qdev->homa_deferred); EXPECT_STREQ("", unit_log_get()); + homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_update_link_idle__nic_idle) @@ -546,6 +600,34 @@ TEST_F(homa_qdisc, homa_qdisc_update_link_idle__nic_idle) EXPECT_EQ(1200 + HOMA_ETH_FRAME_OVERHEAD, atomic64_read(&qdev.link_idle_time)); } +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__pacer_lost_cycles_metric) +{ + struct homa_qdisc_dev qdev; + + /* qdev->pacer_wake_time < idle */ + mock_clock = 10000; + memset(&qdev, 0, sizeof(qdev)); + qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ + atomic64_set(&qdev.link_idle_time, 4000); + qdev.pacer_wake_time = 2000; + + homa_qdisc_update_link_idle(&qdev, 200, 0); + EXPECT_EQ(6000, homa_metrics_per_cpu()->pacer_lost_cycles); + + /* qdev->pacer_wake_time > idle */ + atomic64_set(&qdev.link_idle_time, 4000); + qdev.pacer_wake_time = 8000; + + homa_qdisc_update_link_idle(&qdev, 200, 0); + EXPECT_EQ(8000, homa_metrics_per_cpu()->pacer_lost_cycles); + + /* pacer_inactive */ + atomic64_set(&qdev.link_idle_time, 4000); + qdev.pacer_wake_time = 0; + + homa_qdisc_update_link_idle(&qdev, 200, 0); + EXPECT_EQ(8000, homa_metrics_per_cpu()->pacer_lost_cycles); +} TEST_F(homa_qdisc, homa_qdisc_update_link_idle__queue_too_long) { struct homa_qdisc_dev qdev; @@ -591,6 +673,26 @@ TEST_F(homa_qdisc, homa_qdisc_update_link_idle__cmpxchg_conflicts) atomic64_read(&qdev.link_idle_time)); EXPECT_EQ(4, homa_metrics_per_cpu()->idle_time_conflicts); } +TEST_F(homa_qdisc, homa_qdisc_update_link_idle__pacer_bytes_metric) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + ASSERT_FALSE(IS_ERR(qdev)); + + /* No deferred packets. */ + homa_qdisc_update_link_idle(qdev, 200, -1); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_bytes); + + /* Deferred packets. */ + homa_qdisc_defer_homa(qdev, + mock_skb_alloc(&self->addr, &self->data.common, + 1500, 0)); + homa_qdisc_update_link_idle(qdev, 500, -1); + EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); + + homa_qdisc_qdev_put(qdev); +} TEST_F(homa_qdisc, homa_qdisc_pacer_main__basics) { @@ -630,8 +732,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); link_idle = atomic64_read(&qdev->link_idle_time); - homa_qdisc_srpt_enqueue(&qdev->homa_deferred, - new_test_skb("msg1", 1000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); EXPECT_EQ(1, qdev->homa_deferred.qlen); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; @@ -653,8 +754,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); link_idle = atomic64_read(&qdev->link_idle_time); - homa_qdisc_srpt_enqueue(&qdev->homa_deferred, - new_test_skb("msg1", 1000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); EXPECT_EQ(1, qdev->homa_deferred.qlen); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; @@ -678,8 +778,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); - homa_qdisc_srpt_enqueue(&qdev->homa_deferred, - new_test_skb("msg1", 1000)); + homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); mock_clock = 0; mock_clock_tick = 1000; @@ -693,9 +792,10 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) /* Packet will get transmitted when mock_clock ticks to 7000, but * clock ticks once more in homa_qdisc_update_link_idle, then once - * in homa_qdisc_pacer before it returns. + * in homa_qdisc_dequeue_homa (to update metrics when the queue + * empties) and once more in homa_qdisc_pacer before it returns. */ - EXPECT_EQ(9000, mock_clock); + EXPECT_EQ(10000, mock_clock); homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); @@ -712,10 +812,10 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) skb = new_test_skb("msg1", 1000); qdisc_skb_cb(skb)->pkt_len = 1500; - homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); + homa_qdisc_defer_homa(qdev, skb); skb = new_test_skb("msg2", 1000); qdisc_skb_cb(skb)->pkt_len = 1500; - homa_qdisc_srpt_enqueue(&qdev->homa_deferred, skb); + homa_qdisc_defer_homa(qdev, skb); EXPECT_EQ(2, qdev->homa_deferred.qlen); mock_clock = atomic64_read(&qdev->link_idle_time); diff --git a/util/metrics.py b/util/metrics.py index 7299e66b..96d8db20 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -420,18 +420,18 @@ def scale_number(number): /(total_cores_used * elapsed_secs))) if deltas["pacer_cycles"] != 0: pacer_secs = float(deltas["pacer_cycles"])/(cpu_khz * 1000.0) - print("Pacer throughput: %6.2f Gbps (pacer output when pacer running)" % ( + print("Pacer throughput: %6.2f Gbps (pacer output when pacer active)" % ( deltas["pacer_bytes"]*8e-09/pacer_secs)) if deltas["throttled_cycles"] != 0: throttled_secs = float(deltas["throttled_cycles"])/(cpu_khz * 1000.0) - print("Throttled throughput: %5.2f Gbps (pacer output when throttled)" % ( + print("Throttled throughput: %5.2f Gbps (pacer output when NIC backlogged)" % ( deltas["pacer_bytes"]*8e-09/throttled_secs)) if deltas["skb_allocs"] != 0: print("Skb alloc time: %4.2f usec/skb" % ( float(deltas["skb_alloc_cycles"]) / (cpu_khz / 1000.0) / deltas["skb_allocs"])) if deltas["skb_page_allocs"] != 0: - print("Skb page alloc time: %5.2f usec/skb" % ( + print("Skb page alloc time: %5.2f usec/page" % ( float(deltas["skb_page_alloc_cycles"]) / (cpu_khz / 1000.0) / deltas["skb_page_allocs"])) From 814da12fb8fdf6a8384ba451f71552959c36df80 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 30 Jul 2025 14:06:46 -0700 Subject: [PATCH 419/625] Change all license clauses to add GPL-2.0+ as an option --- Kconfig | 2 +- Makefile.upstream | 2 +- homa.h | 2 +- homa_devel.c | 2 +- homa_devel.h | 2 +- homa_grant.c | 2 +- homa_grant.h | 2 +- homa_impl.h | 2 +- homa_incoming.c | 2 +- homa_interest.c | 2 +- homa_interest.h | 2 +- homa_metrics.c | 2 +- homa_metrics.h | 2 +- homa_offload.c | 2 +- homa_offload.h | 2 +- homa_outgoing.c | 2 +- homa_pacer.c | 2 +- homa_pacer.h | 2 +- homa_peer.c | 2 +- homa_peer.h | 2 +- homa_plumbing.c | 2 +- homa_pool.c | 2 +- homa_pool.h | 2 +- homa_qdisc.c | 2 +- homa_qdisc.h | 2 +- homa_receiver.h | 2 +- homa_rpc.c | 2 +- homa_rpc.h | 2 +- homa_skb.c | 2 +- homa_skb.h | 2 +- homa_sock.c | 2 +- homa_sock.h | 2 +- homa_stub.h | 2 +- homa_timer.c | 2 +- homa_utils.c | 2 +- homa_wire.h | 2 +- test/main.c | 2 +- test/mock.c | 2 +- test/unit_homa_grant.c | 2 +- test/unit_homa_incoming.c | 2 +- test/unit_homa_interest.c | 2 +- test/unit_homa_metrics.c | 2 +- test/unit_homa_offload.c | 2 +- test/unit_homa_outgoing.c | 2 +- test/unit_homa_pacer.c | 2 +- test/unit_homa_peer.c | 2 +- test/unit_homa_plumbing.c | 2 +- test/unit_homa_pool.c | 2 +- test/unit_homa_qdisc.c | 2 +- test/unit_homa_rpc.c | 2 +- test/unit_homa_skb.c | 2 +- test/unit_homa_sock.c | 2 +- test/unit_homa_timer.c | 2 +- test/unit_homa_utils.c | 2 +- test/unit_timetrace.c | 2 +- test/utils.c | 2 +- test/utils.h | 2 +- timetrace.c | 2 +- timetrace.h | 2 +- util/buffer_client.c | 2 +- util/buffer_server.c | 2 +- util/cp_basic | 2 +- util/cp_both | 2 +- util/cp_buffers | 2 +- util/cp_client_threads | 2 +- util/cp_config | 2 +- util/cp_config_buf | 2 +- util/cp_load | 2 +- util/cp_mtu | 2 +- util/cp_node.cc | 2 +- util/cp_server_ports | 2 +- util/cp_tcp | 2 +- util/cp_tcp_config | 2 +- util/cp_vs_tcp | 2 +- util/cperf.py | 2 +- util/diff_metrics.py | 2 +- util/diff_rtts.py | 2 +- util/dist.cc | 2 +- util/dist.h | 2 +- util/dist_test.cc | 2 +- util/dist_to_proto.cc | 2 +- util/get_time_trace.c | 2 +- util/get_traces | 2 +- util/homa_prio.cc | 2 +- util/homa_test.cc | 2 +- util/inc_tput.cc | 2 +- util/metrics.py | 2 +- util/plot.py | 2 +- util/plot_tthoma.py | 2 +- util/receive_raw.c | 2 +- util/rpcid.py | 2 +- util/send_raw.c | 2 +- util/server.cc | 2 +- util/service.py | 2 +- util/smi.cc | 2 +- util/strip.py | 2 +- util/strip_decl.py | 2 +- util/test_time_trace.c | 2 +- util/test_utils.cc | 2 +- util/test_utils.h | 2 +- util/time_trace.cc | 2 +- util/time_trace.h | 2 +- util/ttgrep.py | 2 +- util/tthoma.py | 2 +- util/ttmerge.py | 2 +- util/ttoffset.py | 2 +- util/ttprint.py | 2 +- util/ttrange.py | 2 +- util/ttsum.py | 2 +- util/ttsync.py | 2 +- util/ttsyslog.py | 2 +- util/use_memory.c | 2 +- 112 files changed, 112 insertions(+), 112 deletions(-) diff --git a/Kconfig b/Kconfig index 8ce5fbf0..16fec3fd 100644 --- a/Kconfig +++ b/Kconfig @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: BSD-2-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # # Homa transport protocol # diff --git a/Makefile.upstream b/Makefile.upstream index ed894eba..a7ebccd4 100644 --- a/Makefile.upstream +++ b/Makefile.upstream @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: BSD-2-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # # Makefile for the Linux implementation of the Homa transport protocol. diff --git a/homa.h b/homa.h index d7603488..e44754df 100644 --- a/homa.h +++ b/homa.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file defines the kernel call interface for the Homa * transport protocol. diff --git a/homa_devel.c b/homa_devel.c index be4886a0..c82022c6 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file contains functions that are useful to have in Homa during * development, but aren't needed in production versions. diff --git a/homa_devel.h b/homa_devel.h index 40f4e65c..8caf164a 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file defines functions that are useful during Homa development; * they are not present in the upstreamed version of Homa in Linux. diff --git a/homa_grant.c b/homa_grant.c index 44aab6b7..fedcc899 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file contains functions related to issuing grants for incoming * messages. diff --git a/homa_grant.h b/homa_grant.h index d64a4753..3fa7a8f8 100644 --- a/homa_grant.h +++ b/homa_grant.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains definitions that related to generating grants. */ diff --git a/homa_impl.h b/homa_impl.h index 62be3634..d3054d06 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains definitions that are shared across the files * that implement Homa for Linux. diff --git a/homa_incoming.c b/homa_incoming.c index 377bebe8..f14d9a45 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #ifndef __STRIP__ /* See strip.py */ /* This file contains functions that handle incoming Homa messages, including diff --git a/homa_interest.c b/homa_interest.c index a3e43573..6d36d6da 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file contains functions for managing homa_interest structs. */ diff --git a/homa_interest.h b/homa_interest.h index 645d25d5..a50e54dd 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file defines struct homa_interest and related functions. */ diff --git a/homa_metrics.c b/homa_metrics.c index 379dbbf5..74f6ed25 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file contains various functions for managing Homa's performance * counters. diff --git a/homa_metrics.h b/homa_metrics.h index 664915b7..cefa3400 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains declarations related to Homa's performance metrics. */ diff --git a/homa_offload.c b/homa_offload.c index e5bf9aea..cb4ff8b6 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file implements GSO (Generic Segmentation Offload) and GRO (Generic * Receive Offload) for Homa. diff --git a/homa_offload.h b/homa_offload.h index cf9904b1..936230e2 100644 --- a/homa_offload.h +++ b/homa_offload.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains definitions related to homa_offload.c. */ diff --git a/homa_outgoing.c b/homa_outgoing.c index e92cc546..587344e2 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file contains functions related to the sender side of message * transmission. It also contains utility functions for sending packets. diff --git a/homa_pacer.c b/homa_pacer.c index c14dde0d..8a8c9ddd 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file implements the Homa pacer, which implements SRPT for packet * output. In order to do that, it throttles packet transmission to prevent diff --git a/homa_pacer.h b/homa_pacer.h index d19e25da..d3fab9db 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file defines structs and functions related to the Homa pacer, * which implements SRPT for packet output. In order to do that, it diff --git a/homa_peer.c b/homa_peer.c index 08616ac5..203f25f3 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file provides functions related to homa_peer and homa_peertab * objects. diff --git a/homa_peer.h b/homa_peer.h index df39db05..82d68bc8 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains definitions related to managing peers (homa_peer * and homa_peertab). diff --git a/homa_plumbing.c b/homa_plumbing.c index f2ee9ea9..b0c27a7c 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file consists mostly of "glue" that hooks Homa into the rest of * the Linux kernel. The guts of the protocol are in other files. diff --git a/homa_pool.c b/homa_pool.c index 8651c98e..b1d76a78 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_pool.h b/homa_pool.h index 708183e2..15ba5c5d 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains definitions used to manage user-space buffer pools. */ diff --git a/homa_qdisc.c b/homa_qdisc.c index 0a9ad37a..a91ec27f 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file implements a special-purpose queuing discipline for Homa. * This queuing discipline serves the following purposes: diff --git a/homa_qdisc.h b/homa_qdisc.h index b68e03c6..56bd2c69 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains definitions related to Homa's special-purpose * queuing discipline diff --git a/homa_receiver.h b/homa_receiver.h index b8639643..ca367831 100644 --- a/homa_receiver.h +++ b/homa_receiver.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #pragma once diff --git a/homa_rpc.c b/homa_rpc.c index 0d7edf19..ed71ee6d 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file contains functions for managing homa_rpc structs. */ diff --git a/homa_rpc.h b/homa_rpc.h index cc30a8b4..767ef9cd 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file defines homa_rpc and related structs. */ diff --git a/homa_skb.c b/homa_skb.c index c37be6da..36aa438f 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file contains functions for allocating and freeing sk_buffs for * outbound packets. In particular, this file implements efficient management diff --git a/homa_skb.h b/homa_skb.h index 45428ef5..f484fa23 100644 --- a/homa_skb.h +++ b/homa_skb.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains definitions related to efficient management of * memory associated with transmit sk_buffs. diff --git a/homa_sock.c b/homa_sock.c index 606b506a..daf2113a 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file manages homa_sock and homa_socktab objects. */ diff --git a/homa_sock.h b/homa_sock.h index 3c8830e7..22967f11 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file defines structs and other things related to Homa sockets. */ diff --git a/homa_stub.h b/homa_stub.h index aefe816d..875d3bfe 100644 --- a/homa_stub.h +++ b/homa_stub.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains stripped-down replacements that have been * temporarily removed from Homa during the Linux upstreaming diff --git a/homa_timer.c b/homa_timer.c index 4f561f73..76fdb6e6 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file handles timing-related functions for Homa, such as retries * and timeouts. diff --git a/homa_utils.c b/homa_utils.c index bce7dce1..8ec3963a 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file contains miscellaneous utility functions for Homa, such * as initializing and destroying homa structs. diff --git a/homa_wire.h b/homa_wire.h index f4296c15..871b740e 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file defines the on-the-wire format of Homa packets. */ diff --git a/test/main.c b/test/main.c index 1196faae..d760827f 100644 --- a/test/main.c +++ b/test/main.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* Main program for running Homa unit tests. */ diff --git a/test/mock.c b/test/mock.c index 84e14c32..dc82e27f 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file provides simplified substitutes for many Linux variables and * functions in order to allow Homa unit tests to be run outside a Linux diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 14df6514..c2eb73a8 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_grant.h" diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index a2c6f276..5f198f59 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_grant.h" diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index 9fb49e0c..761dcc46 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_interest.h" diff --git a/test/unit_homa_metrics.c b/test/unit_homa_metrics.c index 523f1e8a..c048759a 100644 --- a/test/unit_homa_metrics.c +++ b/test/unit_homa_metrics.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #define KSELFTEST_NOT_MAIN 1 diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 820d9d57..7e106333 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_offload.h" diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 3d1e16f8..5b01babd 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_grant.h" diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index bd69d504..afa1290b 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_pacer.h" diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 467c9fc5..80338db6 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_peer.h" diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index bbc8a4f6..83c43613 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_peer.h" diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 40dbdced..e01b6716 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_grant.h" diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 32ab3088..d0249482 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_pacer.h" diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index fb29230f..3ac6f3b1 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_grant.h" diff --git a/test/unit_homa_skb.c b/test/unit_homa_skb.c index 55702ddc..f4a9dfc2 100644 --- a/test/unit_homa_skb.c +++ b/test/unit_homa_skb.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_skb.h" diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index dded29c3..d44e9e76 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_interest.h" diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 10ebab85..02d86c73 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_grant.h" diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 0fb7be60..ed94377c 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #include "homa_peer.h" diff --git a/test/unit_timetrace.c b/test/unit_timetrace.c index 99693220..33db5d90 100644 --- a/test/unit_timetrace.c +++ b/test/unit_timetrace.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" #define KSELFTEST_NOT_MAIN 1 diff --git a/test/utils.c b/test/utils.c index 6abfbd29..7c0d7b62 100644 --- a/test/utils.c +++ b/test/utils.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ /* This file various utility functions for unit testing; this file * is implemented entirely in C, and accesses Homa and kernel internals. diff --git a/test/utils.h b/test/utils.h index 05b3f437..9a2390a9 100644 --- a/test/utils.h +++ b/test/utils.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* Utility functions for unit tests, implemented in C. */ diff --git a/timetrace.c b/timetrace.c index c7ea7377..bc86ba1c 100644 --- a/timetrace.c +++ b/timetrace.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: BSD-2-Clause +// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ #include "homa_impl.h" diff --git a/timetrace.h b/timetrace.h index 08468ad1..60e6270c 100644 --- a/timetrace.h +++ b/timetrace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #ifndef HOMA_TIMETRACE_H #define HOMA_TIMETRACE_H diff --git a/util/buffer_client.c b/util/buffer_client.c index 4cd81f07..6b90ca25 100644 --- a/util/buffer_client.c +++ b/util/buffer_client.c @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program used together with buffer_server.c to learn about diff --git a/util/buffer_server.c b/util/buffer_server.c index 142ba4ae..e5bd4029 100644 --- a/util/buffer_server.c +++ b/util/buffer_server.c @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program used together with buffer_client.c to learn about diff --git a/util/cp_basic b/util/cp_basic index 755df89c..ae41548e 100755 --- a/util/cp_basic +++ b/util/cp_basic @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark computes basic latency and throughput numbers # for Homa and TCP. diff --git a/util/cp_both b/util/cp_both index dfe8e5fc..63264502 100755 --- a/util/cp_both +++ b/util/cp_both @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2024 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark runs both TCP and Homa on each client and server # node in order to measure interference between the protocols. diff --git a/util/cp_buffers b/util/cp_buffers index cced4e33..7d8fc883 100755 --- a/util/cp_buffers +++ b/util/cp_buffers @@ -1,7 +1,7 @@ #!/usr/bin/python3 -u # Copyright (c) 2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark varies one or more aspects of Homa's configuration, # similar and measures Homa slowdown, but it also measures switch buffer diff --git a/util/cp_client_threads b/util/cp_client_threads index 7814757d..15291bf3 100755 --- a/util/cp_client_threads +++ b/util/cp_client_threads @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures the throughput of a single client as a # function of the number of sending threads diff --git a/util/cp_config b/util/cp_config index 2aabc125..55bd88c2 100755 --- a/util/cp_config +++ b/util/cp_config @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures Homa slowdown while varying one or more # aspects of Homa's configuration (such as duty cycle). diff --git a/util/cp_config_buf b/util/cp_config_buf index 8933e719..865143b3 100755 --- a/util/cp_config_buf +++ b/util/cp_config_buf @@ -1,7 +1,7 @@ #!/usr/bin/python3 -u # Copyright (c) 2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark varies one or more aspects of Homa's configuration, # similar and measures Homa slowdown, but it also measures switch buffer diff --git a/util/cp_load b/util/cp_load index f19fa99c..96f379b5 100755 --- a/util/cp_load +++ b/util/cp_load @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark generates CDFs of short-message latency for Homa # and TCP under different loads. diff --git a/util/cp_mtu b/util/cp_mtu index 6b0141a4..e523c646 100755 --- a/util/cp_mtu +++ b/util/cp_mtu @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark generates CDFs of short-message latency for Homa # and TCP under different values for MTU (maximum packet size). diff --git a/util/cp_node.cc b/util/cp_node.cc index 1209d842..c9857ac9 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains a program that runs on one node, as part of diff --git a/util/cp_server_ports b/util/cp_server_ports index 24d85896..dfc3842d 100755 --- a/util/cp_server_ports +++ b/util/cp_server_ports @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures the throughput of a single server as a # function of the number of receiving ports diff --git a/util/cp_tcp b/util/cp_tcp index c8b4cd3a..7b124c7b 100755 --- a/util/cp_tcp +++ b/util/cp_tcp @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures the performance of TCP by itself, with # no message truncation. diff --git a/util/cp_tcp_config b/util/cp_tcp_config index 185909eb..cb8de9b8 100755 --- a/util/cp_tcp_config +++ b/util/cp_tcp_config @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures TCP and DCTCP while varying one or more # aspects of Homa's configuration (such as duty cycle). diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index 650fe516..2edc9933 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark compares the performance of Homa with TCP. # Type "cp_vs_tcp --help" for documentation. diff --git a/util/cperf.py b/util/cperf.py index 24993cc3..59446001 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2020-2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This file contains library functions used to run cluster performance # tests for the Linux kernel implementation of Homa. diff --git a/util/diff_metrics.py b/util/diff_metrics.py index c90c53d1..9e39a043 100755 --- a/util/diff_metrics.py +++ b/util/diff_metrics.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2018-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ This program reads 2 Homa metrics files (/proc/net/homa_metrics) diff --git a/util/diff_rtts.py b/util/diff_rtts.py index 0cd42e8d..21dd0f25 100755 --- a/util/diff_rtts.py +++ b/util/diff_rtts.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Compare two .rtts files to identify differences between them. diff --git a/util/dist.cc b/util/dist.cc index e6dfca3c..416eaa58 100644 --- a/util/dist.cc +++ b/util/dist.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains the workload distributions from the Homa paper, plus diff --git a/util/dist.h b/util/dist.h index 15e8c7dd..840a50cf 100644 --- a/util/dist.h +++ b/util/dist.h @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file defines the kernel contains information and supporting diff --git a/util/dist_test.cc b/util/dist_test.cc index 50137b74..df839513 100644 --- a/util/dist_test.cc +++ b/util/dist_test.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #include diff --git a/util/dist_to_proto.cc b/util/dist_to_proto.cc index 88f4a454..ccf35a53 100644 --- a/util/dist_to_proto.cc +++ b/util/dist_to_proto.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #include "dist.h" diff --git a/util/get_time_trace.c b/util/get_time_trace.c index 84393bd4..9cb11717 100644 --- a/util/get_time_trace.c +++ b/util/get_time_trace.c @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /** diff --git a/util/get_traces b/util/get_traces index f200af61..f166adab 100755 --- a/util/get_traces +++ b/util/get_traces @@ -1,7 +1,7 @@ #!/bin/bash # Copyright (c) 2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # Usage: # get_traces first last dst diff --git a/util/homa_prio.cc b/util/homa_prio.cc index ba1d829f..b33a6010 100644 --- a/util/homa_prio.cc +++ b/util/homa_prio.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2020-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains a program that dynamically adjusts Homa's allocation diff --git a/util/homa_test.cc b/util/homa_test.cc index c316967d..f1df4257 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ // This file contains a collection of tests for the Linux implementation diff --git a/util/inc_tput.cc b/util/inc_tput.cc index bfd2c3da..2bf31bd3 100644 --- a/util/inc_tput.cc +++ b/util/inc_tput.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2024 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This program measures the throughput of atomic increments in the face diff --git a/util/metrics.py b/util/metrics.py index 96d8db20..efa55386 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2019-2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Reads Homa metrics from the kernel and prints out anything that is changed diff --git a/util/plot.py b/util/plot.py index 06aca513..1af0516a 100755 --- a/util/plot.py +++ b/util/plot.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This file provides a library of functions for generating plots. diff --git a/util/plot_tthoma.py b/util/plot_tthoma.py index bfbbe075..27aee7e6 100755 --- a/util/plot_tthoma.py +++ b/util/plot_tthoma.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This file provides a collection of functions that plot data generated # by tthoma.py. Invoke with the --help option for more information. diff --git a/util/receive_raw.c b/util/receive_raw.c index 133be3b6..fb37eb4f 100644 --- a/util/receive_raw.c +++ b/util/receive_raw.c @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program that uses a raw socket to receive packets diff --git a/util/rpcid.py b/util/rpcid.py index ebb3acf4..62a7e949 100755 --- a/util/rpcid.py +++ b/util/rpcid.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c)2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Analyzes Homa timetraces on two different machines to extract a diff --git a/util/send_raw.c b/util/send_raw.c index 1ebfb8e2..f6a25fcf 100644 --- a/util/send_raw.c +++ b/util/send_raw.c @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program that will send a packet to a given diff --git a/util/server.cc b/util/server.cc index fa7b076e..0754ebe1 100644 --- a/util/server.cc +++ b/util/server.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This is a test program that acts as a server for testing either diff --git a/util/service.py b/util/service.py index 756d0239..0fd54cbd 100755 --- a/util/service.py +++ b/util/service.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2019-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Compute service times for RPCs from a server-side trace. diff --git a/util/smi.cc b/util/smi.cc index ea9f0f3f..4770f417 100644 --- a/util/smi.cc +++ b/util/smi.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This program spawns a collection of threads on different cores to diff --git a/util/strip.py b/util/strip.py index c57a14cc..b47eacbc 100755 --- a/util/strip.py +++ b/util/strip.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -# SPDX-License-Identifier: BSD-2-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ This script is used to copy information from the Homa GitHub repo to diff --git a/util/strip_decl.py b/util/strip_decl.py index e9361dfb..1e3b83d1 100755 --- a/util/strip_decl.py +++ b/util/strip_decl.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -# SPDX-License-Identifier: BSD-2-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ or GPL-2.0+ """ This script is used to make a copy of homa_impl.h that seletively omits diff --git a/util/test_time_trace.c b/util/test_time_trace.c index 31ae3efe..33be02b9 100644 --- a/util/test_time_trace.c +++ b/util/test_time_trace.c @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This program exercises the Linux kernel time trace mechanism diff --git a/util/test_utils.cc b/util/test_utils.cc index 3a7415a9..eec95ee0 100644 --- a/util/test_utils.cc +++ b/util/test_utils.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2023 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This file contains a collection of functions that are useful in diff --git a/util/test_utils.h b/util/test_utils.h index 53faa404..aeef2c1d 100644 --- a/util/test_utils.h +++ b/util/test_utils.h @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #ifndef _TEST_UTILS_H diff --git a/util/time_trace.cc b/util/time_trace.cc index 35a1bb34..9d815cff 100644 --- a/util/time_trace.cc +++ b/util/time_trace.cc @@ -1,5 +1,5 @@ /* Copyright (c) 2014-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #include diff --git a/util/time_trace.h b/util/time_trace.h index 5e3e989e..cb06236d 100644 --- a/util/time_trace.h +++ b/util/time_trace.h @@ -1,5 +1,5 @@ /* Copyright (c) 2020-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ #ifndef TIMETRACE_H diff --git a/util/ttgrep.py b/util/ttgrep.py index 10a63461..def0c833 100755 --- a/util/ttgrep.py +++ b/util/ttgrep.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2019-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Scan the time trace data in a log file; find all records whose events diff --git a/util/tthoma.py b/util/tthoma.py index 528b7f3e..bbd897f7 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c)2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ This script analyzes time traces gathered from Homa in a variety of ways. diff --git a/util/ttmerge.py b/util/ttmerge.py index e2ffed21..8e233aff 100755 --- a/util/ttmerge.py +++ b/util/ttmerge.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2019-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Merge two or more timetraces into a single trace. All of the traces diff --git a/util/ttoffset.py b/util/ttoffset.py index a9591835..ea2b51aa 100755 --- a/util/ttoffset.py +++ b/util/ttoffset.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2019-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Rewrite a time trace with all of the times offset by a fixed amount diff --git a/util/ttprint.py b/util/ttprint.py index bdb3b384..47d7240b 100755 --- a/util/ttprint.py +++ b/util/ttprint.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2019-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ This program reads timetrace information from /proc/timetrace (or from diff --git a/util/ttrange.py b/util/ttrange.py index dfcd6eec..139f0ea5 100755 --- a/util/ttrange.py +++ b/util/ttrange.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2019-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Extract entries from a timetrace that For any particular time range. diff --git a/util/ttsum.py b/util/ttsum.py index 10a0a9ed..3794d116 100755 --- a/util/ttsum.py +++ b/util/ttsum.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2019-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ This program reads one or more timetrace logs and generates summary diff --git a/util/ttsync.py b/util/ttsync.py index 6ad17f85..e4733fa1 100755 --- a/util/ttsync.py +++ b/util/ttsync.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c)2023 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ Scans two or more timetraces covering the same time interval, determines the diff --git a/util/ttsyslog.py b/util/ttsyslog.py index 16194ded..e6e1260f 100755 --- a/util/ttsyslog.py +++ b/util/ttsyslog.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # Copyright (c) 2019-2022 Homa Developers -# SPDX-License-Identifier: BSD-1-Clause +# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ """ This program reads timetrace information that was printk-ed to the diff --git a/util/use_memory.c b/util/use_memory.c index ee72dd84..d6c82c98 100644 --- a/util/use_memory.c +++ b/util/use_memory.c @@ -1,5 +1,5 @@ /* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-1-Clause + * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ /* This program allocates a given amount of memory and then sleeps From 2e5cd78b8708feefe08522a38cb1afd76089e538 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 30 Jul 2025 15:54:42 -0700 Subject: [PATCH 420/625] Fix various issues from checkpatch.pl, kernel-doc, xmastree, etc. --- Makefile | 3 ++- homa_devel.c | 4 ++-- homa_grant.c | 3 +-- homa_impl.h | 3 +++ homa_metrics.c | 6 +++--- homa_plumbing.c | 2 +- homa_qdisc.c | 22 +++++++++++----------- homa_qdisc.h | 4 ++-- homa_rpc.c | 2 +- test/Makefile | 2 +- test/mock.c | 2 ++ 11 files changed, 29 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 0e460059..8113bf41 100644 --- a/Makefile +++ b/Makefile @@ -72,7 +72,8 @@ CP_HDRS := homa_impl.h \ homa_stub.h \ homa_wire.h CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o homa_grant.o \ - homa_metrics.o homa_offload.o homa_skb.o timetrace.o, $(HOMA_OBJS))) + homa_metrics.o homa_offload.o homa_qdisc.o \ + homa_skb.o timetrace.o, $(HOMA_OBJS))) CP_EXTRAS := Kconfig \ Makefile CP_TARGETS := $(patsubst %,$(HOMA_TARGET)/%,$(CP_HDRS) $(CP_SRCS) $(CP_EXTRAS)) diff --git a/homa_devel.c b/homa_devel.c index c82022c6..2b7db06d 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -999,10 +999,10 @@ void homa_snapshot_rpcs(void) */ void homa_rpc_snapshot_log_tt(void) { - struct homa_rpc_snapshot *snap; - u64 now = homa_clock(); u64 creq_base, creq_bbase, cresp_base, cresp_bbase; u64 sreq_base, sreq_bbase, sresp_base, sresp_bbase; + struct homa_rpc_snapshot *snap; + u64 now = homa_clock(); u64 usecs; int i; diff --git a/homa_grant.c b/homa_grant.c index fedcc899..7c661bd9 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -629,8 +629,7 @@ int homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) incoming_delta = new_grant_offset - received - rpc->msgin.rec_incoming; avl_incoming = grant->max_incoming - atomic_read(&grant->total_incoming); if (avl_incoming < incoming_delta) { - tt_record4("insufficient headroom for grant for RPC id %d " - "(rank %d): desired incoming %d, shortfall %d", + tt_record4("insufficient headroom for grant for RPC id %d (rank %d): desired incoming %d, shortfall %d", rpc->id, rank, new_grant_offset - received, incoming_delta - avl_incoming); prev_stalled = atomic_read(&grant->stalled_rank); diff --git a/homa_impl.h b/homa_impl.h index d3054d06..0f914ae6 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -42,6 +42,9 @@ #include #include #include +#ifdef CONFIG_X86_TSC +#include +#endif #ifndef __UPSTREAM__ /* See strip.py */ #include "homa.h" diff --git a/homa_metrics.c b/homa_metrics.c index 74f6ed25..65b1dc1e 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -91,7 +91,7 @@ void homa_metric_append(const char *name, u64 value, const char *format, ...) } new_chars = snprintf(homa_mout.output + homa_mout.length, 60, - "%-30s %20llu ", name, value); + "%-30s %20llu ", name, value); homa_mout.length += (new_chars > 60) ? 60 : new_chars; va_start(ap, format); new_chars = vsnprintf(homa_mout.output + homa_mout.length, 120, @@ -174,7 +174,7 @@ char *homa_metrics_print(void) "Server RPC requests fully received\n"); M("server_responses_started", m->server_responses_started, "Server RPCs for which response was initiated\n"); - M("server_response_bytes_started",\ + M("server_response_bytes_started", m->server_response_bytes_started, "Message bytes in all initiated server responses\n"); M("server_response_bytes_done", m->server_response_bytes_done, @@ -247,7 +247,7 @@ char *homa_metrics_print(void) M("send_cycles", m->send_cycles, "Time spent in homa_sendmsg for requests\n"); M("send_calls", m->send_calls, - "Total invocations of homa_sendmsg for equests\n"); + "Total invocations of homa_sendmsg for requests\n"); // It is possible for us to get here at a time when a // thread has been blocked for a long time and has // recorded blocked_cycles, but hasn't finished the diff --git a/homa_plumbing.c b/homa_plumbing.c index b0c27a7c..07913e2e 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -482,13 +482,13 @@ int __init homa_load(void) bool init_net_ops = false; bool init_proto6 = false; bool init_proto = false; - bool init_qdisc = false; bool init_homa = false; int status; IF_NO_STRIP(bool init_metrics = false); IF_NO_STRIP(bool init_offload = false); IF_NO_STRIP(bool init_sysctl = false); + IF_NO_STRIP(bool init_qdisc = false); /* Compile-time validations that no packet header is longer * than HOMA_MAX_HEADER. diff --git a/homa_qdisc.c b/homa_qdisc.c index a91ec27f..abe6ef53 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -165,7 +165,7 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, /** * homa_qdisc_destroy() - This function is invoked to perform final cleanup * before a qdisc is deleted. - * @sch: Qdisc that is being deleted. + * @qdisc: Qdisc that is being deleted. */ void homa_qdisc_destroy(struct Qdisc *qdisc) { @@ -249,18 +249,18 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* This packet needs to be deferred until the NIC queue has * been drained a bit. */ - h = (struct homa_data_hdr *) skb_transport_header(skb); + h = (struct homa_data_hdr *)skb_transport_header(skb); tt_record4("homa_qdisc_enqueue deferring homa data packet for id %d, offset %d, bytes_left %d on qid %d", - be64_to_cpu(h->common.sender_id), - ntohl(h->seg.offset), - homa_get_skb_info(skb)->bytes_left, qdev->pacer_qix); + be64_to_cpu(h->common.sender_id), + ntohl(h->seg.offset), + homa_get_skb_info(skb)->bytes_left, qdev->pacer_qix); homa_qdisc_defer_homa(qdev, skb); wake_up(&qdev->pacer_sleep); return NET_XMIT_SUCCESS; enqueue: if (is_homa_pkt(skb)) { - h = (struct homa_data_hdr *) skb_transport_header(skb); + h = (struct homa_data_hdr *)skb_transport_header(skb); tt_record4("homa_qdisc_enqueue queuing homa data packet for id %d, offset %d, bytes_left %d on qid %d", be64_to_cpu(h->common.sender_id), ntohl(h->seg.offset), @@ -277,7 +277,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* homa_enqueue_special is going to lock a different qdisc, * so in order to avoid deadlocks we have to release the * lock for this qdisc. - * */ + */ spin_unlock(qdisc_lock(sch)); result = homa_qdisc_redirect_skb(skb, qdev, false); spin_lock(qdisc_lock(sch)); @@ -575,7 +575,7 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) homa_qdisc_update_link_idle(qdev, qdisc_skb_cb(skb)->pkt_len, -1); - h = (struct homa_data_hdr *) skb_transport_header(skb); + h = (struct homa_data_hdr *)skb_transport_header(skb); tt_record4("homa_qdisc_pacer queuing homa data packet for id %d, offset %d, bytes_left %d on qid %d", be64_to_cpu(h->common.sender_id), ntohl(h->seg.offset), @@ -600,7 +600,7 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) * Return: Standard enqueue return code (usually NET_XMIT_SUCCESS). */ int homa_qdisc_redirect_skb(struct sk_buff *skb, - struct homa_qdisc_dev *qdev, bool pacer) + struct homa_qdisc_dev *qdev, bool pacer) { struct netdev_queue *txq; struct Qdisc *qdisc; @@ -646,7 +646,6 @@ int homa_qdisc_redirect_skb(struct sk_buff *skb, /** * homa_qdisc_update_sysctl() - Recompute information in a homa_qdisc_dev * that depends on sysctl parameters. - * @homa: Used to fetch current sysctl parameter values. * @qdev: Update information here that depends on sysctl values. */ void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev) @@ -688,7 +687,8 @@ void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev) /** * homa_qdisc_update_all_sysctl() - Invoked whenever a sysctl value is changed; * updates all qdisc structures to reflect new values. - * @homa: Overall data about the Homa protocol implementation. + * @hnet: Homa's information about a network namespace: changes will apply + * to qdiscs in this namespace. */ void homa_qdisc_update_all_sysctl(struct homa_net *hnet) { diff --git a/homa_qdisc.h b/homa_qdisc.h index 56bd2c69..b79f1ea5 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -21,7 +21,7 @@ * the homa queuing discipline */ struct homa_qdisc { - /** @dev: Info shared among all qdiscs for a net_device. */ + /** @qdev: Info shared among all qdiscs for a net_device. */ struct homa_qdisc_dev *qdev; /** @@ -40,7 +40,7 @@ struct homa_qdisc_dev { struct net_device *dev; /** - * @homa_net: Homa's information about the network namespace + * @hnet: Homa's information about the network namespace * this object belongs to. */ struct homa_net *hnet; diff --git a/homa_rpc.c b/homa_rpc.c index ed71ee6d..2be4e996 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -616,7 +616,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) } } else { INC_METRIC(server_request_bytes_done, - rpc->msgin.bytes_remaining); + rpc->msgin.bytes_remaining); INC_METRIC(server_requests_done, rpc->msgin.bytes_remaining != 0); if (tx_left > 0) { diff --git a/test/Makefile b/test/Makefile index cafa34ae..2fe253b9 100644 --- a/test/Makefile +++ b/test/Makefile @@ -69,7 +69,6 @@ HOMA_SRCS := homa_devel.c \ homa_peer.c \ homa_pool.c \ homa_plumbing.c \ - homa_qdisc.c \ homa_rpc.c \ homa_sock.c \ homa_timer.c \ @@ -79,6 +78,7 @@ ifeq ($(__STRIP__),) HOMA_SRCS += homa_grant.c \ homa_metrics.c \ homa_offload.c \ + homa_qdisc.c \ homa_skb.c endif HOMA_OBJS := $(patsubst %.c,%.o,$(HOMA_SRCS)) rhashtable.o diff --git a/test/mock.c b/test/mock.c index dc82e27f..a2b25ecb 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1885,6 +1885,7 @@ void mock_put_page(struct page *page) } } +#ifndef __STRIP__ /* See strip.py */ /** * mock_alloc_qdisc() - Allocate and initialize a new Qdisc suitable for * use in unit tests as a homa qdisc. @@ -1903,6 +1904,7 @@ struct Qdisc *mock_alloc_qdisc(struct netdev_queue *dev_queue) spin_lock_init(&qdisc->q.lock); return qdisc; } +#endif /* See strip.py */ /** * mock_rcu_read_lock() - Called instead of rcu_read_lock when Homa is compiled From 2308d197117a6627f32c164f929122f643b025c5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 31 Jul 2025 11:08:26 -0700 Subject: [PATCH 421/625] Remove obsolete flags field in homa_recvmsg_args in man page --- man/recvmsg.2 | 1 - 1 file changed, 1 deletion(-) diff --git a/man/recvmsg.2 b/man/recvmsg.2 index 7eb70a40..5d094483 100644 --- a/man/recvmsg.2 +++ b/man/recvmsg.2 @@ -61,7 +61,6 @@ struct homa_recvmsg_args { * desired RPC, which must be * private. */ uint64_t completion_cookie; /* Value from sendmsg for request. */ - int flags; /* OR-ed combination of bits. */ uint32_t num_bpages; /* Number of valid entries in * bpage_offsets. */ uint32_t bpage_offsets[HOMA_MAX_BPAGES]; /* Tokens for buffer pages. */ From a27550a759be8fbdf076a9d1b3bd60c9e5a62433 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 4 Aug 2025 11:36:12 -0700 Subject: [PATCH 422/625] Don't invoke homa_rpc_shapshot_log_tt anymore --- homa_plumbing.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 07913e2e..f2985b83 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1465,7 +1465,6 @@ int homa_softirq(struct sk_buff *skb) if (unlikely(h->type == FREEZE)) { if (!atomic_read(&tt_frozen)) { homa_rpc_log_active_tt(homa_from_skb(skb), 0); - homa_rpc_snapshot_log_tt(); tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", ntohs(h->dport), tt_addr(skb_canonical_ipv6_saddr(skb)), @@ -1732,7 +1731,6 @@ int homa_dointvec(const struct ctl_table *table, int write, tt_freeze(); } else if (homa->sysctl_action == 7) { homa_rpc_log_active_tt(homa, 0); - homa_rpc_snapshot_log_tt(); tt_record("Freezing cluster because of action 7"); homa_freeze_peers(); tt_record("Finished freezing cluster"); From f3395a27213fc3702acb57e70ca9f5d3154fea03 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 4 Aug 2025 11:37:25 -0700 Subject: [PATCH 423/625] Don't output stats in homa_timer when there is no activity --- homa_timer.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/homa_timer.c b/homa_timer.c index 76fdb6e6..6b61af1f 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -150,11 +150,15 @@ void homa_timer(struct homa *homa) total_grants += m->packets_sent[GRANT - DATA]; } - tt_record4("homa_timer found total_incoming %d, num_grantable_rpcs %d, num_active_rpcs %d, new grants %d", - atomic_read(&homa->grant->total_incoming), - homa->grant->num_grantable_rpcs, - homa->grant->num_active_rpcs, - total_grants - prev_grant_count); + if (atomic_read(&homa->grant->total_incoming) != 0 || + homa->grant->num_grantable_rpcs != 0 || + homa->grant->num_active_rpcs != 0 || + total_grants - prev_grant_count != 0) + tt_record4("homa_timer found total_incoming %d, num_grantable_rpcs %d, num_active_rpcs %d, new grants %d", + atomic_read(&homa->grant->total_incoming), + homa->grant->num_grantable_rpcs, + homa->grant->num_active_rpcs, + total_grants - prev_grant_count); if (total_grants == prev_grant_count && homa->grant->num_grantable_rpcs > 20) { zero_count++; @@ -232,9 +236,10 @@ void homa_timer(struct homa *homa) } homa_socktab_end_scan(&scan); #ifndef __STRIP__ /* See strip.py */ - tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", - total_incoming_rpcs, sum_incoming, sum_incoming_rec, - atomic_read(&homa->grant->total_incoming)); + if (total_incoming_rpcs > 0) + tt_record4("homa_timer found %d incoming RPCs, incoming sum %d, rec_sum %d, homa->total_incoming %d", + total_incoming_rpcs, sum_incoming, sum_incoming_rec, + atomic_read(&homa->grant->total_incoming)); #endif /* See strip.py */ homa_skb_release_pages(homa); homa_peer_gc(homa->peertab); From f3bf78e29c7aaaccbf2bb07402f014e8b1b17cfd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 4 Aug 2025 11:59:45 -0700 Subject: [PATCH 424/625] Fix race in homa_wait_shared * An RPC could get lost if it became ready at the same time that homa_interest_wait returned with an error. that could cause RPCs to be lost * Also, removed nonblocking parameter from homa_interest_wait: it makes more sense to handle this in homa_wait_shared and homa_wait_private. * Also cleaned up metrics (wait_block, etc.) so they are only incremented when an RPC is returned. --- homa_incoming.c | 67 ++++++++++++++++++++++++++++----------- homa_interest.c | 13 ++------ homa_interest.h | 9 ++---- homa_metrics.h | 10 +++--- test/unit_homa_incoming.c | 58 ++++++++++++++++++++++++--------- test/unit_homa_interest.c | 31 ++++++++---------- 6 files changed, 115 insertions(+), 73 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index f14d9a45..6e137608 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1094,6 +1094,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) rpc->error = homa_copy_to_user(rpc); if (rpc->error) { result = rpc->error; + IF_NO_STRIP(avail_immediately = 0); break; } if (rpc->msgin.length >= 0 && @@ -1104,12 +1105,18 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) break; } + if (nonblocking) { + result = -EAGAIN; + IF_NO_STRIP(avail_immediately = 0); + break; + } + result = homa_interest_init_private(&interest, rpc); if (result != 0) break; homa_rpc_unlock(rpc); - result = homa_interest_wait(&interest, nonblocking); + result = homa_interest_wait(&interest); #ifndef __STRIP__ /* See strip.py */ avail_immediately = 0; blocked |= interest.blocked; @@ -1122,8 +1129,9 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) tt_record3("homa_wait_private found rpc id %d, pid %d via handoff, blocked %d", rpc->id, current->pid, interest.blocked); - /* If homa_interest_wait returned an error but the interest - * actually got ready, then ignore the error. + /* Abort on error, but if the interest actually got ready + * in the meantime the ignore the error (loop back around + * to process the RPC). */ if (result != 0 && atomic_read(&interest.ready) == 0) break; @@ -1132,10 +1140,12 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) #ifndef __STRIP__ /* See strip.py */ if (avail_immediately) INC_METRIC(wait_none, 1); - else if (blocked) - INC_METRIC(wait_block, 1); - else - INC_METRIC(wait_fast, 1); + else if (result == 0) { + if (blocked) + INC_METRIC(wait_block, 1); + else + INC_METRIC(wait_fast, 1); + } #endif /* See strip.py */ return result; } @@ -1190,23 +1200,40 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) hsk->sock.sk_data_ready(&hsk->sock); } homa_sock_unlock(hsk); + } else if (nonblocking) { + rpc = ERR_PTR(-EAGAIN); + homa_sock_unlock(hsk); + IF_NO_STRIP(avail_immediately = 0); + + /* This is a good time to cleanup dead RPCS. */ + homa_rpc_reap(hsk, false); + goto done; } else { homa_interest_init_shared(&interest, hsk); homa_sock_unlock(hsk); - result = homa_interest_wait(&interest, nonblocking); + result = homa_interest_wait(&interest); #ifndef __STRIP__ /* See strip.py */ avail_immediately = 0; blocked |= interest.blocked; #endif /* See strip.py */ - homa_interest_unlink_shared(&interest); if (result != 0) { - /* If homa_interest_wait returned an error - * (e.g. -EAGAIN) but in the meantime the - * interest received a handoff, ignore the - * error. + int ready; + + /* homa_interest_wait returned an error, so we + * have to do two things. First, unlink the + * interest from the socket. Second, check to + * see if in the meantime the interest received + * a handoff. If so, ignore the error. Very + * important to hold the socket lock while + * checking, in order to eliminate races with + * homa_rpc_handoff. */ - if (atomic_read(&interest.ready) == 0) { + homa_sock_lock(hsk); + homa_interest_unlink_shared(&interest); + ready = atomic_read(&interest.ready); + homa_sock_unlock(hsk); + if (ready == 0) { rpc = ERR_PTR(result); goto done; } @@ -1238,12 +1265,14 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) done: #ifndef __STRIP__ /* See strip.py */ - if (avail_immediately) + if (avail_immediately) { INC_METRIC(wait_none, 1); - else if (blocked) - INC_METRIC(wait_block, 1); - else - INC_METRIC(wait_fast, 1); + } else if (!IS_ERR(rpc)) { + if (blocked) + INC_METRIC(wait_block, 1); + else + INC_METRIC(wait_fast, 1); + } #endif /* See strip.py */ return rpc; } diff --git a/homa_interest.c b/homa_interest.c index 6d36d6da..40f96c0f 100644 --- a/homa_interest.c +++ b/homa_interest.c @@ -64,13 +64,11 @@ int homa_interest_init_private(struct homa_interest *interest, * and linked to a socket or RPC. On return, the interest * will have been unlinked if its ready flag is set; otherwise * it may still be linked. - * @nonblocking: Nonzero means return without blocking if the interest - * doesn't become ready immediately. * - * Return: 0 for success (there is an actionable RPC in the interest), or - * a negative errno. + * Return: 0 for success (the ready flag is set in the interest), or -EINTR + * if the thread received an interrupt. */ -int homa_interest_wait(struct homa_interest *interest, int nonblocking) +int homa_interest_wait(struct homa_interest *interest) { struct homa_sock *hsk = interest->hsk; int result = 0; @@ -98,11 +96,6 @@ int homa_interest_wait(struct homa_interest *interest, int nonblocking) if (homa_rpc_reap(hsk, false) != 0) continue; - if (nonblocking) { - result = -EAGAIN; - goto done; - } - #ifndef __STRIP__ /* See strip.py */ now = homa_clock(); per_cpu(homa_offload_core, diff --git a/homa_interest.h b/homa_interest.h index a50e54dd..6a1e3c27 100644 --- a/homa_interest.h +++ b/homa_interest.h @@ -71,12 +71,9 @@ struct homa_interest { * homa_interest_init_shared. */ static inline void homa_interest_unlink_shared(struct homa_interest *interest) + __must_hold(hsk->lock) { - if (!list_empty(&interest->links)) { - homa_sock_lock(interest->hsk); - list_del_init(&interest->links); - homa_sock_unlock(interest->hsk); - } + list_del_init(&interest->links); } /** @@ -99,7 +96,7 @@ void homa_interest_init_shared(struct homa_interest *interest, int homa_interest_init_private(struct homa_interest *interest, struct homa_rpc *rpc); void homa_interest_notify_private(struct homa_rpc *rpc); -int homa_interest_wait(struct homa_interest *interest, int nonblocking); +int homa_interest_wait(struct homa_interest *interest); #ifndef __STRIP__ /* See strip.py */ struct homa_interest diff --git a/homa_metrics.h b/homa_metrics.h index cefa3400..31a8850a 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -222,15 +222,15 @@ struct homa_metrics { u64 wait_none; /** - * @wait_fast: total number of times that a message arrived for - * a receiving thread while it was polling (i.e. the message - * wasn't immediately available, but the thread never blocked). + * @wait_fast: total number of times that a thread received an + * incoming message while polling (i.e. the message wasn't + * immediately available, but the thread never blocked). */ u64 wait_fast; /** - * @wait_block: total number of times that a thread blocked at - * least once while waiting for an incoming message. + * @wait_block: total number of times that a thread received an + * incoming message after blocking at least once. */ u64 wait_block; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 5f198f59..99e5e1f1 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2363,7 +2363,7 @@ TEST_F(homa_incoming, homa_wait_private__rpc_not_private) ASSERT_NE(NULL, crpc); EXPECT_EQ(EINVAL, -homa_wait_private(crpc, 0)); } -TEST_F(homa_incoming, homa_wait_private__available_immediately) +TEST_F(homa_incoming, homa_wait_private__rpc_has_error) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, @@ -2372,13 +2372,16 @@ TEST_F(homa_incoming, homa_wait_private__available_immediately) ASSERT_NE(NULL, crpc); ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); atomic_or(RPC_PRIVATE, &crpc->flags); + crpc->error = -ENOENT; homa_rpc_lock(crpc); - EXPECT_EQ(0, homa_wait_private(crpc, 0)); + EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); - ASSERT_EQ(RPC_PRIVATE, atomic_read(&crpc->flags)); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_none)); + EXPECT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags) & RPC_PKTS_READY); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); } -TEST_F(homa_incoming, homa_wait_private__rpc_has_error) +TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, @@ -2387,13 +2390,12 @@ TEST_F(homa_incoming, homa_wait_private__rpc_has_error) ASSERT_NE(NULL, crpc); ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); atomic_or(RPC_PRIVATE, &crpc->flags); - crpc->error = -ENOENT; + mock_copy_data_errors = 1; homa_rpc_lock(crpc); - EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); + EXPECT_EQ(EFAULT, -homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); - EXPECT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags) & RPC_PKTS_READY); } -TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) +TEST_F(homa_incoming, homa_wait_private__available_immediately) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, @@ -2402,10 +2404,11 @@ TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) ASSERT_NE(NULL, crpc); ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); atomic_or(RPC_PRIVATE, &crpc->flags); - mock_copy_data_errors = 1; homa_rpc_lock(crpc); - EXPECT_EQ(EFAULT, -homa_wait_private(crpc, 0)); + EXPECT_EQ(0, homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); + ASSERT_EQ(RPC_PRIVATE, atomic_read(&crpc->flags)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_none)); } TEST_F(homa_incoming, homa_wait_private__nonblocking) { @@ -2419,7 +2422,9 @@ TEST_F(homa_incoming, homa_wait_private__nonblocking) homa_rpc_lock(crpc); EXPECT_EQ(EAGAIN, -homa_wait_private(crpc, 1)); homa_rpc_unlock(crpc); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_fast)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); } TEST_F(homa_incoming, homa_wait_private__signal_notify_race) { @@ -2438,7 +2443,9 @@ TEST_F(homa_incoming, homa_wait_private__signal_notify_race) homa_rpc_lock(crpc); EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); EXPECT_EQ(0, mock_prepare_to_wait_errors); } @@ -2499,7 +2506,26 @@ TEST_F(homa_incoming, homa_wait_shared__nonblocking) rpc = homa_wait_shared(&self->hsk, 1); EXPECT_TRUE(IS_ERR(rpc)); EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_fast)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); +} +TEST_F(homa_incoming, homa_wait_shared__reap_when_nonblocking) +{ + struct homa_rpc *crpc; + struct homa_rpc *rpc; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 14 * 1400 + 1, 1600); + ASSERT_NE(NULL, crpc); + homa_rpc_end(crpc); + EXPECT_EQ(15, self->hsk.dead_skbs); + + rpc = homa_wait_shared(&self->hsk, 1); + EXPECT_TRUE(IS_ERR(rpc)); + EXPECT_EQ(EAGAIN, -PTR_ERR(rpc)); + EXPECT_EQ(5, self->hsk.dead_skbs); } TEST_F(homa_incoming, homa_wait_shared__signal_race_with_handoff) { @@ -2535,7 +2561,9 @@ TEST_F(homa_incoming, homa_wait_shared__socket_shutdown_while_blocked) EXPECT_EQ(ESHUTDOWN, -PTR_ERR(rpc)); EXPECT_EQ(1, self->hsk.shutdown); self->hsk.shutdown = 0; - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); } TEST_F(homa_incoming, homa_wait_shared__copy_to_user_fails) { diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index 761dcc46..af6a8f8e 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -144,7 +144,7 @@ TEST_F(homa_interest, homa_interest_wait__already_ready) homa_interest_init_shared(&interest, &self->hsk); atomic_set(&interest.ready, 1); - EXPECT_EQ(0, homa_interest_wait(&interest, 0)); + EXPECT_EQ(0, homa_interest_wait(&interest)); EXPECT_EQ(0, interest.blocked); homa_interest_unlink_shared(&interest); @@ -163,7 +163,7 @@ TEST_F(homa_interest, homa_interest_wait__call_schedule) hook_count = 2; unit_log_clear(); - EXPECT_EQ(0, homa_interest_wait(&interest, 0)); + EXPECT_EQ(0, homa_interest_wait(&interest)); EXPECT_STREQ("schedule; schedule", unit_log_get()); homa_interest_unlink_shared(&interest); } @@ -179,22 +179,17 @@ TEST_F(homa_interest, homa_interest_wait__call_homa_rpc_reap) ASSERT_NE(NULL, crpc); homa_rpc_end(crpc); EXPECT_EQ(15, self->hsk.dead_skbs); - homa_interest_init_shared(&interest, &self->hsk); - IF_NO_STRIP(self->homa.poll_cycles = 0); - - EXPECT_EQ(EAGAIN, -homa_interest_wait(&interest, 1)); - EXPECT_EQ(0, self->hsk.dead_skbs); - homa_interest_unlink_shared(&interest); -} -TEST_F(homa_interest, homa_interest_wait__nonblocking) -{ - struct homa_interest interest; homa_interest_init_shared(&interest, &self->hsk); - IF_NO_STRIP(self->homa.poll_cycles = 100000); - EXPECT_EQ(EAGAIN, -homa_interest_wait(&interest, 1)); - EXPECT_EQ(0, interest.blocked); + IF_NO_STRIP(self->homa.poll_cycles = 0); + unit_hook_register(notify_hook); + hook_interest = &interest; + hook_count = 1; + unit_log_clear(); + + EXPECT_EQ(0, homa_interest_wait(&interest)); + EXPECT_EQ(5, self->hsk.dead_skbs); homa_interest_unlink_shared(&interest); } TEST_F(homa_interest, homa_interest_wait__poll_then_block) @@ -209,7 +204,7 @@ TEST_F(homa_interest, homa_interest_wait__poll_then_block) hook_interest = &interest; hook_count = 4; - EXPECT_EQ(0, -homa_interest_wait(&interest, 0)); + EXPECT_EQ(0, -homa_interest_wait(&interest)); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(3000, homa_metrics_per_cpu()->poll_cycles); EXPECT_EQ(0, homa_metrics_per_cpu()->blocked_cycles); @@ -225,7 +220,7 @@ TEST_F(homa_interest, homa_interest_wait__interrupted_by_signal) mock_prepare_to_wait_errors = 1; IF_NO_STRIP(self->homa.poll_cycles = 0); - EXPECT_EQ(EINTR, -homa_interest_wait(&interest, 0)); + EXPECT_EQ(EINTR, -homa_interest_wait(&interest)); EXPECT_EQ(1, interest.blocked); homa_interest_unlink_shared(&interest); } @@ -241,7 +236,7 @@ TEST_F(homa_interest, homa_interest_wait__time_metrics) hook_interest = &interest; hook_count = 4; - EXPECT_EQ(0, -homa_interest_wait(&interest, 0)); + EXPECT_EQ(0, -homa_interest_wait(&interest)); IF_NO_STRIP(EXPECT_EQ(700, homa_metrics_per_cpu()->poll_cycles)); IF_NO_STRIP(EXPECT_EQ(1500, homa_metrics_per_cpu()->blocked_cycles)); homa_interest_unlink_shared(&interest); From 76b44c66e74b2a562c6b593e38202f33530b5995 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 4 Aug 2025 13:17:18 -0700 Subject: [PATCH 425/625] Change API for homa_wait_private An error value is only returned if the error prevented the RPC from becoming ready for attention. If the RPC is ready for attention but has failed because of an error, 0 is returned. --- homa_incoming.c | 12 ++++++++---- test/unit_homa_incoming.c | 11 ++++++----- test/unit_homa_plumbing.c | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 6e137608..9a0dd577 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1068,8 +1068,12 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, * @rpc: RPC to wait for; an error will be returned if the RPC is * not a client RPC or not private. Must be locked by caller. * @nonblocking: Nonzero means return immediately if @rpc not ready. - * Return: 0 if the response has been successfully received, otherwise - * a negative errno. + * Return: 0 means that @rpc is ready for attention: either its response + * has been received or it has an unrecoverable error such as + * ETIMEDOUT (in rpc->error). Nonzero means some other error + * (such as EINTR or EINVAL) occurred before @rpc became ready + * for attention; in this case the return value is a negative + * errno. */ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) __must_hold(rpc->bucket->lock) @@ -1079,7 +1083,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) int avail_immediately = 1; int blocked = 0; #endif /* See strip.py */ - int result = 0; + int result; if (!(atomic_read(&rpc->flags) & RPC_PRIVATE)) return -EINVAL; @@ -1090,10 +1094,10 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) * RPC is ready for the application. */ while (1) { + result = 0; if (!rpc->error) rpc->error = homa_copy_to_user(rpc); if (rpc->error) { - result = rpc->error; IF_NO_STRIP(avail_immediately = 0); break; } diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 99e5e1f1..b0908751 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2374,12 +2374,12 @@ TEST_F(homa_incoming, homa_wait_private__rpc_has_error) atomic_or(RPC_PRIVATE, &crpc->flags); crpc->error = -ENOENT; homa_rpc_lock(crpc); - EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); + EXPECT_EQ(0, -homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); EXPECT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags) & RPC_PKTS_READY); IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); - IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_fast)); } TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) { @@ -2392,7 +2392,8 @@ TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) atomic_or(RPC_PRIVATE, &crpc->flags); mock_copy_data_errors = 1; homa_rpc_lock(crpc); - EXPECT_EQ(EFAULT, -homa_wait_private(crpc, 0)); + EXPECT_EQ(0, -homa_wait_private(crpc, 0)); + EXPECT_EQ(-EFAULT, crpc->error); homa_rpc_unlock(crpc); } TEST_F(homa_incoming, homa_wait_private__available_immediately) @@ -2441,10 +2442,10 @@ TEST_F(homa_incoming, homa_wait_private__signal_notify_race) mock_prepare_to_wait_errors = 1; homa_rpc_lock(crpc); - EXPECT_EQ(ENOENT, -homa_wait_private(crpc, 0)); + EXPECT_EQ(0, -homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); - IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_block)); IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_fast)); EXPECT_EQ(0, mock_prepare_to_wait_errors); } diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 83c43613..64b2ae76 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -773,6 +773,24 @@ TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_private) EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); EXPECT_EQ(0, self->recvmsg_args.id); + EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); +} +TEST_F(homa_plumbing, homa_recvmsg__private_rpc_has_error) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->server_port, + self->client_id, 100, 2000); + + EXPECT_NE(NULL, crpc); + atomic_or(RPC_PRIVATE, &crpc->flags); + crpc->error = -ETIMEDOUT; + + self->recvmsg_args.id = crpc->id; + + EXPECT_EQ(ETIMEDOUT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, + 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_EQ(self->client_id, self->recvmsg_args.id); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_shared) { From 4ba0a66114d0862d8941dc3e42442fadeff214ec Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 4 Aug 2025 15:36:50 -0700 Subject: [PATCH 426/625] Improve delay analyzer in tthoma.py * Only count a long packet's Xmit delay if it is the TSO packet, not a derived packet. * For Xmit delay, use the delay from homa_qdisc requeue to nic, if available; otherwise use ip_queue_xmit to nic. --- util/tthoma.py | 145 ++++++++++++++++++++++++++++--------------------- 1 file changed, 83 insertions(+), 62 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index bbd897f7..52f585bd 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -149,6 +149,9 @@ def __missing__(self, id): # value is a dictionary containing the following fields (some may not # be present, depending on which events were present in the traces): # xmit: Time when ip*xmit was invoked +# qdisc_xmit: Time when homa_qdisc requeued a packet that was deferred +# because of NIC queue length (only present for deferred +# packets) # nic: Time when packet was handed off to the NIC (if available) # gro: Time when GRO received the packet # softirq: Time when homa_softirq processed the packet @@ -1588,6 +1591,19 @@ def __pacer_xmit(self, trace, time, core, match, interests): '([0-9]+), offset ([0-9]+), bytes_left ([0-9]+)' }) + def __qdisc_xmit(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + bytes_left = int(match.group(3)) + for interest in interests: + interest.tt_qdisc_xmit(trace, time, core, id, offset, bytes_left) + + patterns.append({ + 'name': 'qdisc_xmit', + 'regexp': 'homa_qdisc_pacer queuing homa data packet for id ([0-9]+), ' + 'offset ([0-9]+), bytes_left ([0-9]+)' + }) + def __tcp_xmit(self, trace, time, core, match, interests): length = int(match.group(1)) for interest in interests: @@ -2594,10 +2610,15 @@ def print_pkt_delays(self): if delay > 0: short_total.append([delay, p, pkt['softirq']]) else: - if ('xmit' in pkt) and ('nic' in pkt): - delay = pkt['nic'] - pkt['xmit'] - if delay > 0: - long_to_nic.append([delay, p, pkt['nic']]) + if 'tso_length' in pkt: + if 'nic' in pkt: + delay = -1 + if 'qdisc_xmit' in pkt: + delay = pkt['nic'] - pkt['qdisc_xmit'] + elif 'xmit' in pkt: + delay = pkt['nic'] - pkt['xmit'] + if delay > 0: + long_to_nic.append([delay, p, pkt['nic']]) if ('nic' in pkt) and ('gro' in pkt): delay = pkt['gro'] - pkt['nic'] if delay > 0: @@ -2780,10 +2801,14 @@ def print_worst(data, label): [pkt['softirq'] - pkt['gro'], p, pkt['softirq']]) else: if (total < min_long) or (total > max_long): - continue; - if ('xmit' in pkt) and ('nic' in pkt): - long_to_nic.append( - [pkt['nic'] - pkt['xmit'], p, pkt['nic']]) + continue + if 'tso_length' in pkt: + if ('qdisc_xmit' in pkt) and ('nic' in pkt): + long_to_nic.append( + [pkt['nic'] - pkt['qdisc_xmit'], p, pkt['nic']]) + elif ('xmit' in pkt) and ('nic' in pkt): + long_to_nic.append( + [pkt['nic'] - pkt['xmit'], p, pkt['nic']]) if ('nic' in pkt) and ('gro' in pkt): long_to_gro.append( [pkt['gro'] - pkt['nic'], p, pkt['gro']]) @@ -6082,6 +6107,10 @@ def tt_pacer_xmit(self, trace, t, core, id, offset, port, bytes_left): global packets packets[pkt_id(id, offset)]['pacer'] = True + def tt_qdisc_xmit(self, trace, t, core, id, offset, bytes_left): + global packets + packets[pkt_id(id, offset)]['qdisc_xmit'] = t + def tt_retransmit(self, trace, t, core, id, offset, length): global packets p = packets[pkt_id(id, offset)] @@ -7603,65 +7632,57 @@ def output(self): global traces, options, packets, rpcs print('\n-------------------') print('Analyzer: temp') - print('-------------------') + print('-------------------\n') - print('Peer nodes: %s\n' % (peer_nodes)) + mtu = get_mtu() delays = [] - pkts = [] - node3pkts = 0 - long = 50 - node = options.node + ip_delays = [] + slow_pkts = [] + qdisc_pkts = [] for pkt in packets.values(): - if pkt['id'] == 500018274: - print(pkt) - if not 'nic' in pkt: - continue - if not ('tx_node' in pkt) or (pkt['tx_node'] != 'node4'): - continue - if not ('rx_node' in pkt) or (pkt['rx_node'] != 'node1'): - continue - pkts.append(pkt) - if not pkts: - print('No data packets made it from node1 to node4 in the traces') - return - pkts.sort(key=lambda d : d['nic']) - print('RpcId Offset NIC GRO Delay') - for pkt in pkts: - if 'gro' in pkt: - delay = pkt['gro'] - pkt['nic'] - delays.append(delay) - print('%9d %6d %9.3f %9.3f %7.1f' % (pkt['id'], pkt['offset'], - pkt['nic'], pkt['gro'], pkt['gro'] - pkt['nic'])) - else: - print('%9d %6d %9.3f N/A' % (pkt['id'], pkt['offset'], - pkt['nic'])) - delays.sort() - print('\nDelays: average %.1f us, P50 %.1f us, P90 %.1f us, P99 %.1f us' % - (sum(delays)/len(delays), delays[50*len(delays)//100], - delays[90*len(delays)//100], delays[99*len(delays)//100])) - - def output_long_qdisc(self): - global traces, options, packets - print('\n-------------------') - print('Analyzer: temp') - print('-------------------') + if 'nic' in pkt and 'qdisc_xmit' in pkt: + qdisc_pkts.append(pkt) + delays.append(pkt['nic'] - pkt['qdisc_xmit']) + elif ('nic' in pkt and 'xmit' in pkt and 'tso_length' in pkt + and pkt['msg_length'] != None and pkt['msg_length'] > mtu): + delay = pkt['nic'] - pkt['xmit'] + ip_delays.append(delay) + if delay > 50: + slow_pkts.append(pkt) - pkts = [] - for pkt_id in self.qdisc_ids: - if not pkt_id in packets: - continue - pkt = packets[pkt_id] - if (not 'xmit' in pkt) or (not 'gro' in pkt): - continue - if not 'nic' in pkt: - print('Queued packet has no mlx send record: %s' % (pkt)) - continue - pkts.append([pkt['nic'] - pkt['xmit'], pkt]) + if not delays: + print('Couldn\'t find any packets that were deferred by homa_qdisc'); + else: + delays.sort() + print('%d delays from qdisc_xmit to nic:' % (len(delays))) + print('Average: %6.1f' % (sum(delays) / len(delays))) + print('Min: %6.1f' % (delays[0])) + print('P50: %6.1f' % (delays[(50 * len(delays) // 100)])) + print('P90: %6.1f' % (delays[(90 * len(delays) // 100)])) + print('P99: %6.1f' % (delays[(99 * len(delays) // 100)])) + print('Max: %6.1f' % (delays[-1])) + + print(''); + if not ip_delays: + print('Couldn\'t find any ip_queue_xmit packets'); + else: + ip_delays.sort() + print('%d delays from ip_queue_xmit to nic:' % (len(ip_delays))) + print('Average: %6.1f' % (sum(ip_delays) / len(ip_delays))) + print('Min: %6.1f' % (ip_delays[0])) + print('P50: %6.1f' % (ip_delays[(50 * len(ip_delays) // 100)])) + print('P90: %6.1f' % (ip_delays[(90 * len(ip_delays) // 100)])) + print('P99: %6.1f' % (ip_delays[(99 * len(ip_delays) // 100)])) + print('Max: %6.1f' % (ip_delays[-1])) + + slow_pkts.sort(key=lambda d : d['nic'] - d['xmit'], reverse=True) + print('Packets that took a long time from ip_queue_xmit to nic:') + print(' Id Offset Node Core Xmit Nic Delay') + for pkt in slow_pkts: + print('%10d %10d %8s %3d %9.3f %9.3f %6.1f' % ( + pkt['id'], pkt['offset'], pkt['tx_node'], pkt['tx_core'], + pkt['xmit'], pkt['nic'], pkt['nic'] - pkt['xmit'])) - for delay, pkt in sorted(pkts, reverse=True, key=lambda t : t[0]): - print('RPC id %10d, offset %d, xmit %9.3f, mlx_delay %6.1f, ' - 'gro_delay %6.1f' % (pkt['id'], pkt['offset'], - pkt['xmit'], delay, pkt['gro'] - pkt['xmit'])) def output_snapshot(self): global packets, rpcs From c6706cfb1bb71191f27cc1ad507aee1a7c6ccc62 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 4 Aug 2025 17:02:53 -0700 Subject: [PATCH 427/625] Update txqueues analyzer in tthoma.py to understand homa_qdisc * Use qdisc_xmit times instead of xmit, if available. * Change analyzer name from txqueues to nicqueues. --- util/tthoma.py | 288 ++++++++++++++++++++++++++----------------------- 1 file changed, 151 insertions(+), 137 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 52f585bd..97ca8b17 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -5596,6 +5596,154 @@ def output(self): core_data['max_backlog'] * 1e-3, core_data['max_backlog_time'])) +#------------------------------------------------ +# Analyzer: nicqueues +#------------------------------------------------ +class AnalyzeNicqueues: + """ + Prints estimates of the amount of outbound packet data queued in the + NIC of each node, assuming that the NIC transmits at full link speed. + The --gbps option specifies the rate at which packets are transmitted. + With --data option, generates detailed timelines of NIC queue lengths. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + # Node name -> list of tuples for + # all transmitted packets. Length is the packet length including + # Homa/TCP header but not IP or Ethernet overheads. Queue_length is + # the # bytes in the NIC queue as of time (includes this packet). + # Queue_length starts off zero and is updated later. Type indicates + # the kind of packet: "homa_data", "homa_grant", or "tcp" + self.nodes = defaultdict(list) + + def tt_send_grant(self, trace, t, core, id, offset, priority, increment): + self.nodes[trace['node']].append([t, 34, 0, "homa_grant"]) + + def tt_tcp_xmit(self, trace, t, core, length): + self.nodes[trace['node']].append([t, length, 0, "tcp"]) + + def output(self): + global options, traces, packets + + for pkt in packets.values(): + if not 'tso_length' in pkt: + continue + if 'nic' in pkt: + t = pkt['nic'] + elif 'qdisc_xmit' in pkt: + t = pkt['qdisc_xmit'] + elif 'xmit' in pkt: + t = pkt['xmit'] + else: + continue + self.nodes[pkt['tx_node']].append([t, pkt['tso_length'] + 60, 0, + "homa_data"]) + + print('\n-------------------') + print('Analyzer: nicqueues') + print('-------------------') + + # Compute queue lengths, find maximum for each node. + print('Worst-case length of NIC tx queue for each node, assuming a link') + print('speed of %.1f Gbps (change with --gbps):' % (options.gbps)) + print('Node: Name of node') + print('MaxLength: Highest estimated output queue length for NIC (bytes)') + print('Time: Time when worst-case queue length occurred') + print('Delay: Delay (usec until fully transmitted) experienced by packet ') + print(' transmitted at Time') + print('P50: Median delay experienced by Homa data packets') + print('P90: 90th percentile delay experienced by Homa data packets') + print('P99: 99th percentile delay experienced by Homa data packets') + print('') + print('Node MaxLength Time Delay P50 P90 P99') + + for node in get_sorted_nodes(): + pkts = self.nodes[node] + if not pkts: + continue + pkts.sort() + max_queue = 0 + max_time = 0 + cur_queue = 0 + prev_time = traces[node]['first_time'] + for i in range(len(pkts)): + time, length, ignore, ignore2 = pkts[i] + + # 20 bytes for IPv4 header, 42 bytes for Ethernet overhead (CRC, + # preamble, interpacket gap) + total_length = length + 62 + + xmit_bytes = ((time - prev_time) * (1000.0*options.gbps/8)) + if xmit_bytes < cur_queue: + cur_queue -= xmit_bytes + else: + cur_queue = 0 + if 0 and (node == 'node6'): + if cur_queue == 0: + print('%9.3f (+%4.1f): length %6d, queue empty' % + (time, time - prev_time, total_length)) + else: + print('%9.3f (+%4.1f): length %6d, xmit %5d, queue %6d -> %6d' % + (time, time - prev_time, total_length, + xmit_bytes, cur_queue, cur_queue + total_length)) + cur_queue += total_length + if cur_queue > max_queue: + max_queue = cur_queue + max_time = time + prev_time = time + pkts[i][2] = cur_queue + data_pkts = sorted(filter(lambda t: t[3] == 'homa_data', pkts), + key=lambda t: t[2]) + print('%-10s %9d %9.3f %7.1f %7.1f %7.1f %7.1f' % ( + node, max_queue, max_time, + (max_queue*8)/(options.gbps*1000), + data_pkts[50*len(data_pkts)//100][2]*8/(options.gbps*1000), + data_pkts[90*len(data_pkts)//100][2]*8/(options.gbps*1000), + data_pkts[99*len(data_pkts)//100][2]*8/(options.gbps*1000))) + + if options.data: + # Print stats for each node at regular intervals + file = open('%s/txqueues.dat' % (options.data), 'w') + line = 'Interval' + for node in get_sorted_nodes(): + line += ' %10s' % (node) + print(line, file=file) + + interval = options.interval + interval_end = get_first_interval_end() + end = get_last_time() + + # Node name -> current index in that node's packets + cur = {} + for node in get_sorted_nodes(): + cur[node] = 0 + + while True: + line = '%8.1f' % (interval_end) + for node in get_sorted_nodes(): + max = -1 + i = cur[node] + xmits = self.nodes[node] + while i < len(xmits): + time, ignore, queue_length, type = xmits[i] + if time > interval_end: + break + if queue_length > max: + max = queue_length + i += 1 + cur[node] = i + if max == -1: + line += ' ' * 11 + else: + line += ' %8d' % (max) + print(line, file=file) + if interval_end > end: + break + interval_end += interval + file.close() + #------------------------------------------------ # Analyzer: nictx #------------------------------------------------ @@ -6109,7 +6257,9 @@ def tt_pacer_xmit(self, trace, t, core, id, offset, port, bytes_left): def tt_qdisc_xmit(self, trace, t, core, id, offset, bytes_left): global packets - packets[pkt_id(id, offset)]['qdisc_xmit'] = t + p = packets[pkt_id(id, offset)] + p['qdisc_xmit'] = t + p['tx_node'] = trace['node'] def tt_retransmit(self, trace, t, core, id, offset, length): global packets @@ -8347,142 +8497,6 @@ def print_type(delays): print(node_info) print(q_details, end='') -#------------------------------------------------ -# Analyzer: txqueues -#------------------------------------------------ -class AnalyzeTxqueues: - """ - Prints estimates of the amount of outbound packet data queued in the - NIC of each node, assuming that the NIC transmits at full link speed. - The --gbps option specifies the rate at which packets are transmitted. - With --data option, generates detailed timelines of NIC queue lengths. - """ - - def __init__(self, dispatcher): - # Node name -> list of tuples for - # all transmitted packets. Length is the packet length including - # Homa/TCP header but not IP or Ethernet overheads. Queue_length is - # the # bytes in the NIC queue as of time (includes this packet). - # Queue_length starts off zero and is updated later. Type indicates - # the kind of packet: "homa_data", "homa_grant", or "tcp" - self.nodes = defaultdict(list) - - def tt_send_data(self, trace, t, core, id, offset, length): - self.nodes[trace['node']].append([t, length + 60, 0, "homa_data"]) - - def tt_send_grant(self, trace, t, core, id, offset, priority, increment): - self.nodes[trace['node']].append([t, 34, 0, "homa_grant"]) - - def tt_tcp_xmit(self, trace, t, core, length): - self.nodes[trace['node']].append([t, length, 0, "tcp"]) - - def output(self): - global options, traces - - print('\n-------------------') - print('Analyzer: txqueues') - print('-------------------') - - # Compute queue lengths, find maximum for each node. - print('Worst-case length of NIC tx queue for each node, assuming a link') - print('speed of %.1f Gbps (change with --gbps):' % (options.gbps)) - print('Node: Name of node') - print('MaxLength: Highest estimated output queue length for NIC (bytes)') - print('Time: Time when worst-case queue length occurred') - print('Delay: Delay (usec until fully transmitted) experienced by packet ') - print(' transmitted at Time') - print('P50: Median delay experienced by Homa data packets') - print('P90: 90th percentile delay experienced by Homa data packets') - print('P99: 99th percentile delay experienced by Homa data packets') - print('') - print('Node MaxLength Time Delay P50 P90 P99') - - for node in get_sorted_nodes(): - pkts = self.nodes[node] - if not pkts: - continue - pkts.sort() - max_queue = 0 - max_time = 0 - cur_queue = 0 - prev_time = traces[node]['first_time'] - for i in range(len(pkts)): - time, length, ignore, ignore2 = pkts[i] - - # 20 bytes for IPv4 header, 42 bytes for Ethernet overhead (CRC, - # preamble, interpacket gap) - total_length = length + 62 - - xmit_bytes = ((time - prev_time) * (1000.0*options.gbps/8)) - if xmit_bytes < cur_queue: - cur_queue -= xmit_bytes - else: - cur_queue = 0 - if 0 and (node == 'node6'): - if cur_queue == 0: - print('%9.3f (+%4.1f): length %6d, queue empty' % - (time, time - prev_time, total_length)) - else: - print('%9.3f (+%4.1f): length %6d, xmit %5d, queue %6d -> %6d' % - (time, time - prev_time, total_length, - xmit_bytes, cur_queue, cur_queue + total_length)) - cur_queue += total_length - if cur_queue > max_queue: - max_queue = cur_queue - max_time = time - prev_time = time - pkts[i][2] = cur_queue - data_pkts = sorted(filter(lambda t: t[3] == 'homa_data', pkts), - key=lambda t: t[2]) - print('%-10s %9d %9.3f %7.1f %7.1f %7.1f %7.1f' % ( - node, max_queue, max_time, - (max_queue*8)/(options.gbps*1000), - data_pkts[50*len(data_pkts)//100][2]*8/(options.gbps*1000), - data_pkts[90*len(data_pkts)//100][2]*8/(options.gbps*1000), - data_pkts[99*len(data_pkts)//100][2]*8/(options.gbps*1000))) - - if options.data: - # Print stats for each node at regular intervals - file = open('%s/txqueues.dat' % (options.data), 'w') - line = 'Interval' - for node in get_sorted_nodes(): - line += ' %10s' % (node) - print(line, file=file) - - interval = options.interval - interval_end = get_first_interval_end() - end = get_last_time() - - # Node name -> current index in that node's packets - cur = {} - for node in get_sorted_nodes(): - cur[node] = 0 - - while True: - line = '%8.1f' % (interval_end) - for node in get_sorted_nodes(): - max = -1 - i = cur[node] - xmits = self.nodes[node] - while i < len(xmits): - time, ignore, queue_length, type = xmits[i] - if time > interval_end: - break - if queue_length > max: - max = queue_length - i += 1 - cur[node] = i - if max == -1: - line += ' ' * 11 - else: - line += ' %8d' % (max) - print(line, file=file) - if interval_end > end: - break - interval_end += interval - file.close() - - #------------------------------------------------ # Analyzer: txsnapshot #------------------------------------------------ From d3650a14bd6513adcfe47c07ee0090af1a07553d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 5 Aug 2025 08:28:05 -0700 Subject: [PATCH 428/625] Improve handling of 'retransmits' info for packets in tthoma.py Change from list of times to list of dicts with more info. Also, improved nicqueues analyzer to use qdisc and retransmit info. --- util/tthoma.py | 175 ++++++++++++++++++++++++------------------------- 1 file changed, 85 insertions(+), 90 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 97ca8b17..ca776e10 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -183,12 +183,17 @@ def __missing__(self, id): # softirq_core: Core on which SoftIRQ processed the packet # free_tx_skb: Time when NAPI released the skb on the sender, which can't # happen until the packet has been fully transmitted. -# retransmits: If the packet was retransmitted, this will be a list of all -# the times when the packet was retransmiteed. +# retransmits: A list with one entry for each time the packet was +# retransmitted. The entry is a dictionary with the same +# fields as a packet (though many may be omitted). There will +# be an entry "retrans" that gives the time of the trace +# record declaring retransmission. If there are no retransmits, +# this will be an empty list. class PacketDict(dict): def __missing__(self, key): id_str, offset_str = key.split(':') - self[key] = {'id': int(id_str), 'offset': int(offset_str)} + self[key] = {'id': int(id_str), 'offset': int(offset_str), + 'retransmits': []} return self[key] packets = PacketDict() @@ -5628,18 +5633,34 @@ def output(self): global options, traces, packets for pkt in packets.values(): - if not 'tso_length' in pkt: - continue - if 'nic' in pkt: - t = pkt['nic'] - elif 'qdisc_xmit' in pkt: - t = pkt['qdisc_xmit'] - elif 'xmit' in pkt: - t = pkt['xmit'] - else: - continue - self.nodes[pkt['tx_node']].append([t, pkt['tso_length'] + 60, 0, - "homa_data"]) + if 'tso_length' in pkt: + if 'nic' in pkt: + t = pkt['nic'] + elif 'qdisc_xmit' in pkt: + t = pkt['qdisc_xmit'] + elif 'xmit' in pkt: + t = pkt['xmit'] + else: + continue + self.nodes[pkt['tx_node']].append([t, pkt['tso_length'] + 60, 0, + "homa_data"]) + for retrans in pkt['retransmits']: + if 'nic' in retrans: + t = retrans['nic'] + elif 'qdisc_xmit' in retrans: + t = retrans['qdisc_xmit'] + elif 'xmit' in retrans: + t = retrans['xmit'] + else: + continue + if 'tso_length' in retrans: + length = retrans['tso_length'] + elif 'length' in pkt: + length = pkt['length'] + else: + continue + self.nodes[pkt['tx_node']].append([t, length + 60, 0, + "homa_data"]) print('\n-------------------') print('Analyzer: nicqueues') @@ -5705,7 +5726,7 @@ def output(self): if options.data: # Print stats for each node at regular intervals - file = open('%s/txqueues.dat' % (options.data), 'w') + file = open('%s/nicqueues.dat' % (options.data), 'w') line = 'Interval' for node in get_sorted_nodes(): line += ' %10s' % (node) @@ -6181,28 +6202,35 @@ def init_trace(self, trace): def tt_ip_xmit(self, trace, t, core, id, offset): global packets, rpcs p = packets[pkt_id(id, offset)] - # Only record first transmission (packet might be retransmitted) - if not 'xmit' in p: + p['tx_node'] = trace['node'] + if not p['retransmits']: p['xmit'] = t - p['tx_node'] = trace['node'] p['tx_core'] = core rpcs[id]['send_data_pkts'].append(p) + else: + p['retransmits'][-1]['xmit'] = t def tt_mlx_data(self, trace, t, core, peer, id, offset, tx_queue): global packets p = packets[pkt_id(id, offset)] - if not 'retransmits' in p: + p['tx_node'] = trace['node'] + if not p['retransmits']: p['nic'] = t - p['tx_node'] = trace['node'] p['tx_queue'] = tx_queue + else: + p['retransmits'][-1]['nic'] = t def tt_free_tx_skb(self, trace, t, core, id, offset, qid, msg_length): global packets p = packets[pkt_id(id, offset)] - p['free_tx_skb'] = t - p['tx_qid'] = qid p['tx_node'] = trace['node'] - p['msg_length'] = msg_length + if not p['retransmits']: + p['free_tx_skb'] = t + p['tx_qid'] = qid + p['msg_length'] = msg_length + else: + p = p['retransmits'][-1] + p['free_tx_skb'] = t def tt_gro_data(self, trace, t, core, peer, id, offset, prio): global packets, recv_offsets, rpcs @@ -6243,31 +6271,32 @@ def tt_free_skbs(self, trace, t, core, num_skbs): def tt_send_data(self, trace, t, core, id, offset, length): global packets p = packets[pkt_id(id, offset)] - if (not 'retransmits' in p) and (length > self.tso_lengths[offset]): - self.tso_lengths[offset] = length - p['id'] = id - # If packet has been retransmitted, don't record tso_length, since - # that could make a TSO segment appear to be the main TSO packet. - if not 'retransmits' in p: + if not p['retransmits']: + if length > self.tso_lengths[offset]: + self.tso_lengths[offset] = length p['tso_length'] = length + else: + p['retransmits'][-1]['tso_length'] = length def tt_pacer_xmit(self, trace, t, core, id, offset, port, bytes_left): global packets - packets[pkt_id(id, offset)]['pacer'] = True + p = packets[pkt_id(id, offset)] + if p['retransmits']: + p = p['retransmits'][-1] + p['pacer'] = True def tt_qdisc_xmit(self, trace, t, core, id, offset, bytes_left): global packets p = packets[pkt_id(id, offset)] - p['qdisc_xmit'] = t p['tx_node'] = trace['node'] + if p['retransmits']: + p = p['retransmits'][-1] + p['qdisc_xmit'] = t def tt_retransmit(self, trace, t, core, id, offset, length): global packets p = packets[pkt_id(id, offset)] - if not 'retransmits' in p: - p['retransmits'] = [t] - else: - p['retransmits'].append(t) + p['retransmits'].append({'retrans': t}) def tt_send_grant(self, trace, t, core, id, offset, priority, increment): global grants, rpcs @@ -6370,7 +6399,8 @@ def analyze(self): if pid in packets: pkt2 = packets[pid] else: - pkt2 = {'offset': offset, 'length': length} + pkt2 = {'offset': offset, 'length': length, + 'retransmits': []} new_pkts.append([pid, pkt2]) for key in ['xmit', 'nic', 'id', 'msg_length', 'priority', 'tx_node', 'tx_core', @@ -7784,55 +7814,23 @@ def output(self): print('Analyzer: temp') print('-------------------\n') - mtu = get_mtu() - delays = [] - ip_delays = [] - slow_pkts = [] - qdisc_pkts = [] + bytes = 0 for pkt in packets.values(): - if 'nic' in pkt and 'qdisc_xmit' in pkt: - qdisc_pkts.append(pkt) - delays.append(pkt['nic'] - pkt['qdisc_xmit']) - elif ('nic' in pkt and 'xmit' in pkt and 'tso_length' in pkt - and pkt['msg_length'] != None and pkt['msg_length'] > mtu): - delay = pkt['nic'] - pkt['xmit'] - ip_delays.append(delay) - if delay > 50: - slow_pkts.append(pkt) - - if not delays: - print('Couldn\'t find any packets that were deferred by homa_qdisc'); - else: - delays.sort() - print('%d delays from qdisc_xmit to nic:' % (len(delays))) - print('Average: %6.1f' % (sum(delays) / len(delays))) - print('Min: %6.1f' % (delays[0])) - print('P50: %6.1f' % (delays[(50 * len(delays) // 100)])) - print('P90: %6.1f' % (delays[(90 * len(delays) // 100)])) - print('P99: %6.1f' % (delays[(99 * len(delays) // 100)])) - print('Max: %6.1f' % (delays[-1])) - - print(''); - if not ip_delays: - print('Couldn\'t find any ip_queue_xmit packets'); - else: - ip_delays.sort() - print('%d delays from ip_queue_xmit to nic:' % (len(ip_delays))) - print('Average: %6.1f' % (sum(ip_delays) / len(ip_delays))) - print('Min: %6.1f' % (ip_delays[0])) - print('P50: %6.1f' % (ip_delays[(50 * len(ip_delays) // 100)])) - print('P90: %6.1f' % (ip_delays[(90 * len(ip_delays) // 100)])) - print('P99: %6.1f' % (ip_delays[(99 * len(ip_delays) // 100)])) - print('Max: %6.1f' % (ip_delays[-1])) - - slow_pkts.sort(key=lambda d : d['nic'] - d['xmit'], reverse=True) - print('Packets that took a long time from ip_queue_xmit to nic:') - print(' Id Offset Node Core Xmit Nic Delay') - for pkt in slow_pkts: - print('%10d %10d %8s %3d %9.3f %9.3f %6.1f' % ( - pkt['id'], pkt['offset'], pkt['tx_node'], pkt['tx_core'], - pkt['xmit'], pkt['nic'], pkt['nic'] - pkt['xmit'])) - + if pkt['retransmits']: + print('Packet with %d retransmissions: %s\n' % ( + len(pkt['retransmits']), pkt)) + for r in pkt['retransmits']: + if 'tso_length' in r: + bytes += r['tso_length'] + elif 'length' in pkt: + bytes += pkt['length'] + else: + print('Can\'t find length for preceding packet') + elapsed = 0 + for trace in traces.values(): + elapsed += trace['elapsed_time'] + print('Total elapsed time %.1f ms, retransmitted bytes %d (%.3f MB/sec)' + % (elapsed * 1e-3, bytes, bytes / elapsed)) def output_snapshot(self): global packets, rpcs @@ -8404,13 +8402,10 @@ def output(self): qid_string = '' total_pkts += 1 - rx = 0 - if 'retransmits' in pkt: - rx += len(pkt['retransmits']) + rx = len(pkt['retransmits']) if 'segments' in pkt: for seg in pkt['segments']: - if 'retransmits' in seg: - rx += len(seg['retransmits']) + rx += len(seg['retransmits']) rx_msg = str(rx) if rx > 0 else "" gro_string = "" From 62bbde9eb2328b16f5b418aa3feb1379eb180dee Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 5 Aug 2025 09:27:29 -0700 Subject: [PATCH 429/625] Improve txpkts analyzer for tthoma.py Incorporate info about homa_qdisc, add Gbps column to per-queue information --- util/tthoma.py | 82 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index ca776e10..a8563069 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -8267,9 +8267,13 @@ def output(self): print('Segs: Total number of segments (packets received by GRO) ' 'transmitted by') print(' node or queue') - print('PSegs: Total number of segments that were transmitted ' + print('Gbps: Throughput of that queue') + print('PTsos: Total number of TSO frames that were transmitted ' 'by the pacer') - print('Backlog: Average KB of tx data that have been in the ' + print('QTsos: Total number of TSO frames that were deferred by ' + 'homa_qdisc to') + print(' limit NIC queue length') + print('Backlog: Average KB of tx data that were in the ' 'posession of the NIC') print(' (presumably without being transmitted) longer than ' '%d usec' % (options.threshold)) @@ -8328,9 +8332,17 @@ def output(self): # on that queue qid_segs = defaultdict(lambda: 0) - # Tx queue number -> total number of packets (segments) transmitted + # Tx queue number -> total number of bytes transmitted on that + # queue + qid_bytes = defaultdict(lambda: 0) + + # Tx queue number -> total number of TSO frames transmitted # by the pacer on that queue - qid_pacer_segs = defaultdict(lambda: 0) + qid_pacer_tsos = defaultdict(lambda: 0) + + # Tx queue number -> total number of TSO frames on that queue + # that were deferred by homa_qdisc because of NIC queue overload + qid_qdisc_tsos = defaultdict(lambda: 0) # Tx queue number -> integral of (excess time * KB) for TSO packets # that have spent "too much time" in the NIC. Excess time is @@ -8357,12 +8369,14 @@ def output(self): (time.strftime('%I:%M %p on %m/%d/%Y'))) f.write('# Data packets transmitted from %s:\n' % (node)) f.write('# Xmit: Time when packet was passed to ip*xmit\n') + f.write('# Qdisc: Time when homa_qdisc requeued packet after ' + 'deferral, if any\n') f.write('# RpcId: Identifier of packet\'s RPC\n') f.write('# Offset: Offset of packet within message\n') f.write('# Length: Size of packet (before segmentation)\n') f.write('# Qid: Transmit queue on which packet was sent\n') f.write('# Nic: Time when packet was queued for NIC\n') - f.write('# NDelay: Nic - Xmit\n') + f.write('# NDelay: Nic - (later of Xmit and Qdisc)\n') f.write('# MaxGro: Time when last fragment of packet was ' 'received by GRO\n') f.write('# GDelay: MaxGro - Nic\n') @@ -8371,12 +8385,26 @@ def output(self): f.write('# Rx: Number of times segments in the packet were ' 'retransmitted\n\n') - f.write('# Xmit RpcId Offset Length Qid') + f.write('# Xmit Qdisc RpcId Offset Length Qid') f.write(' Nic NDelay MaxGro GDelay') f.write(' Free FDelay Rx\n') for pkt in pkts: xmit = pkt['xmit'] - nic = pkt['nic'] if 'nic' in pkt else None + if 'qdisc_xmit' in pkt: + qdisc = pkt['qdisc_xmit'] + qdisc_string = '%10.3f' % (qdisc) + else: + qdisc = None + qdisc_string = '' + nic_delay = None + if 'nic' in pkt: + nic = pkt['nic'] + if qdisc != None: + nic_delay = nic - qdisc + elif xmit != None: + nic_delay = nic - xmit + else: + nic = None max_gro = get_max_gro(pkt) free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None length = pkt['tso_length'] @@ -8388,8 +8416,12 @@ def output(self): if 'segments' in pkt: segs += len(pkt['segments']) qid_segs[qid] += segs + if 'tso_length' in pkt: + qid_bytes[qid] += length if 'pacer' in pkt: - qid_pacer_segs[qid] += segs + qid_pacer_tsos[qid] += 1 + if 'qdisc_xmit' in pkt: + qid_qdisc_tsos[qid] += 1 if 'tx_queue' in pkt: qid_tx_queue[qid] = pkt['tx_queue'] qid_string = str(qid) @@ -8409,8 +8441,8 @@ def output(self): rx_msg = str(rx) if rx > 0 else "" gro_string = "" - if rx == 0 and qid != None and nic != None: - delays[qid]['nic'].append(nic - xmit) + if rx == 0 and qid != None and nic_delay != None: + delays[qid]['nic'].append(nic_delay) if max_gro != None: delays[qid]['gro'].append(max_gro - nic) gro_string = '%.1f' % (max_gro - nic) @@ -8426,19 +8458,21 @@ def output(self): qid_total_bytes[qid] += length - f.write('%10.3f %10d %6d %6d %3s' % (xmit, pkt['id'], - pkt['offset'], pkt['tso_length'], qid_string)) + line = '%10.3f %10s %10d %6d %6d %3s' % (xmit, qdisc_string, + pkt['id'], pkt['offset'], pkt['tso_length'], + qid_string) nic_delay_string = '' - if (nic != None) and (xmit != None): - nic_delay_string = '%.1f' % (nic - xmit) - f.write(' %10s %7s %10s %7s' % (print_if(nic, '%.3f'), + if (nic_delay != None): + nic_delay_string = '%.1f' % (nic_delay) + line += ' %10s %7s %10s %7s' % (print_if(nic, '%.3f'), nic_delay_string, print_if(max_gro, '%.3f'), - gro_string)) + gro_string) free_delay_string = '' if (nic != None) and (free != None): free_delay_string = '%.1f' % (free - nic) - f.write(' %10s %7s %2s\n' % (print_if(free, '%.3f'), - free_delay_string, rx_msg)) + line += ' %10s %7s %2s' % (print_if(free, '%.3f'), + free_delay_string, rx_msg) + f.write(line.rstrip() + '\n') f.close() def print_type(delays): @@ -8453,11 +8487,11 @@ def print_type(delays): if not first_node: q_details += '\n' q_details += 'Transmit queues for %s\n' % (node) - q_details += 'Qid TxQueue Tsos Segs PSegs Backlog BFrac ' - q_details += 'NicP10 NicP50 NicP90 ' + q_details += 'Qid TxQueue Tsos Segs Gbps ' + q_details += 'PTsos QTsos Backlog BFrac NicP10 NicP50 NicP90 ' q_details += 'GroP10 GroP50 GroP90 FreP10 FreP50 FreP90\n' + q_details += '-----------------------------------' q_details += '-------------------------------------------------' - q_details += '----------------------' q_details += '------------------------------------------\n' first_node = False totals = defaultdict(list) @@ -8466,9 +8500,11 @@ def print_type(delays): q_delays = delays[qid] for type, d in q_delays.items(): totals[type].extend(d) - q_details += '%4d %10s %5d %5d %5d %6.1f %5.2f %s %s %s\n' % ( + q_details += '%4d %10s %5d %5d %6.2f ' % ( qid, qid_tx_queue[qid], qid_tsos[qid], qid_segs[qid], - qid_pacer_segs[qid], + 8e-3 * qid_bytes[qid] / traces[node]['elapsed_time']) + q_details += '%5d %5d %6.1f %5.2f %s %s %s\n' % ( + qid_pacer_tsos[qid], qid_qdisc_tsos[qid], 1e-3*qid_backlog[qid]/total_time, qid_slow_bytes[qid]/qid_total_bytes[qid], print_type(q_delays['nic']), From 65ef681c0101d56a09b186170d0a71de7da4a146 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 5 Aug 2025 11:21:05 -0700 Subject: [PATCH 430/625] Fix bugs in nicqueues analyzer for tthoma.py Also added new Dispatcher method pattern_matched. --- util/tthoma.py | 65 ++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index a8563069..8a62863f 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -308,6 +308,9 @@ def __missing__(self, key): # as of the end of the interval intervals = None +# Dispatcher used to parse the traces. +dispatcher = None + def add_to_intervals(node, start, end, key, delta): """ Find all of the intervals for node whose end times overlap the range @@ -807,6 +810,9 @@ def __init__(self): # a trace file is read self.parse_table = None + # Pattern name -> pattern + self.pattern_dict = {} + # The number of initial characters of the message portion of a # trace record that are used to lookup in parse_table. This is # the largest number such that each pattern has at least this many @@ -825,6 +831,7 @@ def __init__(self): for pattern in self.patterns: pattern['matches'] = 0 + self.pattern_dict[pattern['name']] = pattern def get_analyzer(self, name): """ @@ -847,6 +854,13 @@ def get_analyzers(self): return self.objs + def pattern_matched(self, name): + """ + Return True if the pattern with the given name matched at least + one event in the traces, False if it never matched + """ + return self.pattern_dict[name]['matches'] > 0 + def interest(self, analyzer): """ If analyzer hasn't already been registered with this dispatcher, @@ -990,6 +1004,8 @@ def __build_parse_table(self): """ if self.parse_table != None: return + + # Pattern prefix -> list of patterns with that prefix self.parse_table = defaultdict(list) # Pass 1: first compute self.prefix_length and set the 'parser' @@ -5630,28 +5646,14 @@ def tt_tcp_xmit(self, trace, t, core, length): self.nodes[trace['node']].append([t, length, 0, "tcp"]) def output(self): - global options, traces, packets + global options, traces, packets, dispatcher for pkt in packets.values(): - if 'tso_length' in pkt: - if 'nic' in pkt: - t = pkt['nic'] - elif 'qdisc_xmit' in pkt: - t = pkt['qdisc_xmit'] - elif 'xmit' in pkt: - t = pkt['xmit'] - else: - continue - self.nodes[pkt['tx_node']].append([t, pkt['tso_length'] + 60, 0, - "homa_data"]) + if 'tso_length' in pkt and 'nic' in pkt: + self.nodes[pkt['tx_node']].append([pkt['nic'], + pkt['tso_length'] + 60, 0, "homa_data"]) for retrans in pkt['retransmits']: - if 'nic' in retrans: - t = retrans['nic'] - elif 'qdisc_xmit' in retrans: - t = retrans['qdisc_xmit'] - elif 'xmit' in retrans: - t = retrans['xmit'] - else: + if not 'nic' in retrans: continue if 'tso_length' in retrans: length = retrans['tso_length'] @@ -5659,8 +5661,8 @@ def output(self): length = pkt['length'] else: continue - self.nodes[pkt['tx_node']].append([t, length + 60, 0, - "homa_data"]) + self.nodes[pkt['tx_node']].append([retrans['nic'], + length + 60, 0, "homa_data"]) print('\n-------------------') print('Analyzer: nicqueues') @@ -5672,7 +5674,7 @@ def output(self): print('Node: Name of node') print('MaxLength: Highest estimated output queue length for NIC (bytes)') print('Time: Time when worst-case queue length occurred') - print('Delay: Delay (usec until fully transmitted) experienced by packet ') + print('Delay: Delay (usec until fully transmitted) experienced by packet') print(' transmitted at Time') print('P50: Median delay experienced by Homa data packets') print('P90: 90th percentile delay experienced by Homa data packets') @@ -5759,7 +5761,7 @@ def output(self): line += ' ' * 11 else: line += ' %8d' % (max) - print(line, file=file) + print(line.rstrip(), file=file) if interval_end > end: break interval_end += interval @@ -8312,7 +8314,8 @@ def output(self): if sort_key == 'gro': pkts = sorted(pkts, key = lambda pkt : get_max_gro(pkt)) elif sort_key != 'xmit': - pkts = sorted(pkts, key = lambda pkt : pkt[sort_key]) + pkts = sorted(pkts, key = lambda pkt : + pkt[sort_key] if sort_key in pkt else 1e20) if len(pkts) == 0: continue @@ -8721,28 +8724,28 @@ def output(self): exit(1) options.pkt_id = int(match.group(1)) options.pkt_offset = int(match.group(2)) -d = Dispatcher() +dispatcher = Dispatcher() analyzer_classes = [] for name in options.analyzers.split(): class_name = 'Analyze' + name[0].capitalize() + name[1:] if not hasattr(sys.modules[__name__], class_name): print('No analyzer named "%s"' % (name), file=sys.stderr) exit(1) - d.interest(class_name) + dispatcher.interest(class_name) analyzer_classes.append(class_name) # Parse the timetrace files; this will invoke handlers in the analyzers. for file in tt_files: - d.parse(file) + dispatcher.parse(file) -d.print_no_matches() +dispatcher.print_no_matches() if options.verbose: - d.print_stats() + dispatcher.print_stats() # Invoke 'analyze' methods in each analyzer, if present, to perform # postprocessing now that all the trace data has been read. -for analyzer in d.get_analyzers(): +for analyzer in dispatcher.get_analyzers(): if hasattr(analyzer, 'analyze'): # print('Calling %s.analyze' % (type(analyzer).__name__), file=sys.stderr) analyzer.analyze() @@ -8750,6 +8753,6 @@ def output(self): # Give each analyzer a chance to output its findings (includes # printing output and generating data files). for name in analyzer_classes: - analyzer = d.get_analyzer(name) + analyzer = dispatcher.get_analyzer(name) if hasattr(analyzer, 'output'): analyzer.output() \ No newline at end of file From 93661cc47289568c0e7efc567eaac3203751a92c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 6 Aug 2025 16:27:03 -0700 Subject: [PATCH 431/625] Implement and use homa_rpc_tx_end function * Need to use its output instead of rpc->msgout->next_xmit_offset in many situations to account for packet queuing in homa_qdisc. * Also, in homa_qdisc_enqueue, automatically defer new packets if there are existing deferred packets. --- homa_impl.h | 5 ++-- homa_incoming.c | 36 +++++++++-------------- homa_outgoing.c | 40 +++++++++++++++++++++++++ homa_qdisc.c | 7 +++-- homa_rpc.h | 11 ++++++- homa_timer.c | 7 +++-- test/unit_homa_incoming.c | 11 ++----- test/unit_homa_outgoing.c | 39 ++++++++++++++++++++++++ test/unit_homa_qdisc.c | 62 ++++++++++++++++++++++++--------------- test/unit_homa_timer.c | 4 ++- util/strip_decl.py | 1 + 11 files changed, 160 insertions(+), 63 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 0f914ae6..d88299d9 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -548,8 +548,8 @@ struct homa_net { */ struct homa_skb_info { /** - * @next_skb: used to link together all of the skb's for a Homa - * message (in order of offset). + * @next_skb: used to link together all of the skb's for an + * outgoing Homa message (in order of offset). */ struct sk_buff *next_skb; @@ -784,6 +784,7 @@ void homa_request_retrans(struct homa_rpc *rpc); void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, struct homa_sock *hsk); void homa_rpc_handoff(struct homa_rpc *rpc); +int homa_rpc_tx_end(struct homa_rpc *rpc); int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); diff --git a/homa_incoming.c b/homa_incoming.c index 9a0dd577..93db90d2 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -802,6 +802,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, int length = ntohl(h->length); int end = offset + length; struct homa_busy_hdr busy; + int tx_end; if (!rpc) { tt_record4("resend request for unknown id %d, peer 0x%x:%d, offset %d; responding with RPC_UNKNOWN", @@ -819,6 +820,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, rpc->id, offset, length); #endif /* See strip.py */ + tx_end = homa_rpc_tx_end(rpc); if (!homa_is_client(rpc->id) && rpc->state != RPC_OUTGOING) { /* We are the server for this RPC and don't yet have a * response message, so send BUSY to keep the client @@ -830,16 +832,12 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, goto done; } - /* First, retransmit bytes that were already sent once. */ if (length == -1) - end = rpc->msgout.next_xmit_offset; + end = tx_end; #ifndef __STRIP__ /* See strip.py */ - if (end > rpc->msgout.next_xmit_offset) - homa_resend_data(rpc, offset, rpc->msgout.next_xmit_offset, - h->priority); - else - homa_resend_data(rpc, offset, end, h->priority); + homa_resend_data(rpc, offset, (end > tx_end) ? tx_end : end, + h->priority); if (end > rpc->msgout.granted) { /* It appears that a grant packet was lost; assume that @@ -852,23 +850,15 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, homa_xmit_data(rpc, false); } #else /* See strip.py */ - if (end > rpc->msgout.next_xmit_offset) - homa_resend_data(rpc, offset, rpc->msgout.next_xmit_offset); - else - homa_resend_data(rpc, offset, end); + homa_resend_data(rpc, offset, (end > tx_end) ? tx_end : end); #endif /* See strip.py */ - if (offset >= rpc->msgout.next_xmit_offset) { + if (offset >= tx_end) { /* We have chosen not to transmit any of the requested data; * send BUSY so the receiver knows we are alive. */ - tt_record3("sending BUSY from resend, id %d, offset %d, granted %d", - rpc->id, rpc->msgout.next_xmit_offset, -#ifndef __STRIP__ /* See strip.py */ - rpc->msgout.granted); -#else /* See strip.py */ - rpc->msgout.length); -#endif /* See strip.py */ + tt_record3("sending BUSY from resend, id %d, offset %d, tx_end %d", + rpc->id, offset, tx_end); homa_xmit_control(BUSY, &busy, sizeof(busy), rpc); goto done; } @@ -891,21 +881,23 @@ void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) rpc->id, tt_addr(rpc->peer->addr), rpc->dport); if (homa_is_client(rpc->id)) { if (rpc->state == RPC_OUTGOING) { + int tx_end = homa_rpc_tx_end(rpc); + /* It appears that everything we've already transmitted * has been lost; retransmit it. */ tt_record4("Restarting id %d to server 0x%x:%d, lost %d bytes", rpc->id, tt_addr(rpc->peer->addr), - rpc->dport, rpc->msgout.next_xmit_offset); + rpc->dport, tx_end); #ifndef __STRIP__ /* See strip.py */ homa_freeze(rpc, RESTART_RPC, "Freezing because of RPC restart, id %d, peer 0x%x"); - homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset, + homa_resend_data(rpc, 0, tx_end, homa_unsched_priority(rpc->hsk->homa, rpc->peer, rpc->msgout.length)); #else /* See strip.py */ - homa_resend_data(rpc, 0, rpc->msgout.next_xmit_offset); + homa_resend_data(rpc, 0, tx_end); #endif /* See strip.py */ goto done; } diff --git a/homa_outgoing.c b/homa_outgoing.c index 587344e2..143d79d4 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -381,6 +381,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) rpc->msgout.num_skbs++; rpc->msgout.skb_memory += skb->truesize; rpc->msgout.copied_from_user = rpc->msgout.length - bytes_left; + rpc->msgout.first_not_tx = rpc->msgout.packets; if (overlap_xmit && list_empty(&rpc->throttled_links) && #ifndef __STRIP__ /* See strip.py */ xmit && offset < rpc->msgout.granted) { @@ -870,3 +871,42 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) resend_done: return; } + +/** + * homa_rpc_tx_end() - Return the offset of the first byte in an + * RPC's outgoing message that has not yet been fully transmitted. + * "Fully transmitted" means the message has been transmitted by the + * NIC and the skb has been released by the driver. This is different from + * rpc->msgout.next_xmit_offset, which computes the first offset that + * hasn't yet been passed to the IP stack. + * @rpc: RPC to check + * Return: See above. If the message has been fully transmitted then + * rpc->msgout.length is returned. + */ +int homa_rpc_tx_end(struct homa_rpc *rpc) +{ + struct sk_buff *skb = rpc->msgout.first_not_tx; + + while (skb) { + struct homa_skb_info *homa_info = homa_get_skb_info(skb); + + /* next_xmit_offset tells us whether the packet has been + * passed to the IP stack. Checking the reference count tells + * us whether the packet has been released by the driver + * (which only happens after notification from the NIC that + * transmission is complete). + */ + if (homa_info->offset >= rpc->msgout.next_xmit_offset || + refcount_read(&skb->users) > 1) { + tt_record3("homa_rpc_tx_complete id %d tx up to %d/%d", + rpc->id, homa_info->offset, + rpc->msgout.length); + return homa_info->offset; + } + skb = homa_info->next_skb; + rpc->msgout.first_not_tx = skb; + } + tt_record2("homa_rpc_tx_complete id %d fully transmitted (%d bytes)", + rpc->id, rpc->msgout.length); + return rpc->msgout.length; +} diff --git a/homa_qdisc.c b/homa_qdisc.c index abe6ef53..32583304 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -233,16 +233,17 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, pkt_len = qdisc_skb_cb(skb)->pkt_len; if (pkt_len < homa->pacer->throttle_min_bytes) { - homa_qdisc_update_link_idle(q->qdev, pkt_len, -1); + homa_qdisc_update_link_idle(qdev, pkt_len, -1); goto enqueue; } if (!is_homa_pkt(skb)) { - homa_qdisc_update_link_idle(q->qdev, pkt_len, -1); + homa_qdisc_update_link_idle(qdev, pkt_len, -1); goto enqueue; } - if (homa_qdisc_update_link_idle(q->qdev, pkt_len, + if (skb_queue_empty(&qdev->homa_deferred) && + homa_qdisc_update_link_idle(qdev, pkt_len, homa->pacer->max_nic_queue_cycles)) goto enqueue; diff --git a/homa_rpc.h b/homa_rpc.h index 767ef9cd..f77de25a 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -61,10 +61,19 @@ struct homa_message_out { /** * @next_xmit_offset: All bytes in the message, up to but not - * including this one, have been transmitted. + * including this one, have been passed to ip_queue_xmit or + * ip6_xmit. */ int next_xmit_offset; + /** + * @first_not_tx: All packets in @packets preceding this one have + * been confirmed to have been transmitted by the NIC (the driver + * has released its reference). NULL means all packets are known to + * have been transmitted. Used by homa_rpc_tx_complete. + */ + struct sk_buff *first_not_tx; + #ifndef __STRIP__ /* See strip.py */ /** * @unscheduled: Initial bytes of message that we'll send diff --git a/homa_timer.c b/homa_timer.c index 6b61af1f..3b13fe17 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -28,10 +28,11 @@ void homa_timer_check_rpc(struct homa_rpc *rpc) __must_hold(rpc->bucket->lock) { struct homa *homa = rpc->hsk->homa; + int tx_end = homa_rpc_tx_end(rpc); /* See if we need to request an ack for this RPC. */ if (!homa_is_client(rpc->id) && rpc->state == RPC_OUTGOING && - rpc->msgout.next_xmit_offset >= rpc->msgout.length) { + tx_end == rpc->msgout.length) { if (rpc->done_timer_ticks == 0) { rpc->done_timer_ticks = homa->timer_ticks; } else { @@ -76,9 +77,9 @@ void homa_timer_check_rpc(struct homa_rpc *rpc) if (rpc->state == RPC_OUTGOING) { #ifndef __STRIP__ /* See strip.py */ - if (rpc->msgout.next_xmit_offset < rpc->msgout.granted) { + if (tx_end < rpc->msgout.granted) { #else /* See strip.py */ - if (rpc->msgout.next_xmit_offset < rpc->msgout.length) { + if (tx_end < rpc->msgout.length) { #endif /* See strip.py */ /* There are granted bytes that we haven't transmitted, * so no need to be concerned; the ball is in our court. diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index b0908751..181005eb 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1696,11 +1696,8 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); } -TEST_F(homa_incoming, homa_resend_pkt__negative_length) +TEST_F(homa_incoming, homa_resend_pkt__negative_length_in_resend) { - /* Entire msgin has not been received yet. But we have received - * everything we have granted so far. - */ struct homa_resend_hdr h = {{.sport = htons(self->client_port), .dport = htons(self->server_port), .sender_id = cpu_to_be64(self->client_id), @@ -1716,11 +1713,10 @@ TEST_F(homa_incoming, homa_resend_pkt__negative_length) srpc->msgout.next_xmit_offset = 2000; homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); - // The server might send a GRANT right after BUSY so just check substr EXPECT_STREQ("xmit DATA retrans 1400@0; " "xmit DATA retrans 1400@1400", unit_log_get()); } -TEST_F(homa_incoming, homa_resend_pkt__clip_range_to_next_xmit_offset) +TEST_F(homa_incoming, homa_resend_pkt__clip_range_to_tx_end) { struct homa_resend_hdr h = {{.sport = htons(self->server_port), .dport = htons(self->hsk.port), @@ -1783,8 +1779,7 @@ TEST_F(homa_incoming, homa_resend_pkt__update_granted_and_xmit) homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_EQ(3400, crpc->msgout.granted); - EXPECT_STREQ("xmit DATA 1400@1400; " - "xmit DATA 1400@2800", unit_log_get()); + EXPECT_EQ(4200, crpc->msgout.next_xmit_offset); } TEST_F(homa_incoming, homa_resend_pkt__clip_granted_to_message_length) { diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 5b01babd..2ffd3045 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1184,3 +1184,42 @@ TEST_F(homa_outgoing, homa_resend_data__set_homa_info) "homa_info: wire_bytes 1538, data_bytes 1400, seg_length 1400, offset 8400", unit_log_get()); } + +TEST_F(homa_outgoing, homa_rpc_tx_end) +{ + struct homa_rpc *crpc; + struct sk_buff *skbs[5]; + struct sk_buff *skb; + int i; + + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 6000, 1000); + ASSERT_EQ(5, crpc->msgout.num_skbs); + + /* First call: no packets passed to IP stack. */ + EXPECT_EQ(0, homa_rpc_tx_end(crpc)); + + /* Second call: all packets passed to IP, but no packets complete. */ + for (skb = crpc->msgout.packets, i = 0; skb != NULL; + skb = homa_get_skb_info(skb)->next_skb, i++) { + skbs[i] = skb; + skb_get(skb); + EXPECT_EQ(2, refcount_read(&skbs[i]->users)); + } + crpc->msgout.next_xmit_offset = 6000; + EXPECT_EQ(0, homa_rpc_tx_end(crpc)); + + /* Third call: packets 0 and 3 transmitted. */ + kfree_skb(skbs[0]); + kfree_skb(skbs[3]); + EXPECT_EQ(1400, homa_rpc_tx_end(crpc)); + EXPECT_EQ(skbs[1], crpc->msgout.first_not_tx); + + /* Fourth call: all packets transmitted. */ + kfree_skb(skbs[1]); + kfree_skb(skbs[2]); + kfree_skb(skbs[4]); + EXPECT_EQ(6000, homa_rpc_tx_end(crpc)); + EXPECT_EQ(NULL, crpc->msgout.first_not_tx); +} \ No newline at end of file diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index d0249482..a9ce123a 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -303,29 +303,6 @@ TEST_F(homa_qdisc, _homa_qdisc_homa_qdisc_set_qixs_object) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet) -{ - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); - struct sk_buff *skb, *to_free; - struct homa_qdisc *q; - u64 idle; - - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); - idle = mock_clock + 1 + self->homa.pacer->max_nic_queue_cycles + 1; - atomic64_set(&q->qdev->link_idle_time, idle); - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - qdisc_skb_cb(skb)->pkt_len = 1500; - to_free = NULL; - - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); - EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, q->qdev->homa_deferred.qlen); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); -} TEST_F(homa_qdisc, homa_qdisc_enqueue__short_packet) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); @@ -380,6 +357,45 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) homa_qdisc_destroy(qdisc); kfree(qdisc); } +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_qdisc *q; + u64 idle; + + /* First packet is deferred because the NIC queue is full. */ + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + idle = mock_clock + 1 + self->homa.pacer->max_nic_queue_cycles + 1; + atomic64_set(&q->qdev->link_idle_time, idle); + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + qdisc_skb_cb(skb)->pkt_len = 1500; + to_free = NULL; + + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, q->qdev->homa_deferred.qlen); + EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); + + /* Second packet is deferred even though NIC not busy, because + * there are other packets waiting. + */ + atomic64_set(&q->qdev->link_idle_time, 0); + self->data.common.sender_id = cpu_to_be64(101); + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + qdisc_skb_cb(skb)->pkt_len = 1500; + to_free = NULL; + + unit_log_clear(); + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, q->qdev->homa_deferred.qlen); + EXPECT_STREQ("", unit_log_get()); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} TEST_F(homa_qdisc, homa_qdisc_enqueue__drop_packet_queue_over_limit) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 02d86c73..4ce452db 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -62,11 +62,13 @@ TEST_F(homa_timer, homa_timer_check_rpc__request_ack) /* First call: do nothing (response not fully transmitted). */ homa_rpc_lock(srpc); + homa_xmit_data(srpc, false); + skb_get(srpc->msgout.packets); homa_timer_check_rpc(srpc); EXPECT_EQ(0, srpc->done_timer_ticks); + kfree_skb(srpc->msgout.packets); /* Second call: set done_timer_ticks. */ - homa_xmit_data(srpc, false); unit_log_clear(); homa_timer_check_rpc(srpc); EXPECT_EQ(100, srpc->done_timer_ticks); diff --git a/util/strip_decl.py b/util/strip_decl.py index 1e3b83d1..22edd6bf 100755 --- a/util/strip_decl.py +++ b/util/strip_decl.py @@ -44,6 +44,7 @@ 'int homa_message_out_fill(', 'void homa_message_out_init(', 'void homa_resend_data(', + 'int homa_rpc_tx_end(', 'struct sk_buff *homa_tx_data_pkt_alloc(', 'int __homa_xmit_control(', 'void __homa_xmit_data(', From 06a440ecd89b035728731851e039808c11a2d9d3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 8 Aug 2025 14:19:19 -0700 Subject: [PATCH 432/625] Add new note to perf.txt --- perf.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf.txt b/perf.txt index b06af455..b4ecb49b 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,14 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +61. (July 2025) Client responses could starve server requests. This came +about because a server request that wakes up after waiting for buffer space +has 0 received bytes. In contrast, a new client response will have received +unscheduled bytes. As a result, the client responses always got priority for +new grants and server requests could starve. The solution was to grant server +requests an amount equal to the unscheduled bytes when they wake up after +qwaiting for buffer space. + 60. (July 2025) Measured impact of new FIFO grant mechanism on xl170 cluster using "-w starve -b 40 -s 30 -n 6" (priorities were not enabled). Slowdowns as a function of message length: From 33ed99b78f962cec06e1c5bd1295322ebfc76d94 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 8 Aug 2025 18:33:17 -0700 Subject: [PATCH 433/625] Fix various issues in tthoma.py * Several of these were latent bugs. * Other changes were required by the presence of homa_qdisc data. --- util/tthoma.py | 285 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 190 insertions(+), 95 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 8a62863f..7824b963 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -149,9 +149,14 @@ def __missing__(self, id): # value is a dictionary containing the following fields (some may not # be present, depending on which events were present in the traces): # xmit: Time when ip*xmit was invoked +# qdisc_defer: If the packet was deferred by homa_qdisc, gives the +# time when the deferral decision was made. This field +# exists only for packets that were deferred. # qdisc_xmit: Time when homa_qdisc requeued a packet that was deferred # because of NIC queue length (only present for deferred # packets) +# xmit2: qdisc_xmit if it exists, otherwise xmit: a time when Homa +# has decided to transmit the packet (after any Homa queuing). # nic: Time when packet was handed off to the NIC (if available) # gro: Time when GRO received the packet # softirq: Time when homa_softirq processed the packet @@ -239,7 +244,9 @@ def __missing__(self, key): # of the end of the interval # tx_starts: Number of new outgoing messages that started in the interval # tx_pkts: Number of data packets passed to ip*xmit during the interval +# (or requeued by homa_qdisc after deferral) # tx_bytes: Number of bytes of data passed to ip*xmit during the interval +# (or requeued by homa_qdisc after deferral) # tx_nic_pkts: Number of data packets passed to the NIC during the interval # tx_nic_bytes: Number of bytes of data passed to the NIC during the interval # tx_in_nic: Number of bytes of data that have been passed to the NIC @@ -1612,6 +1619,18 @@ def __pacer_xmit(self, trace, time, core, match, interests): '([0-9]+), offset ([0-9]+), bytes_left ([0-9]+)' }) + def __qdisc_defer(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + for interest in interests: + interest.tt_qdisc_defer(trace, time, core, id, offset) + + patterns.append({ + 'name': 'qdisc_defer', + 'regexp': 'homa_qdisc_enqueue deferring homa data packet for ' + 'id ([0-9]+), offset ([0-9]+)' + }) + def __qdisc_xmit(self, trace, time, core, match, interests): id = int(match.group(1)) offset = int(match.group(2)) @@ -1913,10 +1932,10 @@ def output(self): elapsed = traces[node]['elapsed_time'] msgs, liveFrac, avgLive = self.sum_list(events) rate = 1e3 * self.node_in_starts[node] / elapsed - gbps = total_bytes*8e-3 / elapsed + avg_gbps = total_bytes*8e-3 / elapsed print('%-10s %6d %7.2f %9.3f %8.2f %7.2f %7.2f' % ( - node, msgs, rate, liveFrac, avgLive, gbps, - gbps/liveFrac), end='') + node, msgs, rate, liveFrac, avgLive, avg_gbps, + avg_gbps/liveFrac), end='') print(' %5.2f (C%02d) %6.3f (C%02d) %6.3f (C%02d)' % ( max_gbps, max_core, max_rpcs/total_rpcs if total_rpcs != 0 else 0, @@ -1934,10 +1953,10 @@ def output(self): events = sorted(self.node_out_msgs[node]) msgs, liveFrac, avgLive = self.sum_list(events) rate = 1e3 * self.node_out_starts[node] / elapsed - gbps = bytes*8e-3 / elapsed + avg_gbps = bytes*8e-3 / elapsed print('%-10s %6d %7.2f %9.3f %8.2f %7.2f %7.2f' % ( - node, msgs, rate, liveFrac, avgLive, gbps, - gbps/liveFrac)) + node, msgs, rate, liveFrac, avgLive, avg_gbps, + avg_gbps/liveFrac)) if options.data: for node in get_sorted_nodes(): @@ -4184,8 +4203,8 @@ def add_grant_info(self, rpc): """ # List of tuples, where event is one of: - # grant_xmit: time and offset describe a grant passed to ip*xmit - # by receiver + # grant_xmit: time and offset describe a grant passed to ip*xmit + # by receiver # grant_softirq: time and offset describe a grant processed by # SoftIRQ on sender. # data: time and offset describe a data packet passed to @@ -4195,33 +4214,37 @@ def add_grant_info(self, rpc): events = [] id = rpc['id'] - if id^1 in rpcs: - peer = rpcs[id^1] - peer_node = peer['node'] - for t, offset, prio, increment in peer['send_grant']: - events.append([t, 'grant_xmit', offset]) - grant_xmit_offset = 0 grant_softirq_offset = 0 data_offset = 1e20 prev_time = 0 + if id^1 in rpcs: + tx_rpc = rpcs[id^1] + tx_node = tx_rpc['node'] + for t, offset in tx_rpc['softirq_grant']: + events.append([t, 'grant_softirq', offset]) + for pkt in tx_rpc['send_data_pkts']: + if 'xmit2' in pkt and 'tso_length' in pkt: + offset = pkt['offset'] + events.append([pkt['xmit2'], 'data', + offset + pkt['tso_length']]) + if offset < data_offset: + # Computes initial data_offset, for cases where initial + # data packets aren't in the trace + data_offset = offset + node = rpc['node'] - for t, offset in rpc['softirq_grant']: - events.append([t, 'grant_softirq', offset]) - for t, offset, length in rpc['send_data']: - events.append([t, 'data', offset+length]) - if offset < data_offset: - # Computes initial data_offset, for cases where initial - # data packets aren't in the trace - data_offset = offset + for t, offset, prio, increment in rpc['send_grant']: + events.append([t, 'grant_xmit', offset]) if not events: return + for t, event, offset in sorted(events, key=lambda t : t[0]): if grant_xmit_offset > data_offset: - add_to_intervals(peer_node, prev_time, t, 'rx_granted', + add_to_intervals(node, prev_time, t, 'rx_granted', grant_xmit_offset - data_offset) if grant_softirq_offset > data_offset: - add_to_intervals(node, prev_time, t, 'tx_grant_avl', + add_to_intervals(tx_node, prev_time, t, 'tx_grant_avl', grant_softirq_offset - data_offset) if event == 'grant_xmit': if offset > grant_xmit_offset: @@ -4295,7 +4318,7 @@ def analyze(self): print('Packet with no length: %s' % (pkt)) continue length = pkt['length'] - txmit = pkt['xmit'] if 'xmit' in pkt else None + txmit = pkt['xmit2'] if 'xmit2' in pkt else None if 'nic' in pkt: tnic = pkt['nic'] nic_interval = get_interval(tx_node, tnic) @@ -4481,7 +4504,7 @@ def analyze(self): if offset == 0: get_interval(node, t)['rx_starts'] += 1 - # tx_grant_avl + # tx_grant_avl and rx_granted self.add_grant_info(rpc) # rx_grantable @@ -6287,6 +6310,14 @@ def tt_pacer_xmit(self, trace, t, core, id, offset, port, bytes_left): p = p['retransmits'][-1] p['pacer'] = True + def tt_qdisc_defer(self, trace, t, core, id, offset): + global packets + p = packets[pkt_id(id, offset)] + p['tx_node'] = trace['node'] + if p['retransmits']: + p = p['retransmits'][-1] + p['qdisc_defer'] = t + def tt_qdisc_xmit(self, trace, t, core, id, offset, bytes_left): global packets p = packets[pkt_id(id, offset)] @@ -6380,6 +6411,11 @@ def analyze(self): if 'peer' in tx_rpc: pkt['rx_node'] = peer_nodes[tx_rpc['peer']] + if 'qdisc_xmit' in pkt: + pkt['xmit2'] = pkt['qdisc_xmit'] + elif ('xmit' in pkt) and (not 'qdisc_defer' in pkt): + pkt['xmit2'] = pkt['xmit'] + # Make sure that all of the smaller packets deriving from each # TSO packet are represented and properly populated (if one of # these packets is lost it won't be represented yet). @@ -6404,8 +6440,8 @@ def analyze(self): pkt2 = {'offset': offset, 'length': length, 'retransmits': []} new_pkts.append([pid, pkt2]) - for key in ['xmit', 'nic', 'id', 'msg_length', - 'priority', 'tx_node', 'tx_core', + for key in ['xmit', 'qdisc_xmit', 'xmit2', 'nic', 'id', + 'msg_length', 'priority', 'tx_node', 'tx_core', 'free_tx_skb']: if key in pkt: pkt2[key] = pkt[key] @@ -7109,10 +7145,11 @@ def output(self): f.write('# data packets have not been transmitted by ' 'the peer\n') f.write('# IP: KB of data that have been passed to ip*xmit ' - 'on sender but not\n') - f.write('# yet transmitted by NIC (large numbers ' - 'probably indicate qdisc\n') - f.write('# backup)\n') + 'on sender (or\n') + f.write(' requeued by homa_qdisc after being ' + 'deferred) but not yet\n') + f.write(' transmitted by NIC; large numbers probably ' + 'indicate qdisc backup\n') f.write('# Net: KB of data that have been passed to the ' 'NIC but not\n') f.write('# yet received by GRO\n') @@ -7278,16 +7315,16 @@ def collect_live_rpcs(node, t, receive): following values: pkts: List of all the data packets in this RPC grants: List of all the grant packets in this RPC - pre_xmit: Offset just after highest byte sent in a data - packet with 'xmit' < target time - post_xmit: Lowest offset contained in a data packet with - 'xmit' >= target time + pre_xmit2: Offset just after highest byte sent in a data + packet with 'xmit2' < target time + post_xmit2: Lowest offset contained in a data packet with + 'xmit2' >= target time pre_gro and post_gro: - Same, except measured with 'gro' instead of 'xmit' + Same, except measured with 'gro' instead of 'xmit2' pre_softirq and post_softirq: - Same, except measured with 'softirq' instead of 'xmit' + Same, except measured with 'softirq' instead of 'xmit2' pre_copied and post_copied: - Same, except measured with 'copied' instead of 'xmit' + Same, except measured with 'copied' instead of 'xmit2' The following offsets record things that happened either before or after the target time. @@ -7309,12 +7346,12 @@ def collect_live_rpcs(node, t, receive): sorting the RPCs in "closest to completion" order. sort_grant_xmit pre_grant_xmit (if nonzero) else sort_grant_gro sort_grant_gro pre_grant_gro (if nonzero) else sort_grant_softirq - sort_grant_softirq pre_grant_softirq (if nonzero) else pre_xmit + sort_grant_softirq pre_grant_softirq (if nonzero) else pre_xmit2 """ global packets, grants, rpcs, options, traces live_rpcs = defaultdict(lambda : {'pkts': [], 'grants': [], - 'pre_xmit': 0, 'post_xmit': 1e20, + 'pre_xmit2': 0, 'post_xmit2': 1e20, 'pre_gro': 0, 'post_gro': 1e20, 'pre_softirq': 0, 'post_softirq': 1e20, 'pre_copied': 0, 'post_copied': 1e20, @@ -7326,9 +7363,10 @@ def collect_live_rpcs(node, t, receive): def check_live(tx_id, node, t, receive): """ - If receive is True, returns whether tx_id is live for receiving - on node at t. Otherwise returns whether tx_id is live for sending - on node at t. In either case, tx_id is the RPC id on the sender. + If receive is True, returns whether the RPC given by tx_id is live + for receiving on node at t. Otherwise returns whether tx_id is live + for sending on node at t. In either case, tx_id is the RPC id on + atthe sender. """ if receive: if not tx_id^1 in rpcs: @@ -7361,7 +7399,7 @@ def check_live(tx_id, node, t, receive): offset = pkt['offset'] end_offset = offset + pkt['length'] - for type in ['xmit', 'gro', 'softirq', 'copied']: + for type in ['xmit2', 'gro', 'softirq', 'copied']: if (type in pkt): pkt_time = pkt[type] if pkt_time < t: @@ -7400,64 +7438,71 @@ def check_live(tx_id, node, t, receive): if not check_live(id, node, t, receive): continue if 'sent' in tx_rpc: - live_rpcs[id]['pre_xmit'] = tx_rpc['sent'] + live_rpcs[id]['pre_xmit2'] = tx_rpc['sent'] else: - live_rpcs[id]['pre_xmit'] = 0 + live_rpcs[id]['pre_xmit2'] = 0 - # Deduce missing fields in RPCs where possible + # Deduce missing fields (or improve estimates) in RPCs where possible for id, live_rpc in live_rpcs.items(): next_stage = 0 if id^1 in rpcs: rx_rpc = rpcs[id^1] else: rx_rpc = {} - if 'remaining' in rx_rpc and live_rpc['pre_softirq'] == 0: - live_rpc['pre_softirq'] = rx_rpc['in_length'] - rx_rpc['remaining'] - for type in ['copied', 'softirq', 'gro', 'xmit']: + if 'remaining' in rx_rpc: + rcvd = rx_rpc['in_length'] - rx_rpc['remaining'] + if ((rcvd > live_rpc['post_copied']) and + (live_rpc['post_softirq'] == 0) and + (live_rpc['post_gro'] == 0) and + (live_rpc['post_xmit2'] == 0)): + live_rpc['post_copied'] = rcvd + for type in ['copied', 'softirq', 'gro', 'xmit2']: pre_field = 'pre_' + type post_field = 'post_' + type pre = live_rpc[pre_field] post = live_rpc[post_field] - if pre == 0: - # There were no packets with times before the target; - # However, we can infer the field value from the first - # packet after the target time, if any, or from information - # from the next later stage (e.g. if a byte got to SoftIRQ - # it must have been received by GRO). - if post < 1e20: - live_rpc[pre_field] = post - elif next_stage != 0: - live_rpc[pre_field] = next_stage + + # We can correct for missing information by using packets + # after the target time, or packets from the next stage: + # (e.g. if a byte got to SoftIRQ it must have been received + # by GRO). + if post < 1e20 and post > pre: + pre = post + if next_stage > pre: + pre = next_stage + live_rpc[pre_field] = pre next_stage = pre + # Deduce missing grant fields where possible. next_stage = 0 unsched = 0 if 'unsched' in rx_rpc: unsched = rx_rpc['unsched'] - if 'granted' in rx_rpc and live_rpc['pre_grant_xmit'] == 0: - live_rpc['pre_grant_xmit'] = rx_rpc['granted'] + if 'granted' in rx_rpc and live_rpc['post_grant_softirq'] == 0: + live_rpc['post_grant_softirq'] = rx_rpc['granted'] + if (unsched > 0 and live_rpc['pre_xmit2'] > unsched and + live_rpc['pre_xmit2'] > live_rpc['pre_grant_softirq']): + # We sent unscheduled packets: they must have been granted. + live_rpc['pre_grant_softirq'] = live_rpc['pre_xmit2'] for type in ['softirq', 'gro', 'xmit']: pre_field = 'pre_grant_' + type post_field = 'post_grant_' + type pre = live_rpc[pre_field] post = live_rpc[post_field] - if pre == 0: - if post < 1e20: - live_rpc[pre_field] = post - elif next_stage != 0: - live_rpc[pre_field] = next_stage - elif (type == 'softirq') and (unsched > 0) and ( - live_rpc['pre_xmit'] > unsched): - live_rpc[pre_field] = live_rpc['pre_xmit'] + if post < 1e20 and post > pre: + pre = post + if next_stage > pre: + pre = next_stage + if pre <= unsched: + pre= 0 + live_rpc[pre_field] = pre next_stage = pre - if live_rpc[pre_field] <= unsched: - live_rpc[pre_field] = 0 # Fields for sorting. if live_rpc['pre_grant_softirq']: live_rpc['sort_grant_softirq'] = live_rpc['pre_grant_softirq'] else: - live_rpc['sort_grant_softirq'] = live_rpc['pre_xmit'] + live_rpc['sort_grant_softirq'] = live_rpc['pre_xmit2'] if live_rpc['pre_grant_gro']: live_rpc['sort_grant_gro'] = live_rpc['pre_grant_gro'] else: @@ -7469,9 +7514,12 @@ def check_live(tx_id, node, t, receive): # Count lost packets in the RPC. for pkt in live_rpc['pkts']: - if (('xmit' in pkt) and (not 'gro' in pkt) - and ((options.time - pkt['xmit']) > 200)): + if (('xmit2' in pkt) and (not 'gro' in pkt) + and (pkt['xmit2'] >= traces[node]['first_time']) + and ((options.time - pkt['xmit2']) > 200) + and (options.time < traces[node]['last_time'])): live_rpc['lost'] += 1 + print('Lost packet: %s' % (pkt)) return live_rpcs def get_sorted_ids(live_rpcs): @@ -7482,8 +7530,11 @@ def get_sorted_ids(live_rpcs): """ def sort_key(live_rpcs, id, field): - length = rpcs[id]['out_length'] - if length == None: + if id in rpcs: + length = rpcs[id]['out_length'] + if length == None: + length = 0 + else: length = 0 if not field in live_rpcs[id]: print('Missing field %s in id %d: %s' % (field, id, live_rpcs[id])) @@ -7498,7 +7549,7 @@ def sort_key(live_rpcs, id, field): sorted_ids = sorted(sorted_ids, key = lambda id : sort_key(live_rpcs, id, 'pre_gro')) sorted_ids = sorted(sorted_ids, - key = lambda id : sort_key(live_rpcs, id, 'pre_xmit')) + key = lambda id : sort_key(live_rpcs, id, 'pre_xmit2')) sorted_ids = sorted(sorted_ids, key = lambda id : sort_key(live_rpcs, id, 'sort_grant_softirq')) sorted_ids = sorted(sorted_ids, @@ -7542,8 +7593,9 @@ def output(self): print('GGro: Highest offset in grant that has been received by GRO') print('GSoft: Highest offset in grant that has been processed ' 'by SoftIRQ') - print('Xmit: Offset just after last data byte that has been ' + print('Xmit: Offset just after last byte that has been ' 'passed to ip*xmit') + print(' or requeued by homa_qdisc after deferral)') print('Gro: Offset just after last data byte that has been ' 'processed by GRO') print('SoftIrq: Offset just after last data byte that has been ' @@ -7580,7 +7632,7 @@ def output(self): if live_rpc['pre_grant_gro'] > 0 else "", str(live_rpc['pre_grant_softirq']) if live_rpc['pre_grant_softirq'] > 0 else ""), end='') - print('%7d %7d %7d %7d %7s %4s %4d' % (live_rpc['pre_xmit'], + print('%7d %7d %7d %7d %7s %4s %4d' % (live_rpc['pre_xmit2'], live_rpc['pre_gro'], live_rpc['pre_softirq'], live_rpc['pre_copied'], incoming, rank, live_rpc['lost'])) @@ -7590,7 +7642,9 @@ def output(self): print('TxCore: Core where sender passed packet to ip*xmit') print('GCore: Core where receiver GRO processed packet') print('SCore: Core where receiver SoftIRQ processed packet') - print('Xmit: Time when sender passed packet to ip*xmit') + print('Xmit: Time when sender passed packet to ip*xmit or when ' + 'sender qdisc') + print(' requeued packet after deferral, whichever is later') print('Nic: Time when sender handed off packet to NIC') print('Free: Time when packet buffer freed after tx') print('Gro: Time when receiver GRO processed packet') @@ -7599,6 +7653,8 @@ def output(self): 'preceding value') print('and the reference time') + # Generate a line with overall info about the state of incoming + # data for an RPC. for tx_id in sorted_ids: live_rpc = live_rpcs[tx_id] rx_rpc = rpcs[tx_id^1] @@ -7626,10 +7682,10 @@ def output(self): for pkt in live_rpc['pkts']: offset = pkt['offset'] keep = True - if 'xmit' in pkt: - if pkt['xmit'] >= options.time: + if 'xmit2' in pkt: + if pkt['xmit2'] >= options.time: keep = False - elif offset >= live_rpc['pre_xmit']: + elif offset >= live_rpc['pre_xmit2']: keep = False if 'gro' in pkt: if pkt['gro'] < options.time: @@ -7639,7 +7695,7 @@ def output(self): if keep: net_pkts.append(pkt) - keep = False + keep = True if 'gro' in pkt: if pkt['gro'] >= options.time: keep = False @@ -7662,7 +7718,7 @@ def output(self): if 'xmit' in pkt: if pkt['xmit'] > options.time: keep = False - elif offset > live_rpc['pre_xmit']: + elif offset > live_rpc['pre_xmit2']: keep = False if 'gro' in pkt: if pkt['gro'] <= options.time: @@ -7672,7 +7728,7 @@ def output(self): if keep: net_grants.append(pkt) - keep = False + keep = True if 'gro' in pkt: if pkt['gro'] > options.time: keep = False @@ -7698,8 +7754,8 @@ def output(self): 'Gro Free GCore') for pkt in net_pkts: print('%6d %7s %-10s %4s %7s %8s %7s %8s %7s %8s %5s' % ( - pkt['offset'], print_field_if(pkt, 'xmit', '%7.1f'), - print_field_if(pkt, 'xmit', '(%7.1f)', + pkt['offset'], print_field_if(pkt, 'xmit2', '%7.1f'), + print_field_if(pkt, 'xmit2', '(%7.1f)', lambda t : t - options.time ), print_field_if(pkt, 'tx_core', '%4d'), print_field_if(pkt, 'nic', '%7.1f'), @@ -8546,12 +8602,40 @@ def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') dispatcher.interest('AnalyzePackets') + def get_sorted_ids(self, live_rpcs): + """ + Given the results from collect_live_rpcs, return a list of the + ids in live_rpcs, sorted based on transmission priority (how close + each message is to fully transmitted). + """ + + def sort_key(live_rpcs, id, field): + if id in rpcs: + length = rpcs[id]['out_length'] + if length == None: + length = 0 + else: + length = 0 + if not field in live_rpcs[id]: + print('Missing field %s in id %d: %s' % (field, id, live_rpcs[id])) + return length - live_rpcs[id][field] + + sorted_ids = sorted(live_rpcs.keys(), + key = lambda id : live_rpcs[id]['pre_xmit2'], + reverse = True) + sorted_ids = sorted(sorted_ids, + key = lambda id : rpcs[id]['sendmsg'] + if (id in rpcs) and ('sendmsg' in rpcs[id]) else 0) + sorted_ids = sorted(sorted_ids, + key = lambda id : sort_key(live_rpcs, id, 'pre_xmit2')) + return sorted_ids + def output(self): global packets, rpcs, options, traces live_rpcs = AnalyzeRxsnapshot.collect_live_rpcs(options.node, options.time, False) - sorted_ids = AnalyzeRxsnapshot.get_sorted_ids(live_rpcs) + sorted_ids = self.get_sorted_ids(live_rpcs) for id, rpc in rpcs.items(): if rpc['node'] != options.node: @@ -8575,6 +8659,8 @@ def output(self): print('Id: RPC identifier on the sender side') print('Peer: Receiving node') print('Length: Length of outgoing message, if known') + print('Window: Bytes that have been granted but not transmitted ' + '(Gsoft - Xmit)') print('Gxmit: Highest offset for which grant has been passed ' 'to ip_*xmit') print('GGro: Highest offset in grant that has been received by GRO') @@ -8582,6 +8668,7 @@ def output(self): 'by SoftIRQ') print('Xmit: Offset just after last data byte that has been ' 'passed to ip*xmit') + print(' or requeued by homa_qdisc after deferral') print('Gro: Offset just after last data byte that has been ' 'processed by GRO') print('SoftIrq: Offset just after last data byte that has been ' @@ -8590,9 +8677,11 @@ def output(self): 'copied to user space') print('Incoming: Gxmit - SoftIrq') print('Lost: Packets that appear to have been dropped in the network') - print(' Id Peer Length GXmit GGro GSoft ', end='') + print(' Id Peer Length Window GXmit GGro GSoft ', + end='') print(' Xmit Gro SoftIrq Copied Incoming Lost') - print('-------------------------------------------', end='') + print('--------------------------------------------------------------', + end='') print('---------------------------------------------') for id in sorted_ids: @@ -8600,15 +8689,21 @@ def output(self): live_rpc = live_rpcs[id] incoming = (live_rpc['pre_grant_xmit'] - live_rpc['pre_softirq'] if live_rpc['pre_grant_xmit'] > 0 else 0) - print('%10d %-10s %7s %7s %7s %7s ' % (id, get_rpc_node(id^1), + window = live_rpc['pre_grant_softirq'] - live_rpc['pre_xmit2'] + if window > 0: + window = str(window) + else: + window = "" + print('%10d %-10s %7s %7s %7s %7s %7s ' % (id, get_rpc_node(id^1), tx_rpc['out_length'] if tx_rpc['out_length'] != None else "", + window, str(live_rpc['pre_grant_xmit']) if live_rpc['pre_grant_xmit'] > 0 else "", str(live_rpc['pre_grant_gro']) if live_rpc['pre_grant_gro'] > 0 else "", str(live_rpc['pre_grant_softirq']) if live_rpc['pre_grant_softirq'] > 0 else ""), end='') - print('%7d %7d %7d %7d %7d %4d' % (live_rpc['pre_xmit'], + print('%7d %7d %7d %7d %7d %4d' % (live_rpc['pre_xmit2'], live_rpc['pre_gro'], live_rpc['pre_softirq'], live_rpc['pre_copied'], incoming, live_rpc['lost'])) From 651d80c2d5c64f6608337ef32ca6f08dca141bdb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 12 Aug 2025 14:22:12 -0700 Subject: [PATCH 434/625] Fix various issues in rxsnapshot analyzer for tthoma.py --- util/tthoma.py | 75 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 20 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 7824b963..55d0c3c3 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -91,6 +91,7 @@ # The following fields will be present if homa_rpc_log_active_tt was invoked # when the timetraces were frozen; they reflect the RPC's state at the end # of the trace. +# stats_time: Time when the information below was recorded # remaining: # of bytes in the incoming message still to be received # granted: # of bytes granted for the incoming message # sent: # of bytes that have been sent for the outgoing message @@ -6767,7 +6768,9 @@ def tt_rpc_incoming(self, trace, t, core, id, peer, received, length): def tt_rpc_incoming2(self, trace, t, core, id, incoming, granted): global rpcs - rpcs[id]['granted'] = granted + rpc = rpcs[id] + rpc['granted'] = granted + rpc['stats_time'] = t def tt_rpc_incoming3(self, trace, t, core, id, length, remaining, rank): global rpcs @@ -7315,6 +7318,13 @@ def collect_live_rpcs(node, t, receive): following values: pkts: List of all the data packets in this RPC grants: List of all the grant packets in this RPC + unsched: Number of bytes of unscheduled incoming data, + or 0 if unknown + min_time: Lowest "interesting" time seen in any packet + for this RPC + lost: Number of packets that appear to have been lost + (transmitted but not received after long delay) + pre_xmit2: Offset just after highest byte sent in a data packet with 'xmit2' < target time post_xmit2: Lowest offset contained in a data packet with @@ -7337,10 +7347,6 @@ def collect_live_rpcs(node, t, receive): pre_grant_softirq and post_grant_softirq: Same, except measured with 'softirq' instead of 'xmit' - min_time: Lowest "interesting" time seen in any packet - for this RPC - lost: Number of packets that appear to have been lost - (transmitted but not received after long delay) The following offsets are derived from those above and used for sorting the RPCs in "closest to completion" order. @@ -7348,7 +7354,7 @@ def collect_live_rpcs(node, t, receive): sort_grant_gro pre_grant_gro (if nonzero) else sort_grant_softirq sort_grant_softirq pre_grant_softirq (if nonzero) else pre_xmit2 """ - global packets, grants, rpcs, options, traces + global packets, grants, rpcs, options, traces, max_unsched live_rpcs = defaultdict(lambda : {'pkts': [], 'grants': [], 'pre_xmit2': 0, 'post_xmit2': 1e20, @@ -7358,7 +7364,7 @@ def collect_live_rpcs(node, t, receive): 'pre_grant_xmit': 0, 'post_grant_xmit': 1e20, 'pre_grant_gro': 0, 'post_grant_gro': 1e20, 'pre_grant_softirq': 0, 'post_grant_softirq': 1e20, - 'lost': 0, 'min_time': 1e20 + 'lost': 0, 'min_time': 1e20, 'unsched': max_unsched }) def check_live(tx_id, node, t, receive): @@ -7451,10 +7457,7 @@ def check_live(tx_id, node, t, receive): rx_rpc = {} if 'remaining' in rx_rpc: rcvd = rx_rpc['in_length'] - rx_rpc['remaining'] - if ((rcvd > live_rpc['post_copied']) and - (live_rpc['post_softirq'] == 0) and - (live_rpc['post_gro'] == 0) and - (live_rpc['post_xmit2'] == 0)): + if live_rpc['post_copied'] > 1e19: live_rpc['post_copied'] = rcvd for type in ['copied', 'softirq', 'gro', 'xmit2']: pre_field = 'pre_' + type @@ -7478,7 +7481,8 @@ def check_live(tx_id, node, t, receive): unsched = 0 if 'unsched' in rx_rpc: unsched = rx_rpc['unsched'] - if 'granted' in rx_rpc and live_rpc['post_grant_softirq'] == 0: + live_rpc['unsched'] = unsched + if 'granted' in rx_rpc and live_rpc['post_grant_softirq'] >= 1e19: live_rpc['post_grant_softirq'] = rx_rpc['granted'] if (unsched > 0 and live_rpc['pre_xmit2'] > unsched and live_rpc['pre_xmit2'] > live_rpc['pre_grant_softirq']): @@ -7489,14 +7493,20 @@ def check_live(tx_id, node, t, receive): post_field = 'post_grant_' + type pre = live_rpc[pre_field] post = live_rpc[post_field] + if id == 999999: + print('Id %d before inference post_grant_%s %d, ' + 'pre_grant_%s %d, next_stage %d' % (id, type, + post, type, pre, next_stage)) if post < 1e20 and post > pre: pre = post if next_stage > pre: pre = next_stage - if pre <= unsched: - pre= 0 live_rpc[pre_field] = pre next_stage = pre + if id == 999999: + print('Id %d after inference post_grant_%s %d, ' + 'pre_grant_%s %d, next_stage %d' % (id, type, + live_rpc[post_field], type, pre, next_stage)) # Fields for sorting. if live_rpc['pre_grant_softirq']: @@ -7570,6 +7580,19 @@ def sort_key(live_rpcs, id, field): return sorted_ids + def count_data(self, rpc, start_time, end_time): + """ + Return a count of the number of message bytes present in all + data packets received for @rpc between @start_time and @end_time. + """ + + result = 0 + for pkt in rpc['softirq_data_pkts']: + softirq = pkt['softirq'] + if (softirq >= start_time) and (softirq < end_time): + result += pkt['length'] + return result + def output(self): global packets, rpcs, options, traces @@ -7595,7 +7618,7 @@ def output(self): 'by SoftIRQ') print('Xmit: Offset just after last byte that has been ' 'passed to ip*xmit') - print(' or requeued by homa_qdisc after deferral)') + print(' or requeued by homa_qdisc after deferral') print('Gro: Offset just after last data byte that has been ' 'processed by GRO') print('SoftIrq: Offset just after last data byte that has been ' @@ -7615,7 +7638,13 @@ def output(self): for id in sorted_ids: rx_rpc = rpcs[id^1] live_rpc = live_rpcs[id] - incoming = live_rpc['pre_grant_xmit'] - live_rpc['pre_softirq'] + post_data = self.count_data(rx_rpc, options.time, + rx_rpc['stats_time'] if 'stats_time' in rx_rpc else 1e20) + if 'remaining' in rx_rpc: + received = rx_rpc['in_length'] - rx_rpc['remaining'] - post_data + else: + received = rx_rpc['in_length'] - post_data + incoming = live_rpc['pre_grant_xmit'] - received if incoming <= 0: incoming = '' rank = '' @@ -7627,11 +7656,12 @@ def output(self): rpcs[id]['node'] if id in rpcs else "", rx_rpc['in_length'] if rx_rpc['in_length'] != None else "", str(live_rpc['pre_grant_xmit']) - if live_rpc['pre_grant_xmit'] > 0 else "", + if live_rpc['pre_grant_xmit'] > live_rpc['unsched'] else "", str(live_rpc['pre_grant_gro']) - if live_rpc['pre_grant_gro'] > 0 else "", + if live_rpc['pre_grant_gro'] > live_rpc['unsched'] else "", str(live_rpc['pre_grant_softirq']) - if live_rpc['pre_grant_softirq'] > 0 else ""), end='') + if live_rpc['pre_grant_softirq'] > live_rpc['unsched'] + else ""), end='') print('%7d %7d %7d %7d %7s %4s %4d' % (live_rpc['pre_xmit2'], live_rpc['pre_gro'], live_rpc['pre_softirq'], live_rpc['pre_copied'], incoming, rank, live_rpc['lost'])) @@ -7655,6 +7685,7 @@ def output(self): # Generate a line with overall info about the state of incoming # data for an RPC. + trace_start = traces[options.node]['first_time'] for tx_id in sorted_ids: live_rpc = live_rpcs[tx_id] rx_rpc = rpcs[tx_id^1] @@ -7683,7 +7714,11 @@ def output(self): offset = pkt['offset'] keep = True if 'xmit2' in pkt: - if pkt['xmit2'] >= options.time: + xmit2 = pkt['xmit2'] + if xmit2 >= options.time: + keep = False + if ((xmit2 < trace_start) and (not 'gro' in pkt) and + (not 'copied' in pkt)): keep = False elif offset >= live_rpc['pre_xmit2']: keep = False From be1f19df3b2b1239b77deafa10f06413f11aa261 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 12 Aug 2025 14:26:29 -0700 Subject: [PATCH 435/625] Miscellaneous improvements in tt_record calls --- homa_grant.c | 18 +++++++++++------- homa_outgoing.c | 8 +------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 7c661bd9..c730628f 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -629,9 +629,8 @@ int homa_grant_update_granted(struct homa_rpc *rpc, struct homa_grant *grant) incoming_delta = new_grant_offset - received - rpc->msgin.rec_incoming; avl_incoming = grant->max_incoming - atomic_read(&grant->total_incoming); if (avl_incoming < incoming_delta) { - tt_record4("insufficient headroom for grant for RPC id %d (rank %d): desired incoming %d, shortfall %d", - rpc->id, rank, new_grant_offset - received, - incoming_delta - avl_incoming); + tt_record4("insufficient headroom for grant for RPC id %d (rank %d): desired increment %d, available %d", + rpc->id, rank, incoming_delta, avl_incoming); prev_stalled = atomic_read(&grant->stalled_rank); while (prev_stalled > rank) prev_stalled = atomic_cmpxchg(&grant->stalled_rank, @@ -740,8 +739,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) grant->next_recalc = now + grant->recalc_cycles; needy_rank = homa_grant_fix_order(grant); homa_grant_unlock(grant); - tt_record1("homa_grant_check_rpc released grant lock (id %d)", - rpc->id); + tt_record2("homa_grant_check_rpc released grant lock (id %d, needy_rank %d)", + rpc->id, needy_rank); INC_METRIC(grant_check_recalcs, 1); } @@ -804,7 +803,8 @@ void homa_grant_check_rpc(struct homa_rpc *rpc) } INC_METRIC(grant_check_locked, locked); - tt_record1("homa_grant_check_rpc finished with id %d", rpc->id); + tt_record2("homa_grant_check_rpc finished with id %d, total_incoming %d", + rpc->id, atomic_read(&grant->total_incoming)); } /** @@ -889,8 +889,10 @@ void homa_grant_find_oldest(struct homa_grant *grant) oldest_birth = rpc->msgin.birth; } - if (oldest) + if (oldest) { homa_rpc_hold(oldest); + tt_record1("homa_grant_find_oldest chose id %d", oldest); + } grant->oldest_rpc = oldest; } @@ -1009,6 +1011,8 @@ void homa_grant_check_fifo(struct homa_grant *grant) } homa_grant_cand_init(&cand); rpc->msgin.granted += grant->fifo_grant_increment; + tt_record2("homa_grant_check_fifo granted %d more bytes to id %d", + grant->fifo_grant_increment, rpc->id); if (rpc->msgin.granted >= rpc->msgin.length) { INC_METRIC(fifo_grant_bytes, grant->fifo_grant_increment + rpc->msgin.length - diff --git a/homa_outgoing.c b/homa_outgoing.c index 143d79d4..391724f7 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -897,16 +897,10 @@ int homa_rpc_tx_end(struct homa_rpc *rpc) * transmission is complete). */ if (homa_info->offset >= rpc->msgout.next_xmit_offset || - refcount_read(&skb->users) > 1) { - tt_record3("homa_rpc_tx_complete id %d tx up to %d/%d", - rpc->id, homa_info->offset, - rpc->msgout.length); + refcount_read(&skb->users) > 1) return homa_info->offset; - } skb = homa_info->next_skb; rpc->msgout.first_not_tx = skb; } - tt_record2("homa_rpc_tx_complete id %d fully transmitted (%d bytes)", - rpc->id, rpc->msgout.length); return rpc->msgout.length; } From 4fa23cd3cc9e24a8a90b8a0bf2b31b8104e2e995 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 18 Aug 2025 11:09:52 -0700 Subject: [PATCH 436/625] Fix compilation error in tt_record statement in homa_grant.c --- homa_grant.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_grant.c b/homa_grant.c index c730628f..207ec941 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -891,7 +891,7 @@ void homa_grant_find_oldest(struct homa_grant *grant) if (oldest) { homa_rpc_hold(oldest); - tt_record1("homa_grant_find_oldest chose id %d", oldest); + tt_record1("homa_grant_find_oldest chose id %d", oldest->id); } grant->oldest_rpc = oldest; } From 5b6fd3aff8e9fe43a636d01da0d20979a9e737a6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 18 Aug 2025 11:32:11 -0700 Subject: [PATCH 437/625] Add "WITH Linux-syscall-note" to license in homa.h --- homa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa.h b/homa.h index e44754df..fffdfb69 100644 --- a/homa.h +++ b/homa.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ WITH Linux-syscall-note */ /* This file defines the kernel call interface for the Homa * transport protocol. From 96eb443221fc64ef074e4ca2ae13971eda54a851 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 18 Aug 2025 12:11:17 -0700 Subject: [PATCH 438/625] Fix checkpatch.pl issues --- homa_qdisc.c | 6 +++--- homa_timer.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 32583304..1f106776 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -263,9 +263,9 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (is_homa_pkt(skb)) { h = (struct homa_data_hdr *)skb_transport_header(skb); tt_record4("homa_qdisc_enqueue queuing homa data packet for id %d, offset %d, bytes_left %d on qid %d", - be64_to_cpu(h->common.sender_id), - ntohl(h->seg.offset), - homa_get_skb_info(skb)->bytes_left, q->ix); + be64_to_cpu(h->common.sender_id), + ntohl(h->seg.offset), + homa_get_skb_info(skb)->bytes_left, q->ix); } else { tt_record2("homa_qdisc_enqueue queuing non-homa packet, qix %d, pacer_qix %d", q->ix, qdev->pacer_qix); diff --git a/homa_timer.c b/homa_timer.c index 3b13fe17..39fb0b19 100644 --- a/homa_timer.c +++ b/homa_timer.c @@ -32,7 +32,7 @@ void homa_timer_check_rpc(struct homa_rpc *rpc) /* See if we need to request an ack for this RPC. */ if (!homa_is_client(rpc->id) && rpc->state == RPC_OUTGOING && - tx_end == rpc->msgout.length) { + tx_end == rpc->msgout.length) { if (rpc->done_timer_ticks == 0) { rpc->done_timer_ticks = homa->timer_ticks; } else { From 624cd2fd83af2fa86e8e089fd28a45410702e3b5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 19 Aug 2025 22:01:25 -0700 Subject: [PATCH 439/625] Fix bug in computing num_grantable_rpcs in homa_grant.c Accidentally decremented it when unmanaging an RPC that was already unmanaged. --- homa_grant.c | 26 ++++++++++++++++---------- test/unit_homa_grant.c | 17 +++++++++++++++++ 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/homa_grant.c b/homa_grant.c index 207ec941..f96e276f 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -548,22 +548,28 @@ void homa_grant_unmanage_rpc(struct homa_rpc *rpc, __must_hold(rpc->bucket->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; + bool removed = false; u64 time = homa_clock(); homa_grant_lock(grant); - INC_METRIC(grantable_rpcs_integral, grant->num_grantable_rpcs - * (time - grant->last_grantable_change)); - grant->last_grantable_change = time; - grant->num_grantable_rpcs--; - tt_record2("Decremented num_grantable_rpcs to %d, id %d", - grant->num_grantable_rpcs, rpc->id); - - if (rpc->msgin.rank >= 0) + if (rpc->msgin.rank >= 0) { homa_grant_remove_active(rpc, cand); - if (!list_empty(&rpc->grantable_links)) + removed = true; + } + if (!list_empty(&rpc->grantable_links)) { homa_grant_remove_grantable(rpc); - grant->window = homa_grant_window(grant); + removed = true; + } + if (removed) { + INC_METRIC(grantable_rpcs_integral, grant->num_grantable_rpcs + * (time - grant->last_grantable_change)); + grant->last_grantable_change = time; + grant->num_grantable_rpcs--; + tt_record2("Decremented num_grantable_rpcs to %d, id %d", + grant->num_grantable_rpcs, rpc->id); + grant->window = homa_grant_window(grant); + } if (rpc == grant->oldest_rpc) { homa_rpc_put(rpc); grant->oldest_rpc = NULL; diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index c2eb73a8..a52761a1 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -884,6 +884,23 @@ TEST_F(homa_grant, homa_grant_unmanage_rpc__basics) EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); EXPECT_EQ(60000, self->homa.grant->window); } +TEST_F(homa_grant, homa_grant_unmanage_rpc__rpc_not_managed) +{ + struct homa_rpc *rpc; + + self->homa.grant->max_rpcs_per_peer = 1; + self->homa.grant->window_param = 0; + self->homa.grant->max_incoming = 60000; + self->homa.grant->last_grantable_change = 100; + mock_clock = 250; + rpc = test_rpc(self, 200, self->server_ip, 30000); + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); + + homa_grant_unmanage_rpc(rpc, &self->cand); + EXPECT_EQ(0, self->homa.grant->num_grantable_rpcs); + EXPECT_EQ(0, homa_metrics_per_cpu()->grantable_rpcs_integral); + EXPECT_EQ(100, self->homa.grant->last_grantable_change); +} TEST_F(homa_grant, homa_grant_unmanage_rpc__remove_from_oldest_rpc) { struct homa_rpc *rpc; From 1bd9336e3dae7c17a19d54fcad6f531bdb3787a3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 21 Aug 2025 09:50:05 -0700 Subject: [PATCH 440/625] Trivial change in tt_record message --- homa_rpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_rpc.c b/homa_rpc.c index 2be4e996..c8a82e59 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -598,7 +598,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) homa_peer_release(rpc->peer); rpc->peer = NULL; } - tt_record2("homa_rpc_reap finished reaping id %d, socket %d", + tt_record2("homa_rpc_reap finished reaping id %d, port %d", rpc->id, rpc->hsk->port); #ifndef __STRIP__ /* See strip.py */ From 7ca8d54198cd879fc9d1183572062a10df81f68b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 21 Aug 2025 10:33:53 -0700 Subject: [PATCH 441/625] Invoke hom_rpc_reap from homa_rpc_acked if wmem exhausted This fixes a deadlock that could occur where wmem was exhausted for a server socket, RPCs completed, but no-one called homa_rpc_reap to release the tx buffer space. There was already a similar check for client RPCs in homa_plumbing.c --- homa_rpc.c | 7 +++++++ test/unit_homa_rpc.c | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/homa_rpc.c b/homa_rpc.c index c8a82e59..43dcb64a 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -241,6 +241,13 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, tt_record1("homa_rpc_acked freeing id %d", rpc->id); homa_rpc_end(rpc); homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ + + if (test_bit(SOCK_NOSPACE, &hsk2->sock.sk_socket->flags)) { + /* There are tasks waiting for tx memory, so reap + * immediately. + */ + homa_rpc_reap(hsk, false); + } } if (hsk->port != server_port) sock_put(&hsk2->sock); diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 3ac6f3b1..dbab1cb0 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -362,6 +362,7 @@ TEST_F(homa_rpc, homa_rpc_acked__basics) ack.client_id = cpu_to_be64(self->client_id); homa_rpc_acked(&hsk, self->client_ip, &ack); EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); + EXPECT_EQ(1, unit_list_length(&hsk.dead_rpcs)); EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); unit_sock_destroy(&hsk); } @@ -419,6 +420,28 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); unit_sock_destroy(&hsk); } +TEST_F(homa_rpc, homa_rpc_acked__call_homa_rpc_reap) +{ + struct homa_rpc *srpc; + struct homa_sock hsk; + struct homa_ack ack = {}; + + mock_sock_init(&hsk, self->hnet, self->server_port); + set_bit(SOCK_NOSPACE, &hsk.sock.sk_socket->flags); + srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); + ASSERT_NE(NULL, srpc); + ack.server_port = htons(self->server_port); + ack.client_id = cpu_to_be64(self->client_id); + unit_log_clear(); + homa_rpc_acked(&hsk, self->client_ip, &ack); + EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); + EXPECT_EQ(0, unit_list_length(&hsk.dead_rpcs)); + EXPECT_STREQ("ack 1235; homa_rpc_end invoked; reaped 1235", + unit_log_get()); + unit_sock_destroy(&hsk); +} TEST_F(homa_rpc, homa_rpc_end__basics) { From 75e5f615cc686e1164c59cf56259fca398e1c2fe Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 21 Aug 2025 10:46:33 -0700 Subject: [PATCH 442/625] Add port info to tt_record calls in homa_rpc.c --- homa_rpc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index 43dcb64a..1b259d9e 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -281,7 +281,8 @@ void homa_rpc_end(struct homa_rpc *rpc) if (!rpc || rpc->state == RPC_DEAD) return; UNIT_LOG("; ", "homa_rpc_end invoked"); - tt_record1("homa_rpc_end invoked for id %d", rpc->id); + tt_record2("homa_rpc_end invoked for id %d, port %d", rpc->id, + rpc->hsk->port); rpc->state = RPC_DEAD; rpc->error = -EINVAL; @@ -498,8 +499,8 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) homa_sock_lock(hsk); if (atomic_read(&hsk->protect_count)) { INC_METRIC(disabled_reaps, 1); - tt_record2("homa_rpc_reap returning: protect_count %d, dead_skbs %d", - atomic_read(&hsk->protect_count), + tt_record3("homa_rpc_reap returning for port %d: protect_count %d, dead_skbs %d", + hsk->port, atomic_read(&hsk->protect_count), hsk->dead_skbs); homa_sock_unlock(hsk); if (reap_all) From 6926d45b54813bf1623f698601cf20f5929be232 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 21 Aug 2025 10:48:11 -0700 Subject: [PATCH 443/625] Change deferral test in homa_qdisc_enqueue Don't transmit short Homa data packets immediately unless the entire message is short (i.e. don't transmit a short packet at the end of a long message). --- homa_qdisc.c | 50 ++++++++++++++++++++++++++++++++---------- test/unit_homa_qdisc.c | 50 ++++++++++++++++++++++++++++++++---------- 2 files changed, 76 insertions(+), 24 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 1f106776..82eff6ce 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -230,14 +230,41 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct homa_data_hdr *h; int pkt_len; int result; + int offset; pkt_len = qdisc_skb_cb(skb)->pkt_len; - if (pkt_len < homa->pacer->throttle_min_bytes) { + if (!is_homa_pkt(skb)) { homa_qdisc_update_link_idle(qdev, pkt_len, -1); goto enqueue; } - if (!is_homa_pkt(skb)) { + /* For Homa packets, transmit control packets and short messages + * immediately, bypassing the pacer mechanism completely. We do + * this because (a) we don't want to delay control packets, (b) the + * pacer's single thread doesn't have enough throughput to handle + * all the short packets (whereas processing here happens concurrently + * on multiple cores), and (c) there is no way to generate enough + * short packets to cause NIC queue buildup, so bypassing the pacer + * won't impact the SRPT mechanism significantly. + * + * Note: it's very important to use message length, not packet + * length when deciding whether to bypass the pacer. If packet + * length were used, then the short packet at the end of a long + * message might be transmitted when all the earlier packets in the + * message have been deferred, and the deferred packets might not be + * transmitted for a long time due to SRPT. In the meantime, the + * receiver will have reserved incoming for those packets. These + * reservations can pile up to the point where the receiver can't + * issue any grants, even though the "incoming" data isn't going to + * be transmitted anytime soon. + */ + + h = (struct homa_data_hdr *)skb_transport_header(skb); + offset = ntohl(h->seg.offset); + if (offset == -1) + offset = ntohl(h->common.sequence); + if (h->common.type != DATA || ntohl(h->message_length) < + homa->pacer->throttle_min_bytes) { homa_qdisc_update_link_idle(qdev, pkt_len, -1); goto enqueue; } @@ -250,22 +277,20 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* This packet needs to be deferred until the NIC queue has * been drained a bit. */ - h = (struct homa_data_hdr *)skb_transport_header(skb); tt_record4("homa_qdisc_enqueue deferring homa data packet for id %d, offset %d, bytes_left %d on qid %d", - be64_to_cpu(h->common.sender_id), - ntohl(h->seg.offset), + be64_to_cpu(h->common.sender_id), offset, homa_get_skb_info(skb)->bytes_left, qdev->pacer_qix); homa_qdisc_defer_homa(qdev, skb); - wake_up(&qdev->pacer_sleep); return NET_XMIT_SUCCESS; enqueue: if (is_homa_pkt(skb)) { - h = (struct homa_data_hdr *)skb_transport_header(skb); - tt_record4("homa_qdisc_enqueue queuing homa data packet for id %d, offset %d, bytes_left %d on qid %d", - be64_to_cpu(h->common.sender_id), - ntohl(h->seg.offset), - homa_get_skb_info(skb)->bytes_left, q->ix); + if (h->common.type == DATA) { + h = (struct homa_data_hdr *)skb_transport_header(skb); + tt_record4("homa_qdisc_enqueue queuing homa data packet for id %d, offset %d, bytes_left %d on qid %d", + be64_to_cpu(h->common.sender_id), offset, + homa_get_skb_info(skb)->bytes_left, q->ix); + } } else { tt_record2("homa_qdisc_enqueue queuing non-homa packet, qix %d, pacer_qix %d", q->ix, qdev->pacer_qix); @@ -318,6 +343,7 @@ void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb) spin_lock_irqsave(&qdev->homa_deferred.lock, flags); if (skb_queue_empty(&qdev->homa_deferred)) { __skb_queue_head(&qdev->homa_deferred, skb); + wake_up(&qdev->pacer_sleep); goto done; } INC_METRIC(throttled_cycles, now - qdev->last_defer); @@ -450,7 +476,7 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, ? clock - qdev->pacer_wake_time : clock - idle; INC_METRIC(pacer_lost_cycles, lost); - tt_record1("pacer lost %d cycles", lost); + tt_record1("homa_qdisc pacer lost %d cycles", lost); } new_idle = clock + cycles_for_packet; } else { diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index a9ce123a..217b0081 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -303,7 +303,7 @@ TEST_F(homa_qdisc, _homa_qdisc_homa_qdisc_set_qixs_object) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_enqueue__short_packet) +TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; @@ -313,12 +313,16 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__short_packet) q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); q->ix = 3; - skb = mock_skb_alloc(&self->addr, &self->data.common, 100, 0); - qdisc_skb_cb(skb)->pkt_len = 100; + skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + qdisc_skb_cb(skb)->pkt_len = 1500; + if (skb_is_ipv6(skb)) + ipv6_hdr(skb)->nexthdr = IPPROTO_TCP; + else + ip_hdr(skb)->protocol = IPPROTO_TCP; to_free = NULL; unit_log_clear(); - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); EXPECT_EQ(0, q->qdev->homa_deferred.qlen); EXPECT_EQ(1, qdisc->q.qlen); @@ -328,7 +332,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__short_packet) homa_qdisc_destroy(qdisc); kfree(qdisc); } -TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) +TEST_F(homa_qdisc, homa_qdisc_enqueue__short_message) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; @@ -338,16 +342,13 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); q->ix = 3; - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - qdisc_skb_cb(skb)->pkt_len = 1500; - if (skb_is_ipv6(skb)) - ipv6_hdr(skb)->nexthdr = IPPROTO_TCP; - else - ip_hdr(skb)->protocol = IPPROTO_TCP; + self->data.message_length = htonl(100); + skb = mock_skb_alloc(&self->addr, &self->data.common, 100, 0); + qdisc_skb_cb(skb)->pkt_len = 100; to_free = NULL; unit_log_clear(); - homa_qdisc_enqueue(skb, qdisc, &to_free); + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); EXPECT_EQ(NULL, to_free); EXPECT_EQ(0, q->qdev->homa_deferred.qlen); EXPECT_EQ(1, qdisc->q.qlen); @@ -357,6 +358,31 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) homa_qdisc_destroy(qdisc); kfree(qdisc); } +TEST_F(homa_qdisc, homa_qdisc_enqueue__short_final_packet_in_long_message) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + atomic64_set(&q->qdev->link_idle_time, 1000000); + q->ix = 3; + self->data.message_length = htonl(3000); + self->data.seg.offset = htonl(2800); + skb = mock_skb_alloc(&self->addr, &self->data.common, 200, 0); + qdisc_skb_cb(skb)->pkt_len = 100; + to_free = NULL; + unit_log_clear(); + + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, q->qdev->homa_deferred.qlen); + EXPECT_EQ(0, qdisc->q.qlen); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); From 2c7588603531bf6bea4fc26787ff1557773ed6b8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 25 Aug 2025 13:36:04 -0700 Subject: [PATCH 444/625] Rework deadlock fix in commit 7ca8d54198 Instead of checking wmem in callers of homa_rpc_end, do it directly in homa_rpc_end (this way the check only occurs in one place). --- homa_rpc.c | 14 +++++++------- test/unit_homa_rpc.c | 41 +++++++++++++++++++---------------------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index 1b259d9e..c0ac66f4 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -241,13 +241,6 @@ void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, tt_record1("homa_rpc_acked freeing id %d", rpc->id); homa_rpc_end(rpc); homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */ - - if (test_bit(SOCK_NOSPACE, &hsk2->sock.sk_socket->flags)) { - /* There are tasks waiting for tx memory, so reap - * immediately. - */ - homa_rpc_reap(hsk, false); - } } if (hsk->port != server_port) sock_put(&hsk2->sock); @@ -330,6 +323,13 @@ void homa_rpc_end(struct homa_rpc *rpc) homa_sock_unlock(rpc->hsk); homa_pacer_unmanage_rpc(rpc); + + if (test_bit(SOCK_NOSPACE, &rpc->hsk->sock.sk_socket->flags)) { + /* There are tasks waiting for tx memory so reap immediately. */ + homa_rpc_unlock(rpc); + homa_rpc_reap(rpc->hsk, false); + homa_rpc_lock(rpc); + } } /** diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index dbab1cb0..ec224596 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -420,28 +420,6 @@ TEST_F(homa_rpc, homa_rpc_acked__no_such_rpc) EXPECT_STREQ("OUTGOING", homa_symbol_for_state(srpc)); unit_sock_destroy(&hsk); } -TEST_F(homa_rpc, homa_rpc_acked__call_homa_rpc_reap) -{ - struct homa_rpc *srpc; - struct homa_sock hsk; - struct homa_ack ack = {}; - - mock_sock_init(&hsk, self->hnet, self->server_port); - set_bit(SOCK_NOSPACE, &hsk.sock.sk_socket->flags); - srpc = unit_server_rpc(&hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->client_port, self->server_id, - 100, 3000); - ASSERT_NE(NULL, srpc); - ack.server_port = htons(self->server_port); - ack.client_id = cpu_to_be64(self->client_id); - unit_log_clear(); - homa_rpc_acked(&hsk, self->client_ip, &ack); - EXPECT_EQ(0, unit_list_length(&hsk.active_rpcs)); - EXPECT_EQ(0, unit_list_length(&hsk.dead_rpcs)); - EXPECT_STREQ("ack 1235; homa_rpc_end invoked; reaped 1235", - unit_log_get()); - unit_sock_destroy(&hsk); -} TEST_F(homa_rpc, homa_rpc_end__basics) { @@ -555,6 +533,25 @@ TEST_F(homa_rpc, homa_rpc_end__remove_from_throttled_list) homa_rpc_end(crpc); EXPECT_EQ(0, unit_list_length(&self->homa.pacer->throttled_rpcs)); } +TEST_F(homa_rpc, homa_rpc_end__call_homa_rpc_reap) +{ + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 100, 3000); + ASSERT_NE(NULL, srpc); + homa_rpc_lock(srpc); + set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); + unit_log_clear(); + + homa_rpc_end(srpc); + EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_EQ(0, unit_list_length(&self->hsk.dead_rpcs)); + EXPECT_STREQ("homa_rpc_end invoked; reaped 1235", + unit_log_get()); + homa_rpc_unlock(srpc); +} TEST_F(homa_rpc, homa_rpc_reap__basics) { From 3c9835ba6ca46df8f16ac4cd7a7852b4c9f7c690 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 25 Aug 2025 21:05:39 -0700 Subject: [PATCH 445/625] Don't reap RPCs until all tx packets are out of the tx pipeline This allows homa_qdisc to store RPC pointers in packets safely. Also, added new metric reaper_active_skbs. Also, changed slightly the meaning of the return value from homa_rpc_reap: if there were unreaped RPCs that couldn't be reaped (e.g. because they were locked), the return value is now 0, whereas it used to be 1. --- homa_metrics.c | 2 ++ homa_metrics.h | 7 +++++++ homa_rpc.c | 42 +++++++++++++++++++++++++++++++----------- test/unit_homa_rpc.c | 41 +++++++++++++++++++++++++++++++++++------ 4 files changed, 75 insertions(+), 17 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index 65b1dc1e..6c85a944 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -381,6 +381,8 @@ char *homa_metrics_print(void) "Reaper invocations that were not disabled\n"); M("reaper_dead_skbs", m->reaper_dead_skbs, "Sum of hsk->dead_skbs across all reaper calls\n"); + M("reaper_active_skbs", m->reaper_active_skbs, + "RPCs skipped by reaper because of active tx skbs\n"); M("throttle_list_adds", m->throttle_list_adds, "Calls to homa_add_to_throttled\n"); M("throttle_list_checks", m->throttle_list_checks, diff --git a/homa_metrics.h b/homa_metrics.h index 31a8850a..2ef5d701 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -652,6 +652,13 @@ struct homa_metrics { */ u64 reaper_dead_skbs; + /** + * @reaper_active_skbs: total number of times homa_rpc_reap had to skip + * an RPC because one of its tx skb's was still in the transmit + * pipeline. + */ + u64 reaper_active_skbs; + /** * @throttle_list_adds: total number of calls to homa_add_to_throttled. */ diff --git a/homa_rpc.c b/homa_rpc.c index c0ac66f4..7e09af38 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -471,11 +471,11 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) struct homa_rpc *rpcs[BATCH_MAX]; struct sk_buff *skbs[BATCH_MAX]; int num_skbs, num_rpcs; + bool checked_all_rpcs; struct homa_rpc *rpc; struct homa_rpc *tmp; int i, batch_size; int skbs_to_reap; - int result = 0; int rx_frees; INC_METRIC(reaper_calls, 1); @@ -485,9 +485,15 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * BATCH_MAX skbs. */ skbs_to_reap = hsk->homa->reap_limit; - while (skbs_to_reap > 0 && !list_empty(&hsk->dead_rpcs)) { + checked_all_rpcs = list_empty(&hsk->dead_rpcs); + while (1) { batch_size = BATCH_MAX; - if (!reap_all) { + if (reap_all) { + if (list_empty(&hsk->dead_rpcs)) + break; + } else { + if (skbs_to_reap <= 0 || checked_all_rpcs) + break; if (batch_size > skbs_to_reap) batch_size = skbs_to_reap; skbs_to_reap -= batch_size; @@ -536,9 +542,23 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) */ if (rpc->msgout.length >= 0) { while (rpc->msgout.packets) { - skbs[num_skbs] = rpc->msgout.packets; - rpc->msgout.packets = homa_get_skb_info( - rpc->msgout.packets)->next_skb; + struct sk_buff *skb = + rpc->msgout.packets; + + /* This tests whether skb is still in a + * transmit queue somewhere; if so, + * can't reap the RPC since homa_qdisc + * may try to access it via the skb's + * homa_skb_info. + */ + if (refcount_read(&skb->users) > 1) { + INC_METRIC(reaper_active_skbs, + 1); + goto next_rpc; + } + skbs[num_skbs] = skb; + rpc->msgout.packets = + homa_get_skb_info(skb)->next_skb; num_skbs++; rpc->msgout.num_skbs--; if (num_skbs >= batch_size) @@ -567,15 +587,17 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) &hsk->sock.sk_wmem_alloc)); if (num_rpcs >= batch_size) goto release; + +next_rpc: + continue; } + checked_all_rpcs = true; /* Free all of the collected resources; release the socket * lock while doing this. */ release: hsk->dead_skbs -= num_skbs + rx_frees; - result = !list_empty(&hsk->dead_rpcs) && - (num_skbs + num_rpcs) != 0; homa_sock_unlock(hsk); homa_skb_free_many_tx(hsk->homa, skbs, num_skbs); for (i = 0; i < num_rpcs; i++) { @@ -641,11 +663,9 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) tt_record4("reaped %d skbs, %d rpcs; %d skbs remain for port %d", num_skbs + rx_frees, num_rpcs, hsk->dead_skbs, hsk->port); - if (!result && !reap_all) - break; } homa_pool_check_waiting(hsk->buffer_pool); - return result; + return !checked_all_rpcs; } /** diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index ec224596..c4840d05 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -553,6 +553,10 @@ TEST_F(homa_rpc, homa_rpc_end__call_homa_rpc_reap) homa_rpc_unlock(srpc); } +TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) +{ + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); +} TEST_F(homa_rpc, homa_rpc_reap__basics) { struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, @@ -655,7 +659,7 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_locked) #else /* See strip.py */ mock_trylock_errors = 1; #endif /* See strip.py */ - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps)); unit_log_clear(); @@ -679,7 +683,7 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) unit_log_clear(); homa_rpc_hold(crpc1); self->homa.reap_limit = 3; - EXPECT_EQ(1, homa_rpc_reap(&self->hsk, false)); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1236", unit_log_get()); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->deferred_rpc_reaps)); unit_log_clear(); @@ -691,6 +695,35 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_refs) EXPECT_STREQ("reaped 1234", unit_log_get()); IF_NO_STRIP(EXPECT_EQ(2, homa_metrics_per_cpu()->deferred_rpc_reaps)); } +TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_skb_refcount) +{ + struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 2000); + struct homa_rpc *crpc2 = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id+2, 1000, 2000); + + ASSERT_NE(NULL, crpc1); + ASSERT_NE(NULL, crpc2); + homa_rpc_end(crpc1); + homa_rpc_end(crpc2); + skb_get(crpc1->msgout.packets); + EXPECT_EQ(5, self->hsk.dead_skbs); + unit_log_clear(); + + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1236", unit_log_get()); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->reaper_active_skbs)); + EXPECT_EQ(4, self->hsk.dead_skbs); + + kfree_skb(crpc1->msgout.packets); + unit_log_clear(); + EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); + EXPECT_STREQ("reaped 1234", unit_log_get()); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->reaper_active_skbs)); + EXPECT_EQ(0, self->hsk.dead_skbs); +} TEST_F(homa_rpc, homa_rpc_reap__hit_limit_in_msgout_packets) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -882,10 +915,6 @@ TEST_F(homa_rpc, homa_rpc_reap__call_homa_sock_wakeup_wmem) homa_rpc_reap(&self->hsk, false); EXPECT_EQ(0, test_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags)); } -TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) -{ - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); -} TEST_F(homa_rpc, homa_rpc_find_client) { From 4b49e2279b7a563fbb1097f402b73de35856f8b8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 26 Aug 2025 09:48:20 -0700 Subject: [PATCH 446/625] Remove the 'net' field from struct hnet --- homa_impl.h | 3 --- homa_utils.c | 1 - test/mock.c | 30 ++++++++++++++++++++++++------ test/mock.h | 1 + test/unit_homa_plumbing.c | 6 +++--- test/unit_homa_qdisc.c | 2 +- 6 files changed, 29 insertions(+), 14 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index d88299d9..e79bc010 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -506,9 +506,6 @@ struct homa { * particular network namespace. */ struct homa_net { - /** @net: Network namespace corresponding to this structure. */ - struct net *net; - /** @homa: Global Homa information. */ struct homa *homa; diff --git a/homa_utils.c b/homa_utils.c index 8ec3963a..2d2b1f93 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -162,7 +162,6 @@ void homa_destroy(struct homa *homa) int homa_net_init(struct homa_net *hnet, struct net *net, struct homa *homa) { memset(hnet, 0, sizeof(*hnet)); - hnet->net = net; hnet->homa = homa; hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; #ifndef __STRIP__ /* See strip.py */ diff --git a/test/mock.c b/test/mock.c index a2b25ecb..4df5249b 100644 --- a/test/mock.c +++ b/test/mock.c @@ -249,6 +249,9 @@ __u16 mock_min_default_port = 0x8000; /* Used as sk_socket for all sockets created by mock_sock_init. */ static struct socket mock_socket; +/* Each of the entries in mock_hnets below is associated with the + * corresonding entry in mock_nets. + */ #define MOCK_MAX_NETS 10 struct net mock_nets[MOCK_MAX_NETS]; struct homa_net mock_hnets[MOCK_MAX_NETS]; @@ -1812,17 +1815,32 @@ int mock_get_link_ksettings(struct net_device *dev, return 0; } +/** + * mock_net_for_hnet() - Return the struct net associated with a struct + * homa_net, or NULL if the struct net can't be identified. + * @hnet: Find the struct net associated with this. + * Return: See above. + */ +struct net *mock_net_for_hnet(struct homa_net *hnet) +{ + int i; + + for (i = 0; i < mock_num_hnets; i++) { + if (hnet == &mock_hnets[i]) + return &mock_nets[i]; + } + return NULL; +} + void *mock_net_generic(const struct net *net, unsigned int id) { - struct homa_net *hnet; int i; if (id != homa_net_id) return NULL; - for (i = 0; i < MOCK_MAX_NETS; i++) { - hnet = &mock_hnets[i]; - if (hnet->net == net) - return hnet; + for (i = 0; i < mock_num_hnets; i++) { + if (net == &mock_nets[i]) + return &mock_hnets[i]; } return NULL; } @@ -2215,7 +2233,7 @@ int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port) sk->sk_data_ready = mock_data_ready; sk->sk_family = mock_ipv6 ? AF_INET6 : AF_INET; sk->sk_socket = &mock_socket; - sk->sk_net.net = hnet->net; + sk->sk_net.net = mock_net_for_hnet(hnet); memset(&mock_socket, 0, sizeof(mock_socket)); refcount_set(&sk->sk_wmem_alloc, 1); init_waitqueue_head(&mock_socket.wq.wait); diff --git a/test/mock.h b/test/mock.h index a32a06b6..a6a5f081 100644 --- a/test/mock.h +++ b/test/mock.h @@ -200,6 +200,7 @@ unsigned int mock_get_mtu(const struct dst_entry *dst); void mock_get_page(struct page *page); void *mock_kmalloc(size_t size, gfp_t flags); +struct net *mock_net_for_hnet(struct homa_net *hnet); void *mock_net_generic(const struct net *net, unsigned int id); int mock_page_refs(struct page *page); int mock_page_refs(struct page *page); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 64b2ae76..4067ad2b 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -138,7 +138,7 @@ TEST_F(homa_plumbing, homa_net_exit__free_peers) homa_peer_release(homa_peer_get(&self->hsk, &addr3)); EXPECT_EQ(3, unit_count_peers(&self->homa)); - homa_net_exit(self->hsk.hnet->net); + homa_net_exit(mock_net_for_hnet(self->hsk.hnet)); EXPECT_EQ(0, unit_count_peers(&self->homa)); } @@ -280,7 +280,7 @@ TEST_F(homa_plumbing, homa_socket__success) struct homa_sock hsk; memset(&hsk, 0, sizeof(hsk)); - hsk.sock.sk_net.net = self->hnet->net; + hsk.sock.sk_net.net = mock_net_for_hnet(self->hnet); refcount_set(&hsk.sock.sk_wmem_alloc, 1); EXPECT_EQ(0, homa_socket(&hsk.sock)); unit_sock_destroy(&hsk); @@ -290,7 +290,7 @@ TEST_F(homa_plumbing, homa_socket__homa_sock_init_failure) struct homa_sock hsk; memset(&hsk, 0, sizeof(hsk)); - hsk.sock.sk_net.net = self->hnet->net; + hsk.sock.sk_net.net = mock_net_for_hnet(self->hnet); refcount_set(&hsk.sock.sk_wmem_alloc, 1); mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_socket(&hsk.sock)); diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 217b0081..fde3ee7b 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -97,7 +97,7 @@ FIXTURE_SETUP(homa_qdisc) memset(&self->dev, 0, sizeof(self->dev)); self->dev._tx = self->txqs; self->dev.num_tx_queues = NUM_TXQS; - self->dev.nd_net.net = self->hnet->net; + self->dev.nd_net.net = mock_net_for_hnet(self->hnet); self->dev.ethtool_ops = &self->ethtool_ops; memset(&self->ethtool_ops, 0, sizeof(self->ethtool_ops)); self->ethtool_ops.get_link_ksettings = mock_get_link_ksettings; From c2e0e5daaa662c2410f5a63661031614b3d57aad Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 26 Aug 2025 10:36:18 -0700 Subject: [PATCH 447/625] Remove unneeded definitions from stripped version --- homa_impl.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/homa_impl.h b/homa_impl.h index e79bc010..09759cd1 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -684,6 +684,7 @@ static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb) return mapped; } +#ifndef __STRIP__ /* See strip.py */ /** * is_homa_pkt() - Return true if @skb is a Homa packet, false otherwise. * @skb: Packet buffer to check. @@ -708,6 +709,7 @@ static inline bool is_homa_pkt(struct sk_buff *skb) return protocol == IPPROTO_HOMA; #endif /* See strip.py */ } +#endif /* See strip.py */ /** * homa_make_header_avl() - Invokes pskb_may_pull to make sure that all the @@ -726,6 +728,7 @@ static inline bool homa_make_header_avl(struct sk_buff *skb) return pskb_may_pull(skb, pull_length); } +#ifndef __UPSTREAM__ /* See strip.py */ #ifdef __UNIT_TEST__ void unit_log_printf(const char *separator, const char *format, ...) __printf(2, 3); @@ -736,6 +739,7 @@ void unit_hook(char *id); #define UNIT_LOG(...) #define UNIT_HOOK(...) #endif /* __UNIT_TEST__ */ +#endif /* See strip.py */ extern unsigned int homa_net_id; @@ -935,6 +939,7 @@ static inline u64 homa_ns_to_cycles(u64 ns) #endif /* __UNIT_TEST__ */ } +#ifndef __STRIP__ /* See strip.py */ /** * homa_usecs_to_cycles() - Convert from units of microseconds to units of * homa_clock(). @@ -953,6 +958,7 @@ static inline u64 homa_usecs_to_cycles(u64 usecs) return tmp; #endif /* __UNIT_TEST__ */ } +#endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ /** From 72460d616484b39964431e4162abf5888e2e01f2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 26 Aug 2025 10:43:47 -0700 Subject: [PATCH 448/625] Rename homa_net_from_net to homa_net, eliminate "from" functions homa_net can be used instead of homa_net_from_skb and homa_from_skb. --- homa_devel.c | 2 +- homa_grant.c | 2 +- homa_impl.h | 32 ++------------------------------ homa_incoming.c | 2 +- homa_offload.c | 4 ++-- homa_pacer.c | 2 +- homa_peer.c | 2 +- homa_plumbing.c | 15 ++++++++------- homa_qdisc.c | 2 +- 9 files changed, 18 insertions(+), 45 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 2b7db06d..b9d5e8a3 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -410,7 +410,7 @@ void homa_freeze_peers(void) int err; /* Find a socket to use (any socket for the namespace will do). */ - hnet = homa_net_from_net(&init_net); + hnet = homa_net(&init_net); rcu_read_lock(); hsk = homa_socktab_start_scan(hnet->homa->socktab, &scan); while (hsk && hsk->hnet != hnet) diff --git a/homa_grant.c b/homa_grant.c index f96e276f..a7daa7a4 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -1165,7 +1165,7 @@ int homa_grant_dointvec(const struct ctl_table *table, int write, struct homa_grant *grant; int result; - grant = homa_net_from_net(current->nsproxy->net_ns)->homa->grant; + grant = homa_net(current->nsproxy->net_ns)->homa->grant; /* Generate a new ctl_table that refers to a field in the * net-specific struct homa. diff --git a/homa_impl.h b/homa_impl.h index 09759cd1..ed7f63be 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -836,44 +836,16 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc); #endif /* See strip.py */ /** - * homa_net_from_net() - Return the struct homa_net associated with a particular + * homa_net() - Return the struct homa_net associated with a particular * struct net. * @net: Get the Homa data for this net namespace. * Return: see above. */ -static inline struct homa_net *homa_net_from_net(struct net *net) +static inline struct homa_net *homa_net(struct net *net) { return (struct homa_net *)net_generic(net, homa_net_id); } -/** - * homa_from_skb() - Return the struct homa associated with a particular - * sk_buff. - * @skb: Get the struct homa for this packet buffer. - * Return: see above. - */ -static inline struct homa *homa_from_skb(struct sk_buff *skb) -{ - struct homa_net *hnet; - - hnet = net_generic(dev_net(skb->dev), homa_net_id); - return hnet->homa; -} - -/** - * homa_net_from_skb() - Return the struct homa_net associated with a particular - * sk_buff. - * @skb: Get the struct homa for this packet buffer. - * Return: see above. - */ -static inline struct homa_net *homa_net_from_skb(struct sk_buff *skb) -{ - struct homa_net *hnet; - - hnet = net_generic(dev_net(skb->dev), homa_net_id); - return hnet; -} - /** * homa_clock() - Return a fine-grain clock value that is monotonic and * consistent across cores. diff --git a/homa_incoming.c b/homa_incoming.c index 93db90d2..4e6d66b0 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -457,7 +457,7 @@ void homa_dispatch_pkts(struct sk_buff *skb) int num_acks = 0; /* Find the appropriate socket.*/ - hnet = homa_net_from_skb(skb); + hnet = homa_net(dev_net(skb->dev)); hsk = homa_sock_find(hnet, dport); if (!hsk || (!homa_is_client(id) && !hsk->is_server)) { if (skb_is_ipv6(skb)) diff --git a/homa_offload.c b/homa_offload.c index cb4ff8b6..56d3dbd0 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -289,7 +289,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, */ u64 saved_softirq_metric, softirq_cycles; struct homa_offload_core *offload_core; - struct homa *homa = homa_from_skb(skb); + struct homa *homa = homa_net(dev_net(skb->dev))->homa; struct sk_buff *result = NULL; struct homa_data_hdr *h_new; u64 *softirq_cycles_metric; @@ -609,7 +609,7 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) { struct homa_data_hdr *h = (struct homa_data_hdr *)skb_transport_header(skb); - struct homa *homa = homa_from_skb(skb); + struct homa *homa = homa_net(dev_net(skb->dev))->homa; // tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", // h->common.type, homa_local_id(h->common.sender_id), diff --git a/homa_pacer.c b/homa_pacer.c index 8a8c9ddd..28bde94d 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -418,7 +418,7 @@ int homa_pacer_dointvec(const struct ctl_table *table, int write, struct homa_pacer *pacer; int result; - pacer = homa_net_from_net(current->nsproxy->net_ns)->homa->pacer; + pacer = homa_net(current->nsproxy->net_ns)->homa->pacer; /* Generate a new ctl_table that refers to a field in the * net-specific struct homa. diff --git a/homa_peer.c b/homa_peer.c index 203f25f3..6a5c127b 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -790,7 +790,7 @@ int homa_peer_dointvec(const struct ctl_table *table, int write, struct ctl_table table_copy; int result; - peertab = homa_net_from_net(current->nsproxy->net_ns)->homa->peertab; + peertab = homa_net(current->nsproxy->net_ns)->homa->peertab; /* Generate a new ctl_table that refers to a field in the * net-specific struct homa. diff --git a/homa_plumbing.c b/homa_plumbing.c index f2985b83..f865eb73 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -736,7 +736,7 @@ module_exit(homa_unload); int homa_net_start(struct net *net) { pr_notice("Homa attaching to net namespace\n"); - return homa_net_init(homa_net_from_net(net), net, global_homa); + return homa_net_init(homa_net(net), net, global_homa); } /** @@ -747,7 +747,7 @@ int homa_net_start(struct net *net) void homa_net_exit(struct net *net) { pr_notice("Homa detaching from net namespace\n"); - homa_net_destroy(homa_net_from_net(net)); + homa_net_destroy(homa_net(net)); } /** @@ -1400,7 +1400,7 @@ int homa_softirq(struct sk_buff *skb) struct homa_common_hdr *h; int header_offset; - IF_NO_STRIP(struct homa *homa = homa_from_skb(skb)); + IF_NO_STRIP(struct homa *homa = homa_net(dev_net(skb->dev))->homa); IF_NO_STRIP(u64 start); #ifndef __STRIP__ /* See strip.py */ @@ -1464,7 +1464,8 @@ int homa_softirq(struct sk_buff *skb) */ if (unlikely(h->type == FREEZE)) { if (!atomic_read(&tt_frozen)) { - homa_rpc_log_active_tt(homa_from_skb(skb), 0); + homa_rpc_log_active_tt(homa_net( + dev_net(skb->dev))->homa, 0); tt_record4("Freezing because of request on port %d from 0x%x:%d, id %d", ntohs(h->dport), tt_addr(skb_canonical_ipv6_saddr(skb)), @@ -1560,7 +1561,7 @@ int homa_softirq(struct sk_buff *skb) int homa_err_handler_v4(struct sk_buff *skb, u32 info) { const struct icmphdr *icmp = icmp_hdr(skb); - struct homa *homa = homa_from_skb(skb); + struct homa *homa = homa_net(dev_net(skb->dev))->homa; struct in6_addr daddr; int type = icmp->type; int code = icmp->code; @@ -1607,7 +1608,7 @@ int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; - struct homa *homa = homa_from_skb(skb); + struct homa *homa = homa_net(dev_net(skb->dev))->homa; int error = 0; int port = 0; @@ -1679,7 +1680,7 @@ __poll_t homa_poll(struct file *file, struct socket *sock, int homa_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct homa *homa = homa_net_from_net(current->nsproxy->net_ns)->homa; + struct homa *homa = homa_net(current->nsproxy->net_ns)->homa; struct ctl_table table_copy; int result; diff --git a/homa_qdisc.c b/homa_qdisc.c index 82eff6ce..f5164de6 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -144,7 +144,7 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct homa_net *hnet; int i; - hnet = homa_net_from_net(dev_net(sch->dev_queue->dev)); + hnet = homa_net(dev_net(sch->dev_queue->dev)); qdev = homa_qdisc_qdev_get(hnet, sch->dev_queue->dev); if (IS_ERR(qdev)) return PTR_ERR(qdev); From 74fac889b840e1d5666963669eaa6cbef4db1f13 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 26 Aug 2025 10:51:09 -0700 Subject: [PATCH 449/625] Minor editorial cleanup of comments on locking strategy --- homa_impl.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index ed7f63be..8de96ee8 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -981,9 +981,6 @@ static inline int homa_high_priority(struct homa *homa) * * RPC lock * * Socket lock * * Other lock - * 3. It is not safe to wait on an RPC lock while holding any other lock. - * 4. It is safe to wait on a socket lock while holding an RPC lock, but - * not while holding any other lock. * * It may seem surprising that RPC locks are acquired *before* socket locks, * but this is essential for high performance. Homa has been designed so that From ae75d427e4c50f5ee4628b1ebfe4312964ed0ba3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 26 Aug 2025 11:20:55 -0700 Subject: [PATCH 450/625] Remove homa_peer_wait_dead: not used --- homa_peer.c | 19 ------------------- homa_peer.h | 1 - test/unit_homa_peer.c | 34 ---------------------------------- 3 files changed, 54 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index 6a5c127b..4f4ba5c6 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -242,25 +242,6 @@ void homa_peer_free_dead(struct homa_peertab *peertab) } } -/** - * homa_peer_wait_dead() - Don't return until all of the dead peers have - * been freed. - * @peertab: Overall information about peers, which includes a dead list. - * - */ -void homa_peer_wait_dead(struct homa_peertab *peertab) -{ - while (1) { - spin_lock_bh(&peertab->lock); - homa_peer_free_dead(peertab); - if (list_empty(&peertab->dead_peers)) { - spin_unlock_bh(&peertab->lock); - return; - } - spin_unlock_bh(&peertab->lock); - } -} - /** * homa_peer_prefer_evict() - Given two peers, determine which one is * a better candidate for eviction. diff --git a/homa_peer.h b/homa_peer.h index 82d68bc8..3c48f36d 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -306,7 +306,6 @@ int homa_peer_prefer_evict(struct homa_peertab *peertab, struct homa_peer *peer1, struct homa_peer *peer2); void homa_peer_rcu_callback(struct rcu_head *head); -void homa_peer_wait_dead(struct homa_peertab *peertab); void homa_peer_update_sysctl_deps(struct homa_peertab *peertab); #ifndef __STRIP__ /* See strip.py */ void homa_peer_lock_slow(struct homa_peer *peer); diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 80338db6..10acee25 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -81,17 +81,6 @@ static void stop_gc_hook(char *id) unit_log_printf("; ", "gc_stop_count %d", hook_peertab->gc_stop_count); } -static int hook_free_count; -static void complete_rcu_hook(char *id) { - if (strcmp(id, "unlock") != 0) - return; - if (hook_free_count == 0) - return; - hook_free_count--; - if (hook_free_count == 0) - homa_peer_rcu_callback(&hook_peertab->rcu_head); -} - TEST_F(homa_peer, homa_peer_alloc_peertab__success) { struct homa_peertab *peertab; @@ -278,29 +267,6 @@ TEST_F(homa_peer, homa_peer_free_dead) { EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_peer, homa_peer_wait_dead) { - struct homa_peertab *peertab = self->homa.peertab; - struct homa_peer *peer; - - peer = homa_peer_alloc(&self->hsk, ip1111); - homa_peer_release(peer); - list_add_tail(&peer->dead_links, &peertab->dead_peers); - unit_log_clear(); - unit_log_dead_peers(&self->homa); - EXPECT_STREQ("[1::1:1:1]", unit_log_get()); - atomic_set(&peertab->call_rcu_pending, 1); - - unit_hook_register(complete_rcu_hook); - hook_peertab = self->homa.peertab; - hook_free_count = 5; - - homa_peer_wait_dead(peertab); - unit_log_clear(); - unit_log_dead_peers(&self->homa); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(0, hook_free_count); -} - TEST_F(homa_peer, homa_peer_prefer_evict) { struct homa_peertab *peertab = self->homa.peertab; From ac73265c4add851e29fe8faca4cbc3683de03b9f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 26 Aug 2025 14:37:05 -0700 Subject: [PATCH 451/625] Fix bug in ttsum.py: printed usecs instead of ns if not -f --- util/ttsum.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/util/ttsum.py b/util/ttsum.py index 3794d116..a6822f75 100755 --- a/util/ttsum.py +++ b/util/ttsum.py @@ -186,8 +186,8 @@ def scan(f, startingEvent): intervals.sort() medianTime = intervals[len(intervals)//2] message = '%-*s %6.0f %6.0f %6.0f %6.0f %7d' % (nameLength, - event, medianTime, intervals[0], intervals[-1], - sum(intervals)/len(intervals), len(intervals)) + event, medianTime * 1e03, intervals[0] * 1e03, intervals[-1] * 1e03, + 1e03 * sum(intervals)/len(intervals), len(intervals)) outputInfo.append([medianTime, message]) # Pass 2: sort in order of median interval length, then print. @@ -234,13 +234,14 @@ def scan(f, startingEvent): if options.altFormat: message = '%-*s %6.0f %6.0f %6.0f %6.0f %6.0f %7d' % ( nameLength, eventName, medianTime*1e03, times[0]*1e03, - times[-1]*1e03, sum(times)*1e03/len(times), - intervals[len(intervals)//2]*1e03, len(times)) + times[-1] * 1e03, sum(times) * 1e03/len(times), + intervals[len(intervals)//2] * 1e03, len(times)) else: message = '%-*s %6.0f %6.0f %6.0f %6.0f %6.0f %7d' % ( - nameLength, eventName, medianTime*1e03, medianInterval*1e03, - intervals[0]*1e03, intervals[-1]*1e03, - sum(intervals)/len(intervals)*1e03, len(intervals)) + nameLength, eventName, medianTime * 1e03, + medianInterval * 1e03, + intervals[0] * 1e03, intervals[-1] * 1e03, + 1e03 * sum(intervals)/len(intervals), len(intervals)) outputInfo.append([medianTime, message]) outputInfo.sort(key=lambda item: item[0]) From 500f07322dd245f1952a8afc9656c211c289cd6b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 26 Aug 2025 14:43:00 -0700 Subject: [PATCH 452/625] Pull murmurhash hash function out into a separate header file --- Makefile | 3 ++- homa_peer.c | 3 ++- homa_peer.h | 42 ------------------------------------------ murmurhash3.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 44 deletions(-) create mode 100644 murmurhash3.h diff --git a/Makefile b/Makefile index 8113bf41..39336e13 100644 --- a/Makefile +++ b/Makefile @@ -70,7 +70,8 @@ CP_HDRS := homa_impl.h \ homa_rpc.h \ homa_sock.h \ homa_stub.h \ - homa_wire.h + homa_wire.h \ + murmurhash3.h CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o homa_grant.o \ homa_metrics.o homa_offload.o homa_qdisc.o \ homa_skb.o timetrace.o, $(HOMA_OBJS))) diff --git a/homa_peer.c b/homa_peer.c index 4f4ba5c6..783cfb86 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -7,6 +7,7 @@ #include "homa_impl.h" #include "homa_peer.h" #include "homa_rpc.h" +#include "murmurhash3.h" #ifdef __UNIT_TEST__ #undef rhashtable_init @@ -24,7 +25,7 @@ static const struct rhashtable_params ht_params = { .key_offset = offsetof(struct homa_peer, ht_key), .head_offset = offsetof(struct homa_peer, ht_linkage), .nelem_hint = 10000, - .hashfn = homa_peer_hash, + .hashfn = murmurhash3, .obj_cmpfn = homa_peer_compare }; diff --git a/homa_peer.h b/homa_peer.h index 3c48f36d..30a82d7a 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -387,48 +387,6 @@ static inline void homa_peer_release(struct homa_peer *peer) } /** - * homa_peer_hash() - Hash function used for @peertab->ht. - * @data: Pointer to key for which a hash is desired. Must actually - * be a struct homa_peer_key. - * @dummy: Not used - * @seed: Seed for the hash. - * Return: A 32-bit hash value for the given key. - */ -static inline u32 homa_peer_hash(const void *data, u32 dummy, u32 seed) -{ - /* This is MurmurHash3, used instead of the jhash default because it - * is faster (25 ns vs. 40 ns as of May 2025). - */ - BUILD_BUG_ON(sizeof(struct homa_peer_key) & 0x3); - const u32 len = sizeof(struct homa_peer_key) >> 2; - const u32 c1 = 0xcc9e2d51; - const u32 c2 = 0x1b873593; - const u32 *key = data; - u32 h = seed; - - for (size_t i = 0; i < len; i++) { - u32 k = key[i]; - - k *= c1; - k = (k << 15) | (k >> (32 - 15)); - k *= c2; - - h ^= k; - h = (h << 13) | (h >> (32 - 13)); - h = h * 5 + 0xe6546b64; - } - - h ^= len * 4; // Total number of input bytes - - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; -} - -/**q * homa_peer_compare() - Comparison function for entries in @peertab->ht. * @arg: Contains one of the keys to compare. * @obj: homa_peer object containing the other key to compare. diff --git a/murmurhash3.h b/murmurhash3.h new file mode 100644 index 00000000..1ed1f0b6 --- /dev/null +++ b/murmurhash3.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */ + +/* This file contains a limited implementation of MurmurHash3; it is + * used for rhashtables instead of the default jhash because it is + * faster (25 ns. vs. 40 ns as of May 2025) + */ + +/** + * murmurhash3() - Hash function. + * @data: Pointer to key for which a hash is desired. + * @len: Length of the key; must be a multiple of 4. + * @seed: Seed for the hash. + * Return: A 32-bit hash value for the given key. + */ +static inline u32 murmurhash3(const void *data, u32 len, u32 seed) +{ + const u32 c1 = 0xcc9e2d51; + const u32 c2 = 0x1b873593; + const u32 *key = data; + u32 h = seed; + + len = len >> 2; + for (size_t i = 0; i < len; i++) { + u32 k = key[i]; + + k *= c1; + k = (k << 15) | (k >> (32 - 15)); + k *= c2; + + h ^= k; + h = (h << 13) | (h >> (32 - 13)); + h = h * 5 + 0xe6546b64; + } + + /* Total number of input bytes */ + h ^= len * 4; + + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} From e881a9871b3fb58fa93a958f40125f13b9eba71a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 27 Aug 2025 10:50:48 -0700 Subject: [PATCH 453/625] Trivial fix for peer_dst_refreshes comment in homa_metrics.h --- homa_metrics.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/homa_metrics.h b/homa_metrics.h index 2ef5d701..2b9e12d0 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -433,8 +433,9 @@ struct homa_metrics { u64 peer_route_errors; /** - * @peer_dst_refreshes: total number of times that homa_dst_refresh - * was called to update an obsolete dst for a peer. + * @peer_dst_refreshes: total number of times that the dst for a + * peer had to be regenerated because the existing one had become + * obsolete. */ u64 peer_dst_refreshes; From 69fb32dac20f39bb039662ea66c244c01a9eef11 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 27 Aug 2025 14:18:24 -0700 Subject: [PATCH 454/625] Refactor homa_peer.c to fix lack of sync. in homa_dst_refresh * Eliminate homa_dst_refresh. * Change homa_peer_get_dst to homa_peer_refresh_dst, which now replaces homa_dst_refresh. * Use the peer lock in homa_peer_refresh_dst. * Use RCU properly in homa_get_dst, and add other checks. --- homa_peer.c | 212 +++++++++++++++++++++++++----------------- homa_peer.h | 116 +++++++++++------------ test/mock.c | 14 +++ test/unit_homa_peer.c | 173 +++++++++++++++++++--------------- 4 files changed, 292 insertions(+), 223 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index 783cfb86..cc9adc62 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -410,26 +410,20 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, const struct in6_addr *addr) { struct homa_peer *peer; - struct dst_entry *dst; + int status; peer = kzalloc(sizeof(*peer), GFP_ATOMIC); if (!peer) { INC_METRIC(peer_kmalloc_errors, 1); return (struct homa_peer *)ERR_PTR(-ENOMEM); } + peer->addr = *addr; peer->ht_key.addr = *addr; peer->ht_key.hnet = hsk->hnet; - INIT_LIST_HEAD(&peer->dead_links); atomic_set(&peer->refs, 1); peer->access_jiffies = jiffies; - peer->addr = *addr; - dst = homa_peer_get_dst(peer, hsk); - if (IS_ERR(dst)) { - INC_METRIC(peer_route_errors, 1); - kfree(peer); - return (struct homa_peer *)dst; - } - peer->dst = dst; + INIT_LIST_HEAD(&peer->dead_links); + spin_lock_init(&peer->lock); #ifndef __STRIP__ /* See strip.py */ peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 2] = INT_MAX; @@ -437,10 +431,15 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, INIT_LIST_HEAD(&peer->grantable_links); #endif /* See strip.py */ peer->current_ticks = -1; - spin_lock_init(&peer->ack_lock); - INC_METRIC(peer_allocs, 1); + + status = homa_peer_reset_dst(peer, hsk); + if (status != 0) { + kfree(peer); + return ERR_PTR(status); + } tt_record1("Allocated new homa_peer for node 0x%x", tt_addr(peer->addr)); + INC_METRIC(peer_allocs, 1); return peer; } @@ -536,67 +535,59 @@ struct homa_peer *homa_peer_get(struct homa_sock *hsk, } /** - * homa_dst_refresh() - This method is called when the dst for a peer is - * obsolete; it releases that dst and creates a new one. - * @peertab: Table containing the peer. - * @peer: Peer whose dst is obsolete. - * @hsk: Socket that will be used to transmit data to the peer. + * homa_get_dst() - Returns destination information associated with a peer, + * updating it if the cached information is stale. + * @peer: Peer whose destination information is desired. + * @hsk: Homa socket with which the dst will be used; needed by lower-level + * code to recreate the dst. + * Return: Up-to-date destination for peer; a reference has been taken + * on this dst_entry, which the caller must eventually release. */ -void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, - struct homa_sock *hsk) +struct dst_entry *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk) { struct dst_entry *dst; + int pass; - INC_METRIC(peer_dst_refreshes, 1); - dst = homa_peer_get_dst(peer, hsk); - if (IS_ERR(dst)) { -#ifndef __STRIP__ /* See strip.py */ - /* Retain the existing dst if we can't create a new one. */ - if (hsk->homa->verbose) - pr_notice("%s couldn't recreate dst: error %ld", - __func__, PTR_ERR(dst)); - INC_METRIC(peer_route_errors, 1); -#endif /* See strip.py */ - return; - } - dst_release(peer->dst); - peer->dst = dst; -} - -#ifndef __STRIP__ /* See strip.py */ -/** - * homa_unsched_priority() - Returns the priority level to use for - * unscheduled packets of a message. - * @homa: Overall data about the Homa protocol implementation. - * @peer: The destination of the message. - * @length: Number of bytes in the message. - * - * Return: A priority level. - */ -int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, - int length) -{ - int i; + rcu_read_lock(); + for (pass = 0; ; pass++) { + do { + /* This loop repeats only if we happen to fetch + * the dst right when it is being reset. + */ + dst = rcu_dereference(peer->dst); + } while (!dst_hold_safe(dst)); - for (i = homa->num_priorities - 1; ; i--) { - if (peer->unsched_cutoffs[i] >= length) - return i; + /* After the first pass it's OK to return an obsolete dst + * (we're basically giving up; continuing could result in + * an infinite loop if homa_dst_refresh can't create a new dst). + */ + if (dst_check(dst, peer->dst_cookie) || pass > 0) + break; + dst_release(dst); + INC_METRIC(peer_dst_refreshes, 1); + homa_peer_reset_dst(peer, hsk); } - /* Can't ever get here */ + rcu_read_unlock(); + return dst; } -#endif /* See strip.py */ /** - * homa_peer_get_dst() - Find an appropriate dst structure (either IPv4 - * or IPv6) for a peer. - * @peer: The peer for which a dst is needed. Note: this peer's flow - * struct will be overwritten. + * homa_peer_reset_dst() - Find an appropriate dst_entry for a peer and + * store it in the peer's dst field. If the field is already set, the + * current value is assumed to be stale and will be discarded if a new + * dst_entry can be created. + * @peer: The peer whose dst field should be reset. * @hsk: Socket that will be used for sending packets. - * Return: The dst structure (or an ERR_PTR); a reference has been taken. + * Return: Zero for success, or a negative errno if there was an error + * (in which case the existing value for the dst field is left + * in place). */ -struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, - struct homa_sock *hsk) +int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) { + struct dst_entry *dst; + int result = 0; + + homa_peer_lock(peer); memset(&peer->flow, 0, sizeof(peer->flow)); if (hsk->sock.sk_family == AF_INET) { struct rtable *rt; @@ -604,39 +595,86 @@ struct dst_entry *homa_peer_get_dst(struct homa_peer *peer, flowi4_init_output(&peer->flow.u.ip4, hsk->sock.sk_bound_dev_if, hsk->sock.sk_mark, hsk->inet.tos, RT_SCOPE_UNIVERSE, hsk->sock.sk_protocol, 0, - peer->addr.in6_u.u6_addr32[3], + ipv6_to_ipv4(peer->addr), hsk->inet.inet_saddr, 0, 0, hsk->sock.sk_uid); security_sk_classify_flow(&hsk->sock, &peer->flow.u.__fl_common); rt = ip_route_output_flow(sock_net(&hsk->sock), &peer->flow.u.ip4, &hsk->sock); - if (IS_ERR(rt)) - return (struct dst_entry *)(PTR_ERR(rt)); - return &rt->dst; + if (IS_ERR(rt)) { + result = PTR_ERR(rt); + INC_METRIC(peer_route_errors, 1); + goto done; + } + dst = &rt->dst; + peer->dst_cookie = 0; + } else { + peer->flow.u.ip6.flowi6_oif = hsk->sock.sk_bound_dev_if; + peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX; + peer->flow.u.ip6.flowi6_mark = hsk->sock.sk_mark; + peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE; + peer->flow.u.ip6.flowi6_proto = hsk->sock.sk_protocol; + peer->flow.u.ip6.flowi6_flags = 0; + peer->flow.u.ip6.flowi6_secid = 0; + peer->flow.u.ip6.flowi6_tun_key.tun_id = 0; + peer->flow.u.ip6.flowi6_uid = hsk->sock.sk_uid; + peer->flow.u.ip6.daddr = peer->addr; + peer->flow.u.ip6.saddr = hsk->inet.pinet6->saddr; + peer->flow.u.ip6.fl6_dport = 0; + peer->flow.u.ip6.fl6_sport = 0; + peer->flow.u.ip6.mp_hash = 0; + peer->flow.u.ip6.__fl_common.flowic_tos = hsk->inet.tos; + peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(hsk->inet.tos, + 0); + security_sk_classify_flow(&hsk->sock, + &peer->flow.u.__fl_common); + dst = ip6_dst_lookup_flow(sock_net(&hsk->sock), &hsk->sock, + &peer->flow.u.ip6, NULL); + if (IS_ERR(dst)) { + result = PTR_ERR(dst); + INC_METRIC(peer_route_errors, 1); + goto done; + } + peer->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } - peer->flow.u.ip6.flowi6_oif = hsk->sock.sk_bound_dev_if; - peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX; - peer->flow.u.ip6.flowi6_mark = hsk->sock.sk_mark; - peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE; - peer->flow.u.ip6.flowi6_proto = hsk->sock.sk_protocol; - peer->flow.u.ip6.flowi6_flags = 0; - peer->flow.u.ip6.flowi6_secid = 0; - peer->flow.u.ip6.flowi6_tun_key.tun_id = 0; - peer->flow.u.ip6.flowi6_uid = hsk->sock.sk_uid; - peer->flow.u.ip6.daddr = peer->addr; - peer->flow.u.ip6.saddr = hsk->inet.pinet6->saddr; - peer->flow.u.ip6.fl6_dport = 0; - peer->flow.u.ip6.fl6_sport = 0; - peer->flow.u.ip6.mp_hash = 0; - peer->flow.u.ip6.__fl_common.flowic_tos = hsk->inet.tos; - peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(hsk->inet.tos, 0); - security_sk_classify_flow(&hsk->sock, &peer->flow.u.__fl_common); - return ip6_dst_lookup_flow(sock_net(&hsk->sock), &hsk->sock, - &peer->flow.u.ip6, NULL); + + /* From the standpoint of homa_get_dst, peer->dst is not updated + * atomically with peer->dst_cookie, which means homa_get_dst could + * use a new cookie with an old dest. Fortunately, this is benign; at + * worst, it might cause an obsolete dst to be reused (resulting in + * a lost packet) or a valid dst to be replaced (resulting in + * unnecessary work). + */ + dst_release(rcu_replace_pointer(peer->dst, dst, true)); + +done: + homa_peer_unlock(peer); + return result; } #ifndef __STRIP__ /* See strip.py */ +/** + * homa_unsched_priority() - Returns the priority level to use for + * unscheduled packets of a message. + * @homa: Overall data about the Homa protocol implementation. + * @peer: The destination of the message. + * @length: Number of bytes in the message. + * + * Return: A priority level. + */ +int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, + int length) +{ + int i; + + for (i = homa->num_priorities - 1; ; i--) { + if (peer->unsched_cutoffs[i] >= length) + return i; + } + /* Can't ever get here */ +} + /** * homa_peer_set_cutoffs() - Set the cutoffs for unscheduled priorities in * a peer object. This is a convenience function used primarily by unit tests. @@ -665,18 +703,18 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, int c2, /** * homa_peer_lock_slow() - This function implements the slow path for - * acquiring a peer's @ack_lock. It is invoked when the lock isn't + * acquiring a peer's @lock. It is invoked when the lock isn't * immediately available. It waits for the lock, but also records statistics * about the waiting time. * @peer: Peer to lock. */ void homa_peer_lock_slow(struct homa_peer *peer) - __acquires(peer->ack_lock) + __acquires(peer->lock) { u64 start = homa_clock(); tt_record("beginning wait for peer lock"); - spin_lock_bh(&peer->ack_lock); + spin_lock_bh(&peer->lock); tt_record("ending wait for peer lock"); INC_METRIC(peer_ack_lock_misses, 1); INC_METRIC(peer_ack_lock_miss_cycles, homa_clock() - start); diff --git a/homa_peer.h b/homa_peer.h index 30a82d7a..d168b2eb 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -133,9 +133,27 @@ struct homa_peer_key { * have communicated with (either as client or server). */ struct homa_peer { + /** + * @addr: IPv6 address for the machine (IPv4 addresses are stored + * as IPv4-mapped IPv6 addresses). + */ + struct in6_addr addr; + /** @ht_key: The hash table key for this peer in peertab->ht. */ struct homa_peer_key ht_key; + /** + * @refs: Number of unmatched calls to homa_peer_hold; it's not safe + * to free this object until the reference count is zero. + */ + atomic_t refs; + + /** + * @access_jiffies: Time in jiffies of most recent access to this + * peer. + */ + unsigned long access_jiffies; + /** * @ht_linkage: Used by rashtable implement to link this peer into * peertab->ht. @@ -146,31 +164,41 @@ struct homa_peer { struct list_head dead_links; /** - * @refs: Number of unmatched calls to homa_peer_hold; it's not safe - * to free this object until the reference count is zero. + * @lock: used to synchronize access to fields in this struct, such + * as @num_acks, @acks, @dst, and @dst_cookie. */ - atomic_t refs ____cacheline_aligned_in_smp; + spinlock_t lock ____cacheline_aligned_in_smp; /** - * @access_jiffies: Time in jiffies of most recent access to this - * peer. + * @num_acks: the number of (initial) entries in @acks that + * currently hold valid information. */ - unsigned long access_jiffies; + int num_acks; /** - * @addr: IPv6 address for the machine (IPv4 addresses are stored - * as IPv4-mapped IPv6 addresses). + * @acks: info about client RPCs whose results have been completely + * received. */ - struct in6_addr addr ____cacheline_aligned_in_smp; + struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; - /** @flow: Addressing info needed to send packets. */ - struct flowi flow; + /** + * @dst: Used to route packets to this peer; this object owns a + * reference that must eventually be released. + */ + struct dst_entry __rcu *dst; /** - * @dst: Used to route packets to this peer; we own a reference - * to this, which we must eventually release. + * @dst_cookie: Used to check whether dst is still valid. This is + * accessed without synchronization, which is racy, but the worst + * that can happen is using an obsolete dst. */ - struct dst_entry *dst; + u32 dst_cookie; + + /** + * @flow: Addressing info used to create @dst and also required + * when transmitting packets. + * */ + struct flowi flow; #ifndef __STRIP__ /* See strip.py */ /** @@ -260,27 +288,12 @@ struct homa_peer { * in the current pass, if it still needs one. */ struct homa_rpc *resend_rpc; - - /** - * @num_acks: the number of (initial) entries in @acks that - * currently hold valid information. - */ - int num_acks; - - /** - * @acks: info about client RPCs whose results have been completely - * received. - */ - struct homa_ack acks[HOMA_MAX_ACKS_PER_PKT]; - - /** - * @ack_lock: used to synchronize access to @num_acks and @acks. - */ - spinlock_t ack_lock; }; void homa_dst_refresh(struct homa_peertab *peertab, struct homa_peer *peer, struct homa_sock *hsk); +struct dst_entry + *homa_get_dst(struct homa_peer *peer, struct homa_sock *hsk); void homa_peer_add_ack(struct homa_rpc *rpc); struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, const struct in6_addr *addr); @@ -298,14 +311,13 @@ struct homa_peer *homa_peer_get(struct homa_sock *hsk, const struct in6_addr *addr); int homa_peer_get_acks(struct homa_peer *peer, int count, struct homa_ack *dst); -struct dst_entry - *homa_peer_get_dst(struct homa_peer *peer, struct homa_sock *hsk); int homa_peer_pick_victims(struct homa_peertab *peertab, struct homa_peer *victims[], int max_victims); int homa_peer_prefer_evict(struct homa_peertab *peertab, struct homa_peer *peer1, struct homa_peer *peer2); void homa_peer_rcu_callback(struct rcu_head *head); +int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk); void homa_peer_update_sysctl_deps(struct homa_peertab *peertab); #ifndef __STRIP__ /* See strip.py */ void homa_peer_lock_slow(struct homa_peer *peer); @@ -315,54 +327,36 @@ void homa_peer_set_cutoffs(struct homa_peer *peer, int c0, int c1, #ifndef __STRIP__ /* See strip.py */ /** - * homa_peer_lock() - Acquire the lock for a peer's @unacked_lock. If the lock - * isn't immediately available, record stats on the waiting time. + * homa_peer_lock() - Acquire the lock for a peer. If the lock isn't + * immediately available, record stats on the waiting time. * @peer: Peer to lock. */ static inline void homa_peer_lock(struct homa_peer *peer) - __acquires(peer->ack_lock) + __acquires(peer->lock) { - if (!spin_trylock_bh(&peer->ack_lock)) + if (!spin_trylock_bh(&peer->lock)) homa_peer_lock_slow(peer); } #else /* See strip.py */ /** - * homa_peer_lock() - Acquire the lock for a peer's @ack_lock. + * homa_peer_lock() - Acquire the lock for a peer. * @peer: Peer to lock. */ static inline void homa_peer_lock(struct homa_peer *peer) - __acquires(peer->ack_lock) + __acquires(peer->lock) { - spin_lock_bh(&peer->ack_lock); + spin_lock_bh(&peer->lock); } #endif /* See strip.py */ /** - * homa_peer_unlock() - Release the lock for a peer's @unacked_lock. + * homa_peer_unlock() - Release the lock for a peer. * @peer: Peer to lock. */ static inline void homa_peer_unlock(struct homa_peer *peer) - __releases(peer->ack_lock) -{ - spin_unlock_bh(&peer->ack_lock); -} - -/** - * homa_get_dst() - Returns destination information associated with a peer, - * updating it if the cached information is stale. - * @peer: Peer whose destination information is desired. - * @hsk: Homa socket; needed by lower-level code to recreate the dst. - * Return: Up-to-date destination for peer; a reference has been taken - * on this dst_entry, which the caller must eventually release. - */ -static inline struct dst_entry *homa_get_dst(struct homa_peer *peer, - struct homa_sock *hsk) + __releases(peer->lock) { - if (unlikely(peer->dst->obsolete && - !peer->dst->ops->check(peer->dst, 0))) - homa_dst_refresh(hsk->homa->peertab, peer, hsk); - dst_hold(peer->dst); - return peer->dst; + spin_unlock_bh(&peer->lock); } /** diff --git a/test/mock.c b/test/mock.c index 4df5249b..60d5bfa3 100644 --- a/test/mock.c +++ b/test/mock.c @@ -721,6 +721,13 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len) return 0; } +struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) +{ + if (mock_check_error(&mock_dst_check_errors)) + return NULL; + return dst; +} + struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { @@ -815,6 +822,13 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) return 0; } +struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +{ + if (mock_check_error(&mock_dst_check_errors)) + return NULL; + return dst; +} + unsigned int ipv4_mtu(const struct dst_entry *dst) { return mock_mtu; diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 10acee25..ded8f3ec 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -602,6 +602,7 @@ TEST_F(homa_peer, homa_peer_alloc__route_error) #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); + EXPECT_EQ(0, homa_metrics_per_cpu()->peer_allocs); #endif /* See strip.py */ } @@ -635,7 +636,7 @@ TEST_F(homa_peer, homa_peer_free__nonzero_ref_count) kfree(peer); } -TEST_F(homa_peer, homa_peer_find__basics) +TEST_F(homa_peer, homa_peer_get__basics) { struct homa_peer *peer, *peer2; @@ -674,7 +675,7 @@ TEST_F(homa_peer, homa_peer_find__basics) homa_peer_release(peer); homa_peer_release(peer2); } -TEST_F(homa_peer, homa_peer_find__error_in_homa_peer_alloc) +TEST_F(homa_peer, homa_peer_get__error_in_homa_peer_alloc) { struct homa_peer *peer; @@ -686,7 +687,7 @@ TEST_F(homa_peer, homa_peer_find__error_in_homa_peer_alloc) EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); #endif /* See strip.py */ } -TEST_F(homa_peer, homa_peer_find__insert_error) +TEST_F(homa_peer, homa_peer_get__insert_error) { struct homa_peer *peer; @@ -695,7 +696,7 @@ TEST_F(homa_peer, homa_peer_find__insert_error) EXPECT_TRUE(IS_ERR(peer)); EXPECT_EQ(EINVAL, -PTR_ERR(peer)); } -TEST_F(homa_peer, homa_peer_find__conflicting_create) +TEST_F(homa_peer, homa_peer_get__conflicting_create) { struct homa_peer *peer; @@ -713,55 +714,53 @@ TEST_F(homa_peer, homa_peer_find__conflicting_create) EXPECT_EQ(1, self->hnet->num_peers); } -TEST_F(homa_peer, homa_dst_refresh__basics) +TEST_F(homa_peer, homa_get_dst__normal) { - struct dst_entry *old_dst; - struct homa_peer *peer; - - peer = homa_peer_get(&self->hsk, ip1111); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + struct dst_entry *dst; - old_dst = peer->dst; - homa_dst_refresh(self->homa.peertab, peer, &self->hsk); - EXPECT_NE(old_dst, peer->dst); + dst = homa_get_dst(peer, &self->hsk); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->peer_dst_refreshes)); + dst_release(dst); homa_peer_release(peer); } -TEST_F(homa_peer, homa_dst_refresh__routing_error) +TEST_F(homa_peer, homa_get_dst__must_refresh_obsolete) { - struct dst_entry *old_dst; - struct homa_peer *peer; - - peer = homa_peer_get(&self->hsk, ip1111); - ASSERT_NE(NULL, peer); - EXPECT_EQ_IP(*ip1111, peer->addr); + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + struct dst_entry *old, *dst; - old_dst = peer->dst; - mock_route_errors = 1; - homa_dst_refresh(self->homa.peertab, peer, &self->hsk); - EXPECT_EQ(old_dst, peer->dst); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(1, homa_metrics_per_cpu()->peer_route_errors); -#endif /* See strip.py */ + old = peer->dst; + peer->dst->obsolete = 1; + mock_dst_check_errors = 1; + dst = homa_get_dst(peer, &self->hsk); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes)); + EXPECT_NE(old, dst); + dst_release(dst); homa_peer_release(peer); } - -#ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_peer, homa_unsched_priority) +TEST_F(homa_peer, homa_get_dst__multiple_refresh_failures) { - struct homa_peer peer; - - homa_peer_set_cutoffs(&peer, INT_MAX, 0, 0, INT_MAX, 200, 100, 0, 0); + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + struct dst_entry *old, *dst; - EXPECT_EQ(5, homa_unsched_priority(&self->homa, &peer, 10)); - EXPECT_EQ(4, homa_unsched_priority(&self->homa, &peer, 200)); - EXPECT_EQ(3, homa_unsched_priority(&self->homa, &peer, 201)); + old = peer->dst; + peer->dst->obsolete = 1; + mock_dst_check_errors = 0xf; + mock_route_errors = 0xf; + dst = homa_get_dst(peer, &self->hsk); + EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); + IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes)); + EXPECT_EQ(old, dst); + EXPECT_EQ(3, mock_dst_check_errors); + dst_release(dst); + homa_peer_release(peer); } -#endif /* See strip.py */ -TEST_F(homa_peer, homa_peer_get_dst__ipv4) +TEST_F(homa_peer, homa_peer_reset_dst__ipv4) { - struct dst_entry *dst; + int status; // Make sure the test uses IPv4. mock_ipv6 = false; @@ -772,17 +771,38 @@ TEST_F(homa_peer, homa_peer_get_dst__ipv4) &self->client_ip[0]); ASSERT_NE(NULL, peer); - dst = homa_peer_get_dst(peer, &self->hsk); - ASSERT_NE(NULL, dst); - dst_release(dst); + status = homa_peer_reset_dst(peer, &self->hsk); + ASSERT_EQ(0, -status); + ASSERT_NE(NULL, peer->dst); EXPECT_STREQ("196.168.0.1", homa_print_ipv4_addr(peer->flow.u.ip4.daddr)); homa_peer_release(peer); } -TEST_F(homa_peer, homa_peer_get_dst__ipv6) +TEST_F(homa_peer, homa_peer_reset_dst__ipv4_route_error) +{ + struct dst_entry *old; + int status; + + // Make sure the test uses IPv4. + mock_ipv6 = false; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + + struct homa_peer *peer = homa_peer_get(&self->hsk, + &self->client_ip[0]); + ASSERT_NE(NULL, peer); + old = peer->dst; + + mock_route_errors = 1; + status = homa_peer_reset_dst(peer, &self->hsk); + EXPECT_EQ(EHOSTUNREACH, -status); + EXPECT_EQ(old, peer->dst); + homa_peer_release(peer); +} +TEST_F(homa_peer, homa_peer_reset_dst__ipv6) { - struct dst_entry *dst; char buffer[30]; + int status; u32 addr; // Make sure the test uses IPv6. @@ -793,9 +813,8 @@ TEST_F(homa_peer, homa_peer_get_dst__ipv6) struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); ASSERT_NE(NULL, peer); - dst = homa_peer_get_dst(peer, &self->hsk); - ASSERT_NE(NULL, dst); - dst_release(dst); + status = homa_peer_reset_dst(peer, &self->hsk); + ASSERT_EQ(0, -status); addr = ntohl(peer->flow.u.ip4.daddr); snprintf(buffer, sizeof(buffer), "%u.%u.%u.%u", (addr >> 24) & 0xff, (addr >> 16) & 0xff, (addr >> 8) & 0xff, addr & 0xff); @@ -803,8 +822,39 @@ TEST_F(homa_peer, homa_peer_get_dst__ipv6) homa_print_ipv6_addr(&peer->flow.u.ip6.daddr)); homa_peer_release(peer); } +TEST_F(homa_peer, homa_peer_reset_dst__ipv6_route_error) +{ + struct dst_entry *old; + int status; + + // Make sure the test uses IPv6. + mock_ipv6 = true; + unit_sock_destroy(&self->hsk); + mock_sock_init(&self->hsk, self->hnet, 0); + + struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); + ASSERT_NE(NULL, peer); + old = peer->dst; + + mock_route_errors = 1; + status = homa_peer_reset_dst(peer, &self->hsk); + EXPECT_EQ(EHOSTUNREACH, -status); + EXPECT_EQ(old, peer->dst); + homa_peer_release(peer); +} #ifndef __STRIP__ /* See strip.py */ +TEST_F(homa_peer, homa_unsched_priority) +{ + struct homa_peer peer; + + homa_peer_set_cutoffs(&peer, INT_MAX, 0, 0, INT_MAX, 200, 100, 0, 0); + + EXPECT_EQ(5, homa_unsched_priority(&self->homa, &peer, 10)); + EXPECT_EQ(4, homa_unsched_priority(&self->homa, &peer, 200)); + EXPECT_EQ(3, homa_unsched_priority(&self->homa, &peer, 201)); +} + TEST_F(homa_peer, homa_peer_lock_slow) { struct homa_peer *peer = homa_peer_get(&self->hsk, ip3333); @@ -920,30 +970,3 @@ TEST_F(homa_peer, homa_peer_update_sysctl_deps) EXPECT_EQ(10*HZ, peertab->idle_jiffies_min); EXPECT_EQ(100*HZ, peertab->idle_jiffies_max); } - -/* Functions in homa_peer.h: */ - -TEST_F(homa_peer, homa_get_dst__normal) -{ - struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); - struct dst_entry *dst; - - dst = homa_get_dst(peer, &self->hsk); - EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); - IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->peer_dst_refreshes)); - dst_release(dst); - homa_peer_release(peer); -} -TEST_F(homa_peer, homa_get_dst__must_refresh) -{ - struct homa_peer *peer = homa_peer_get(&self->hsk, &ip1111[0]); - struct dst_entry *dst; - - peer->dst->obsolete = 1; - mock_dst_check_errors = 1; - dst = homa_get_dst(peer, &self->hsk); - EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); - IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->peer_dst_refreshes)); - dst_release(dst); - homa_peer_release(peer); -} From b8256c77ac22aa587070ddc73b5f5cd661f97f9b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 27 Aug 2025 14:26:09 -0700 Subject: [PATCH 455/625] Eliminate homa_peer->addr Use the address in the ht_key field instead. --- homa_peer.c | 1 - homa_peer.h | 21 ++++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index cc9adc62..7e5b5ac9 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -417,7 +417,6 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, INC_METRIC(peer_kmalloc_errors, 1); return (struct homa_peer *)ERR_PTR(-ENOMEM); } - peer->addr = *addr; peer->ht_key.addr = *addr; peer->ht_key.hnet = hsk->hnet; atomic_set(&peer->refs, 1); diff --git a/homa_peer.h b/homa_peer.h index d168b2eb..bee107a0 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -120,7 +120,8 @@ struct homa_peertab { struct homa_peer_key { /** * @addr: Address of the desired host. IPv4 addresses are represented - * with IPv4-mapped IPv6 addresses. + * with IPv4-mapped IPv6 addresses. Must be the first variable in + * the struct, because of union in homa_peer. */ struct in6_addr addr; @@ -133,14 +134,16 @@ struct homa_peer_key { * have communicated with (either as client or server). */ struct homa_peer { - /** - * @addr: IPv6 address for the machine (IPv4 addresses are stored - * as IPv4-mapped IPv6 addresses). - */ - struct in6_addr addr; - - /** @ht_key: The hash table key for this peer in peertab->ht. */ - struct homa_peer_key ht_key; + union { + /** + * @addr: IPv6 address for the machine (IPv4 addresses are + * stored as IPv4-mapped IPv6 addresses). + */ + struct in6_addr addr; + + /** @ht_key: The hash table key for this peer in peertab->ht. */ + struct homa_peer_key ht_key; + }; /** * @refs: Number of unmatched calls to homa_peer_hold; it's not safe From bb792b77443a6ed3cc1c84d38956913d604a18ca Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 27 Aug 2025 16:15:52 -0700 Subject: [PATCH 456/625] Use reference counts more cleanly to manage homa_peers * Switch from atomic_t to refcount_t for homa_peer->refs. * Keep a reference on a peer for the entry in peertab->ht. * This eliminates the need for the dead_peers mechanism; just call_rcu in homa_peer_release when the reference count becomes 0. --- homa_peer.c | 108 ++++++-------------------------- homa_peer.h | 41 ++++--------- test/mock.c | 1 + test/unit_homa_peer.c | 140 +++++------------------------------------- test/unit_homa_rpc.c | 4 +- test/utils.c | 15 ----- test/utils.h | 1 - 7 files changed, 51 insertions(+), 259 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index 7e5b5ac9..625b552d 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -90,7 +90,6 @@ struct homa_peertab *homa_peer_alloc_peertab(void) } peertab->ht_valid = true; rhashtable_walk_enter(&peertab->ht, &peertab->ht_iter); - INIT_LIST_HEAD(&peertab->dead_peers); peertab->gc_threshold = 5000; peertab->net_max = 10000; peertab->idle_secs_min = 10; @@ -143,7 +142,7 @@ void homa_peer_free_net(struct homa_net *hnet) continue; if (rhashtable_remove_fast(&peertab->ht, &peer->ht_linkage, ht_params) == 0) { - homa_peer_free(peer); + homa_peer_release(peer); hnet->num_peers--; peertab->num_peers--; } @@ -159,24 +158,21 @@ void homa_peer_free_net(struct homa_net *hnet) } /** - * homa_peer_free_fn() - This function is invoked for each entry in + * homa_peer_release_fn() - This function is invoked for each entry in * the peer hash table by the rhashtable code when the table is being * deleted. It frees its argument. - * @object: struct homa_peer to free. + * @object: homa_peer to free. * @dummy: Not used. */ -void homa_peer_free_fn(void *object, void *dummy) +void homa_peer_release_fn(void *object, void *dummy) { struct homa_peer *peer = object; - homa_peer_free(peer); + homa_peer_release(peer); } /** - * homa_peer_free_peertab() - Destructor for homa_peertabs. After this - * function returns, it is unsafe to use any results from previous calls - * to homa_peer_get, since all existing homa_peer objects will have been - * destroyed. + * homa_peer_free_peertab() - Destructor for homa_peertabs. * @peertab: The table to destroy. */ void homa_peer_free_peertab(struct homa_peertab *peertab) @@ -187,11 +183,9 @@ void homa_peer_free_peertab(struct homa_peertab *peertab) if (peertab->ht_valid) { rhashtable_walk_exit(&peertab->ht_iter); - rhashtable_free_and_destroy(&peertab->ht, homa_peer_free_fn, + rhashtable_free_and_destroy(&peertab->ht, homa_peer_release_fn, NULL); } - while (!list_empty(&peertab->dead_peers)) - homa_peer_free_dead(peertab); #ifndef __STRIP__ /* See strip.py */ if (peertab->sysctl_header) { unregister_net_sysctl_table(peertab->sysctl_header); @@ -201,48 +195,6 @@ void homa_peer_free_peertab(struct homa_peertab *peertab) kfree(peertab); } -/** - * homa_peer_rcu_callback() - This function is invoked as the callback - * for an invocation of call_rcu. It just marks a peertab to indicate that - * it was invoked. - * @head: Contains information used to locate the peertab. - */ -void homa_peer_rcu_callback(struct rcu_head *head) -{ - struct homa_peertab *peertab; - - peertab = container_of(head, struct homa_peertab, rcu_head); - atomic_set(&peertab->call_rcu_pending, 0); -} - -/** - * homa_peer_free_dead() - Release peers on peertab->dead_peers - * if possible. - * @peertab: Check the dead peers here. - */ -void homa_peer_free_dead(struct homa_peertab *peertab) - __must_hold(peertab->lock) -{ - struct homa_peer *peer, *tmp; - - /* A dead peer can be freed only if: - * (a) there are no call_rcu calls pending (if there are, it's - * possible that a new reference might get created for the - * peer) - * (b) the peer's reference count is zero. - */ - if (atomic_read(&peertab->call_rcu_pending)) - return; - list_for_each_entry_safe(peer, tmp, &peertab->dead_peers, dead_links) { - if (atomic_read(&peer->refs) == 0) { - tt_record1("homa_peer_free_dead freeing homa_peer 0x%x", - tt_addr(peer->addr)); - list_del_init(&peer->dead_links); - homa_peer_free(peer); - } - } -} - /** * homa_peer_prefer_evict() - Given two peers, determine which one is * a better candidate for eviction. @@ -369,10 +321,7 @@ void homa_peer_gc(struct homa_peertab *peertab) spin_lock_bh(&peertab->lock); if (peertab->gc_stop_count != 0) goto done; - if (!list_empty(&peertab->dead_peers)) - homa_peer_free_dead(peertab); - if (atomic_read(&peertab->call_rcu_pending) || - peertab->num_peers < peertab->gc_threshold) + if (peertab->num_peers < peertab->gc_threshold) goto done; num_victims = homa_peer_pick_victims(peertab, victims, EVICT_BATCH_SIZE); @@ -384,15 +333,13 @@ void homa_peer_gc(struct homa_peertab *peertab) if (rhashtable_remove_fast(&peertab->ht, &peer->ht_linkage, ht_params) == 0) { - list_add_tail(&peer->dead_links, &peertab->dead_peers); + homa_peer_release(peer); peertab->num_peers--; peer->ht_key.hnet->num_peers--; tt_record1("homa_peer_gc removed homa_peer 0x%x", tt_addr(peer->addr)); } } - atomic_set(&peertab->call_rcu_pending, 1); - call_rcu(&peertab->rcu_head, homa_peer_rcu_callback); done: spin_unlock_bh(&peertab->lock); } @@ -419,9 +366,8 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, } peer->ht_key.addr = *addr; peer->ht_key.hnet = hsk->hnet; - atomic_set(&peer->refs, 1); + refcount_set(&peer->refs, 1); peer->access_jiffies = jiffies; - INIT_LIST_HEAD(&peer->dead_links); spin_lock_init(&peer->lock); #ifndef __STRIP__ /* See strip.py */ peer->unsched_cutoffs[HOMA_MAX_PRIORITIES - 1] = 0; @@ -444,31 +390,16 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, /** * homa_peer_free() - Release any resources in a peer and free the homa_peer - * struct. - * @peer: Structure to free. Must not currently be linked into - * peertab->ht. + * struct. Invoked by the RCU mechanism via homa_peer_release. + * @head: Pointer to the rcu_head field of the peer to free. */ -void homa_peer_free(struct homa_peer *peer) +void homa_peer_free(struct rcu_head *head) { - dst_release(peer->dst); + struct homa_peer *peer; - if (atomic_read(&peer->refs) == 0) - kfree(peer); - else { -#ifdef __UNIT_TEST__ - if (!mock_peer_free_no_fail) - FAIL(" %s found peer %s with reference count %d", - __func__, homa_print_ipv6_addr(&peer->addr), - atomic_read(&peer->refs)); - else - UNIT_LOG("; ", "peer %s has reference count %d", - homa_print_ipv6_addr(&peer->addr), - atomic_read(&peer->refs)); -#else /* __UNIT_TEST__ */ - WARN(1, "%s found peer with reference count %d", - __func__, atomic_read(&peer->refs)); -#endif /* __UNIT_TEST__ */ - } + peer = container_of(head, struct homa_peer, rcu_head); + dst_release(peer->dst); + kfree(peer); } /** @@ -513,18 +444,17 @@ struct homa_peer *homa_peer_get(struct homa_sock *hsk, if (IS_ERR(other)) { /* Couldn't insert; return the error info. */ homa_peer_release(peer); - homa_peer_free(peer); peer = other; } else if (other) { /* Someone else already created the desired peer; use that * one instead of ours. */ homa_peer_release(peer); - homa_peer_free(peer); + homa_peer_hold(other); peer = other; - homa_peer_hold(peer); peer->access_jiffies = jiffies; } else { + homa_peer_hold(peer); peertab->num_peers++; key.hnet->num_peers++; } diff --git a/homa_peer.h b/homa_peer.h index bee107a0..3d50dd98 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -40,25 +40,9 @@ struct homa_peertab { */ bool ht_valid; - /** - * @dead_peers: List of peers that have been removed from ht - * but can't yet be freed (because they have nonzero reference - * counts or an rcu sync point hasn't been reached). - */ - struct list_head dead_peers; - /** @rcu_head: Holds state of a pending call_rcu invocation. */ struct rcu_head rcu_head; - /** - * @call_rcu_pending: Nonzero means that call_rcu has been - * invoked but it has not invoked the callback function; until the - * callback has been invoked we can't free peers on dead_peers or - * invoke call_rcu again (which means we can't add more peers to - * dead_peers). - */ - atomic_t call_rcu_pending; - /** * @gc_stop_count: Nonzero means that peer garbage collection * should not be performed (conflicting state changes are underway). @@ -146,10 +130,12 @@ struct homa_peer { }; /** - * @refs: Number of unmatched calls to homa_peer_hold; it's not safe - * to free this object until the reference count is zero. + * @refs: Number of outstanding references to this peer. Includes + * one reference for the entry in peertab->ht, plus one for each + * unmatched call to homa_peer_hold; the peer gets freed when + * this value becomes zero. */ - atomic_t refs; + refcount_t refs; /** * @access_jiffies: Time in jiffies of most recent access to this @@ -163,9 +149,6 @@ struct homa_peer { */ struct rhash_head ht_linkage; - /** @dead_links: Used to link this peer into peertab->dead_peers. */ - struct list_head dead_links; - /** * @lock: used to synchronize access to fields in this struct, such * as @num_acks, @acks, @dst, and @dst_cookie. @@ -291,6 +274,9 @@ struct homa_peer { * in the current pass, if it still needs one. */ struct homa_rpc *resend_rpc; + + /** @rcu_head: Holds state of a pending call_rcu invocation. */ + struct rcu_head rcu_head; }; void homa_dst_refresh(struct homa_peertab *peertab, @@ -304,9 +290,7 @@ struct homa_peertab *homa_peer_alloc_peertab(void); int homa_peer_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -void homa_peer_free(struct homa_peer *peer); -void homa_peer_free_dead(struct homa_peertab *peertab); -void homa_peer_free_fn(void *object, void *dummy); +void homa_peer_free(struct rcu_head *head); void homa_peer_free_net(struct homa_net *hnet); void homa_peer_free_peertab(struct homa_peertab *peertab); void homa_peer_gc(struct homa_peertab *peertab); @@ -319,7 +303,7 @@ int homa_peer_pick_victims(struct homa_peertab *peertab, int homa_peer_prefer_evict(struct homa_peertab *peertab, struct homa_peer *peer1, struct homa_peer *peer2); -void homa_peer_rcu_callback(struct rcu_head *head); +void homa_peer_release_fn(void *object, void *dummy); int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk); void homa_peer_update_sysctl_deps(struct homa_peertab *peertab); #ifndef __STRIP__ /* See strip.py */ @@ -369,7 +353,7 @@ static inline void homa_peer_unlock(struct homa_peer *peer) */ static inline void homa_peer_hold(struct homa_peer *peer) { - atomic_inc(&peer->refs); + refcount_inc(&peer->refs); } /** @@ -380,7 +364,8 @@ static inline void homa_peer_hold(struct homa_peer *peer) */ static inline void homa_peer_release(struct homa_peer *peer) { - atomic_dec(&peer->refs); + if (refcount_dec_and_test(&peer->refs)) + call_rcu(&peer->rcu_head, homa_peer_free); } /** diff --git a/test/mock.c b/test/mock.c index 60d5bfa3..f80a64f2 100644 --- a/test/mock.c +++ b/test/mock.c @@ -368,6 +368,7 @@ void BUG_func(void) void call_rcu(struct rcu_head *head, void free_func(struct rcu_head *head)) { unit_log_printf("; ", "call_rcu invoked"); + free_func(head); } bool cancel_work_sync(struct work_struct *work) diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index ded8f3ec..6890dc3f 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -161,11 +161,11 @@ TEST_F(homa_peer, homa_peer_free_net__set_gc_stop_count) homa_peer_free_net(self->hnet); EXPECT_EQ(0, unit_count_peers(&self->homa)); - EXPECT_STREQ("gc_stop_count 4", unit_log_get()); + EXPECT_SUBSTR("gc_stop_count 4", unit_log_get()); EXPECT_EQ(3, self->homa.peertab->gc_stop_count); } -TEST_F(homa_peer, homa_peer_free_fn) +TEST_F(homa_peer, homa_peer_release_fn) { struct homa_peer *peer; struct dst_entry *dst; @@ -176,97 +176,31 @@ TEST_F(homa_peer, homa_peer_free_fn) EXPECT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); homa_peer_release(peer); - homa_peer_free_fn(peer, NULL); + homa_peer_release_fn(peer, NULL); EXPECT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); dst_release(dst); } -TEST_F(homa_peer, homa_peer_free_peertab__basics) { +TEST_F(homa_peer, homa_peer_free_peertab) { struct homa_peer *peer; + /* Create two peers, release one before destroying the table, the + * other after (test infrastructure will detect improper freeing). + */ peer = homa_peer_get(&self->hsk, ip1111); homa_peer_release(peer); peer = homa_peer_get(&self->hsk, ip2222); - mock_peer_free_no_fail = 1; unit_log_clear(); homa_peer_free_peertab(self->homa.peertab); #ifndef __STRIP__ /* See strip.py */ - EXPECT_STREQ("peer [2::2:2:2] has reference count 1; " - "unregister_net_sysctl_table", unit_log_get()); -#else /* See strip.py */ - EXPECT_STREQ("peer [2::2:2:2] has reference count 1", unit_log_get()); + EXPECT_SUBSTR("unregister_net_sysctl_table", unit_log_get()); #endif /* See strip.py */ - kfree(peer); - self->homa.peertab = homa_peer_alloc_peertab(); -} -TEST_F(homa_peer, homa_peer_free_peertab__free_dead_peers) { - struct homa_peertab *peertab = self->homa.peertab; - struct homa_peer *peer; - - jiffies = 100; - peer = homa_peer_get(&self->hsk, ip1111); homa_peer_release(peer); - peer = homa_peer_get(&self->hsk, ip2222); - homa_peer_release(peer); - - jiffies = peertab->idle_jiffies_max + 1000; - peertab->num_peers = peertab->gc_threshold + 100; - homa_peer_gc(peertab); - EXPECT_EQ(2, unit_list_length(&peertab->dead_peers)); - - homa_peer_rcu_callback(&peertab->rcu_head); - homa_peer_free_peertab(self->homa.peertab); - - /* Can't check explicitly for problems (peertab is gone now), but - * end-of-test checks will complain if the peers weren't freed. - */ self->homa.peertab = homa_peer_alloc_peertab(); } -TEST_F(homa_peer, homa_peer_rcu_callback) { - atomic_set(&self->homa.peertab->call_rcu_pending, 4); - homa_peer_rcu_callback(&self->homa.peertab->rcu_head); - EXPECT_EQ(0, atomic_read(&self->homa.peertab->call_rcu_pending)); -} - -TEST_F(homa_peer, homa_peer_free_dead) { - struct homa_peertab *peertab = self->homa.peertab; - struct homa_peer *peer1, *peer2; - - peer1 = homa_peer_alloc(&self->hsk, ip1111); - peer2 = homa_peer_alloc(&self->hsk, ip2222); - - list_add_tail(&peer1->dead_links, &peertab->dead_peers); - list_add_tail(&peer2->dead_links, &peertab->dead_peers); - unit_log_clear(); - unit_log_dead_peers(&self->homa); - EXPECT_STREQ("[1::1:1:1]; [2::2:2:2]", unit_log_get()); - - /* First call: RCU pending. */ - atomic_set(&peertab->call_rcu_pending, 1); - homa_peer_free_dead(peertab); - unit_log_clear(); - unit_log_dead_peers(&self->homa); - EXPECT_STREQ("[1::1:1:1]; [2::2:2:2]", unit_log_get()); - - /* Second call: peers have nonzero reference counts. */ - atomic_set(&peertab->call_rcu_pending, 0); - homa_peer_free_dead(peertab); - unit_log_clear(); - unit_log_dead_peers(&self->homa); - EXPECT_STREQ("[1::1:1:1]; [2::2:2:2]", unit_log_get()); - - /* Third call: all reference counts zero. */ - homa_peer_release(peer1); - homa_peer_release(peer2); - homa_peer_free_dead(peertab); - unit_log_clear(); - unit_log_dead_peers(&self->homa); - EXPECT_STREQ("", unit_log_get()); -} - TEST_F(homa_peer, homa_peer_prefer_evict) { struct homa_peertab *peertab = self->homa.peertab; @@ -478,18 +412,9 @@ TEST_F(homa_peer, homa_peer_gc__basics) unit_log_clear(); homa_peer_gc(peertab); - unit_log_dead_peers(&self->homa); - EXPECT_STREQ("call_rcu invoked; [1::1:1:1]", unit_log_get()); - EXPECT_EQ(1, atomic_read(&peertab->call_rcu_pending)); + EXPECT_STREQ("call_rcu invoked", unit_log_get()); EXPECT_EQ(0, self->hnet->num_peers); EXPECT_EQ(peertab->gc_threshold - 1, peertab->num_peers); - - homa_peer_rcu_callback(&peertab->rcu_head); - unit_log_clear(); - homa_peer_gc(peertab); - unit_log_dead_peers(&self->homa); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(0, atomic_read(&peertab->call_rcu_pending)); } TEST_F(homa_peer, homa_peer_gc__gc_stop_count) { @@ -499,6 +424,7 @@ TEST_F(homa_peer, homa_peer_gc__gc_stop_count) jiffies = 300; peer = homa_peer_get(&self->hsk, ip1111); homa_peer_release(peer); + EXPECT_EQ(1, self->hnet->num_peers); jiffies = peertab->idle_jiffies_max + 1000; peertab->num_peers = peertab->gc_threshold; @@ -506,26 +432,8 @@ TEST_F(homa_peer, homa_peer_gc__gc_stop_count) unit_log_clear(); homa_peer_gc(peertab); - unit_log_dead_peers(&self->homa); - EXPECT_STREQ("", unit_log_get()); -} -TEST_F(homa_peer, homa_peer_gc__call_rcu_pending) -{ - struct homa_peertab *peertab = self->homa.peertab; - struct homa_peer *peer; - - jiffies = 300; - peer = homa_peer_get(&self->hsk, ip1111); - homa_peer_release(peer); - - jiffies = peertab->idle_jiffies_max + 1000; - peertab->num_peers = peertab->gc_threshold; - atomic_set(&peertab->call_rcu_pending, 1); - - unit_log_clear(); - homa_peer_gc(peertab); - unit_log_dead_peers(&self->homa); EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(1, self->hnet->num_peers); } TEST_F(homa_peer, homa_peer_gc__peers_below_gc_threshold) { @@ -541,7 +449,6 @@ TEST_F(homa_peer, homa_peer_gc__peers_below_gc_threshold) unit_log_clear(); homa_peer_gc(peertab); - unit_log_dead_peers(&self->homa); EXPECT_STREQ("", unit_log_get()); } TEST_F(homa_peer, homa_peer_gc__no_suitable_candidates) @@ -558,7 +465,6 @@ TEST_F(homa_peer, homa_peer_gc__no_suitable_candidates) unit_log_clear(); homa_peer_gc(peertab); - unit_log_dead_peers(&self->homa); EXPECT_STREQ("", unit_log_get()); } @@ -578,7 +484,6 @@ TEST_F(homa_peer, homa_peer_alloc__success) #endif /* See strip.py */ EXPECT_EQ(1, atomic_read(&peer->dst->__rcuref.refcnt)); homa_peer_release(peer); - homa_peer_free(peer); } TEST_F(homa_peer, homa_peer_alloc__kmalloc_error) { @@ -606,7 +511,7 @@ TEST_F(homa_peer, homa_peer_alloc__route_error) #endif /* See strip.py */ } -TEST_F(homa_peer, homa_peer_free__normal) +TEST_F(homa_peer, homa_peer_free) { struct homa_peer *peer; struct dst_entry *dst; @@ -618,23 +523,9 @@ TEST_F(homa_peer, homa_peer_free__normal) ASSERT_EQ(2, atomic_read(&dst->__rcuref.refcnt)); homa_peer_release(peer); - homa_peer_free(peer); ASSERT_EQ(1, atomic_read(&dst->__rcuref.refcnt)); dst_release(dst); } -TEST_F(homa_peer, homa_peer_free__nonzero_ref_count) -{ - struct homa_peer *peer; - - peer = homa_peer_alloc(&self->hsk, ip2222); - ASSERT_FALSE(IS_ERR(peer)); - mock_peer_free_no_fail = 1; - - unit_log_clear(); - homa_peer_free(peer); - EXPECT_STREQ("peer [2::2:2:2] has reference count 1", unit_log_get()); - kfree(peer); -} TEST_F(homa_peer, homa_peer_get__basics) { @@ -646,6 +537,7 @@ TEST_F(homa_peer, homa_peer_get__basics) ASSERT_FALSE(IS_ERR(peer)); EXPECT_EQ_IP(*ip1111, peer->addr); EXPECT_EQ(456, peer->access_jiffies); + EXPECT_EQ(2, refcount_read(&peer->refs)); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(INT_MAX, peer->unsched_cutoffs[HOMA_MAX_PRIORITIES-2]); EXPECT_EQ(0, peer->cutoff_version); @@ -656,7 +548,7 @@ TEST_F(homa_peer, homa_peer_get__basics) /* Second call: lookup existing peer. */ peer2 = homa_peer_get(&self->hsk, ip1111); EXPECT_EQ(peer, peer2); - EXPECT_EQ(2, atomic_read(&peer->refs)); + EXPECT_EQ(3, refcount_read(&peer->refs)); EXPECT_EQ(1, self->homa.peertab->num_peers); EXPECT_EQ(1, self->hnet->num_peers); @@ -664,7 +556,7 @@ TEST_F(homa_peer, homa_peer_get__basics) peer2 = homa_peer_get(&self->hsk, ip2222); EXPECT_NE(peer, peer2); ASSERT_FALSE(IS_ERR(peer2)); - EXPECT_EQ(1, atomic_read(&peer2->refs)); + EXPECT_EQ(2, refcount_read(&peer2->refs)); EXPECT_EQ(2, self->homa.peertab->num_peers); EXPECT_EQ(2, self->hnet->num_peers); @@ -707,7 +599,7 @@ TEST_F(homa_peer, homa_peer_get__conflicting_create) peer = homa_peer_get(&self->hsk, ip3333); EXPECT_FALSE(IS_ERR(conflicting_peer)); EXPECT_EQ(conflicting_peer, peer); - EXPECT_EQ(1, atomic_read(&peer->refs)); + EXPECT_EQ(2, refcount_read(&peer->refs)); EXPECT_EQ(110, peer->access_jiffies); homa_peer_release(peer); EXPECT_EQ(1, self->homa.peertab->num_peers); diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index c4840d05..866228f5 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -811,11 +811,11 @@ TEST_F(homa_rpc, homa_rpc_reap__release_peer_ref) ASSERT_NE(NULL, crpc); peer = crpc->peer; - EXPECT_EQ(1, atomic_read(&peer->refs)); + EXPECT_EQ(2, refcount_read(&peer->refs)); homa_rpc_end(crpc); homa_rpc_reap(&self->hsk, false); - EXPECT_EQ(0, atomic_read(&peer->refs)); + EXPECT_EQ(1, refcount_read(&peer->refs)); EXPECT_EQ(NULL, crpc->peer); } #ifndef __STRIP__ /* See strip.py */ diff --git a/test/utils.c b/test/utils.c index 7c0d7b62..a5cab395 100644 --- a/test/utils.c +++ b/test/utils.c @@ -318,21 +318,6 @@ void unit_log_throttled(struct homa *homa) } } -/** - * unit_log_dead_peers() - Append to the test log the addresses of all - * peers in peertab->dead_peers for @homa. - * @homa: Homa's overall state. - */ -void unit_log_dead_peers(struct homa *homa) -{ - struct homa_peer *peer; - - list_for_each_entry(peer, &homa->peertab->dead_peers, dead_links) { - unit_log_printf("; ", "%s", - homa_print_ipv6_addr(&peer->ht_key.addr)); - } -} - /** * unit_print_gaps() - Returns a static string describing the gaps in an RPC. * @rpc: Log the gaps in this RPC. diff --git a/test/utils.h b/test/utils.h index 9a2390a9..16aa3e94 100644 --- a/test/utils.h +++ b/test/utils.h @@ -44,7 +44,6 @@ struct iov_iter *unit_iov_iter(void *buffer, size_t length); int unit_list_length(struct list_head *head); void unit_log_active_ids(struct homa_sock *hsk); -void unit_log_dead_peers(struct homa *homa); void unit_log_filled_skbs(struct sk_buff *skb, int verbose); void unit_log_frag_list(struct sk_buff *skb, int verbose); #ifndef __STRIP__ /* See strip.py */ From a85d27d2e10fbfcf3a14fa3cdd7badc5253b471c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 28 Aug 2025 10:25:32 -0700 Subject: [PATCH 457/625] Fix bugs in metrics.py related to calculating usecs/op --- util/metrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/util/metrics.py b/util/metrics.py index efa55386..b3b5aa37 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -332,7 +332,7 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (time/calls)/1000 + us_per = (time/calls)/(cpu_khz*1e-3) print("") print("Polling in recv %6.2f %7.2f us/syscall" % (cores, us_per)) @@ -340,7 +340,7 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (deltas["skb_alloc_cycles"]/calls)/1000 + us_per = (deltas["skb_alloc_cycles"]/calls)/(cpu_khz*1e-3) print("Skb allocation %6.2f %7.2f us/skb" % ( deltas["skb_alloc_cycles"]/time_delta, us_per)) @@ -348,7 +348,7 @@ def scale_number(number): if calls == 0: us_per = 0 else: - us_per = (deltas["skb_free_cycles"]/calls)/1000 + us_per = (deltas["skb_free_cycles"]/calls)/(cpu_khz*1e-3) print("Skb freeing %6.2f %7.2f us/skb" % ( deltas["skb_free_cycles"]/time_delta, us_per)) From 22cf40bddccbf8b63143ec5ccc436836151387ab Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 28 Aug 2025 11:00:40 -0700 Subject: [PATCH 458/625] Use ktime_get_ns in homa_clock in upstreamed version --- homa_impl.h | 30 +++++++++++++++--------------- homa_metrics.c | 8 ++++++-- perf.txt | 17 +++++++++++++++++ 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 8de96ee8..49c46e68 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -853,24 +853,24 @@ static inline struct homa_net *homa_net(struct net *net) */ static inline u64 homa_clock(void) { - /* As of May 2025 there does not appear to be a portable API that - * meets Homa's needs: - * - The Intel X86 TSC works well but is not portable. - * - sched_clock() does not guarantee monotonicity or consistency. - * - ktime_get_mono_fast_ns and ktime_get_raw_fast_ns are very slow - * (27 ns to read, vs 8 ns for TSC) - * Thus we use a hybrid approach that uses TSC (via get_cycles) where - * available (which should be just about everywhere Homa runs). + /* This function exists to make it easy to switch time sources + * if/when new or better sources become available. */ #ifdef __UNIT_TEST__ u64 mock_get_clock(void); return mock_get_clock(); #else /* __UNIT_TEST__ */ -#ifdef CONFIG_X86_TSC +#ifndef __UPSTREAM__ /* See strip.py */ + /* As of August 2025, get_cycles takes only about 8 ns/call, vs. + * 14 ns/call for ktime_get_ns. This saves about .04 core when + * driving a 25 Gbps network at high load (see perf.txt for details). + * Unfortunately, Linux reviewers will not allow get_cycles in the + * upstreamed version. + */ return get_cycles(); -#else - return ktime_get_mono_fast_ns(); -#endif /* CONFIG_X86_TSC */ +#else /* See strip.py */ + return ktime_get_ns(); +#endif /* See strip.py */ #endif /* __UNIT_TEST__ */ } @@ -884,11 +884,11 @@ static inline u64 homa_clock_khz(void) #ifdef __UNIT_TEST__ return 1000000; #else /* __UNIT_TEST__ */ -#ifdef CONFIG_X86_TSC +#ifndef __UPSTREAM__ /* See strip.py */ return tsc_khz; -#else +#else /* See strip.py */ return 1000000; -#endif /* CONFIG_X86_TSC */ +#endif /* See strip.py */ #endif /* __UNIT_TEST__ */ } diff --git a/homa_metrics.c b/homa_metrics.c index 6c85a944..22e31aeb 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -240,9 +240,13 @@ char *homa_metrics_print(void) "Time spent in homa_softirq during SoftIRQ\n"); M("bypass_softirq_cycles", m->bypass_softirq_cycles, "Time spent in homa_softirq during bypass from GRO\n"); - M("linux_softirq_cycles", m->linux_softirq_cycles, + + /* Adjust stats gathered in Linux: they always use rdtsc. */ + M("linux_softirq_cycles", m->linux_softirq_cycles * + (homa_clock_khz() / 1000) / (tsc_khz / 1000), "Time spent in all Linux SoftIRQ\n"); - M("napi_cycles", m->napi_cycles, + M("napi_cycles", m->napi_cycles * (homa_clock_khz() / 1000) / + (tsc_khz / 1000), "Time spent in NAPI-level packet handling\n"); M("send_cycles", m->send_cycles, "Time spent in homa_sendmsg for requests\n"); diff --git a/perf.txt b/perf.txt index b4ecb49b..0512079e 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,22 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +62. (August 2025) Using ktime_get_ns (rdtscp) instead of get_cycles (rdtsc) +in homa_clock (Linux reviewers won't allow get_cycles for upstreaming). +rdtscp takes about 14 ns per call, vs. 8 for ktime_get_ns. Running "w4 -b20" +on xl170s homa_clock invocations are 21 M/sec, so expect about .12 additional +core to be used. Measurements on xl170 cluster (25 Gbps network) using "w4 -b20" +(average across 6 nodes in experiment, then average over 5 runs): + rdtsc rdtscp Ratio +Gbps/sec/core: 6.46 6.22 0.954 +Total core utilization: 6.20 6.44 1.038 + +Same experiment but in overload ("w4 -b40"): + rdtsc rdtscp Ratio +Gbps/sec/core: 5.44 5.32 0.980 +Total core utilization: 8.08 8.05 0.997 +Maximum throughput (Gbps): 21.95 21.42 0.976 + 61. (July 2025) Client responses could starve server requests. This came about because a server request that wakes up after waiting for buffer space has 0 received bytes. In contrast, a new client response will have received @@ -57,6 +73,7 @@ mode): Function Units Overhead ----------------------------------------------- rdtsc cycles 8 ns +rdtscp cycles 14 ns sched_clock ns 9 ns ktime_get_mono_fast_ns ns 24 ns ktime_get_raw_fast_ns ns 24 ns From 13b9db5207fb315cef478a5b148238aa3d202d14 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 28 Aug 2025 20:59:08 -0700 Subject: [PATCH 459/625] Refactor homa_sock_init to minimize time when socktab lock is held Also, release lock and call cond_resched when looping over default port number. --- homa_sock.c | 58 +++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/homa_sock.c b/homa_sock.c index daf2113a..2e61c5ad 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -115,7 +115,7 @@ struct homa_sock *homa_socktab_next(struct homa_socktab_scan *scan) return NULL; success: - scan->hsk = hlist_entry(next, struct homa_sock, socktab_links); + scan->hsk = hlist_entry(next, struct homa_sock, socktab_links); sock_hold(&scan->hsk->sock); rcu_read_unlock(); return scan->hsk; @@ -173,34 +173,13 @@ int homa_sock_init(struct homa_sock *hsk) if (IS_ERR(buffer_pool)) return PTR_ERR(buffer_pool); - /* Initialize Homa-specific fields. */ + /* Initialize Homa-specific fields. We can initialize everything + * except the port and hash table links without acquiring the + * socket lock. + */ hsk->homa = homa; hsk->hnet = hnet; hsk->buffer_pool = buffer_pool; - - /* Pick a default port. Must keep the socktab locked from now - * until the new socket is added to the socktab, to ensure that - * no other socket chooses the same port. - */ - spin_lock_bh(&socktab->write_lock); - starting_port = hnet->prev_default_port; - while (1) { - hnet->prev_default_port++; - if (hnet->prev_default_port < HOMA_MIN_DEFAULT_PORT) - hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT; - other = homa_sock_find(hnet, hnet->prev_default_port); - if (!other) - break; - sock_put(&other->sock); - if (hnet->prev_default_port == starting_port) { - spin_unlock_bh(&socktab->write_lock); - hsk->shutdown = true; - hsk->homa = NULL; - result = -EADDRNOTAVAIL; - goto error; - } - } - hsk->port = hnet->prev_default_port; hsk->inet.inet_num = hsk->port; hsk->inet.inet_sport = htons(hsk->port); @@ -230,6 +209,33 @@ int homa_sock_init(struct homa_sock *hsk) bucket->id = i + 1000000; INIT_HLIST_HEAD(&bucket->rpcs); } + + /* Pick a default port. Must keep the socktab locked from now + * until the new socket is added to the socktab, to ensure that + * no other socket chooses the same port. + */ + spin_lock_bh(&socktab->write_lock); + starting_port = hnet->prev_default_port; + while (1) { + hnet->prev_default_port++; + if (hnet->prev_default_port < HOMA_MIN_DEFAULT_PORT) + hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT; + other = homa_sock_find(hnet, hnet->prev_default_port); + if (!other) + break; + sock_put(&other->sock); + if (hnet->prev_default_port == starting_port) { + spin_unlock_bh(&socktab->write_lock); + hsk->shutdown = true; + hsk->homa = NULL; + result = -EADDRNOTAVAIL; + goto error; + } + spin_unlock_bh(&socktab->write_lock); + cond_resched(); + spin_lock_bh(&socktab->write_lock); + } + hsk->port = hnet->prev_default_port; hlist_add_head_rcu(&hsk->socktab_links, &socktab->buckets[homa_socktab_bucket(hnet, hsk->port)]); From 95ef4b2d4015036e46255f3fb938c763113189ea Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 29 Aug 2025 11:21:07 -0700 Subject: [PATCH 460/625] Remove obsolete file util/test_time_trace.c --- util/Makefile | 2 +- util/test_time_trace.c | 29 ----------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 util/test_time_trace.c diff --git a/util/Makefile b/util/Makefile index 9e60b497..5d79282e 100644 --- a/util/Makefile +++ b/util/Makefile @@ -4,7 +4,7 @@ CFLAGS := -Wall -Werror -fno-strict-aliasing -O3 -I.. BINS := buffer_client buffer_server cp_node dist_test dist_to_proto \ get_time_trace homa_prio homa_test inc_tput receive_raw scratch \ - send_raw server smi test_time_trace use_memory + send_raw server smi use_memory OBJS := $(patsubst %,%.o,$(BINS)) diff --git a/util/test_time_trace.c b/util/test_time_trace.c deleted file mode 100644 index 33be02b9..00000000 --- a/util/test_time_trace.c +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2019-2022 Homa Developers - * SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ - */ - -/* This program exercises the Linux kernel time trace mechanism - * by calling a new system call that creates time traces. - */ - -#include -#include -#include -#include -#include -#include -#include - -int main(int argc, char** argv) { - int i; - printf("Invoking new 'test_timetrace' syscall.\n"); - for (i = 0; i < 100; i++) { - int status = syscall(334); - if (status < 0) { - printf(" Error in test_timetrace: %s (%d)", - strerror(errno), errno); - } - } - return 0; -} - From 7b444af0d6672b2b5ed043c634c28c0cd3bac0bc Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 29 Aug 2025 11:23:39 -0700 Subject: [PATCH 461/625] Trivial improvements in comments --- homa_sock.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/homa_sock.h b/homa_sock.h index 22967f11..cd7fd2e5 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -76,7 +76,7 @@ struct homa_rpc_bucket { * it has been looked up and before it has been locked. * 3. The lookup mechanism does not use RCU. This is important because * RPCs are created rapidly and typically live only a few tens of - * microseconds. As of May 2027 RCU introduces a lag of about + * microseconds. As of May 2025 RCU introduces a lag of about * 25 ms before objects can be deleted; for RPCs this would result * in hundreds or thousands of RPCs accumulating before RCU allows * them to be deleted. @@ -165,7 +165,9 @@ struct homa_sock { /** * @shutdown: True means the socket is no longer usable (either * shutdown has already been invoked, or the socket was never - * properly initialized). + * properly initialized). Note: can't use the SOCK_DEAD flag for + * this because that flag doesn't get set until much later in the + * process of closing a socket. */ bool shutdown; @@ -350,8 +352,8 @@ static inline struct homa_rpc_bucket /* We can use a really simple hash function here because RPC ids * are allocated sequentially. */ - return &hsk->client_rpc_buckets[(id >> 1) - & (HOMA_CLIENT_RPC_BUCKETS - 1)]; + return &hsk->client_rpc_buckets[(id >> 1) & + (HOMA_CLIENT_RPC_BUCKETS - 1)]; } /** From 053ba5388f5b97dc5d0198f2ef711f2d525aa8bb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 29 Aug 2025 14:57:26 -0700 Subject: [PATCH 462/625] Fix bug with reap_all in homa_rpc_reap Previously, an infinite loop would occur if homa_rpc_reap was invoked with reap_all and a dead RPC had a nonzero reference count. --- homa_rpc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index 7e09af38..522fa597 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -486,13 +486,10 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) */ skbs_to_reap = hsk->homa->reap_limit; checked_all_rpcs = list_empty(&hsk->dead_rpcs); - while (1) { + while (!checked_all_rpcs) { batch_size = BATCH_MAX; - if (reap_all) { - if (list_empty(&hsk->dead_rpcs)) - break; - } else { - if (skbs_to_reap <= 0 || checked_all_rpcs) + if (!reap_all) { + if (skbs_to_reap <= 0) break; if (batch_size > skbs_to_reap) batch_size = skbs_to_reap; From dfcf34c46c2d1c8b1ec643bff551f1a351c7c93e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 29 Aug 2025 15:17:58 -0700 Subject: [PATCH 463/625] Fix bugs in wmem management * Specify reap_all when invoking homa_rpc_reap to free wmem (want to get as much memory as possible) * Don't check for wmem in homa_rpc_end (caller will likely have a reference that prevents reaping) * Instead, check for wmem in homa_dispatch_pkts (combine with existing check for slow reaping) --- homa_incoming.c | 31 ++++++++++++++++++------------ homa_plumbing.c | 2 +- homa_rpc.c | 7 ------- test/unit_homa_incoming.c | 40 ++++++++++++++++++--------------------- test/unit_homa_plumbing.c | 8 +++++++- test/unit_homa_rpc.c | 19 ------------------- 6 files changed, 45 insertions(+), 62 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 4e6d66b0..ec15a147 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -637,18 +637,25 @@ void homa_dispatch_pkts(struct sk_buff *skb) homa_rpc_acked(hsk, &saddr, &acks[num_acks]); } - if (hsk->dead_skbs >= 2 * hsk->homa->dead_buffs_limit) { - /* We get here if other approaches are not keeping up with - * reaping dead RPCs. See "RPC Reaping Strategy" in - * homa_rpc_reap code for details. - */ -#ifndef __STRIP__ /* See strip.py */ - u64 start = homa_clock(); -#endif /* See strip.py */ - - tt_record("homa_data_pkt calling homa_rpc_reap"); - homa_rpc_reap(hsk, false); - INC_METRIC(data_pkt_reap_cycles, homa_clock() - start); + /* We need to reap dead RPCs here under two conditions: + * 1. The socket has hit its limit on tx buffer space and threads are + * blocked waiting for skbs to be released. + * 2. A large number of dead RPCs have accumulated, and it seems + * that the reaper isn't keeping up when invoked only at + * "convenient" times (see "RPC Reaping Strategy" in homa_rpc_reap + * code for details). + */ + if (hsk->dead_skbs > 0) { + int waiting_for_wmem = test_bit(SOCK_NOSPACE, + &hsk->sock.sk_socket->flags); + if (waiting_for_wmem || + hsk->dead_skbs >= 2 * hsk->homa->dead_buffs_limit) { + IF_NO_STRIP(u64 start = homa_clock()); + + tt_record("homa_dispatch_pkts calling homa_rpc_reap"); + homa_rpc_reap(hsk, waiting_for_wmem); + INC_METRIC(data_pkt_reap_cycles, homa_clock() - start); + } } sock_put(&hsk->sock); } diff --git a/homa_plumbing.c b/homa_plumbing.c index f865eb73..8e70f0ed 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1347,7 +1347,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, /* There are tasks waiting for tx memory, so reap * immediately. */ - homa_rpc_reap(hsk, false); + homa_rpc_reap(hsk, true); } if (unlikely(copy_to_user((__force void __user *)msg->msg_control, diff --git a/homa_rpc.c b/homa_rpc.c index 522fa597..b70aa5d9 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -323,13 +323,6 @@ void homa_rpc_end(struct homa_rpc *rpc) homa_sock_unlock(rpc->hsk); homa_pacer_unmanage_rpc(rpc); - - if (test_bit(SOCK_NOSPACE, &rpc->hsk->sock.sk_socket->flags)) { - /* There are tasks waiting for tx memory so reap immediately. */ - homa_rpc_unlock(rpc); - homa_rpc_reap(rpc->hsk, false); - homa_rpc_lock(rpc); - } } /** diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 181005eb..1e02082e 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1307,66 +1307,62 @@ TEST_F(homa_incoming, homa_dispatch_pkts__too_many_acks) EXPECT_STREQ("sk->sk_data_ready invoked; ack 1237; ack 1235", unit_log_get()); } -#if 0 #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__invoke_homa_grant_check_rpc) { self->data.incoming = htonl(1000); self->data.message_length = htonl(20000); - homa_dispatch_pkts(mock_skb_new(self->server_ip, &self->data.common, + homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &self->data.common, 0, 0)); unit_log_clear(); unit_log_grantables(&self->homa); EXPECT_SUBSTR("id 1235", unit_log_get()); } #endif /* See strip.py */ -#endif TEST_F(homa_incoming, homa_dispatch_pkts__forced_reap) { struct homa_rpc *dead = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, self->client_id, 20000, 20000); struct homa_rpc *srpc; - mock_clock_tick = 10; + int dead_skbs; + mock_clock_tick = 10; homa_rpc_end(dead); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(31, self->hsk.dead_skbs); -#else /* See strip.py */ - EXPECT_EQ(30, self->hsk.dead_skbs); -#endif /* See strip.py */ + dead_skbs = self->hsk.dead_skbs; + EXPECT_TRUE(dead_skbs >= 30); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->client_port, self->server_id, 10000, 5000); ASSERT_NE(NULL, srpc); self->homa.dead_buffs_limit = 16; - /* First packet: below the threshold for reaps. */ + /* First packet: criteria for reaps not met. */ self->data.common.dport = htons(self->hsk.port); homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(31, self->hsk.dead_skbs); -#else /* See strip.py */ - EXPECT_EQ(30, self->hsk.dead_skbs); -#endif /* See strip.py */ + EXPECT_EQ(dead_skbs, self->hsk.dead_skbs); #ifndef __STRIP__ /* See strip.py */ EXPECT_EQ(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); #endif /* See strip.py */ - /* Second packet: must reap. */ + /* Second packet: must reap because of dead_buffs_limit (should only + * reaps a few skbs). + */ self->homa.dead_buffs_limit = 15; - self->homa.reap_limit = 10; + self->homa.reap_limit = 5; homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0)); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(21, self->hsk.dead_skbs); -#else /* See strip.py */ - EXPECT_EQ(20, self->hsk.dead_skbs); -#endif /* See strip.py */ + EXPECT_EQ(dead_skbs - 5, self->hsk.dead_skbs); #ifndef __STRIP__ /* See strip.py */ EXPECT_NE(0, homa_metrics_per_cpu()->data_pkt_reap_cycles); #endif /* See strip.py */ + + /* Third packet: must reap all dead skbs (SOCK_NO_SPACE). */ + set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0)); + EXPECT_EQ(0, self->hsk.dead_skbs); } TEST_F(homa_incoming, homa_data_pkt__basics) diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 4067ad2b..e0075fe1 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -939,16 +939,22 @@ TEST_F(homa_plumbing, homa_recvmsg__delete_server_rpc_after_error) } TEST_F(homa_plumbing, homa_recvmsg__reap_because_of_SOCK_NOSPACE) { + /* Make the tx message long enough that it takes multiple reap + * passes (to ensure homa_rpc_reap was called with reap_all==true). + */ struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_MSG, self->client_ip, self->server_ip, self->server_port, - self->client_id, 100, 2000); + self->client_id, 20000, 2000); EXPECT_NE(NULL, crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); + EXPECT_TRUE(refcount_read(&self->hsk.sock.sk_wmem_alloc) > 20000); set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); EXPECT_EQ(2000, homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_EQ(1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); + EXPECT_EQ(0, self->hsk.dead_skbs); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->reaper_calls)); } TEST_F(homa_plumbing, homa_recvmsg__error_copying_out_args) diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 866228f5..ae108ad9 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -533,25 +533,6 @@ TEST_F(homa_rpc, homa_rpc_end__remove_from_throttled_list) homa_rpc_end(crpc); EXPECT_EQ(0, unit_list_length(&self->homa.pacer->throttled_rpcs)); } -TEST_F(homa_rpc, homa_rpc_end__call_homa_rpc_reap) -{ - struct homa_rpc *srpc; - - srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->client_port, self->server_id, - 100, 3000); - ASSERT_NE(NULL, srpc); - homa_rpc_lock(srpc); - set_bit(SOCK_NOSPACE, &self->hsk.sock.sk_socket->flags); - unit_log_clear(); - - homa_rpc_end(srpc); - EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_EQ(0, unit_list_length(&self->hsk.dead_rpcs)); - EXPECT_STREQ("homa_rpc_end invoked; reaped 1235", - unit_log_get()); - homa_rpc_unlock(srpc); -} TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) { From dc6cefb7e0b9765e79447c04e352f87c326624a6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 29 Aug 2025 17:29:19 -0700 Subject: [PATCH 464/625] Print skb_shared_info size when loading Homa --- homa_plumbing.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 8e70f0ed..f57e4dd0 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -533,7 +533,7 @@ int __init homa_load(void) pr_err("Homa module loading\n"); #ifndef __UPSTREAM__ /* See strip.py */ - pr_notice("Homa structure sizes: homa_data_hdr %lu, homa_seg_hdr %lu, ack %lu, peer %lu, ip_hdr %lu flowi %lu ipv6_hdr %lu, flowi6 %lu tcp_sock %lu homa_rpc %lu sk_buff %lu rcvmsg_control %lu union sockaddr_in_union %lu HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", + pr_notice("Homa structure sizes: homa_data_hdr %lu, homa_seg_hdr %lu, ack %lu, peer %lu, ip_hdr %lu flowi %lu ipv6_hdr %lu, flowi6 %lu tcp_sock %lu homa_rpc %lu sk_buff %lu skb_shared_info %lu rcvmsg_control %lu union sockaddr_in_union %lu HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", sizeof(struct homa_data_hdr), sizeof(struct homa_seg_hdr), sizeof(struct homa_ack), @@ -545,6 +545,7 @@ int __init homa_load(void) sizeof(struct tcp_sock), sizeof(struct homa_rpc), sizeof(struct sk_buff), + sizeof(struct skb_shared_info), sizeof(struct homa_recvmsg_args), sizeof(union sockaddr_in_union), HOMA_MAX_BPAGES, From d8f7f1ef7433c1354844bd8df5bcb75a33aee93e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 29 Aug 2025 17:29:53 -0700 Subject: [PATCH 465/625] Add comment to explain why sock_wait_for_mem can't be used --- homa_sock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/homa_sock.c b/homa_sock.c index 2e61c5ad..859d1d2a 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -504,6 +504,10 @@ int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking) long timeo = hsk->sock.sk_sndtimeo; int result; + /* Note: we can't use sock_wait_for_wmem because that function + * is not available to modules (as of August 2025 it's static). + */ + if (nonblocking) timeo = 0; set_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags); From f420f1a4f3ffe476b52363c79c1227bc9df5b1b9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 29 Aug 2025 17:30:27 -0700 Subject: [PATCH 466/625] Various small improvements to the wmem test in homa_test.c --- util/homa_test.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/util/homa_test.cc b/util/homa_test.cc index f1df4257..13e8ab78 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -895,12 +895,15 @@ void test_udpclose() } } -/* Receive one message every second. */ +/* Receive one message every second. After a few messages have been + * received, shut down the socket to make sure that the wmem waiting + * mechanism aborts properly. + */ void recv_slow(int fd) { int status; - while (1) { + for (int i = 0; i < 15; i++) { sleep(1); recv_args.id = 0; recv_hdr.msg_controllen = sizeof(recv_args); @@ -909,8 +912,10 @@ void recv_slow(int fd) printf("Receiver exiting: %s\n", strerror(errno)); return; } - printf("Received response with %d bytes\n", status); + printf("Received response %d with %d bytes\n", i, status); } + printf("Receiver shutting down socket\n"); + shutdown(fd, 0); } /** @@ -932,7 +937,7 @@ void test_wmem(int fd, const sockaddr_in_union *dest, char *request) iov.iov_base = request; iov.iov_len = length; - for ( ; count > 0; count--) { + for (int i = 0; i < count; i++) { init_sendmsg_hdrs(&msghdr, &homa_args, &iov, 1, &dest->sa, sockaddr_size(&dest->sa)); status = sendmsg(fd, &msghdr, 0); @@ -940,14 +945,15 @@ void test_wmem(int fd, const sockaddr_in_union *dest, char *request) printf("Error in sendmsg: %s\n", strerror(errno)); break; } - printf("Sent request with %d bytes\n", length); + printf("Sent request %d with %d bytes\n", i, length); } + printf("Sender shutting down socket\n"); shutdown(fd, 0); thread.join(); } /** - * test_wmem() - Use two threads, a sender and a receiver, and make the + * test_wmem_poll() - Use two threads, a sender and a receiver, and make the * receiver go so slowly that the sender uses up all available tx packet * memory and blocks. On the sender, use poll to wait for tx packet memory. * @fd: Homa socket. From b4bb1fef55d06326fe22ae91202610c7cef7713f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sat, 30 Aug 2025 18:20:24 -0700 Subject: [PATCH 467/625] Acquire RCU read lock in homa_sock_wakeup_wmem for safety --- homa_sock.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/homa_sock.h b/homa_sock.h index cd7fd2e5..af367160 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -443,7 +443,9 @@ static inline void homa_sock_wakeup_wmem(struct homa_sock *hsk) tt_record2("homa_sock_wakeup_wmem waking up port %d, wmem %d", hsk->port, refcount_read(&hsk->sock.sk_wmem_alloc)); clear_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags); + rcu_read_lock(); wake_up_interruptible_poll(sk_sleep(&hsk->sock), EPOLLOUT); + rcu_read_unlock(); } } From f41038720a5169f75614a911ed29508e1314a135 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sun, 31 Aug 2025 16:21:52 -0700 Subject: [PATCH 468/625] Add comment about why sk_stream_write_space can't be used. --- homa_sock.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/homa_sock.h b/homa_sock.h index af367160..a4f09173 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -438,6 +438,10 @@ static inline bool homa_sock_wmem_avl(struct homa_sock *hsk) */ static inline void homa_sock_wakeup_wmem(struct homa_sock *hsk) { + /* Note: can't use sk_stream_write_space for this functionality + * because it uses a different test to determine whether enough + * memory is available. + */ if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags) && homa_sock_wmem_avl(hsk)) { tt_record2("homa_sock_wakeup_wmem waking up port %d, wmem %d", From 2267fe902c81f3c48d89a2e4c52eec805195dd51 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 09:33:30 -0700 Subject: [PATCH 469/625] Remove pacer from the stripped version for upstreaming --- Makefile | 5 ++-- Makefile.upstream | 1 - homa_impl.h | 5 ++-- homa_outgoing.c | 53 ++++++++++++++++++++--------------- homa_plumbing.c | 7 +++-- homa_rpc.c | 9 +++--- homa_utils.c | 11 ++++---- test/Makefile | 4 +-- test/unit_homa_incoming.c | 57 ++++++++++++++++++++----------------- test/unit_homa_outgoing.c | 59 ++++++++++++++++++++++++--------------- test/unit_homa_rpc.c | 2 ++ test/unit_homa_timer.c | 5 ++++ test/unit_homa_utils.c | 6 ++-- test/utils.c | 17 +++++++++++ test/utils.h | 10 ++++--- util/strip_decl.py | 4 +-- 16 files changed, 152 insertions(+), 103 deletions(-) diff --git a/Makefile b/Makefile index 39336e13..d68e49b8 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,6 @@ HOMA_OBJS := homa_devel.o \ homa_incoming.o \ homa_interest.o \ homa_outgoing.o \ - homa_pacer.o \ homa_peer.o \ homa_pool.o \ homa_plumbing.o \ @@ -20,6 +19,7 @@ else HOMA_OBJS += homa_grant.o \ homa_metrics.o \ homa_offload.o \ + homa_pacer.o \ homa_qdisc.o \ homa_skb.o endif @@ -64,7 +64,6 @@ checkpatch: HOMA_TARGET ?= $(LINUX_SRC_DIR)/net/homa CP_HDRS := homa_impl.h \ homa_interest.h \ - homa_pacer.h \ homa_peer.h \ homa_pool.h \ homa_rpc.h \ @@ -73,7 +72,7 @@ CP_HDRS := homa_impl.h \ homa_wire.h \ murmurhash3.h CP_SRCS := $(patsubst %.o,%.c,$(filter-out homa_devel.o homa_grant.o \ - homa_metrics.o homa_offload.o homa_qdisc.o \ + homa_metrics.o homa_offload.o homa_pacer.o homa_qdisc.o \ homa_skb.o timetrace.o, $(HOMA_OBJS))) CP_EXTRAS := Kconfig \ Makefile diff --git a/Makefile.upstream b/Makefile.upstream index a7ebccd4..1e02be7f 100644 --- a/Makefile.upstream +++ b/Makefile.upstream @@ -6,7 +6,6 @@ obj-$(CONFIG_HOMA) := homa.o homa-y:= homa_incoming.o \ homa_interest.o \ homa_outgoing.o \ - homa_pacer.o \ homa_peer.o \ homa_plumbing.o \ homa_pool.o \ diff --git a/homa_impl.h b/homa_impl.h index 49c46e68..4894b345 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -111,12 +111,12 @@ struct homa { */ atomic64_t next_outgoing_id; +#ifndef __STRIP__ /* See strip.py */ /** * @pacer: Information related to the pacer; managed by homa_pacer.c. */ struct homa_pacer *pacer; -#ifndef __STRIP__ /* See strip.py */ /** * @grant: Contains information used by homa_grant.c to manage * grants for incoming messages. @@ -808,7 +808,6 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, size_t length, struct homa_rpc *rpc); int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, struct homa_sock *hsk); -void homa_xmit_data(struct homa_rpc *rpc, bool force); void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk); #ifndef __STRIP__ /* See strip.py */ @@ -827,11 +826,13 @@ int homa_sysctl_softirq_cores(const struct ctl_table *table, loff_t *ppos); int homa_unsched_priority(struct homa *homa, struct homa_peer *peer, int length); +void homa_xmit_data(struct homa_rpc *rpc, bool force); void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc, int priority); #else /* See strip.py */ int homa_message_in_init(struct homa_rpc *rpc, int unsched); void homa_resend_data(struct homa_rpc *rpc, int start, int end); +void homa_xmit_data(struct homa_rpc *rpc); void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc); #endif /* See strip.py */ diff --git a/homa_outgoing.c b/homa_outgoing.c index 391724f7..1eb025ae 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -5,16 +5,15 @@ */ #include "homa_impl.h" -#include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" +#include "homa_wire.h" + #ifndef __STRIP__ /* See strip.py */ +#include "homa_pacer.h" #include "homa_qdisc.h" #include "homa_skb.h" -#endif /* See strip.py */ -#include "homa_wire.h" - -#ifdef __STRIP__ /* See strip.py */ +#else /* See strip.py */ #include "homa_stub.h" #endif /* See strip.py */ @@ -265,7 +264,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) struct sk_buff **last_link; struct dst_entry *dst; u64 segs_per_gso; - int overlap_xmit; + IF_NO_STRIP(int overlap_xmit); /* Bytes of the message that haven't yet been copied into skbs. */ int bytes_left; @@ -324,8 +323,8 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) UNIT_LOG("; ", "mtu %d, max_seg_data %d, max_gso_data %d", mtu, max_seg_data, max_gso_data); - overlap_xmit = rpc->msgout.length > 2 * max_gso_data; #ifndef __STRIP__ /* See strip.py */ + overlap_xmit = rpc->msgout.length > 2 * max_gso_data; if (homa_qdisc_active(rpc->hsk->hnet)) overlap_xmit = 0; rpc->msgout.granted = rpc->msgout.unscheduled; @@ -382,22 +381,24 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) rpc->msgout.skb_memory += skb->truesize; rpc->msgout.copied_from_user = rpc->msgout.length - bytes_left; rpc->msgout.first_not_tx = rpc->msgout.packets; - if (overlap_xmit && list_empty(&rpc->throttled_links) && #ifndef __STRIP__ /* See strip.py */ + if (overlap_xmit && list_empty(&rpc->throttled_links) && xmit && offset < rpc->msgout.granted) { -#else /* See strip.py */ - xmit) { -#endif /* See strip.py */ tt_record1("waking up pacer for id %d", rpc->id); homa_pacer_manage_rpc(rpc); } +#endif /* See strip.py */ } tt_record2("finished copy from user space for id %d, length %d", rpc->id, rpc->msgout.length); INC_METRIC(sent_msg_bytes, rpc->msgout.length); refcount_add(rpc->msgout.skb_memory, &rpc->hsk->sock.sk_wmem_alloc); +#ifndef __STRIP__ /* See strip.py */ if (!overlap_xmit && xmit) homa_xmit_data(rpc, false); +#else /* See strip.py */ + homa_xmit_data(rpc); +#endif /* See strip.py */ return 0; error: @@ -577,6 +578,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) homa_peer_release(peer); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_xmit_data() - If an RPC has outbound data packets that are permitted * to be transmitted according to the scheduling mechanism, arrange for @@ -585,7 +587,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) * @rpc: RPC to check for transmittable packets. Must be locked by * caller. Note: this function will release the RPC lock while * passing packets through the RPC stack, then reacquire it - * before returning. It is possible that the RPC gets freed + * before returning. It is possible that the RPC gets terminated * when the lock isn't held, in which case the state will * be RPC_DEAD on return. * @force: True means send at least one packet, even if the NIC queue @@ -593,11 +595,25 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) * the NIC queue is sufficiently long. */ void homa_xmit_data(struct homa_rpc *rpc, bool force) +#else /* See strip.py */ +/** + * homa_xmit_data() - If an RPC has outbound data packets that are permitted + * to be transmitted according to the scheduling mechanism, arrange for + * them to be sent. + * @rpc: RPC to check for transmittable packets. Must be locked by + * caller. Note: this function will release the RPC lock while + * passing packets through the RPC stack, then reacquire it + * before returning. It is possible that the RPC gets terminated + * when the lock isn't held, in which case the state will + * be RPC_DEAD on return. + */ +void homa_xmit_data(struct homa_rpc *rpc) +#endif /* See strip.py */ __must_hold(rpc->bucket->lock) { - struct homa *homa = rpc->hsk->homa; int length; + IF_NO_STRIP(struct homa *homa = rpc->hsk->homa); IF_NO_STRIP(struct netdev_queue *txq); while (*rpc->msgout.next_xmit && rpc->state != RPC_DEAD) { @@ -612,16 +628,10 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) rpc->msgout.granted); break; } -#endif /* See strip.py */ -#ifndef __STRIP__ /* See strip.py */ if (rpc->msgout.length - rpc->msgout.next_xmit_offset > homa->pacer->throttle_min_bytes && !homa_qdisc_active(rpc->hsk->hnet)) { -#else /* See strip.py */ - if (rpc->msgout.length - rpc->msgout.next_xmit_offset > - homa->pacer->throttle_min_bytes) { -#endif /* See strip.py */ if (!homa_pacer_check_nic_q(homa->pacer, skb, force)) { tt_record1("homa_xmit_data adding id %u to throttle queue", rpc->id); @@ -630,7 +640,6 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) } } -#ifndef __STRIP__ /* See strip.py */ if (rpc->msgout.next_xmit_offset < rpc->msgout.unscheduled) priority = homa_unsched_priority(homa, rpc->peer, rpc->msgout.length); @@ -663,10 +672,10 @@ void homa_xmit_data(struct homa_rpc *rpc, bool force) tt_record4("homa_xmit_data found stopped txq for id %d, qid %d, num_queued %d, limit %d", rpc->id, skb->queue_mapping, txq->dql.num_queued, txq->dql.adj_limit); + force = false; #else /* See strip.py */ __homa_xmit_data(skb, rpc); #endif /* See strip.py */ - force = false; homa_rpc_lock(rpc); } } @@ -857,9 +866,9 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) new_homa_info->offset = offset; tt_record3("retransmitting offset %d, length %d, id %d", offset, seg_length, rpc->id); +#ifndef __STRIP__ /* See strip.py */ homa_pacer_check_nic_q(rpc->hsk->homa->pacer, new_skb, true); -#ifndef __STRIP__ /* See strip.py */ __homa_xmit_data(new_skb, rpc, priority); #else /* See strip.py */ __homa_xmit_data(new_skb, rpc); diff --git a/homa_plumbing.c b/homa_plumbing.c index f57e4dd0..845c9270 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -5,14 +5,15 @@ */ #include "homa_impl.h" +#include "homa_peer.h" +#include "homa_pool.h" + #ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" #include "homa_offload.h" +#include "homa_pacer.h" #include "homa_qdisc.h" #endif /* See strip.py */ -#include "homa_pacer.h" -#include "homa_peer.h" -#include "homa_pool.h" /* Identifier for retrieving Homa-specific data for a struct net. */ unsigned int homa_net_id; diff --git a/homa_rpc.c b/homa_rpc.c index b70aa5d9..24bbdb2f 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -4,15 +4,14 @@ #include "homa_impl.h" #include "homa_interest.h" -#include "homa_pacer.h" #include "homa_peer.h" #include "homa_pool.h" + #ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" +#include "homa_pacer.h" #include "homa_skb.h" -#endif /* See strip.py */ - -#ifdef __STRIP__ /* See strip.py */ +#else /* See strip.py */ #include "homa_stub.h" #endif /* See strip.py */ @@ -322,7 +321,7 @@ void homa_rpc_end(struct homa_rpc *rpc) rpc->hsk->homa->max_dead_buffs = rpc->hsk->dead_skbs; homa_sock_unlock(rpc->hsk); - homa_pacer_unmanage_rpc(rpc); + IF_NO_STRIP(homa_pacer_unmanage_rpc(rpc)); } /** diff --git a/homa_utils.c b/homa_utils.c index 2d2b1f93..37646165 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -5,15 +5,14 @@ */ #include "homa_impl.h" -#include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" + #ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" +#include "homa_pacer.h" #include "homa_skb.h" -#endif /* See strip.py */ - -#ifdef __STRIP__ /* See strip.py */ +#else /* See strip.py */ #include "homa_stub.h" #endif /* See strip.py */ @@ -35,13 +34,13 @@ int homa_init(struct homa *homa) atomic64_set(&homa->next_outgoing_id, 2); homa->link_mbps = 25000; +#ifndef __STRIP__ /* See strip.py */ homa->pacer = homa_pacer_alloc(homa); if (IS_ERR(homa->pacer)) { err = PTR_ERR(homa->pacer); homa->pacer = NULL; return err; } -#ifndef __STRIP__ /* See strip.py */ homa->grant = homa_grant_alloc(homa); if (IS_ERR(homa->grant)) { err = PTR_ERR(homa->grant); @@ -137,11 +136,11 @@ void homa_destroy(struct homa *homa) homa_grant_free(homa->grant); homa->grant = NULL; } -#endif /* See strip.py */ if (homa->pacer) { homa_pacer_free(homa->pacer); homa->pacer = NULL; } +#endif /* See strip.py */ if (homa->peertab) { homa_peer_free_peertab(homa->peertab); homa->peertab = NULL; diff --git a/test/Makefile b/test/Makefile index 2fe253b9..2603bdfa 100644 --- a/test/Makefile +++ b/test/Makefile @@ -43,7 +43,6 @@ CCFLAGS := -std=c++11 $(WARNS) -MD -g $(CCINCLUDES) $(DEFS) -fsanitize=address TEST_SRCS := unit_homa_incoming.c \ unit_homa_interest.c \ unit_homa_outgoing.c \ - unit_homa_pacer.c \ unit_homa_peer.c \ unit_homa_pool.c \ unit_homa_plumbing.c \ @@ -56,6 +55,7 @@ ifeq ($(__STRIP__),) TEST_SRCS += unit_homa_grant.c \ unit_homa_metrics.c \ unit_homa_offload.c \ + unit_homa_pacer.c \ unit_homa_qdisc.c \ unit_homa_skb.c endif @@ -65,7 +65,6 @@ HOMA_SRCS := homa_devel.c \ homa_interest.c \ homa_incoming.c \ homa_outgoing.c \ - homa_pacer.c \ homa_peer.c \ homa_pool.c \ homa_plumbing.c \ @@ -78,6 +77,7 @@ ifeq ($(__STRIP__),) HOMA_SRCS += homa_grant.c \ homa_metrics.c \ homa_offload.c \ + homa_pacer.c \ homa_qdisc.c \ homa_skb.c endif diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 1e02082e..6704fdc9 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -16,6 +16,12 @@ #include "homa_offload.h" #endif /* See strip.py */ +#ifndef __STRIP__ /* See strip.py */ +#define XMIT_DATA(rpc, force) homa_xmit_data(rpc, force) +#else /* See strip.py */ +#define XMIT_DATA(rpc, force) homa_xmit_data(rpc) +#endif /* See strip.py */ + static struct homa_rpc *hook_rpc; static int delete_count; static int lock_delete_count; @@ -94,10 +100,8 @@ FIXTURE_SETUP(homa_incoming) #ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 1; self->homa.poll_cycles = 0; -#endif /* See strip.py */ self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.pacer->fifo_fraction = 0; -#ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; #endif /* See strip.py */ @@ -1571,7 +1575,7 @@ TEST_F(homa_incoming, homa_grant_pkt__basics) ASSERT_NE(NULL, srpc); homa_rpc_lock(srpc); - homa_xmit_data(srpc, false); + XMIT_DATA(srpc, false); homa_rpc_unlock(srpc); unit_log_clear(); @@ -1671,6 +1675,26 @@ TEST_F(homa_incoming, homa_resend_pkt__rpc_incoming_server_sends_busy) // The server might send a GRANT right after BUSY so just check substr EXPECT_SUBSTR("xmit BUSY", unit_log_get()); } +TEST_F(homa_incoming, homa_resend_pkt__negative_length_in_resend) +{ + struct homa_resend_hdr h = {{.sport = htons(self->client_port), + .dport = htons(self->server_port), + .sender_id = cpu_to_be64(self->client_id), + .type = RESEND}, + .offset = htonl(0), + .length = htonl(-1)}; + struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 2000, 20000); + + ASSERT_NE(NULL, srpc); + unit_log_clear(); + srpc->msgout.next_xmit_offset = 2000; + + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); + EXPECT_STREQ("xmit DATA retrans 1400@0; " + "xmit DATA retrans 1400@1400", unit_log_get()); +} TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) { /* Important to respond to resends even if client thinks the @@ -1692,26 +1716,6 @@ TEST_F(homa_incoming, homa_resend_pkt__client_not_outgoing) homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); EXPECT_STREQ("xmit DATA retrans 1400@0", unit_log_get()); } -TEST_F(homa_incoming, homa_resend_pkt__negative_length_in_resend) -{ - struct homa_resend_hdr h = {{.sport = htons(self->client_port), - .dport = htons(self->server_port), - .sender_id = cpu_to_be64(self->client_id), - .type = RESEND}, - .offset = htonl(0), - .length = htonl(-1)}; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk2, UNIT_OUTGOING, - self->client_ip, self->server_ip, self->client_port, - self->server_id, 2000, 20000); - - ASSERT_NE(NULL, srpc); - unit_log_clear(); - srpc->msgout.next_xmit_offset = 2000; - - homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &h.common, 0, 0)); - EXPECT_STREQ("xmit DATA retrans 1400@0; " - "xmit DATA retrans 1400@1400", unit_log_get()); -} TEST_F(homa_incoming, homa_resend_pkt__clip_range_to_tx_end) { struct homa_resend_hdr h = {{.sport = htons(self->server_port), @@ -1768,7 +1772,7 @@ TEST_F(homa_incoming, homa_resend_pkt__update_granted_and_xmit) ASSERT_NE(NULL, crpc); crpc->msgout.granted = 1400; homa_rpc_lock(crpc); - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); homa_rpc_unlock(crpc); unit_log_clear(); EXPECT_EQ(1400, crpc->msgout.next_xmit_offset); @@ -1808,6 +1812,7 @@ TEST_F(homa_incoming, homa_resend_pkt__requested_data_hasnt_been_sent_yet) self->server_port, self->client_id, 2000, 100); ASSERT_NE(NULL, crpc); + unit_reset_tx(crpc); unit_log_clear(); homa_dispatch_pkts(mock_skb_alloc(self->server_ip, &h.common, 0, 0)); @@ -1826,7 +1831,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_all) ASSERT_NE(NULL, crpc); homa_rpc_lock(crpc); - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); homa_rpc_unlock(crpc); unit_log_clear(); @@ -1858,7 +1863,7 @@ TEST_F(homa_incoming, homa_unknown_pkt__client_resend_part) crpc->msgout.granted = 1400; #endif /* See strip.py */ homa_rpc_lock(crpc); - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); homa_rpc_unlock(crpc); unit_log_clear(); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 2ffd3045..c6bd50c7 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -17,6 +17,12 @@ #include "mock.h" #include "utils.h" +#ifndef __STRIP__ /* See strip.py */ +#define XMIT_DATA(rpc, force) homa_xmit_data(rpc, force) +#else /* See strip.py */ +#define XMIT_DATA(rpc, force) homa_xmit_data(rpc) +#endif /* See strip.py */ + /* The following hook function frees hook_rpc. */ static struct homa_rpc *hook_rpc; static void unlock_hook(char *id) @@ -83,9 +89,9 @@ FIXTURE_SETUP(homa_outgoing) homa_init(&self->homa); self->hnet = mock_alloc_hnet(&self->homa); mock_clock = 10000; +#ifndef __STRIP__ /* See strip.py */ self->homa.pacer->cycles_per_mbyte = 1000000; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; -#ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; self->homa.pacer->fifo_fraction = 0; @@ -375,7 +381,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__basics) EXPECT_EQ(3000, crpc->msgout.granted); #endif /* See strip.py */ EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); - EXPECT_STREQ("mtu 1496, max_seg_data 1400, max_gso_data 1400; " + EXPECT_SUBSTR("mtu 1496, max_seg_data 1400, max_gso_data 1400; " "_copy_from_iter 1400 bytes at 1000; " "_copy_from_iter 1400 bytes at 2400; " "_copy_from_iter 200 bytes at 3800", unit_log_get()); @@ -575,6 +581,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) EXPECT_EQ(1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); homa_rpc_unlock(crpc); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_message_out_fill__add_to_throttled) { struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, @@ -608,6 +615,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__too_short_for_pipelining) unit_log_throttled(&self->homa); EXPECT_STREQ("", unit_log_get()); } +#endif /* See strip.py */ TEST_F(homa_outgoing, homa_message_out_fill__packet_memory_accounting) { struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, @@ -776,11 +784,14 @@ TEST_F(homa_outgoing, homa_xmit_data__basics) crpc->msgout.granted = 5000; homa_peer_set_cutoffs(crpc->peer, INT_MAX, 0, 0, 0, 0, INT_MAX, 7000, 0); +#else /* See strip.py */ + unit_reset_tx(crpc); #endif /* See strip.py */ + unit_log_clear(); mock_clear_xmit_prios(); homa_rpc_lock(crpc); - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); homa_rpc_unlock(crpc); #ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit DATA 1400@0; " @@ -789,6 +800,9 @@ TEST_F(homa_outgoing, homa_xmit_data__basics) "xmit DATA 1400@4200", unit_log_get()); EXPECT_STREQ("6 6 2 2", mock_xmit_prios); EXPECT_EQ(5600, crpc->msgout.next_xmit_offset); + unit_log_clear(); + unit_log_throttled(&self->homa); + EXPECT_STREQ("", unit_log_get()); #else /* See strip.py */ EXPECT_STREQ("xmit DATA 1400@0; " "xmit DATA 1400@1400; " @@ -797,9 +811,6 @@ TEST_F(homa_outgoing, homa_xmit_data__basics) "xmit DATA 400@5600", unit_log_get()); EXPECT_EQ(6000, crpc->msgout.next_xmit_offset); #endif /* See strip.py */ - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("", unit_log_get()); } #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__stop_because_no_more_granted) @@ -811,14 +822,13 @@ TEST_F(homa_outgoing, homa_xmit_data__stop_because_no_more_granted) unit_log_clear(); crpc->msgout.granted = 1000; homa_rpc_lock(crpc); - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); unit_log_clear(); unit_log_throttled(&self->homa); EXPECT_STREQ("", unit_log_get()); } -#endif /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -831,7 +841,7 @@ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) self->homa.pacer->throttle_min_bytes = 250; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 200@0", unit_log_get()); unit_log_clear(); @@ -852,7 +862,7 @@ TEST_F(homa_outgoing, homa_xmit_data__force) self->homa.pacer->max_nic_queue_cycles = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc1); - homa_xmit_data(crpc1, false); + XMIT_DATA(crpc1, false); homa_rpc_unlock(crpc1); unit_log_clear(); unit_log_throttled(&self->homa); @@ -861,7 +871,7 @@ TEST_F(homa_outgoing, homa_xmit_data__force) /* Now force transmission. */ unit_log_clear(); homa_rpc_lock(crpc2); - homa_xmit_data(crpc2, true); + XMIT_DATA(crpc2, true); homa_rpc_unlock(crpc2); EXPECT_STREQ("xmit DATA 1400@0", unit_log_get()); unit_log_clear(); @@ -869,7 +879,6 @@ TEST_F(homa_outgoing, homa_xmit_data__force) EXPECT_STREQ("request id 1234, next_offset 2800; " "request id 1236, next_offset 1400", unit_log_get()); } -#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__dont_throttle_because_homa_qdisc_in_use) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -884,7 +893,7 @@ TEST_F(homa_outgoing, homa_xmit_data__dont_throttle_because_homa_qdisc_in_use) self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 600@1400", unit_log_get()); unit_log_clear(); @@ -892,7 +901,6 @@ TEST_F(homa_outgoing, homa_xmit_data__dont_throttle_because_homa_qdisc_in_use) EXPECT_STREQ("", unit_log_get()); homa_qdisc_qdev_put(qdev); } -#endif /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__throttle) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -905,7 +913,7 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 1400@0; " "xmit DATA 1400@1400", unit_log_get()); @@ -913,7 +921,6 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) unit_log_throttled(&self->homa); EXPECT_STREQ("request id 1234, next_offset 2800", unit_log_get()); } -#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_xmit_data__metrics_for_client_rpc) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -922,12 +929,12 @@ TEST_F(homa_outgoing, homa_xmit_data__metrics_for_client_rpc) crpc->msgout.granted = 4000; homa_rpc_lock(crpc); - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); EXPECT_EQ(4200, homa_metrics_per_cpu()->client_request_bytes_done); EXPECT_EQ(0, homa_metrics_per_cpu()->client_requests_done); crpc->msgout.granted = 6000; - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); EXPECT_EQ(6000, homa_metrics_per_cpu()->client_request_bytes_done); EXPECT_EQ(1, homa_metrics_per_cpu()->client_requests_done); homa_rpc_unlock(crpc); @@ -942,12 +949,12 @@ TEST_F(homa_outgoing, homa_xmit_data__metrics_for_server_rpc) srpc->msgout.granted = 4000; homa_rpc_lock(srpc); - homa_xmit_data(srpc, false); + XMIT_DATA(srpc, false); EXPECT_EQ(4200, homa_metrics_per_cpu()->server_response_bytes_done); EXPECT_EQ(0, homa_metrics_per_cpu()->server_responses_done); srpc->msgout.granted = 9900; - homa_xmit_data(srpc, false); + XMIT_DATA(srpc, false); EXPECT_EQ(10000, homa_metrics_per_cpu()->server_response_bytes_done); EXPECT_EQ(1, homa_metrics_per_cpu()->server_responses_done); homa_rpc_unlock(srpc); @@ -962,13 +969,15 @@ TEST_F(homa_outgoing, homa_xmit_data__rpc_freed) #ifndef __STRIP__ /* See strip.py */ crpc->msgout.unscheduled = 2000; crpc->msgout.granted = 5000; +#else /* See strip.py */ + unit_reset_tx(crpc); #endif /* See strip.py */ unit_log_clear(); homa_rpc_lock(crpc); unit_hook_register(lock_free_hook); hook_rpc = crpc; - homa_xmit_data(crpc, false); + XMIT_DATA(crpc, false); homa_rpc_unlock(crpc); EXPECT_STREQ("xmit DATA 1400@0; homa_rpc_end invoked", unit_log_get()); @@ -998,6 +1007,9 @@ TEST_F(homa_outgoing, __homa_xmit_data__fill_dst) struct dst_entry *dst; int old_refcount; +#ifdef __STRIP__ /* See strip.py */ + unit_reset_tx(crpc); +#endif /* See strip.py */ unit_log_clear(); dst = crpc->peer->dst; old_refcount = atomic_read(&dst->__rcuref.refcnt); @@ -1198,15 +1210,18 @@ TEST_F(homa_outgoing, homa_rpc_tx_end) ASSERT_EQ(5, crpc->msgout.num_skbs); /* First call: no packets passed to IP stack. */ + crpc->msgout.next_xmit_offset = 0; EXPECT_EQ(0, homa_rpc_tx_end(crpc)); - /* Second call: all packets passed to IP, but no packets complete. */ for (skb = crpc->msgout.packets, i = 0; skb != NULL; skb = homa_get_skb_info(skb)->next_skb, i++) { skbs[i] = skb; skb_get(skb); EXPECT_EQ(2, refcount_read(&skbs[i]->users)); } + EXPECT_EQ(5, i); + + /* Second call: all packets passed to IP, but no packets complete. */ crpc->msgout.next_xmit_offset = 6000; EXPECT_EQ(0, homa_rpc_tx_end(crpc)); diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index ae108ad9..3db56ebc 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -521,6 +521,7 @@ TEST_F(homa_rpc, homa_rpc_end__dead_buffs) EXPECT_EQ(14, self->homa.max_dead_buffs); EXPECT_EQ(14, self->hsk.dead_skbs); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_rpc, homa_rpc_end__remove_from_throttled_list) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -533,6 +534,7 @@ TEST_F(homa_rpc, homa_rpc_end__remove_from_throttled_list) homa_rpc_end(crpc); EXPECT_EQ(0, unit_list_length(&self->homa.pacer->throttled_rpcs)); } +#endif /* See strip.py */ TEST_F(homa_rpc, homa_rpc_reap__nothing_to_reap) { diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 4ce452db..443ccafe 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -62,7 +62,11 @@ TEST_F(homa_timer, homa_timer_check_rpc__request_ack) /* First call: do nothing (response not fully transmitted). */ homa_rpc_lock(srpc); +#ifndef __STRIP__ /* See strip.py */ homa_xmit_data(srpc, false); +#else /* See strip.py */ + homa_xmit_data(srpc); +#endif /* See strip.py */ skb_get(srpc->msgout.packets); homa_timer_check_rpc(srpc); EXPECT_EQ(0, srpc->done_timer_ticks); @@ -145,6 +149,7 @@ TEST_F(homa_timer, homa_timer_check_rpc__granted_bytes_not_sent) self->server_port, self->client_id, 5000, 200); ASSERT_NE(NULL, crpc); + crpc->msgout.next_xmit_offset = 0; unit_log_clear(); crpc->silent_ticks = 10; homa_rpc_lock(crpc); diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index ed94377c..3a3cc6d3 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -56,7 +56,6 @@ static void set_cutoffs(struct homa *homa, int c0, int c1, int c2, homa->unsched_cutoffs[6] = c6; homa->unsched_cutoffs[7] = c7; } -#endif /* See strip.py */ TEST_F(homa_utils, homa_init__pacer_alloc_failure) { @@ -68,7 +67,6 @@ TEST_F(homa_utils, homa_init__pacer_alloc_failure) EXPECT_EQ(NULL, homa2.pacer); homa_destroy(&homa2); } -#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_utils, homa_init__grant_alloc_failure) { struct homa homa2; @@ -87,7 +85,7 @@ TEST_F(homa_utils, homa_init__peertab_alloc_failure) #ifndef __STRIP__ /* See strip.py */ mock_kmalloc_errors = 4; #else /* See strip.py */ - mock_kmalloc_errors = 2; + mock_kmalloc_errors = 1; #endif/* See strip.py */ unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); @@ -101,7 +99,7 @@ TEST_F(homa_utils, homa_init__cant_allocate_port_map) #ifndef __STRIP__ /* See strip.py */ mock_kmalloc_errors = 0x10; #else /* See strip.py */ - mock_kmalloc_errors = 8; + mock_kmalloc_errors = 4; #endif/* See strip.py */ unit_log_clear(); EXPECT_EQ(ENOMEM, -homa_init(&homa2)); diff --git a/test/utils.c b/test/utils.c index a5cab395..c8055ea0 100644 --- a/test/utils.c +++ b/test/utils.c @@ -301,6 +301,7 @@ void unit_log_skb_list(struct sk_buff_head *packets, int verbose) } } +#ifndef __STRIP__ /* See strip.py */ /** * unit_log_throttled() - Append to the test log information about all of * the messages in homa->throttle_rpcs. @@ -317,6 +318,7 @@ void unit_log_throttled(struct homa *homa) rpc->msgout.next_xmit_offset); } } +#endif /* See strip.py */ /** * unit_print_gaps() - Returns a static string describing the gaps in an RPC. @@ -342,6 +344,21 @@ const char *unit_print_gaps(struct homa_rpc *rpc) return buffer; } +/** + * unit_reset_tx() - Reset the state of an RPC so that it appears no packets + * have been transmitted. + */ +void unit_reset_tx(struct homa_rpc *rpc) +{ + struct sk_buff *skb; + + for (skb = rpc->msgout.packets; skb != NULL; + skb = homa_get_skb_info(skb)->next_skb) + skb_dst_drop(skb); + rpc->msgout.next_xmit = &rpc->msgout.packets; + rpc->msgout.next_xmit_offset = 0; +} + /** * unit_server_rpc() - Create a homa_server_rpc and arrange for it to be * in a given state. diff --git a/test/utils.h b/test/utils.h index 16aa3e94..5c739872 100644 --- a/test/utils.h +++ b/test/utils.h @@ -46,16 +46,13 @@ int unit_list_length(struct list_head *head); void unit_log_active_ids(struct homa_sock *hsk); void unit_log_filled_skbs(struct sk_buff *skb, int verbose); void unit_log_frag_list(struct sk_buff *skb, int verbose); -#ifndef __STRIP__ /* See strip.py */ -void unit_log_grantables(struct homa *homa); -#endif /* See strip.py */ void unit_log_hashed_rpcs(struct homa_sock *hsk); void unit_log_message_out_packets(struct homa_message_out *message, int verbose); void unit_log_skb_list(struct sk_buff_head *packets, int verbose); -void unit_log_throttled(struct homa *homa); const char *unit_print_gaps(struct homa_rpc *rpc); +void unit_reset_tx(struct homa_rpc *rpc); struct homa_rpc *unit_server_rpc(struct homa_sock *hsk, enum unit_rpc_state state, @@ -66,6 +63,11 @@ struct homa_rpc void unit_sock_destroy(struct homa_sock *hsk); void unit_teardown(void); +#ifndef __STRIP__ /* See strip.py */ +void unit_log_grantables(struct homa *homa); +void unit_log_throttled(struct homa *homa); +#endif /* See strip.py */ + /* Kludge to avoid including arpa/inet.h, which causes definition * conflicts with kernel header files. */ diff --git a/util/strip_decl.py b/util/strip_decl.py index 22edd6bf..65c23d07 100755 --- a/util/strip_decl.py +++ b/util/strip_decl.py @@ -32,12 +32,10 @@ ['peer', 'int homa_xmit_control(' ], - ['pacer', - 'void homa_xmit_data(' - ], ['rpc', 'int homa_message_in_init(', 'void homa_rpc_handoff(', + 'void homa_xmit_data(' ], ['outgoing', 'int homa_fill_data_interleaved(', From e9a118fac57d1e7c2358562601fa85e418128cd3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 10:14:07 -0700 Subject: [PATCH 470/625] Reduce batch size in homa_rpc_reap from 20 to 10 This is to reduce the amount of stack space consumed. --- homa_rpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_rpc.c b/homa_rpc.c index 24bbdb2f..ce4c9d45 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -458,7 +458,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) #ifdef __UNIT_TEST__ #define BATCH_MAX 3 #else /* __UNIT_TEST__ */ -#define BATCH_MAX 20 +#define BATCH_MAX 10 #endif /* __UNIT_TEST__ */ struct homa_rpc *rpcs[BATCH_MAX]; struct sk_buff *skbs[BATCH_MAX]; From eab5d498e7ce605897c797201a3682464b7d64c0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 11:22:42 -0700 Subject: [PATCH 471/625] Fix compilation problem with stripped version --- homa_impl.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 4894b345..ec84f5ba 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -701,13 +701,10 @@ static inline bool is_homa_pkt(struct sk_buff *skb) return true; protocol = (skb_is_ipv6(skb)) ? ipv6_hdr(skb)->nexthdr : ip_hdr(skb)->protocol; -#ifndef __STRIP__ /* See strip.py */ return (protocol == IPPROTO_HOMA || (protocol == IPPROTO_TCP && tcp_hdr(skb)->urg_ptr == htons(HOMA_TCP_URGENT))); -#else /* See strip.py */ return protocol == IPPROTO_HOMA; -#endif /* See strip.py */ } #endif /* See strip.py */ From d660afdefd5ea51b4c2997796f05e628e2df9ddf Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 11:45:19 -0700 Subject: [PATCH 472/625] Update notes.txt --- notes.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/notes.txt b/notes.txt index 6acc2c90..2f01b12e 100755 --- a/notes.txt +++ b/notes.txt @@ -4,6 +4,8 @@ Notes for Homa implementation in Linux: * Move interest cleanup code from homa_sock to a new function in homa_interest. Also move wakeup code from homa_rpc_handoff. +* Use skb_attempt_defer_free once it has been properly exported. + * Thoughts on making TCP and Homa play better together: * Goals: * Keep the NIC tx queue from growing long. From f883d53351699d1593b884b60e08497a166a8518 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 11:49:00 -0700 Subject: [PATCH 473/625] Change type of homa_rpc->refs from atomic_t to refcount_t --- homa_rpc.c | 16 +++++++++------- homa_rpc.h | 11 ++++++----- homa_sock.h | 2 +- test/mock.c | 6 +++--- test/unit_homa_grant.c | 20 ++++++++++---------- 5 files changed, 29 insertions(+), 26 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index ce4c9d45..dad4e1df 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -45,6 +45,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, bucket = homa_client_rpc_bucket(hsk, crpc->id); crpc->bucket = bucket; crpc->state = RPC_OUTGOING; + refcount_set(&crpc->refs, 1); crpc->peer = homa_peer_get(hsk, &dest_addr_as_ipv6); if (IS_ERR(crpc->peer)) { tt_record("error in homa_peer_get"); @@ -149,6 +150,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, srpc->hsk = hsk; srpc->bucket = bucket; srpc->state = RPC_INCOMING; + refcount_set(&srpc->refs, 1); srpc->peer = homa_peer_get(hsk, source); if (IS_ERR(srpc->peer)) { err = PTR_ERR(srpc->peer); @@ -509,18 +511,18 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) int refs; /* Make sure that all outstanding uses of the RPC have - * completed. We can only be sure if the reference - * count is zero when we're holding the lock. Note: - * it isn't safe to block while locking the RPC here, - * since we hold the socket lock. + * completed. We can read the reference count safely + * only when we're holding the lock. Note: it isn't + * safe to block while locking the RPC here, since we + * hold the socket lock. */ if (homa_rpc_try_lock(rpc)) { - refs = atomic_read(&rpc->refs); + refs = refcount_read(&rpc->refs); homa_rpc_unlock(rpc); } else { - refs = 1; + refs = 2; } - if (refs != 0) { + if (refs > 1) { INC_METRIC(deferred_rpc_reaps, 1); continue; } diff --git a/homa_rpc.h b/homa_rpc.h index f77de25a..aa862de8 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -280,10 +280,11 @@ struct homa_rpc { #define RPC_PRIVATE 8 /** - * @refs: Number of unmatched calls to homa_rpc_hold; it's not safe - * to free the RPC until this is zero. + * @refs: Number of references to this RPC, including one for each + * unmatched call to homa_rpc_hold plus one for the socket's reference + * in either active_rpcs or dead_rpcs. */ - atomic_t refs; + refcount_t refs; /** * @peer: Information about the other machine (the server, if @@ -528,7 +529,7 @@ static inline void homa_unprotect_rpcs(struct homa_sock *hsk) */ static inline void homa_rpc_hold(struct homa_rpc *rpc) { - atomic_inc(&rpc->refs); + refcount_inc(&rpc->refs); } /** @@ -538,7 +539,7 @@ static inline void homa_rpc_hold(struct homa_rpc *rpc) */ static inline void homa_rpc_put(struct homa_rpc *rpc) { - atomic_dec(&rpc->refs); + refcount_dec(&rpc->refs); } #endif /* __UNIT_TEST__ */ diff --git a/homa_sock.h b/homa_sock.h index a4f09173..e2d904fa 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -212,7 +212,7 @@ struct homa_sock { /** * @dead_rpcs: Contains RPCs for which homa_rpc_end has been - * called, but their packet buffers haven't yet been freed. + * called, but which have not yet been reaped by homa_rpc_reap. */ struct list_head dead_rpcs; diff --git a/test/mock.c b/test/mock.c index f80a64f2..01ad29c1 100644 --- a/test/mock.c +++ b/test/mock.c @@ -2024,15 +2024,15 @@ void *mock_rht_walk_next(struct rhashtable_iter *iter) void mock_rpc_hold(struct homa_rpc *rpc) { mock_rpc_holds++; - atomic_inc(&rpc->refs); + refcount_inc(&rpc->refs); } void mock_rpc_put(struct homa_rpc *rpc) { - if (atomic_read(&rpc->refs) == 0) + if (refcount_read(&rpc->refs) < 2) FAIL("homa_rpc_put invoked when RPC has no active holds"); mock_rpc_holds--; - atomic_dec(&rpc->refs); + refcount_dec(&rpc->refs); } /** diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index a52761a1..6b59dd7b 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -909,11 +909,11 @@ TEST_F(homa_grant, homa_grant_unmanage_rpc__remove_from_oldest_rpc) homa_grant_manage_rpc(rpc); self->homa.grant->oldest_rpc = rpc; homa_rpc_hold(rpc); - EXPECT_EQ(1, rpc->refs.counter); + EXPECT_EQ(2, refcount_read(&rpc->refs)); homa_grant_unmanage_rpc(rpc, &self->cand); EXPECT_EQ(NULL, self->homa.grant->oldest_rpc); - EXPECT_EQ(0, rpc->refs.counter); + EXPECT_EQ(1, refcount_read(&rpc->refs)); } TEST_F(homa_grant, homa_grant_update_incoming) @@ -1471,11 +1471,11 @@ TEST_F(homa_grant, homa_grant_find_oldest__take_reference) rpc = test_rpc(self, 100, self->server_ip, 40000); homa_grant_insert_grantable(rpc); - ASSERT_EQ(0, rpc->refs.counter); + EXPECT_EQ(1, refcount_read(&rpc->refs)); homa_grant_find_oldest(self->homa.grant); - ASSERT_EQ(rpc, self->homa.grant->oldest_rpc); - ASSERT_EQ(1, rpc->refs.counter); + EXPECT_EQ(rpc, self->homa.grant->oldest_rpc); + EXPECT_EQ(2, refcount_read(&rpc->refs)); } TEST_F(homa_grant, homa_grant_promote_rpc__rpc_is_active) @@ -1767,7 +1767,7 @@ TEST_F(homa_grant, homa_grant_cand_add__basics) EXPECT_EQ(0, cand.removes); EXPECT_EQ(rpc2, cand.rpcs[0]); EXPECT_EQ(rpc1, cand.rpcs[1]); - EXPECT_EQ(1, atomic_read(&rpc1->refs)); + EXPECT_EQ(2, refcount_read(&rpc1->refs)); homa_grant_cand_check(&cand, self->homa.grant); } TEST_F(homa_grant, homa_grant_cand_add__wrap_around) @@ -1818,9 +1818,9 @@ TEST_F(homa_grant, homa_grant_cand_check__basics) unit_log_clear(); homa_grant_cand_check(&cand, self->homa.grant); EXPECT_STREQ("xmit GRANT 10000@2; xmit GRANT 10000@0", unit_log_get()); - EXPECT_EQ(0, atomic_read(&rpc1->refs)); - EXPECT_EQ(0, atomic_read(&rpc2->refs)); - EXPECT_EQ(0, atomic_read(&rpc3->refs)); + EXPECT_EQ(1, refcount_read(&rpc1->refs)); + EXPECT_EQ(1, refcount_read(&rpc2->refs)); + EXPECT_EQ(1, refcount_read(&rpc3->refs)); } TEST_F(homa_grant, homa_grant_cand_check__rpc_dead) { @@ -1838,7 +1838,7 @@ TEST_F(homa_grant, homa_grant_cand_check__rpc_dead) unit_log_clear(); homa_grant_cand_check(&cand, self->homa.grant); EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(0, atomic_read(&rpc->refs)); + EXPECT_EQ(1, refcount_read(&rpc->refs)); rpc->state = saved_state; } TEST_F(homa_grant, homa_grant_cand_check__rpc_becomes_fully_granted) From c0e601301fd7bd319b84adf339cc5b44c18ae0af Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 13:20:16 -0700 Subject: [PATCH 474/625] Fix various minor issues from comments on upstream patch series --- homa_outgoing.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index 1eb025ae..f5516008 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -260,15 +260,12 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) * in a GSO packet (before segmentation). */ int mtu, max_seg_data, max_gso_data; - struct sk_buff **last_link; struct dst_entry *dst; u64 segs_per_gso; IF_NO_STRIP(int overlap_xmit); - /* Bytes of the message that haven't yet been copied into skbs. */ int bytes_left; - int gso_size; int err; @@ -831,8 +828,8 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) #ifndef __STRIP__ /* See strip.py */ new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr)); #else /* See strip.py */ - new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) - + seg_length); + new_skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) + + seg_length); #endif /* See strip.py */ if (unlikely(!new_skb)) { UNIT_LOG("; ", "skb allocation error"); From 885ef30a953c37e56a6ac8eb1cde837d8538b776 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 13:29:14 -0700 Subject: [PATCH 475/625] Use cpu_relax where appropriate --- homa_qdisc.c | 1 + homa_utils.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index f5164de6..68161744 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -589,6 +589,7 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) */ if (i != 0) goto done; + cpu_relax(); now = homa_clock(); } diff --git a/homa_utils.c b/homa_utils.c index 37646165..5ccff868 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -231,6 +231,5 @@ void homa_spin(int ns) end = homa_clock() + homa_ns_to_cycles(ns); while (homa_clock() < end) - /* Empty loop body.*/ - ; + cpu_relax(); } From 860f155010af94df6767869c5774d9757d55b80c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 15:06:07 -0700 Subject: [PATCH 476/625] Eliminate acks and num_acks variable in homa_dispatch_pkts Acks can be handled more cleanly now that RPCs have refcounts, and this reduces the stack footprint. --- homa_incoming.c | 33 ++++++----------- homa_rpc.c | 2 +- test/unit_homa_incoming.c | 75 ++++++++++++++++++++++++++++----------- 3 files changed, 66 insertions(+), 44 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index ec15a147..bf70ddbf 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -442,19 +442,10 @@ void homa_dispatch_pkts(struct sk_buff *skb) struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data; u64 id = homa_local_id(h->common.sender_id); int dport = ntohs(h->common.dport); - - /* Used to collect acks from data packets so we can process them - * all at the end (can't process them inline because that may - * require locking conflicting RPCs). If we run out of space just - * ignore the extra acks; they'll be regenerated later through the - * explicit mechanism. - */ - struct homa_ack acks[MAX_ACKS]; struct homa_rpc *rpc = NULL; struct homa_sock *hsk; struct homa_net *hnet; struct sk_buff *next; - int num_acks = 0; /* Find the appropriate socket.*/ hnet = homa_net(dev_net(skb->dev)); @@ -569,15 +560,6 @@ void homa_dispatch_pkts(struct sk_buff *skb) switch (h->common.type) { case DATA: - if (h->ack.client_id) { - /* Save the ack for processing later, when we - * have released the RPC lock. - */ - if (num_acks < MAX_ACKS) { - acks[num_acks] = h->ack; - num_acks++; - } - } homa_data_pkt(skb, rpc); INC_METRIC(packets_received[DATA - DATA], 1); break; @@ -632,11 +614,6 @@ void homa_dispatch_pkts(struct sk_buff *skb) homa_rpc_unlock(rpc); } - while (num_acks > 0) { - num_acks--; - homa_rpc_acked(hsk, &saddr, &acks[num_acks]); - } - /* We need to reap dead RPCs here under two conditions: * 1. The socket has hit its limit on tx buffer space and threads are * blocked waiting for skbs to be released. @@ -680,6 +657,16 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) tt_addr(rpc->peer->addr), ntohl(h->seg.offset), ntohl(h->message_length)); + if (h->ack.client_id) { + const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb); + + homa_rpc_unlock(rpc); + homa_rpc_acked(rpc->hsk, &saddr, &h->ack); + homa_rpc_lock(rpc); + if (rpc->state == RPC_DEAD) + goto discard; + } + if (rpc->state != RPC_INCOMING && homa_is_client(rpc->id)) { if (unlikely(rpc->state != RPC_OUTGOING)) goto discard; diff --git a/homa_rpc.c b/homa_rpc.c index dad4e1df..3d415ae9 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -216,7 +216,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, * @hsk: Socket on which the ack was received. May or may not correspond * to the RPC, but can sometimes be used to avoid a socket lookup. * @saddr: Source address from which the act was received (the client - * note for the RPC) + * node for the RPC) * @ack: Information about an RPC from @saddr that may now be deleted * safely. */ diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 6704fdc9..ba314981 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -1291,26 +1291,6 @@ TEST_F(homa_incoming, homa_dispatch_pkts__handle_ack) EXPECT_STREQ("DEAD", homa_symbol_for_state(srpc)); EXPECT_SUBSTR("ack 1235", unit_log_get()); } -TEST_F(homa_incoming, homa_dispatch_pkts__too_many_acks) -{ - struct sk_buff *skb, *skb2, *skb3; - - self->data.ack = (struct homa_ack) { - .server_port = htons(self->server_port), - .client_id = cpu_to_be64(self->client_id)}; - self->data.common.sender_id = cpu_to_be64(self->client_id+10); - unit_log_clear(); - skb = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); - self->data.ack.client_id = cpu_to_be64(self->client_id+2); - skb2 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); - self->data.ack.client_id = cpu_to_be64(self->client_id+4); - skb3 = mock_skb_alloc(self->client_ip, &self->data.common, 1400, 0); - skb->next = skb2; - skb2->next = skb3; - homa_dispatch_pkts(skb); - EXPECT_STREQ("sk->sk_data_ready invoked; ack 1237; ack 1235", - unit_log_get()); -} #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_incoming, homa_dispatch_pkts__invoke_homa_grant_check_rpc) { @@ -1390,6 +1370,61 @@ TEST_F(homa_incoming, homa_data_pkt__basics) EXPECT_EQ(1, homa_metrics_per_cpu()->responses_received); #endif /* See strip.py */ } +TEST_F(homa_incoming, homa_data_pkt__handle_ack) +{ + struct homa_rpc *srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 100, 3000); + struct homa_rpc *srpc2 = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id+2, 10000, 1000); + + ASSERT_NE(NULL, srpc1); + ASSERT_NE(NULL, srpc2); + EXPECT_EQ(8600, srpc2->msgin.bytes_remaining); + + self->data.ack = (struct homa_ack) { + .server_port = htons(self->hsk.port), + .client_id = cpu_to_be64(self->client_id)}; + self->data.common.sender_id = cpu_to_be64(self->client_id+2); + self->data.seg.offset = htonl(1400); + unit_log_clear(); + homa_rpc_lock(srpc2); + homa_data_pkt(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0), srpc2); + homa_rpc_unlock(srpc2); + EXPECT_EQ(RPC_DEAD, srpc1->state); + EXPECT_SUBSTR("ack 1235; homa_rpc_end invoked", unit_log_get()); + EXPECT_EQ(7200, srpc2->msgin.bytes_remaining); +} +TEST_F(homa_incoming, homa_data_pkt__handle_ack_rpc_now_dead) +{ + struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 1000); + + ASSERT_NE(NULL, srpc); + EXPECT_EQ(8600, srpc->msgin.bytes_remaining); + + /* This is a bit contrived, but the ack terminates the RPC for which + * the data packet was intended. + */ + self->data.ack = (struct homa_ack) { + .server_port = htons(self->hsk.port), + .client_id = cpu_to_be64(self->client_id)}; + self->data.common.sender_id = cpu_to_be64(self->client_id); + self->data.seg.offset = htonl(1400); + unit_log_clear(); + homa_rpc_lock(srpc); + homa_data_pkt(mock_skb_alloc(self->client_ip, &self->data.common, + 1400, 0), srpc); + homa_rpc_unlock(srpc); + EXPECT_EQ(RPC_DEAD, srpc->state); + EXPECT_SUBSTR("ack 1235; " + "homa_rpc_end invoked; " + "homa_data_pkt discarded packet", unit_log_get()); + EXPECT_EQ(8600, srpc->msgin.bytes_remaining); +} TEST_F(homa_incoming, homa_data_pkt__wrong_client_rpc_state) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, From e7946a9d3730605cd3bca816e4a10562fb733b17 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 15:21:09 -0700 Subject: [PATCH 477/625] Remove global data variable in homa_plumbing.c --- homa_plumbing.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 845c9270..98d28f1e 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -28,21 +28,12 @@ static struct pernet_operations homa_net_ops = { .size = sizeof(struct homa_net) }; -/* Global data for Homa. Never reference homa_data directly. Always use - * the global_homa variable instead (or, even better, a homa pointer - * stored in a struct or passed via a parameter); this allows overriding - * during unit tests. +/* Global data for Homa. Avoid referencing directly except when there is + * no alternative (instead, use a homa pointer stored in a struct or + * passed via a parameter). This allows overriding during unit tests. */ static struct homa homa_data; -/* This variable contains the address of the statically-allocated struct homa - * used throughout Homa. This variable should almost never be used directly: - * it should be passed as a parameter to functions that need it. This - * variable is used only by a few functions called from Linux where there - * is no struct homa* available. - */ -static struct homa *global_homa = &homa_data; - /* This structure defines functions that handle various operations on * Homa sockets. These functions are relatively generic: they are called * to implement top-level system calls. Many of these operations can @@ -475,7 +466,7 @@ static int timer_thread_exit; */ int __init homa_load(void) { - struct homa *homa = global_homa; + struct homa *homa = &homa_data; bool init_protocol6 = false; bool init_protosw6 = false; bool init_protocol = false; @@ -697,7 +688,7 @@ int __init homa_load(void) */ void __exit homa_unload(void) { - struct homa *homa = global_homa; + struct homa *homa = &homa_data; pr_notice("Homa module unloading\n"); @@ -738,7 +729,7 @@ module_exit(homa_unload); int homa_net_start(struct net *net) { pr_notice("Homa attaching to net namespace\n"); - return homa_net_init(homa_net(net), net, global_homa); + return homa_net_init(homa_net(net), net, &homa_data); } /** From 6cd5210ab1d1f6fa8068e9d022289cb8e68a3c28 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 15:53:51 -0700 Subject: [PATCH 478/625] A few minor cleanups in homa_plumbing.c from patch comments --- homa_plumbing.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 98d28f1e..42b3b50d 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -122,7 +122,6 @@ static struct proto homav6_prot = { .unhash = homa_unhash, .obj_size = sizeof(struct homa_v6_sock), .ipv6_pinfo_offset = offsetof(struct homa_v6_sock, inet6), - .no_autobind = 1, }; @@ -523,8 +522,8 @@ int __init homa_load(void) BUILD_BUG_ON(sizeof(struct homa_abort_args) != 32); #endif /* See strip.py */ - pr_err("Homa module loading\n"); #ifndef __UPSTREAM__ /* See strip.py */ + pr_err("Homa module loading\n"); pr_notice("Homa structure sizes: homa_data_hdr %lu, homa_seg_hdr %lu, ack %lu, peer %lu, ip_hdr %lu flowi %lu ipv6_hdr %lu, flowi6 %lu tcp_sock %lu homa_rpc %lu sk_buff %lu skb_shared_info %lu rcvmsg_control %lu union sockaddr_in_union %lu HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", sizeof(struct homa_data_hdr), sizeof(struct homa_seg_hdr), @@ -545,6 +544,12 @@ int __init homa_load(void) nr_cpu_ids, MAX_NUMNODES); #endif /* See strip.py */ + + status = homa_init(homa); + if (status) + goto error; + init_homa = true; + status = proto_register(&homa_prot, 1); if (status != 0) { pr_err("proto_register failed for homa_prot: %d\n", status); @@ -586,11 +591,6 @@ int __init homa_load(void) } init_protocol6 = true; - status = homa_init(homa); - if (status) - goto error; - init_homa = true; - #ifndef __STRIP__ /* See strip.py */ status = homa_metrics_init(); if (status != 0) @@ -706,13 +706,13 @@ void __exit homa_unload(void) homa_metrics_end(); #endif /* See strip.py */ unregister_pernet_subsys(&homa_net_ops); - homa_destroy(homa); inet_del_protocol(&homa_protocol, IPPROTO_HOMA); inet_unregister_protosw(&homa_protosw); inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA); inet6_unregister_protosw(&homav6_protosw); proto_unregister(&homa_prot); proto_unregister(&homav6_prot); + homa_destroy(homa); #ifndef __UPSTREAM__ /* See strip.py */ tt_destroy(); #endif /* See strip.py */ From 1f2d5ede00d698d1d5a4642cbeb41524d26eba58 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Sep 2025 15:59:07 -0700 Subject: [PATCH 479/625] Remove vestiges of tsc.h --- homa_impl.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index ec84f5ba..d39a7dfd 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -42,9 +42,6 @@ #include #include #include -#ifdef CONFIG_X86_TSC -#include -#endif #ifndef __UPSTREAM__ /* See strip.py */ #include "homa.h" @@ -883,7 +880,7 @@ static inline u64 homa_clock_khz(void) return 1000000; #else /* __UNIT_TEST__ */ #ifndef __UPSTREAM__ /* See strip.py */ - return tsc_khz; + return cpu_khz; #else /* See strip.py */ return 1000000; #endif /* See strip.py */ From b15d52791181095eaf128a14a5c9809ee6872cd5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 2 Sep 2025 16:30:37 -0700 Subject: [PATCH 480/625] Change initializer for homa_rpc->msgin.packets Use __skb_queue_head_init instead of skb_queue_head_init to indicate that the lock field isn't used. --- homa_incoming.c | 2 +- homa_rpc.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index bf70ddbf..2f6f934d 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -48,7 +48,7 @@ int homa_message_in_init(struct homa_rpc *rpc, int length) return -EINVAL; rpc->msgin.length = length; - skb_queue_head_init(&rpc->msgin.packets); + __skb_queue_head_init(&rpc->msgin.packets); INIT_LIST_HEAD(&rpc->msgin.gaps); rpc->msgin.bytes_remaining = length; IF_NO_STRIP(rpc->msgin.birth = homa_clock()); diff --git a/homa_rpc.h b/homa_rpc.h index aa862de8..694f1455 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -137,7 +137,8 @@ struct homa_message_in { /** * @packets: DATA packets for this message that have been received but - * not yet copied to user space (no particular order). + * not yet copied to user space (ordered by increasing offset). The + * lock in this structure is not used (the RPC lock is used instead). */ struct sk_buff_head packets; From dda2331c978bb5d89578ee7cb810b515a30c6127 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 4 Sep 2025 11:25:29 -0700 Subject: [PATCH 481/625] Use set_bit, clear_bit, etc. for flag bits --- homa_incoming.c | 29 ++++++++++++++--------------- homa_plumbing.c | 2 +- homa_rpc.c | 2 +- homa_rpc.h | 12 ++++++------ test/unit_homa_incoming.c | 32 ++++++++++++++++---------------- test/unit_homa_plumbing.c | 4 ++-- 6 files changed, 40 insertions(+), 41 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index 2f6f934d..b9d5bac2 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -330,7 +330,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) continue; } if (n == 0) { - atomic_andnot(RPC_PKTS_READY, &rpc->flags); + clear_bit(RPC_PKTS_READY, &rpc->flags); break; } @@ -412,9 +412,9 @@ int homa_copy_to_user(struct homa_rpc *rpc) tt_record2("finished freeing %d skbs for id %d", n, rpc->id); n = 0; - atomic_or(APP_NEEDS_LOCK, &rpc->flags); + set_bit(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); - atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); + clear_bit(APP_NEEDS_LOCK, &rpc->flags); if (error) break; } @@ -479,11 +479,10 @@ void homa_dispatch_pkts(struct sk_buff *skb) * elsewhere. */ if (rpc) { - int flags = atomic_read(&rpc->flags); - - if (flags & APP_NEEDS_LOCK) { + if (test_bit(APP_NEEDS_LOCK, &rpc->flags)) { homa_rpc_unlock(rpc); - tt_record2("softirq released lock for id %d, flags 0x%x", rpc->id, flags); + tt_record2("softirq released lock for id %d, flags 0x%x", + rpc->id, rpc->flags); /* This short spin is needed to ensure that the * other thread gets the lock before this thread @@ -712,8 +711,8 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc) homa_add_packet(rpc, skb); if (skb_queue_len(&rpc->msgin.packets) != 0 && - !(atomic_read(&rpc->flags) & RPC_PKTS_READY)) { - atomic_or(RPC_PKTS_READY, &rpc->flags); + !test_bit(RPC_PKTS_READY, &rpc->flags)) { + set_bit(RPC_PKTS_READY, &rpc->flags); homa_rpc_handoff(rpc); } @@ -1071,7 +1070,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) #endif /* See strip.py */ int result; - if (!(atomic_read(&rpc->flags) & RPC_PRIVATE)) + if (!test_bit(RPC_PRIVATE, &rpc->flags)) return -EINVAL; /* Each iteration through this loop waits until rpc needs attention @@ -1112,9 +1111,9 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) blocked |= interest.blocked; #endif /* See strip.py */ - atomic_or(APP_NEEDS_LOCK, &rpc->flags); + set_bit(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); - atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); + clear_bit(APP_NEEDS_LOCK, &rpc->flags); homa_interest_unlink_private(&interest); tt_record3("homa_wait_private found rpc id %d, pid %d via handoff, blocked %d", rpc->id, current->pid, interest.blocked); @@ -1238,9 +1237,9 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) rpc->id, current->pid, interest.blocked); } - atomic_or(APP_NEEDS_LOCK, &rpc->flags); + set_bit(APP_NEEDS_LOCK, &rpc->flags); homa_rpc_lock(rpc); - atomic_andnot(APP_NEEDS_LOCK, &rpc->flags); + clear_bit(APP_NEEDS_LOCK, &rpc->flags); if (!rpc->error) rpc->error = homa_copy_to_user(rpc); if (rpc->error) { @@ -1279,7 +1278,7 @@ void homa_rpc_handoff(struct homa_rpc *rpc) struct homa_sock *hsk = rpc->hsk; struct homa_interest *interest; - if (atomic_read(&rpc->flags) & RPC_PRIVATE) { + if (test_bit(RPC_PRIVATE, &rpc->flags)) { homa_interest_notify_private(rpc); return; } diff --git a/homa_plumbing.c b/homa_plumbing.c index 42b3b50d..5db698e2 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1088,7 +1088,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) } homa_rpc_hold(rpc); if (args.flags & HOMA_SENDMSG_PRIVATE) - atomic_or(RPC_PRIVATE, &rpc->flags); + set_bit(RPC_PRIVATE, &rpc->flags); INC_METRIC(send_calls, 1); tt_record4("homa_sendmsg request, target 0x%x:%d, id %u, length %d", (addr->in6.sin6_family == AF_INET) diff --git a/homa_rpc.c b/homa_rpc.c index 3d415ae9..6b83c3dc 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -195,7 +195,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, list_add_tail_rcu(&srpc->active_links, &hsk->active_rpcs); homa_sock_unlock(hsk); if (ntohl(h->seg.offset) == 0 && srpc->msgin.num_bpages > 0) { - atomic_or(RPC_PKTS_READY, &srpc->flags); + set_bit(RPC_PKTS_READY, &srpc->flags); homa_rpc_handoff(srpc); } INC_METRIC(requests_received, 1); diff --git a/homa_rpc.h b/homa_rpc.h index 694f1455..db07f658 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -258,9 +258,9 @@ struct homa_rpc { * manipulated with atomic operations because some of the manipulations * occur without holding the RPC lock. */ - atomic_t flags; + unsigned long flags; - /* Valid bits for @flags: + /* Valid bit numbers for @flags: * RPC_PKTS_READY - The RPC has input packets ready to be * copied to user space. * APP_NEEDS_LOCK - Means that code in the application thread @@ -276,9 +276,9 @@ struct homa_rpc { * where the app explicitly requests the * response from this particular RPC. */ -#define RPC_PKTS_READY 1 -#define APP_NEEDS_LOCK 4 -#define RPC_PRIVATE 8 +#define RPC_PKTS_READY 0 +#define APP_NEEDS_LOCK 1 +#define RPC_PRIVATE 2 /** * @refs: Number of references to this RPC, including one for each @@ -564,7 +564,7 @@ static inline bool homa_is_client(u64 id) */ static inline bool homa_rpc_needs_attention(struct homa_rpc *rpc) { - return (rpc->error != 0 || atomic_read(&rpc->flags) & RPC_PKTS_READY); + return (rpc->error != 0 || test_bit(RPC_PKTS_READY, &rpc->flags)); } #endif /* _HOMA_RPC_H */ diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index ba314981..40a8794a 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -754,7 +754,7 @@ TEST_F(homa_incoming, homa_copy_to_user__basics) self->data.seg.offset = htonl(2800); homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1200, 201800), crpc); - EXPECT_NE(0, atomic_read(&crpc->flags) & RPC_PKTS_READY); + EXPECT_NE(0, test_bit(RPC_PKTS_READY, &crpc->flags)); unit_log_clear(); mock_copy_to_user_dont_copy = -1; @@ -767,7 +767,7 @@ TEST_F(homa_incoming, homa_copy_to_user__basics) "skb_copy_datagram_iter: 1200 bytes to 0x1000af0: 201800-202999", unit_log_get()); EXPECT_EQ(0, skb_queue_len(&crpc->msgin.packets)); - EXPECT_EQ(0, atomic_read(&crpc->flags) & RPC_PKTS_READY); + EXPECT_EQ(0, test_bit(RPC_PKTS_READY, &crpc->flags)); } TEST_F(homa_incoming, homa_copy_to_user__rpc_freed) { @@ -1543,7 +1543,7 @@ TEST_F(homa_incoming, homa_data_pkt__handoff) homa_data_pkt(mock_skb_alloc(self->server_ip, &self->data.common, 1400, 0), crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.ready_rpcs)); - EXPECT_TRUE(atomic_read(&crpc->flags) & RPC_PKTS_READY); + EXPECT_TRUE(test_bit(RPC_PKTS_READY, &crpc->flags)); EXPECT_EQ(1600, crpc->msgin.bytes_remaining); EXPECT_EQ(1, skb_queue_len(&crpc->msgin.packets)); EXPECT_STREQ("sk->sk_data_ready invoked", unit_log_get()); @@ -2401,13 +2401,13 @@ TEST_F(homa_incoming, homa_wait_private__rpc_has_error) self->server_port, self->client_id, 20000, 1600); ASSERT_NE(NULL, crpc); - ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); - atomic_or(RPC_PRIVATE, &crpc->flags); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); + set_bit(RPC_PRIVATE, &crpc->flags); crpc->error = -ENOENT; homa_rpc_lock(crpc); EXPECT_EQ(0, -homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); - EXPECT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags) & RPC_PKTS_READY); + EXPECT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_none)); IF_NO_STRIP(EXPECT_EQ(0, homa_metrics_per_cpu()->wait_block)); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_fast)); @@ -2419,8 +2419,8 @@ TEST_F(homa_incoming, homa_wait_private__copy_to_user_fails) self->server_port, self->client_id, 20000, 1600); ASSERT_NE(NULL, crpc); - ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); - atomic_or(RPC_PRIVATE, &crpc->flags); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); + set_bit(RPC_PRIVATE, &crpc->flags); mock_copy_data_errors = 1; homa_rpc_lock(crpc); EXPECT_EQ(0, -homa_wait_private(crpc, 0)); @@ -2434,12 +2434,12 @@ TEST_F(homa_incoming, homa_wait_private__available_immediately) self->server_port, self->client_id, 20000, 1600); ASSERT_NE(NULL, crpc); - ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); - atomic_or(RPC_PRIVATE, &crpc->flags); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); + set_bit(RPC_PRIVATE, &crpc->flags); homa_rpc_lock(crpc); EXPECT_EQ(0, homa_wait_private(crpc, 0)); homa_rpc_unlock(crpc); - ASSERT_EQ(RPC_PRIVATE, atomic_read(&crpc->flags)); + EXPECT_EQ(1, test_bit(RPC_PRIVATE, &crpc->flags)); IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->wait_none)); } TEST_F(homa_incoming, homa_wait_private__nonblocking) @@ -2449,7 +2449,7 @@ TEST_F(homa_incoming, homa_wait_private__nonblocking) self->server_port, self->client_id, 20000, 1600); ASSERT_NE(NULL, crpc); - atomic_or(RPC_PRIVATE, &crpc->flags); + set_bit(RPC_PRIVATE, &crpc->flags); homa_rpc_lock(crpc); EXPECT_EQ(EAGAIN, -homa_wait_private(crpc, 1)); @@ -2465,7 +2465,7 @@ TEST_F(homa_incoming, homa_wait_private__signal_notify_race) self->server_port, self->client_id, 20000, 1000); ASSERT_NE(NULL, crpc); - atomic_or(RPC_PRIVATE, &crpc->flags); + set_bit(RPC_PRIVATE, &crpc->flags); IF_NO_STRIP(self->homa.poll_cycles = 0); unit_hook_register(handoff_hook); hook_rpc = crpc; @@ -2500,7 +2500,7 @@ TEST_F(homa_incoming, homa_wait_shared__rpc_already_ready) struct homa_rpc *rpc; ASSERT_NE(NULL, crpc); - ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); rpc = homa_wait_shared(&self->hsk, 0); ASSERT_FALSE(IS_ERR(rpc)); @@ -2605,7 +2605,7 @@ TEST_F(homa_incoming, homa_wait_shared__copy_to_user_fails) struct homa_rpc *rpc; ASSERT_NE(NULL, crpc); - ASSERT_EQ(RPC_PKTS_READY, atomic_read(&crpc->flags)); + ASSERT_EQ(1, test_bit(RPC_PKTS_READY, &crpc->flags)); mock_copy_data_errors = 1; rpc = homa_wait_shared(&self->hsk, 0); @@ -2659,7 +2659,7 @@ TEST_F(homa_incoming, homa_rpc_handoff__private_rpc) self->server_port, self->client_id, 20000, 1600); ASSERT_NE(NULL, crpc); - atomic_or(RPC_PRIVATE, &crpc->flags); + set_bit(RPC_PRIVATE, &crpc->flags); homa_interest_init_private(&interest, crpc); mock_log_wakeups = 1; unit_log_clear(); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index e0075fe1..cf996b7b 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -766,7 +766,7 @@ TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_private) self->client_id, 100, 2000); EXPECT_NE(NULL, crpc); - atomic_or(RPC_PRIVATE, &crpc->flags); + set_bit(RPC_PRIVATE, &crpc->flags); self->recvmsg_args.id = crpc->id; @@ -782,7 +782,7 @@ TEST_F(homa_plumbing, homa_recvmsg__private_rpc_has_error) self->client_id, 100, 2000); EXPECT_NE(NULL, crpc); - atomic_or(RPC_PRIVATE, &crpc->flags); + set_bit(RPC_PRIVATE, &crpc->flags); crpc->error = -ETIMEDOUT; self->recvmsg_args.id = crpc->id; From 9311cd35bfec1fdaf1b3f3c52f7d979be492bf3f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 5 Sep 2025 11:30:18 -0700 Subject: [PATCH 482/625] Implement homa_rpc_lock_preempt helper function --- homa_incoming.c | 12 +++--------- homa_rpc.h | 13 +++++++++++++ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index b9d5bac2..bbe82a23 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -412,9 +412,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) tt_record2("finished freeing %d skbs for id %d", n, rpc->id); n = 0; - set_bit(APP_NEEDS_LOCK, &rpc->flags); - homa_rpc_lock(rpc); - clear_bit(APP_NEEDS_LOCK, &rpc->flags); + homa_rpc_lock_preempt(rpc); if (error) break; } @@ -1111,9 +1109,7 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) blocked |= interest.blocked; #endif /* See strip.py */ - set_bit(APP_NEEDS_LOCK, &rpc->flags); - homa_rpc_lock(rpc); - clear_bit(APP_NEEDS_LOCK, &rpc->flags); + homa_rpc_lock_preempt(rpc); homa_interest_unlink_private(&interest); tt_record3("homa_wait_private found rpc id %d, pid %d via handoff, blocked %d", rpc->id, current->pid, interest.blocked); @@ -1237,9 +1233,7 @@ struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking) rpc->id, current->pid, interest.blocked); } - set_bit(APP_NEEDS_LOCK, &rpc->flags); - homa_rpc_lock(rpc); - clear_bit(APP_NEEDS_LOCK, &rpc->flags); + homa_rpc_lock_preempt(rpc); if (!rpc->error) rpc->error = homa_copy_to_user(rpc); if (rpc->error) { diff --git a/homa_rpc.h b/homa_rpc.h index db07f658..557df449 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -468,6 +468,19 @@ static inline int homa_rpc_try_lock(struct homa_rpc *rpc) return 1; } +/** + * homa_rpc_lock_preempt() - Same as homa_rpc_lock, except sets the + * APP_NEEDS_LOCK flags while waiting to encourage the existing lock + * owner to relinquish the lock. + */ +static inline void homa_rpc_lock_preempt(struct homa_rpc *rpc) + __acquires(rpc->bucket->lock) +{ + set_bit(APP_NEEDS_LOCK, &rpc->flags); + homa_bucket_lock(rpc->bucket, rpc->id); + clear_bit(APP_NEEDS_LOCK, &rpc->flags); +} + /** * homa_rpc_unlock() - Release the lock for an RPC. * @rpc: RPC to unlock. From c441b36545991ca0f6316323e2b724546bac25c0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 5 Sep 2025 12:02:59 -0700 Subject: [PATCH 483/625] Remove extraneous refcount check in __homa_xmit_control --- homa_outgoing.c | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index f5516008..0a5c6d9a 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -475,7 +475,6 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, priority = hsk->homa->num_priorities - 1; #endif /* See strip.py */ skb->ooo_okay = 1; - skb_get(skb); #ifndef __STRIP__ /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) { result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, @@ -493,38 +492,8 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, else result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); #endif /* See strip.py */ - if (unlikely(result != 0)) { + if (unlikely(result != 0)) INC_METRIC(control_xmit_errors, 1); - - /* It appears that ip*_xmit frees skbuffs after - * errors; the following code is to raise an alert if - * this isn't actually the case. The extra skb_get above - * and kfree_skb call below are needed to do the check - * accurately (otherwise the buffer could be freed and - * its memory used for some other purpose, resulting in - * a bogus "reference count"). - */ - if (refcount_read(&skb->users) > 1) { -#ifndef __STRIP__ /* See strip.py */ - if (hsk->inet.sk.sk_family == AF_INET6) { - pr_notice("ip6_xmit didn't free Homa control packet (type %d) after error %d\n", - h->type, result); - } else { - pr_notice("ip_queue_xmit didn't free Homa control packet (type %d) after error %d\n", - h->type, result); - tt_record2("ip_queue_xmit didn't free Homa control packet (type %d) after error %d\n", - h->type, result); - } -#else /* See strip.py */ - if (hsk->inet.sk.sk_family == AF_INET6) - pr_notice("ip6_xmit didn't free Homa control packet (type %d) after error %d\n", - h->type, result); - else - pr_notice("ip_queue_xmit didn't free Homa control packet (type %d) after error %d\n", - h->type, result); -#endif /* See strip.py */ - } - } #ifndef __STRIP__ /* See strip.py */ txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) @@ -535,7 +504,6 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, INC_METRIC(packets_sent[h->type - DATA], 1); INC_METRIC(priority_bytes[priority], skb->len); INC_METRIC(priority_packets[priority], 1); - kfree_skb(skb); return result; } From e27fdcb5ba71480b076b6a48e5b3f3205364c488 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 5 Sep 2025 14:26:34 -0700 Subject: [PATCH 484/625] Use consume_skb and kfree_skb_reason instead of kfree_skb --- homa_incoming.c | 23 +++++++++++++++-------- homa_plumbing.c | 6 +++++- homa_qdisc.c | 4 ++-- homa_rpc.c | 3 ++- homa_skb.c | 4 ++-- homa_stub.h | 4 ++-- test/Makefile | 2 +- test/mock.c | 4 ++++ 8 files changed, 33 insertions(+), 17 deletions(-) diff --git a/homa_incoming.c b/homa_incoming.c index bbe82a23..fee19b88 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -167,11 +167,13 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) struct homa_gap *gap, *dummy, *gap2; int start = ntohl(h->seg.offset); int length = homa_data_len(skb); + enum skb_drop_reason reason; int end = start + length; if ((start + length) > rpc->msgin.length) { tt_record3("Packet extended past message end; id %d, offset %d, length %d", rpc->id, start, length); + reason = SKB_DROP_REASON_PKT_TOO_BIG; goto discard; } @@ -187,6 +189,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) rpc->msgin.recv_end, start)) { tt_record2("Couldn't allocate gap for id %d (start %d): no memory", rpc->id, start); + reason = SKB_DROP_REASON_NOMEM; goto discard; } rpc->msgin.recv_end = end; @@ -204,11 +207,13 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (start < gap->start) { tt_record4("Packet overlaps gap start: id %d, start %d, end %d, gap_start %d", rpc->id, start, end, gap->start); + reason = SKB_DROP_REASON_DUP_FRAG; goto discard; } if (end > gap->end) { tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", rpc->id, start, end, gap->start); + reason = SKB_DROP_REASON_DUP_FRAG; goto discard; } gap->start = end; @@ -228,6 +233,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (end > gap->end) { tt_record4("Packet overlaps gap end: id %d, start %d, end %d, gap_end %d", rpc->id, start, end, gap->start); + reason = SKB_DROP_REASON_DUP_FRAG; goto discard; } gap->end = start; @@ -239,6 +245,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) if (!gap2) { tt_record2("Couldn't allocate gap for split for id %d (start %d): no memory", rpc->id, end); + reason = SKB_DROP_REASON_NOMEM; goto discard; } gap2->time = gap->time; @@ -255,7 +262,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb) #endif /* See strip.py */ tt_record4("homa_add_packet discarding packet for id %d, offset %d, length %d, retransmit %d", rpc->id, start, length, h->retransmit); - kfree_skb(skb); + kfree_skb_reason(skb, reason); return; keep: @@ -406,7 +413,7 @@ int homa_copy_to_user(struct homa_rpc *rpc) start = homa_clock(); #endif /* See strip.py */ for (i = 0; i < n; i++) - kfree_skb(skbs[i]); + consume_skb(skbs[i]); INC_METRIC(skb_free_cycles, homa_clock() - start); INC_METRIC(skb_frees, n); tt_record2("finished freeing %d skbs for id %d", @@ -771,7 +778,7 @@ void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc) rpc->msgout.sched_priority = h->priority; homa_xmit_data(rpc, false); } - kfree_skb(skb); + consume_skb(skb); } #endif /* See strip.py */ @@ -855,7 +862,7 @@ void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc, } done: - kfree_skb(skb); + consume_skb(skb); } /** @@ -915,7 +922,7 @@ void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc) #endif /* See strip.py */ } done: - kfree_skb(skb); + consume_skb(skb); } #ifndef __STRIP__ /* See strip.py */ @@ -940,7 +947,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk) peer->cutoff_version = h->cutoff_version; homa_peer_release(peer); } - kfree_skb(skb); + consume_skb(skb); } #endif /* See strip.py */ @@ -1001,7 +1008,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, homa_peer_release(peer); done: - kfree_skb(skb); + consume_skb(skb); } /** @@ -1042,7 +1049,7 @@ void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, } tt_record3("ACK received for id %d, peer 0x%x, with %d other acks", homa_local_id(h->common.sender_id), tt_addr(saddr), count); - kfree_skb(skb); + consume_skb(skb); } /** diff --git a/homa_plumbing.c b/homa_plumbing.c index 5db698e2..560cbe01 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1390,6 +1390,7 @@ int homa_softirq(struct sk_buff *skb) { struct sk_buff *packets, *other_pkts, *next; struct sk_buff **prev_link, **other_link; + enum skb_drop_reason reason; struct homa_common_hdr *h; int header_offset; @@ -1427,6 +1428,7 @@ int homa_softirq(struct sk_buff *skb) pr_notice("Homa can't handle fragmented packet (no space for header); discarding\n"); #endif /* See strip.py */ UNIT_LOG("", "pskb discard"); + reason = SKB_DROP_REASON_HDR_TRUNC; goto discard; } header_offset = skb_transport_header(skb) - skb->data; @@ -1448,6 +1450,7 @@ int homa_softirq(struct sk_buff *skb) skb->len - header_offset); #endif /* See strip.py */ INC_METRIC(short_packets, 1); + reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto discard; } @@ -1466,6 +1469,7 @@ int homa_softirq(struct sk_buff *skb) homa_local_id(h->sender_id)); tt_freeze(); } + reason = SKB_CONSUMED; goto discard; } #endif /* See strip.py */ @@ -1487,7 +1491,7 @@ int homa_softirq(struct sk_buff *skb) discard: *prev_link = skb->next; - kfree_skb(skb); + kfree_skb_reason(skb, reason); } /* Now process the longer packets. Each iteration of this loop diff --git a/homa_qdisc.c b/homa_qdisc.c index 68161744..538c5aca 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -434,7 +434,7 @@ void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev) skb = homa_qdisc_dequeue_homa(qdev); if (!skb) break; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); } } @@ -653,7 +653,7 @@ int homa_qdisc_redirect_skb(struct sk_buff *skb, /* Couldn't find a Homa qdisc to use; drop the skb. * Shouldn't ever happen? */ - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); result = NET_XMIT_DROP; goto done; } diff --git a/homa_rpc.c b/homa_rpc.c index 6b83c3dc..64c16b8f 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -565,7 +565,8 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) if (rpc->msgin.length >= 0 && !skb_queue_empty_lockless(&rpc->msgin.packets)) { rx_frees += skb_queue_len(&rpc->msgin.packets); - __skb_queue_purge(&rpc->msgin.packets); + __skb_queue_purge_reason(&rpc->msgin.packets, + SKB_CONSUMED); } /* If we get here, it means all packets have been diff --git a/homa_skb.c b/homa_skb.c index 36aa438f..7df69482 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -470,7 +470,7 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) /* This sk_buff is still in use somewhere, so can't * reclaim its pages. */ - kfree_skb(skb); + consume_skb(skb); continue; } @@ -492,7 +492,7 @@ void homa_skb_free_many_tx(struct homa *homa, struct sk_buff **skbs, int count) } } shinfo->nr_frags = 0; - kfree_skb(skb); + consume_skb(skb); } if (num_pages > 0) homa_skb_cache_pages(homa, pages_to_cache, num_pages); diff --git a/homa_stub.h b/homa_stub.h index 875d3bfe..502cd93d 100644 --- a/homa_stub.h +++ b/homa_stub.h @@ -54,7 +54,7 @@ static inline int homa_skb_append_from_skb(struct homa *homa, static inline void homa_skb_free_tx(struct homa *homa, struct sk_buff *skb) { - kfree_skb(skb); + consume_skb(skb); } static inline void homa_skb_free_many_tx(struct homa *homa, @@ -63,7 +63,7 @@ static inline void homa_skb_free_many_tx(struct homa *homa, int i; for (i = 0; i < count; i++) - kfree_skb(skbs[i]); + consume_skb(skbs[i]); } static inline void homa_skb_get(struct sk_buff *skb, void *dest, int offset, diff --git a/test/Makefile b/test/Makefile index 2603bdfa..c586f06f 100644 --- a/test/Makefile +++ b/test/Makefile @@ -35,7 +35,7 @@ ifneq ($(__STRIP__),) DEFS += -D__STRIP__ endif -WARNS := -Wall -Wundef -Wno-trigraphs -Wno-sign-compare \ +WARNS := -Wall -Wundef -Wno-trigraphs -Wno-sign-compare -Wuninitialized \ -Wno-strict-aliasing -Wunused-but-set-variable -Werror CFLAGS := $(WARNS) -Wstrict-prototypes -MD -no-pie -g $(CINCLUDES) $(DEFS) CCFLAGS := -std=c++11 $(WARNS) -MD -g $(CCINCLUDES) $(DEFS) -fsanitize=address diff --git a/test/mock.c b/test/mock.c index 01ad29c1..1437c4f3 100644 --- a/test/mock.c +++ b/test/mock.c @@ -378,6 +378,10 @@ bool cancel_work_sync(struct work_struct *work) void __check_object_size(const void *ptr, unsigned long n, bool to_user) {} +void consume_skb(struct sk_buff *skb) { + kfree_skb(skb); +} + size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *iter) { size_t bytes_left = bytes; From 54e8390c311d7016f7fad30b5910adce584730b5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 9 Sep 2025 14:27:20 -0700 Subject: [PATCH 485/625] Refactor mechanism for deferring Homa packets in homa_qdisc * Previous approach could result in repeated scans of a very long packet list * New approach keeps list of deferred packets in the RPC struct (so no need to scan deferred list to find previous info for RPC) * New approach also uses red-black tree to reduce list scan time --- homa_impl.h | 24 +- homa_outgoing.c | 2 +- homa_qdisc.c | 198 ++++++------ homa_qdisc.h | 66 +++- homa_rpc.c | 2 + homa_rpc.h | 9 + test/Makefile | 2 +- test/unit_homa_qdisc.c | 685 ++++++++++++++++++++++++++++++++--------- 8 files changed, 730 insertions(+), 258 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index d39a7dfd..ff2fad8b 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -570,30 +570,8 @@ struct homa_skb_info { */ int offset; - /** - * @bytes_left: number of bytes in this packet and all later packets - * in the same message. Used to priroitize packets for SRPT. - */ - int bytes_left; - - /** - * @rpc: RPC that this packet came from. Used only as a unique - * identifier: it is not safe to dereference this pointer (the RPC - * may no longer exist). - */ + /** @rpc: RPC that this packet belongs to. */ void *rpc; - - /** - * @next_sibling: next packet in @rpc that has been deferred in - * homa_qdisc because the NIC queue was too long, or NULL if none. - */ - struct sk_buff *next_sibling; - - /** - * @last_sibling: last packet in @next_sibling list. Only valid - * for the "head" packet (which is in qdev->homa_deferred). - */ - struct sk_buff *last_sibling; }; /** diff --git a/homa_outgoing.c b/homa_outgoing.c index 0a5c6d9a..a9cc0190 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -188,7 +188,6 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, homa_info->data_bytes = length; homa_info->seg_length = max_seg_data; homa_info->offset = offset; - homa_info->bytes_left = rpc->msgout.length - offset; homa_info->rpc = rpc; #ifndef __STRIP__ /* See strip.py */ @@ -829,6 +828,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) new_homa_info->data_bytes = seg_length; new_homa_info->seg_length = seg_length; new_homa_info->offset = offset; + new_homa_info->rpc = rpc; tt_record3("retransmitting offset %d, length %d, id %d", offset, seg_length, rpc->id); #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_qdisc.c b/homa_qdisc.c index 538c5aca..45b6dd93 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -14,6 +14,7 @@ #include "homa_impl.h" #include "homa_pacer.h" #include "homa_qdisc.h" +#include "homa_rpc.h" #include "timetrace.h" #include @@ -83,8 +84,9 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, qdev->redirect_qix = -1; homa_qdisc_update_sysctl(qdev); INIT_LIST_HEAD(&qdev->links); - skb_queue_head_init(&qdev->homa_deferred); + qdev->deferred_rpcs = RB_ROOT_CACHED; skb_queue_head_init(&qdev->tcp_deferred); + spin_lock_init(&qdev->defer_lock); init_waitqueue_head(&qdev->pacer_sleep); spin_lock_init(&qdev->pacer_mutex); @@ -269,7 +271,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto enqueue; } - if (skb_queue_empty(&qdev->homa_deferred) && + if (!homa_qdisc_any_deferred(qdev) && homa_qdisc_update_link_idle(qdev, pkt_len, homa->pacer->max_nic_queue_cycles)) goto enqueue; @@ -277,9 +279,8 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* This packet needs to be deferred until the NIC queue has * been drained a bit. */ - tt_record4("homa_qdisc_enqueue deferring homa data packet for id %d, offset %d, bytes_left %d on qid %d", - be64_to_cpu(h->common.sender_id), offset, - homa_get_skb_info(skb)->bytes_left, qdev->pacer_qix); + tt_record3("homa_qdisc_enqueue deferring homa data packet for id %d, offset %d on qid %d", + be64_to_cpu(h->common.sender_id), offset, qdev->pacer_qix); homa_qdisc_defer_homa(qdev, skb); return NET_XMIT_SUCCESS; @@ -287,9 +288,9 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (is_homa_pkt(skb)) { if (h->common.type == DATA) { h = (struct homa_data_hdr *)skb_transport_header(skb); - tt_record4("homa_qdisc_enqueue queuing homa data packet for id %d, offset %d, bytes_left %d on qid %d", + tt_record3("homa_qdisc_enqueue queuing homa data packet for id %d, offset %d on qid %d", be64_to_cpu(h->common.sender_id), offset, - homa_get_skb_info(skb)->bytes_left, q->ix); + q->ix); } } else { tt_record2("homa_qdisc_enqueue queuing non-homa packet, qix %d, pacer_qix %d", @@ -320,104 +321,122 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb) { struct homa_skb_info *info = homa_get_skb_info(skb); + struct homa_rpc *rpc = info->rpc; + struct rb_node *prev_deferred; u64 now = homa_clock(); - struct sk_buff *other; unsigned long flags; - /* Tricky point: only one packet from an RPC may appear in - * qdev->homa_deferred at once (the earliest one in the message). - * If later packets from the same message were also in the queue, - * they would have higher priorities and would get transmitted - * first, which we don't want. So, if more than one packet from - * a message is waiting, only the first appears in qdev->homa_deferred; - * the others are queued up using links in the homa_skb_info of - * the first packet. - * - * This also means that we must scan the list starting at the - * low-priority end, so we'll notice if there is an earlier - * (lower priority) packet for the same RPC already in the list. - */ + spin_lock_irqsave(&qdev->defer_lock, flags); + prev_deferred = rb_first_cached(&qdev->deferred_rpcs); + __skb_queue_tail(&rpc->qrpc.packets, skb); + if (skb_queue_len(&rpc->qrpc.packets) == 1) { + int bytes_left; - info->next_sibling = NULL; - info->last_sibling = NULL; - spin_lock_irqsave(&qdev->homa_deferred.lock, flags); - if (skb_queue_empty(&qdev->homa_deferred)) { - __skb_queue_head(&qdev->homa_deferred, skb); - wake_up(&qdev->pacer_sleep); - goto done; + bytes_left = rpc->msgout.length - info->offset; + if (bytes_left < rpc->qrpc.tx_left) + rpc->qrpc.tx_left = bytes_left; + homa_qdisc_insert_rb(qdev, rpc); } - INC_METRIC(throttled_cycles, now - qdev->last_defer); - skb_queue_reverse_walk(&qdev->homa_deferred, other) { - struct homa_skb_info *other_info = homa_get_skb_info(other); - - if (other_info->rpc == info->rpc) { - if (!other_info->last_sibling) - other_info->next_sibling = skb; - else - homa_get_skb_info(other_info->last_sibling)-> - next_sibling = skb; - other_info->last_sibling = skb; - break; - } + if (prev_deferred) + INC_METRIC(throttled_cycles, now - qdev->last_defer); + else + wake_up(&qdev->pacer_sleep); + qdev->last_defer = now; + spin_unlock_irqrestore(&qdev->defer_lock, flags); +} - if (other_info->bytes_left <= info->bytes_left) { - __skb_queue_after(&qdev->homa_deferred, other, skb); - break; - } +/** + * homa_qdisc_insert_rb() - Insert an RPC into the deferred_rpcs red-black + * tree. + * @qdev: Network device for the RPC. + * @rpc: RPC to insert. + */ +void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) +{ + struct rb_node **new = &(qdev->deferred_rpcs.rb_root.rb_node); + struct rb_node *parent = NULL; + bool leftmost = true; - if (skb_queue_is_first(&qdev->homa_deferred, other)) { - __skb_queue_head(&qdev->homa_deferred, skb); - break; + while (*new) { + struct homa_qdisc_rpc *qrpc; + struct homa_rpc *rpc2; + + qrpc = container_of(*new, struct homa_qdisc_rpc, rb_node); + rpc2 = container_of(qrpc, struct homa_rpc, qrpc); + + parent = *new; + /* To sort RPCs, first use bytes left to transmit; settle + * ties in favor of oldest RPC. If still tied (highly unlikely), + * use RPC address to provide deterministic ordering. + */ + if (rpc->qrpc.tx_left < rpc2->qrpc.tx_left) { + new = &((*new)->rb_left); + } else if (rpc->qrpc.tx_left > rpc2->qrpc.tx_left) { + new = &((*new)->rb_right); + leftmost = false; + } else if (rpc->msgout.init_time < rpc2->msgout.init_time) { + new = &((*new)->rb_left); + } else if (rpc->msgout.init_time < rpc2->msgout.init_time) { + new = &((*new)->rb_right); + leftmost = false; + } else if (rpc < rpc2) { + new = &((*new)->rb_left); + } else { + new = &((*new)->rb_right); + leftmost = false; } } -done: - qdev->last_defer = now; - spin_unlock_irqrestore(&qdev->homa_deferred.lock, flags); + /* Add new node and rebalance tree. */ + rb_link_node(&rpc->qrpc.rb_node, parent, new); + rb_insert_color_cached(&rpc->qrpc.rb_node, &qdev->deferred_rpcs, + leftmost); } /** - * homa_qdisc_dequeue_homa() - Remove the frontmost packet from the list - * of deferred Homa packets for a qdev. - * @qdev: The homa_deferred element is the list from which a packet - * will be dequeued. - * Return: The frontmost packet from the list, or NULL if the list was empty. + * homa_qdisc_dequeue_homa() - Return the highest-priority deferred Homa packet + * and dequeue it from the structures that manage deferred packets. + * @qdev: Info about deferred packets is stored here. + * Return: The next packet to transmit, or NULL if there are no deferred + * Homa packets. */ struct sk_buff *homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev) { - struct homa_skb_info *sibling_info; - struct sk_buff *skb, *sibling; + struct homa_qdisc_rpc *qrpc; struct homa_skb_info *info; + struct homa_rpc *rpc; + struct rb_node *node; + struct sk_buff *skb; unsigned long flags; + int bytes_left; - /* The only tricky element about this function is that skb may - * have a sibling list. If so, we need to enqueue the next - * sibling. - */ - spin_lock_irqsave(&qdev->homa_deferred.lock, flags); - if (skb_queue_empty(&qdev->homa_deferred)) { - spin_unlock_irqrestore(&qdev->homa_deferred.lock, flags); + spin_lock_irqsave(&qdev->defer_lock, flags); + node = rb_first_cached(&qdev->deferred_rpcs); + if (!node) { + spin_unlock_irqrestore(&qdev->defer_lock, flags); return NULL; } - skb = qdev->homa_deferred.next; - __skb_unlink(skb, &qdev->homa_deferred); - info = homa_get_skb_info(skb); - if (info->next_sibling) { - /* This is a "compound" packet, containing multiple - * packets from the same RPC. Put the next packet - * back on the list at the front (it should have even - * higher priority than skb, since it is later in the - * message). - */ - sibling = info->next_sibling; - sibling_info = homa_get_skb_info(sibling); - sibling_info->last_sibling = info->last_sibling; - __skb_queue_head(&qdev->homa_deferred, sibling); + qrpc = container_of(node, struct homa_qdisc_rpc, rb_node); + skb = skb_dequeue(&qrpc->packets); + if (skb_queue_len(&qrpc->packets) == 0) { + rb_erase_cached(node, &qdev->deferred_rpcs); } - if (skb_queue_empty(&qdev->homa_deferred)) + /* Update qrpc->bytes_left. This can change the priority of the RPC + * in qdev->deferred_rpcs, but the RPC was already the highest- + * priority one and its priority only gets higher, so its position + * in the rbtree won't change (thus we don't need to remove and + * reinsert it). + */ + rpc = container_of(qrpc, struct homa_rpc, qrpc); + info = homa_get_skb_info(skb); + bytes_left = rpc->msgout.length - (info->offset + info->data_bytes); + if (bytes_left < qrpc->tx_left) + qrpc->tx_left = bytes_left; + + if (!rb_first_cached(&qdev->deferred_rpcs)) INC_METRIC(throttled_cycles, homa_clock() - qdev->last_defer); - spin_unlock_irqrestore(&qdev->homa_deferred.lock, flags); + spin_unlock_irqrestore(&qdev->defer_lock, flags); return skb; } @@ -491,7 +510,7 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, break; INC_METRIC(idle_time_conflicts, 1); } - if (!skb_queue_empty(&qdev->homa_deferred)) + if (rb_first_cached(&qdev->deferred_rpcs)) INC_METRIC(pacer_bytes, bytes); return 1; } @@ -518,8 +537,7 @@ int homa_qdisc_pacer_main(void *device) qdev->pacer_wake_time = 0; INC_METRIC(pacer_cycles, homa_clock() - start); - if (!skb_queue_empty(&qdev->homa_deferred) || - !skb_queue_empty(&qdev->tcp_deferred)) { + if (homa_qdisc_any_deferred(qdev)) { /* There are more packets to transmit (the NIC queue * must be full); call the pacer again, but first * give other threads a chance to run (otherwise @@ -532,9 +550,7 @@ int homa_qdisc_pacer_main(void *device) tt_record("homa_qdisc pacer sleeping"); status = wait_event_interruptible(qdev->pacer_sleep, - kthread_should_stop() || - !skb_queue_empty(&qdev->homa_deferred) || - !skb_queue_empty(&qdev->tcp_deferred)); + kthread_should_stop() || homa_qdisc_any_deferred(qdev)); tt_record1("homa_qdisc pacer woke up with status %d", status); if (status != 0 && status != -ERESTARTSYS) break; @@ -595,7 +611,8 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) /* Note: when we get here, it's possible that the NIC queue is * still too long because other threads have queued packets, - * but we transmit anyway so the pacer thread doesn't starve. + * but we transmit anyway (don't want this thread to get + * starved by others). */ skb = homa_qdisc_dequeue_homa(qdev); if (!skb) @@ -604,10 +621,9 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) homa_qdisc_update_link_idle(qdev, qdisc_skb_cb(skb)->pkt_len, -1); h = (struct homa_data_hdr *)skb_transport_header(skb); - tt_record4("homa_qdisc_pacer queuing homa data packet for id %d, offset %d, bytes_left %d on qid %d", + tt_record3("homa_qdisc_pacer queuing homa data packet for id %d, offset %d on qid %d", be64_to_cpu(h->common.sender_id), - ntohl(h->seg.offset), - homa_get_skb_info(skb)->bytes_left, qdev->pacer_qix); + ntohl(h->seg.offset), qdev->pacer_qix); homa_qdisc_redirect_skb(skb, qdev, true); } done: diff --git a/homa_qdisc.h b/homa_qdisc.h index b79f1ea5..2bc797f1 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -13,6 +13,8 @@ #pragma GCC diagnostic pop #endif /* __UNIT_TEST__*/ +#include + #ifndef _HOMA_QDISC_H #define _HOMA_QDISC_H @@ -105,10 +107,10 @@ struct homa_qdisc_dev { struct list_head links; /** - * @homa_deferred: Homa packets whose transmission was deferred - * because the NIC queue was too long. The queue is in SRPT order. + * @deferred_rpcs: Contains all homa_rpc's with deferred packets, in + * SRPT order. */ - struct sk_buff_head homa_deferred; + struct rb_root_cached deferred_rpcs; /** * @tcp_deferred: TCP packets whose transmission was deferred @@ -123,6 +125,12 @@ struct homa_qdisc_dev { */ u64 last_defer; + /** + * @defer_lock: Sychronizes access to information about deferred + * packets, including deferred_rpcs, tcp_deferred, and last_defer. + */ + spinlock_t defer_lock; + /** * @pacer_wake_time: homa_clock() time when the pacer woke up (if * the pacer is running) or 0 if the pacer is sleeping. @@ -149,6 +157,34 @@ struct homa_qdisc_dev { spinlock_t pacer_mutex __aligned(L1_CACHE_BYTES); }; +/** + * struct homa_qdisc_rpc - One of these structs exists in each homa_rpc, with + * information needed by homa_qidsc. + */ +struct homa_qdisc_rpc { + /** + * @deferred: List of tx skbs from this RPC that have been deferred + * by homa_qdisc. Non-empty means this RPC is currently linked into + * homa_qdisc_dev->deferred_rpcs. + */ + struct sk_buff_head packets; + + /** + * @rb_node: Used to link this struct into + * homa_qdisc_dev->deferred_rpcs. + */ + struct rb_node rb_node; + + /** + * @tx_left: The number of (trailing) bytes of the tx message + * that have not been transmitted by homa_qdisc yet. Only updated + * when packets are added to or removed from the deferred list; + * may be out of date (too high) if packets have been transmitted + * without being deferred. + */ + int tx_left; +}; + void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb); struct sk_buff * @@ -159,6 +195,8 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev); int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack); +void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, + struct homa_rpc *rpc); int homa_qdisc_pacer_main(void *device); struct homa_qdisc_dev * homa_qdisc_qdev_get(struct homa_net *hnet, @@ -187,4 +225,26 @@ static inline bool homa_qdisc_active(struct homa_net *hnet) return !list_empty(&hnet->qdisc_devs); } +/** + * homa_qdisc_rpc_init() - Initialize a homa_qdisc_rpc struct. + * @qrpc: Struct to initialize + */ +static inline void homa_qdisc_rpc_init(struct homa_qdisc_rpc *qrpc) +{ + skb_queue_head_init(&qrpc->packets); + qrpc->tx_left = HOMA_MAX_MESSAGE_LENGTH; +} + +/** + * homa_qdisc_any_deferred() - Returns true if there are currently any + * deferred packets in a homa_qdisc_dev, false if there are none. + * @qdev: Holds info about deferred packets. + * Return: See above. + */ +static inline bool homa_qdisc_any_deferred(struct homa_qdisc_dev *qdev) +{ + return rb_first_cached(&qdev->deferred_rpcs) || + !skb_queue_empty(&qdev->tcp_deferred); +} + #endif /* _HOMA_QDISC_H */ diff --git a/homa_rpc.c b/homa_rpc.c index 64c16b8f..9d401b2f 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -56,6 +56,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, crpc->dport = ntohs(dest->in6.sin6_port); crpc->msgin.length = -1; crpc->msgout.length = -1; + homa_qdisc_rpc_init(&crpc->qrpc); INIT_LIST_HEAD(&crpc->ready_links); INIT_LIST_HEAD(&crpc->buf_links); INIT_LIST_HEAD(&crpc->dead_links); @@ -161,6 +162,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, srpc->id = id; srpc->msgin.length = -1; srpc->msgout.length = -1; + homa_qdisc_rpc_init(&srpc->qrpc); INIT_LIST_HEAD(&srpc->ready_links); INIT_LIST_HEAD(&srpc->buf_links); INIT_LIST_HEAD(&srpc->dead_links); diff --git a/homa_rpc.h b/homa_rpc.h index 557df449..afb57d5f 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -12,6 +12,10 @@ #include "homa_sock.h" #include "homa_wire.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_qdisc.h" +#endif /* See strip.py */ + /* Forward references. */ struct homa_ack; @@ -332,6 +336,11 @@ struct homa_rpc { */ struct homa_message_out msgout; +#ifndef __STRIP__ /* See strip.py */ + /** @qrpc: Information managed by homa_qdisc for this RPC. */ + struct homa_qdisc_rpc qrpc; +#endif /* See strip.py */ + /** * @hash_links: Used to link this object into a hash bucket for * either @hsk->client_rpc_buckets (for a client RPC), or diff --git a/test/Makefile b/test/Makefile index c586f06f..eaef3ef1 100644 --- a/test/Makefile +++ b/test/Makefile @@ -81,7 +81,7 @@ HOMA_SRCS += homa_grant.c \ homa_qdisc.c \ homa_skb.c endif -HOMA_OBJS := $(patsubst %.c,%.o,$(HOMA_SRCS)) rhashtable.o +HOMA_OBJS := $(patsubst %.c,%.o,$(HOMA_SRCS)) rbtree.o rhashtable.o OTHER_SRCS := ccutils.cc \ main.c \ diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index fde3ee7b..fd2d6238 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -3,6 +3,7 @@ #include "homa_impl.h" #include "homa_pacer.h" #include "homa_qdisc.h" +#include "homa_rpc.h" #define KSELFTEST_NOT_MAIN 1 #include "kselftest_harness.h" #include "ccutils.h" @@ -13,53 +14,56 @@ /** * new_test_skb() - Create a new skb for use in testing qdisc stuff. - * The skb will have a small data area plus homa_skb_info and - * @rpc_name: Store this as the rpc field in homa_skb_info. This string - * will be included in messages generated about the skb. - * @bytes_left: Store this as the @bytes_left field in homa_skb_info. + * The skb will have a small data area plus homa_skb_info. + * @rpc: RPC that the packet belongs to (stored in the homa_skb_info + * for the packet). + * @saddr: Source address for packet. + * @offset: Offset of packet data within output message. + * @length: Number of bytes of message data in packet; also used as + * qdisc_skb_cb(skb)->pkt_len. */ -static struct sk_buff *new_test_skb(char *rpc_name, int bytes_left) +static struct sk_buff *new_test_skb(struct homa_rpc *rpc, + struct in6_addr *saddr, int offset, + int length) { struct homa_skb_info *info; + struct homa_data_hdr data; struct sk_buff *skb; - skb = alloc_skb(100 + sizeof(struct homa_skb_info), GFP_ATOMIC); + data.common = (struct homa_common_hdr){ + .sport = htons(rpc->hsk->port), + .dport = htons(rpc->dport), + .type = DATA, + .sender_id = cpu_to_be64(rpc->id) + }; + data.message_length = htonl(rpc->msgout.length); + data.seg.offset = htonl(offset); + skb = mock_skb_alloc(saddr, &data.common, + length + sizeof(struct homa_skb_info), 0); info = homa_get_skb_info(skb); - info->rpc = rpc_name; - info->bytes_left = bytes_left; + info->rpc = rpc; + info->data_bytes = length; + info->offset = offset; + qdisc_skb_cb(skb)->pkt_len = length; return skb; } -/** - * log_skb_list() - Print info to the unit test log describing a list of - * skb's (including sibling sub-lists)a. - * @list: List to print out. - */ -void log_skb_list(struct sk_buff_head *list) +void log_deferred(struct homa_qdisc_dev *qdev) { struct homa_skb_info *info; + struct rb_node *node; + struct homa_rpc *rpc; struct sk_buff *skb; - skb_queue_walk(list, skb) { - info = homa_get_skb_info(skb); - unit_log_printf("; ", "%s:%d", (char *)info->rpc, - info->bytes_left); - if (info->next_sibling) { - struct sk_buff *sibling = info->next_sibling; - char *separator = " ["; - - while (sibling) { - struct homa_skb_info *sibling_info = - homa_get_skb_info(sibling); - - unit_log_printf(separator, "%s:%d", - (char *)sibling_info->rpc, - sibling_info->bytes_left); - separator = " "; - sibling = sibling_info->next_sibling; - } - unit_log_printf("", "]"); + for (node = rb_first_cached(&qdev->deferred_rpcs); node; + node = rb_next(node)) { + rpc = container_of(node, struct homa_rpc, qrpc.rb_node); + unit_log_printf("; ", "[id %llu, offsets", rpc->id); + skb_queue_walk(&rpc->qrpc.packets, skb) { + info = homa_get_skb_info(skb); + unit_log_printf(" ", "%d", info->offset); } + unit_log_printf("", "]"); } } @@ -84,6 +88,13 @@ FIXTURE(homa_qdisc) { struct netdev_queue txqs[NUM_TXQS]; struct Qdisc *qdiscs[NUM_TXQS]; struct ethtool_ops ethtool_ops; + struct in6_addr client_ip; + struct in6_addr server_ip; + int client_port; + int server_port; + u64 client_id; + u64 server_id; + struct homa_sock hsk; struct homa_data_hdr data; }; FIXTURE_SETUP(homa_qdisc) @@ -112,6 +123,14 @@ FIXTURE_SETUP(homa_qdisc) } mock_net_queue.dev = &mock_net_device; + self->client_ip = unit_get_in_addr("196.168.0.1"); + self->server_ip = unit_get_in_addr("1.2.3.4"); + self->client_port = 40000; + self->server_port = 99; + self->client_id = 1234; + self->server_id = 1235; + mock_sock_init(&self->hsk, self->hnet, self->client_port); + self->data.common = (struct homa_common_hdr){ .sport = htons(1000), .dport = htons(2000), @@ -307,14 +326,19 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; struct homa_qdisc *q; + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); q->ix = 3; - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - qdisc_skb_cb(skb)->pkt_len = 1500; + skb = new_test_skb(srpc, &self->addr, 0, 1500); if (skb_is_ipv6(skb)) ipv6_hdr(skb)->nexthdr = IPPROTO_TCP; else @@ -324,7 +348,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(0, q->qdev->homa_deferred.qlen); + EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); EXPECT_EQ(1, qdisc->q.qlen); EXPECT_STREQ("", unit_log_get()); EXPECT_LT(1000000, atomic64_read(&q->qdev->link_idle_time)); @@ -336,21 +360,25 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__short_message) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; struct homa_qdisc *q; + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 200); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); q->ix = 3; - self->data.message_length = htonl(100); - skb = mock_skb_alloc(&self->addr, &self->data.common, 100, 0); - qdisc_skb_cb(skb)->pkt_len = 100; + skb = new_test_skb(srpc, &self->addr, 0, 200); to_free = NULL; unit_log_clear(); EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(0, q->qdev->homa_deferred.qlen); + EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); EXPECT_EQ(1, qdisc->q.qlen); EXPECT_STREQ("", unit_log_get()); EXPECT_LT(1000000, atomic64_read(&q->qdev->link_idle_time)); @@ -362,22 +390,27 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__short_final_packet_in_long_message) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; struct homa_qdisc *q; + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); q->ix = 3; self->data.message_length = htonl(3000); self->data.seg.offset = htonl(2800); - skb = mock_skb_alloc(&self->addr, &self->data.common, 200, 0); - qdisc_skb_cb(skb)->pkt_len = 100; + skb = new_test_skb(srpc, &self->addr, 7000, 100); to_free = NULL; unit_log_clear(); EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, q->qdev->homa_deferred.qlen); + EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); EXPECT_EQ(0, qdisc->q.qlen); homa_qdisc_destroy(qdisc); @@ -387,37 +420,43 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; struct homa_qdisc *q; u64 idle; + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + /* First packet is deferred because the NIC queue is full. */ EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); idle = mock_clock + 1 + self->homa.pacer->max_nic_queue_cycles + 1; atomic64_set(&q->qdev->link_idle_time, idle); - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - qdisc_skb_cb(skb)->pkt_len = 1500; + skb = new_test_skb(srpc, &self->addr, 0, 1500); to_free = NULL; + unit_log_clear(); + mock_log_wakeups = 1; EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, q->qdev->homa_deferred.qlen); - EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); + EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); + EXPECT_STREQ("wake_up", unit_log_get()); /* Second packet is deferred even though NIC not busy, because * there are other packets waiting. */ atomic64_set(&q->qdev->link_idle_time, 0); self->data.common.sender_id = cpu_to_be64(101); - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - qdisc_skb_cb(skb)->pkt_len = 1500; + skb = new_test_skb(srpc, &self->addr, 1500, 1500); to_free = NULL; unit_log_clear(); EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, q->qdev->homa_deferred.qlen); - EXPECT_STREQ("", unit_log_get()); + log_deferred(q->qdev); + EXPECT_STREQ("[id 1235, offsets 0 1500]", unit_log_get()); homa_qdisc_destroy(qdisc); kfree(qdisc); @@ -426,12 +465,18 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__drop_packet_queue_over_limit) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; struct homa_qdisc *q; + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); q->ix = 3; - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + skb = new_test_skb(srpc, &self->addr, 0, 1500); qdisc->limit = 1; qdisc->q.qlen = 5; to_free = NULL; @@ -439,7 +484,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__drop_packet_queue_over_limit) EXPECT_EQ(NET_XMIT_DROP, homa_qdisc_enqueue(skb, qdisc, &to_free)); ASSERT_NE(NULL, to_free); - EXPECT_EQ(0, q->qdev->homa_deferred.qlen); + EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); EXPECT_EQ(5, qdisc->q.qlen); kfree_skb(to_free); @@ -449,14 +494,20 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__drop_packet_queue_over_limit) TEST_F(homa_qdisc, homa_qdisc_enqueue__use_special_queue) { struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; struct homa_qdisc *q; + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 10000); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); q = qdisc_priv(self->qdiscs[1]); q->qdev->pacer_qix = 1; q->qdev->redirect_qix = 3; - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); + skb = new_test_skb(srpc, &self->addr, 0, 1500); unit_log_clear(); spin_lock(qdisc_lock(self->qdiscs[1])); @@ -464,7 +515,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__use_special_queue) &to_free)); spin_unlock(qdisc_lock(self->qdiscs[1])); ASSERT_NE(NULL, to_free); - EXPECT_EQ(0, q->qdev->homa_deferred.qlen); + EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); EXPECT_EQ(0, self->qdiscs[1]->q.qlen); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); @@ -474,158 +525,479 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__use_special_queue) TEST_F(homa_qdisc, homa_qdisc_defer_homa__basics) { + struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; struct homa_qdisc_dev *qdev; + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + srpc4 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 6, 10000, 10000); + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 500)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg4", 1000)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 4000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 8000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc4, &self->addr, 5000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1239, offsets 8000]; " + "[id 1235, offsets 5000]; " + "[id 1241, offsets 5000]; " + "[id 1237, offsets 4000]", unit_log_get()); + EXPECT_EQ(5000, srpc1->qrpc.tx_left); + EXPECT_EQ(6000, srpc2->qrpc.tx_left); + EXPECT_EQ(2000, srpc3->qrpc.tx_left); + EXPECT_EQ(5000, srpc4->qrpc.tx_left); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_defer_homa__multiple_pkts_for_rpc) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 1000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 2000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 6000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 2500, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 4000, 1500)); unit_log_clear(); - log_skb_list(&qdev->homa_deferred); - EXPECT_STREQ("msg3:500; msg1:1000; msg4:1000; msg2:2000", unit_log_get()); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]; " + "[id 1235, offsets 1000 6000 2500 4000]", + unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_defer_homa__dont_update_tx_left) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc->qrpc.tx_left = 2000; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 5000, 500)); + EXPECT_EQ(2000, srpc->qrpc.tx_left); homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_defer_homa__throttled_cycles_metric) { + struct homa_rpc *srpc1, *srpc2; struct homa_qdisc_dev *qdev; + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); mock_clock = 5000; - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 1000, 1500)); EXPECT_EQ(5000, qdev->last_defer); EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); mock_clock = 12000; - homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 2000, 1500)); EXPECT_EQ(12000, qdev->last_defer); EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_cycles); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_defer_homa__multiple_pkts_for_rpc) +TEST_F(homa_qdisc, homa_qdisc_defer_homa__wake_up_pacer) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + struct sk_buff *skb; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + skb = new_test_skb(srpc, &self->addr, 5000, 1500); + unit_log_clear(); + mock_log_wakeups = 1; + homa_qdisc_defer_homa(qdev, skb); + EXPECT_STREQ("wake_up", unit_log_get()); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]", unit_log_get()); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_insert_rb__bytes_left) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + struct homa_qdisc_dev *qdev; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 7000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 3000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 7000]; " + "[id 1235, offsets 5000]; " + "[id 1239, offsets 3000]", + unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_insert_rb__init_time) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + struct homa_qdisc_dev *qdev; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc1->msgout.init_time = 1000; + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc2->msgout.init_time = 500; + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + srpc3->msgout.init_time = 2000; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 5000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 5000]; " + "[id 1235, offsets 5000]; " + "[id 1239, offsets 5000]", + unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_insert_rb__rpc_struct_address) { + struct homa_rpc *srpc1, *srpc2, *srpc3, *tmp; struct homa_qdisc_dev *qdev; + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + /* Swap RPCs if needed to ensure a particular ordering of addresses. */ + if (srpc1 < srpc2) { + tmp = srpc1; + srpc1 = srpc2; + srpc2 = tmp; + srpc1->id = 1235; + srpc2->id = 1237; + } + if (srpc1 > srpc3) { + tmp = srpc1; + srpc1 = srpc3; + srpc3 = tmp; + srpc1->id = 1235; + srpc3->id = 1239; + } + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 800)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 600)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 400)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 5000, 1500)); unit_log_clear(); - log_skb_list(&qdev->homa_deferred); - EXPECT_STREQ("msg1:1000 [msg1:800 msg1:600 msg1:400]; msg2:2000", + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 5000]; " + "[id 1235, offsets 5000]; " + "[id 1239, offsets 5000]", unit_log_get()); homa_qdisc_qdev_put(qdev); } +TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_left_chain) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; + struct homa_qdisc_dev *qdev; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + srpc4 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 6, 10000, 10000); -TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__list_empty) + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 6000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 7000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc4, &self->addr, 8000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1241, offsets 8000]; " + "[id 1239, offsets 7000]; " + "[id 1237, offsets 6000]; " + "[id 1235, offsets 5000]", + unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_right_chain) { + struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; struct homa_qdisc_dev *qdev; + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + srpc4 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id +6 , 10000, 10000); + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - EXPECT_EQ(1, skb_queue_empty(&qdev->homa_deferred)); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 4000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 3000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc4, &self->addr, 2000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]; " + "[id 1237, offsets 4000]; " + "[id 1239, offsets 3000]; " + "[id 1241, offsets 2000]", + unit_log_get()); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__no_deferred_rpcs) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(NULL, homa_qdisc_dequeue_homa(qdev)); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__no_siblings) +TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__multiple_packets_for_rpc) { struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; struct sk_buff *skb; + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - skb = new_test_skb("msg1", 1000); + skb = new_test_skb(srpc, &self->addr, 2000, 500); homa_qdisc_defer_homa(qdev, skb); - homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 3000)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 4000, 500)); unit_log_clear(); - log_skb_list(&qdev->homa_deferred); - EXPECT_STREQ("msg1:1000; msg2:2000; msg3:3000", unit_log_get()); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 2000 3000 4000]", unit_log_get()); EXPECT_EQ(skb, homa_qdisc_dequeue_homa(qdev)); unit_log_clear(); - log_skb_list(&qdev->homa_deferred); - EXPECT_STREQ("msg2:2000; msg3:3000", unit_log_get()); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 3000 4000]", unit_log_get()); kfree_skb(skb); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__siblings) +TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__last_packet_for_rpc) { + struct homa_rpc *srpc1, *srpc2; struct homa_qdisc_dev *qdev; - struct sk_buff *skb1, *skb2; + struct sk_buff *skb; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - skb1 = new_test_skb("msg1", 1000); - homa_qdisc_defer_homa(qdev, skb1); - skb2 = new_test_skb("msg2", 2000); - homa_qdisc_defer_homa(qdev, skb2); - homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 3000)); + skb = new_test_skb(srpc1, &self->addr, 5000, 500); + homa_qdisc_defer_homa(qdev, skb); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 3000, 500)); unit_log_clear(); - log_skb_list(&qdev->homa_deferred); - EXPECT_STREQ("msg1:1000; msg2:2000; msg3:3000", unit_log_get()); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 2000 3000]", + unit_log_get()); - EXPECT_EQ(skb1, homa_qdisc_dequeue_homa(qdev)); + EXPECT_EQ(skb, homa_qdisc_dequeue_homa(qdev)); unit_log_clear(); - log_skb_list(&qdev->homa_deferred); - EXPECT_STREQ("msg2:2000; msg3:3000", unit_log_get()); - kfree_skb(skb1); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000 3000]", unit_log_get()); + kfree_skb(skb); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__update_tx_left) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 4000, 500)); + srpc->qrpc.tx_left = 6000; + + /* First packet doesn't update tx_left. */ + kfree_skb(homa_qdisc_dequeue_homa(qdev)); + EXPECT_EQ(6000, srpc->qrpc.tx_left); + + /* Second packet does update tx_left. */ + kfree_skb(homa_qdisc_dequeue_homa(qdev)); + EXPECT_EQ(5500, srpc->qrpc.tx_left); - EXPECT_EQ(skb2, homa_qdisc_dequeue_homa(qdev)); - unit_log_clear(); - log_skb_list(&qdev->homa_deferred); - EXPECT_STREQ("msg3:3000", unit_log_get()); - kfree_skb(skb2); homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__throttled_cycles_metric) { struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); mock_clock = 5000; - homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 2000)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 3000)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 2000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); EXPECT_EQ(5000, qdev->last_defer); mock_clock = 12000; kfree_skb(homa_qdisc_dequeue_homa(qdev)); EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); - EXPECT_EQ(0, skb_queue_empty(&qdev->homa_deferred)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); kfree_skb(homa_qdisc_dequeue_homa(qdev)); EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_cycles); - EXPECT_EQ(1, skb_queue_empty(&qdev->homa_deferred)); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_free_homa) { struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 500)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 1000)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 600)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg2", 400)); - homa_qdisc_defer_homa(qdev, new_test_skb("msg3", 2000)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 1000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 2000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 4000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 5000, 500)); unit_log_clear(); - log_skb_list(&qdev->homa_deferred); - EXPECT_STREQ("msg1:500; msg2:1000 [msg2:600 msg2:400]; msg3:2000", + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 1000 2000 3000 4000 5000]", unit_log_get()); homa_qdisc_free_homa(qdev); unit_log_clear(); - log_skb_list(&qdev->homa_deferred); + log_deferred(qdev); EXPECT_STREQ("", unit_log_get()); homa_qdisc_qdev_put(qdev); } @@ -718,6 +1090,12 @@ TEST_F(homa_qdisc, homa_qdisc_update_link_idle__cmpxchg_conflicts) TEST_F(homa_qdisc, homa_qdisc_update_link_idle__pacer_bytes_metric) { struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); ASSERT_FALSE(IS_ERR(qdev)); @@ -728,8 +1106,7 @@ TEST_F(homa_qdisc, homa_qdisc_update_link_idle__pacer_bytes_metric) /* Deferred packets. */ homa_qdisc_defer_homa(qdev, - mock_skb_alloc(&self->addr, &self->data.common, - 1500, 0)); + new_test_skb(srpc, &self->addr, 0, 1500)); homa_qdisc_update_link_idle(qdev, 500, -1); EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); @@ -767,47 +1144,59 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__queue_empty) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) +TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) { struct homa_qdisc_dev *qdev; u64 link_idle; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); link_idle = atomic64_read(&qdev->link_idle_time); - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); - EXPECT_EQ(1, qdev->homa_deferred.qlen); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); unit_log_clear(); + mock_trylock_errors = 1; homa_qdisc_pacer(qdev); - EXPECT_EQ(0, qdev->homa_deferred.qlen); - EXPECT_EQ(1, self->qdiscs[3]->q.qlen); - EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_EQ(link_idle, atomic64_read(&qdev->link_idle_time)); homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) +TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) { struct homa_qdisc_dev *qdev; u64 link_idle; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); link_idle = atomic64_read(&qdev->link_idle_time); - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); - EXPECT_EQ(1, qdev->homa_deferred.qlen); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); unit_log_clear(); - mock_trylock_errors = 1; homa_qdisc_pacer(qdev); - EXPECT_EQ(1, qdev->homa_deferred.qlen); - EXPECT_EQ(0, self->qdiscs[3]->q.qlen); - EXPECT_EQ(link_idle, atomic64_read(&qdev->link_idle_time)); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); @@ -815,12 +1204,18 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) { struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); - homa_qdisc_defer_homa(qdev, new_test_skb("msg1", 1000)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); mock_clock = 0; mock_clock_tick = 1000; @@ -829,7 +1224,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) unit_log_clear(); homa_qdisc_pacer(qdev); - EXPECT_EQ(0, qdev->homa_deferred.qlen); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); /* Packet will get transmitted when mock_clock ticks to 7000, but @@ -844,28 +1239,40 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) } TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) { + struct homa_rpc *srpc1, *srpc2; struct homa_qdisc_dev *qdev; struct sk_buff *skb; + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); - skb = new_test_skb("msg1", 1000); - qdisc_skb_cb(skb)->pkt_len = 1500; + skb = new_test_skb(srpc1, &self->addr, 5000, 1500); homa_qdisc_defer_homa(qdev, skb); - skb = new_test_skb("msg2", 1000); - qdisc_skb_cb(skb)->pkt_len = 1500; + skb = new_test_skb(srpc2, &self->addr, 4000, 1500); homa_qdisc_defer_homa(qdev, skb); - EXPECT_EQ(2, qdev->homa_deferred.qlen); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 4000]", unit_log_get()); mock_clock = atomic64_read(&qdev->link_idle_time); self->homa.pacer->max_nic_queue_cycles = 100; unit_log_clear(); homa_qdisc_pacer(qdev); - EXPECT_EQ(1, qdev->homa_deferred.qlen); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 4000]", unit_log_get()); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); EXPECT_LT(mock_clock + 100, atomic64_read(&qdev->link_idle_time)); @@ -875,8 +1282,8 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_pacer_qix) { - struct sk_buff *skb; struct homa_qdisc_dev *qdev; + struct sk_buff *skb; int status; EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); @@ -898,8 +1305,8 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_pacer_qix) } TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_redirect_qix) { - struct sk_buff *skb; struct homa_qdisc_dev *qdev; + struct sk_buff *skb; int status; EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); @@ -920,8 +1327,8 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_redirect_qix) } TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_invalid) { - struct sk_buff *skb; struct homa_qdisc_dev *qdev; + struct sk_buff *skb; int status; int i; @@ -944,8 +1351,8 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_invalid) } TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_not_a_homa_qdisc) { - struct sk_buff *skb; struct homa_qdisc_dev *qdev; + struct sk_buff *skb; int status; int i; @@ -969,8 +1376,8 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_not_a_homa_qdisc) } TEST_F(homa_qdisc, homa_qdisc_redirect_skb__no_suitable_qdisc) { - struct sk_buff *skb; struct homa_qdisc_dev *qdev; + struct sk_buff *skb; int status; int i; From ad2acbd7792537dd01212a5ad5cc4fee96d85a68 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 10 Sep 2025 16:27:54 -0700 Subject: [PATCH 486/625] Fix deadlock over hsk->protect_count in homa_rpc_reap Must return from homa_rpc_reap if hsk->protect_count > 0, even if reap_all is specified. Otherwise self-deadlock can occur, where homa_timer protects a socket, then gets interrupted by homa_softirq, which calls homa_rpc_reap with reap_all. --- homa_rpc.c | 2 -- test/unit_homa_rpc.c | 44 -------------------------------------------- 2 files changed, 46 deletions(-) diff --git a/homa_rpc.c b/homa_rpc.c index 9d401b2f..d83807f2 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -502,8 +502,6 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) hsk->port, atomic_read(&hsk->protect_count), hsk->dead_skbs); homa_sock_unlock(hsk); - if (reap_all) - continue; return 0; } diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 3db56ebc..87fa1441 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -14,34 +14,6 @@ #define n(x) htons(x) #define N(x) htonl(x) -static struct homa_sock *hook_hsk; -void unprotect_hsk_hook(char *id) -{ - if (strcmp(id, "unlock") != 0) - return; - if (hook_hsk) { - homa_unprotect_rpcs(hook_hsk); - hook_hsk = NULL; - } -} - -#if 0 -static struct homa_rpc *hook_rpc; -static int hook_count; -static void unlink_rpc_hook(char *id) -{ - if (strcmp(id, "spin_lock")!= 0) - return; - if (hook_count == 0) - return; - hook_count--; - if (hook_count == 0) { - list_del_init(&hook_rpc->ready_links); - homa_rpc_put(hook_rpc); - } -} -#endif - FIXTURE(homa_rpc) { struct in6_addr client_ip[1]; int client_port; @@ -606,22 +578,6 @@ TEST_F(homa_rpc, homa_rpc_reap__protected) homa_unprotect_rpcs(&self->hsk); EXPECT_STREQ("", unit_log_get()); } -TEST_F(homa_rpc, homa_rpc_reap__protected_and_reap_all) -{ - struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, - UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, - self->server_port, self->client_id, 5000, 2000); - - ASSERT_NE(NULL, crpc1); - homa_rpc_end(crpc1); - unit_log_clear(); - homa_protect_rpcs(&self->hsk); - hook_hsk = &self->hsk; - unit_hook_register(unprotect_hsk_hook); - EXPECT_EQ(0, homa_rpc_reap(&self->hsk, true)); - EXPECT_STREQ("reaped 1234", unit_log_get()); - EXPECT_EQ(0, self->hsk.dead_skbs); -} TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_locked) { struct homa_rpc *crpc1 = unit_client_rpc(&self->hsk, From 3c4ff22386961d3475755bd03a208123b9d030e4 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 10 Sep 2025 16:28:35 -0700 Subject: [PATCH 487/625] Trivial edit to comment --- homa_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_impl.h b/homa_impl.h index ff2fad8b..c8e78f17 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -521,7 +521,7 @@ struct homa_net { #ifndef __STRIP__ /* See strip.py */ /** * @qdisc_devs: List of all homa_qdisc_dev objects that exist for - * this namespace. Protected by qdisc_devs_lock. + * this namespace. Protected by qdisc_devs_mutex. */ struct list_head qdisc_devs; From 224d1ffa37d2407bf13f9395ee542deff2ccd319 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 10 Sep 2025 16:29:04 -0700 Subject: [PATCH 488/625] Add homa_rpcs_deferred functon in homa_devel.c --- homa_devel.c | 23 +++++++++++++++++++++++ homa_devel.h | 1 + 2 files changed, 24 insertions(+) diff --git a/homa_devel.c b/homa_devel.c index b9d5e8a3..2bfa400b 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -1063,6 +1063,7 @@ void homa_rpc_snapshot_log_tt(void) i = 0; } while (i != next_snapshot); } + /** * homa_rpc_stats_log() - Print statistics on RPC progress to the system log. */ @@ -1104,3 +1105,25 @@ void homa_rpc_stats_log(void) snap.server_response_bytes_started - snap.server_response_bytes_done); } + +/** + * homa_rpcs_deferred() - Return true if there are any RPCs with packets + * that have been deferred by homa_qdisc, false if there are none. + * @hnet: Consider only RPCs associatged with this network namespace. + * Return: See above. + */ +bool homa_rpcs_deferred(struct homa_net *hnet) +{ + struct homa_qdisc_dev *qdev; + bool result = false; + + mutex_lock(&hnet->qdisc_devs_mutex); + list_for_each_entry(qdev, &hnet->qdisc_devs, links) { + if (homa_qdisc_any_deferred(qdev)) { + result = true; + break; + } + } + mutex_unlock(&hnet->qdisc_devs_mutex); + return result; +} diff --git a/homa_devel.h b/homa_devel.h index 8caf164a..07c6de53 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -128,6 +128,7 @@ void homa_rpc_log_tt(struct homa_rpc *rpc); void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); void homa_rpc_snapshot_log_tt(void); void homa_rpc_stats_log(void); +bool homa_rpcs_deferred(struct homa_net *hnet); void homa_snapshot_get_stats(struct homa_rpc_snapshot *snap); void homa_snapshot_rpcs(void); int homa_snprintf(char *buffer, int size, int used, From b2428168a472945f3ef8902d152fe51721f6451e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 10 Sep 2025 16:29:24 -0700 Subject: [PATCH 489/625] Disable timetraces sooner in tt_freeze --- timetrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timetrace.c b/timetrace.c index bc86ba1c..cafdc585 100644 --- a/timetrace.c +++ b/timetrace.c @@ -227,8 +227,8 @@ void tt_freeze(void) */ if (atomic_xchg(&tt_frozen, 1) == 0) { tt_record("timetrace frozen"); - pr_err("%s invoked\n", __func__); atomic_inc(&tt_freeze_count); + pr_err("%s invoked\n", __func__); } } From 9146e13d8fc8f891e47e253ee9242124ac151aaa Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 12 Sep 2025 16:24:48 -0700 Subject: [PATCH 490/625] Handle NULL skb->dev after ip_queue_xmit in __homa_xmit_control --- homa_outgoing.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index a9cc0190..c718dd3f 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -454,7 +454,6 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, int extra_bytes; int result; - IF_NO_STRIP(struct netdev_queue *txq); IF_NO_STRIP(int priority); skb = homa_skb_alloc_tx(HOMA_MAX_HEADER); @@ -494,11 +493,15 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, if (unlikely(result != 0)) INC_METRIC(control_xmit_errors, 1); #ifndef __STRIP__ /* See strip.py */ - txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); - if (netif_tx_queue_stopped(txq)) - tt_record4("__homa_xmit_control found stopped txq for id %d, qid %u, num_queued %u, limit %d", - be64_to_cpu(h->sender_id), skb->queue_mapping, - txq->dql.num_queued, txq->dql.adj_limit); + if (skb->dev) { + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); + if (netif_tx_queue_stopped(txq)) + tt_record4("__homa_xmit_control found stopped txq for id %d, qid %u, num_queued %u, limit %d", + be64_to_cpu(h->sender_id), skb->queue_mapping, + txq->dql.num_queued, txq->dql.adj_limit); + } #endif /* See strip.py */ INC_METRIC(packets_sent[h->type - DATA], 1); INC_METRIC(priority_bytes[priority], skb->len); From 916a20147d829b6ed9d64eeecb2433fc9b9455e1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 12 Sep 2025 16:28:22 -0700 Subject: [PATCH 491/625] Introduce homa_qdisc_precedes function Also rearrange headers slightly so that homa_rpc.h doesn't need to include homa_qdisc.h --- homa_qdisc.c | 28 ++------ homa_qdisc.h | 60 ++++++++--------- homa_rpc.c | 1 + homa_rpc.h | 34 ++++++++-- test/unit_homa_qdisc.c | 149 +++++++++++++++++++---------------------- 5 files changed, 133 insertions(+), 139 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 45b6dd93..dfa4b39a 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -355,31 +355,13 @@ void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) { struct rb_node **new = &(qdev->deferred_rpcs.rb_root.rb_node); struct rb_node *parent = NULL; + struct homa_rpc *rpc2; bool leftmost = true; while (*new) { - struct homa_qdisc_rpc *qrpc; - struct homa_rpc *rpc2; - - qrpc = container_of(*new, struct homa_qdisc_rpc, rb_node); - rpc2 = container_of(qrpc, struct homa_rpc, qrpc); - parent = *new; - /* To sort RPCs, first use bytes left to transmit; settle - * ties in favor of oldest RPC. If still tied (highly unlikely), - * use RPC address to provide deterministic ordering. - */ - if (rpc->qrpc.tx_left < rpc2->qrpc.tx_left) { - new = &((*new)->rb_left); - } else if (rpc->qrpc.tx_left > rpc2->qrpc.tx_left) { - new = &((*new)->rb_right); - leftmost = false; - } else if (rpc->msgout.init_time < rpc2->msgout.init_time) { - new = &((*new)->rb_left); - } else if (rpc->msgout.init_time < rpc2->msgout.init_time) { - new = &((*new)->rb_right); - leftmost = false; - } else if (rpc < rpc2) { + rpc2 = container_of(*new, struct homa_rpc, qrpc.rb_node); + if (homa_qdisc_precedes(rpc, rpc2)) { new = &((*new)->rb_left); } else { new = &((*new)->rb_right); @@ -402,7 +384,7 @@ void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) */ struct sk_buff *homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev) { - struct homa_qdisc_rpc *qrpc; + struct homa_rpc_qdisc *qrpc; struct homa_skb_info *info; struct homa_rpc *rpc; struct rb_node *node; @@ -416,7 +398,7 @@ struct sk_buff *homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev) spin_unlock_irqrestore(&qdev->defer_lock, flags); return NULL; } - qrpc = container_of(node, struct homa_qdisc_rpc, rb_node); + qrpc = container_of(node, struct homa_rpc_qdisc, rb_node); skb = skb_dequeue(&qrpc->packets); if (skb_queue_len(&qrpc->packets) == 0) { rb_erase_cached(node, &qdev->deferred_rpcs); diff --git a/homa_qdisc.h b/homa_qdisc.h index 2bc797f1..f723f7e4 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -4,6 +4,8 @@ * queuing discipline */ +#include "homa_rpc.h" + #ifdef __UNIT_TEST__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" @@ -157,34 +159,6 @@ struct homa_qdisc_dev { spinlock_t pacer_mutex __aligned(L1_CACHE_BYTES); }; -/** - * struct homa_qdisc_rpc - One of these structs exists in each homa_rpc, with - * information needed by homa_qidsc. - */ -struct homa_qdisc_rpc { - /** - * @deferred: List of tx skbs from this RPC that have been deferred - * by homa_qdisc. Non-empty means this RPC is currently linked into - * homa_qdisc_dev->deferred_rpcs. - */ - struct sk_buff_head packets; - - /** - * @rb_node: Used to link this struct into - * homa_qdisc_dev->deferred_rpcs. - */ - struct rb_node rb_node; - - /** - * @tx_left: The number of (trailing) bytes of the tx message - * that have not been transmitted by homa_qdisc yet. Only updated - * when packets are added to or removed from the deferred list; - * may be out of date (too high) if packets have been transmitted - * without being deferred. - */ - int tx_left; -}; - void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb); struct sk_buff * @@ -226,10 +200,10 @@ static inline bool homa_qdisc_active(struct homa_net *hnet) } /** - * homa_qdisc_rpc_init() - Initialize a homa_qdisc_rpc struct. + * homa_qdisc_rpc_init() - Initialize a homa_rpc_qdisc struct. * @qrpc: Struct to initialize */ -static inline void homa_qdisc_rpc_init(struct homa_qdisc_rpc *qrpc) +static inline void homa_qdisc_rpc_init(struct homa_rpc_qdisc *qrpc) { skb_queue_head_init(&qrpc->packets); qrpc->tx_left = HOMA_MAX_MESSAGE_LENGTH; @@ -247,4 +221,30 @@ static inline bool homa_qdisc_any_deferred(struct homa_qdisc_dev *qdev) !skb_queue_empty(&qdev->tcp_deferred); } +/** + * homa_qdisc_precedes() - Return true if @rpc1 is considered "less" + * than @rpc2 for the purposes of qdev->deferred_rpcs, or false if @rpc1 + * is consdered "greater" (ties not allowed). + * @rpc1: RPC to compare + * @rpc2: RPC to compare; must be different from rpc1. + */ +static inline bool homa_qdisc_precedes(struct homa_rpc *rpc1, + struct homa_rpc *rpc2) +{ + /* The primary metric for comparison is bytes left to transmit; + * in case of ties, use RPC age as secondar metric (oldest RPC + * is "less"), and if still tied (highly unlikely) use the + * addresses of the RPCs as a tie-breaker. + */ + if (rpc1->qrpc.tx_left < rpc2->qrpc.tx_left) + return true; + else if (rpc2->qrpc.tx_left < rpc1->qrpc.tx_left) + return false; + if (rpc1->msgout.init_time < rpc2->msgout.init_time) + return true; + else if (rpc2->msgout.init_time < rpc1->msgout.init_time) + return false; + return rpc1 < rpc2; +} + #endif /* _HOMA_QDISC_H */ diff --git a/homa_rpc.c b/homa_rpc.c index d83807f2..2b49bea9 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -10,6 +10,7 @@ #ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" #include "homa_pacer.h" +#include "homa_qdisc.h" #include "homa_skb.h" #else /* See strip.py */ #include "homa_stub.h" diff --git a/homa_rpc.h b/homa_rpc.h index afb57d5f..be5f05b3 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -12,10 +12,6 @@ #include "homa_sock.h" #include "homa_wire.h" -#ifndef __STRIP__ /* See strip.py */ -#include "homa_qdisc.h" -#endif /* See strip.py */ - /* Forward references. */ struct homa_ack; @@ -212,6 +208,34 @@ struct homa_message_in { #endif /* See strip.py */ }; +/** + * struct homa_rpc_qdisc - Information that homa_qdisc needs to store in + * each RPC. Managed entirely by homa_qdisc. + */ +struct homa_rpc_qdisc { + /** + * @deferred: List of tx skbs from this RPC that have been deferred + * by homa_qdisc. Non-empty means this RPC is currently linked into + * homa_qdisc_dev->deferred_rpcs. + */ + struct sk_buff_head packets; + + /** + * @rb_node: Used to link this struct into + * homa_qdisc_dev->deferred_rpcs. + */ + struct rb_node rb_node; + + /** + * @tx_left: The number of (trailing) bytes of the tx message + * that have not been transmitted by homa_qdisc yet. Only updated + * when packets are added to or removed from the deferred list; + * may be out of date (too high) if packets have been transmitted + * without being deferred. + */ + int tx_left; +}; + /** * struct homa_rpc - One of these structures exists for each active * RPC. The same structure is used to manage both outgoing RPCs on @@ -338,7 +362,7 @@ struct homa_rpc { #ifndef __STRIP__ /* See strip.py */ /** @qrpc: Information managed by homa_qdisc for this RPC. */ - struct homa_qdisc_rpc qrpc; + struct homa_rpc_qdisc qrpc; #endif /* See strip.py */ /** diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index fd2d6238..c31a5f6b 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -661,7 +661,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__wake_up_pacer) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_insert_rb__bytes_left) +TEST_F(homa_qdisc, homa_qdisc_insert_rb__basics) { struct homa_rpc *srpc1, *srpc2, *srpc3; struct homa_qdisc_dev *qdev; @@ -692,86 +692,6 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__bytes_left) unit_log_get()); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_insert_rb__init_time) -{ - struct homa_rpc *srpc1, *srpc2, *srpc3; - struct homa_qdisc_dev *qdev; - - srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 10000, 10000); - srpc1->msgout.init_time = 1000; - srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id + 2, 10000, 10000); - srpc2->msgout.init_time = 500; - srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id + 4, 10000, 10000); - srpc3->msgout.init_time = 2000; - - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - - homa_qdisc_defer_homa(qdev, - new_test_skb(srpc1, &self->addr, 5000, 1500)); - homa_qdisc_defer_homa(qdev, - new_test_skb(srpc2, &self->addr, 5000, 1500)); - homa_qdisc_defer_homa(qdev, - new_test_skb(srpc3, &self->addr, 5000, 1500)); - unit_log_clear(); - log_deferred(qdev); - EXPECT_STREQ("[id 1237, offsets 5000]; " - "[id 1235, offsets 5000]; " - "[id 1239, offsets 5000]", - unit_log_get()); - homa_qdisc_qdev_put(qdev); -} -TEST_F(homa_qdisc, homa_qdisc_insert_rb__rpc_struct_address) -{ - struct homa_rpc *srpc1, *srpc2, *srpc3, *tmp; - struct homa_qdisc_dev *qdev; - - srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 10000, 10000); - srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id + 2, 10000, 10000); - srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id + 4, 10000, 10000); - /* Swap RPCs if needed to ensure a particular ordering of addresses. */ - if (srpc1 < srpc2) { - tmp = srpc1; - srpc1 = srpc2; - srpc2 = tmp; - srpc1->id = 1235; - srpc2->id = 1237; - } - if (srpc1 > srpc3) { - tmp = srpc1; - srpc1 = srpc3; - srpc3 = tmp; - srpc1->id = 1235; - srpc3->id = 1239; - } - - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); - - homa_qdisc_defer_homa(qdev, - new_test_skb(srpc1, &self->addr, 5000, 1500)); - homa_qdisc_defer_homa(qdev, - new_test_skb(srpc2, &self->addr, 5000, 1500)); - homa_qdisc_defer_homa(qdev, - new_test_skb(srpc3, &self->addr, 5000, 1500)); - unit_log_clear(); - log_deferred(qdev); - EXPECT_STREQ("[id 1237, offsets 5000]; " - "[id 1235, offsets 5000]; " - "[id 1239, offsets 5000]", - unit_log_get()); - homa_qdisc_qdev_put(qdev); -} TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_left_chain) { struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; @@ -1468,4 +1388,71 @@ TEST_F(homa_qdisc, homa_qdisc_update_all_sysctl) kfree(qdisc); homa_qdisc_destroy(qdisc2); kfree(qdisc2); +} + +TEST_F(homa_qdisc, homa_qdisc_precedes__bytes_left) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + srpc1->qrpc.tx_left = 5000; + srpc2->qrpc.tx_left = 3000; + srpc3->qrpc.tx_left = 7000; + EXPECT_EQ(0, homa_qdisc_precedes(srpc1, srpc2)); + EXPECT_EQ(1, homa_qdisc_precedes(srpc1, srpc3)); +} +TEST_F(homa_qdisc, homa_qdisc_precedes__init_time) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc1->msgout.init_time = 1000; + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc2->msgout.init_time = 500; + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + srpc3->msgout.init_time = 2000; + + EXPECT_EQ(0, homa_qdisc_precedes(srpc1, srpc2)); + EXPECT_EQ(1, homa_qdisc_precedes(srpc1, srpc3)); +} +TEST_F(homa_qdisc, homa_qdisc_precedes__rpc_struct_address) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + int result; + + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + if (srpc1 > srpc2) + result = homa_qdisc_precedes(srpc1, srpc2); + else + result = homa_qdisc_precedes(srpc2, srpc1); + EXPECT_EQ(0, result); + if (srpc1 < srpc3) + result = homa_qdisc_precedes(srpc1, srpc3); + else + result = homa_qdisc_precedes(srpc3, srpc1); + EXPECT_EQ(1, result); } \ No newline at end of file From 810a982136fda420d4225434de38b95822a8f522 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 15 Sep 2025 10:51:26 -0700 Subject: [PATCH 492/625] Fix stale information in comments --- homa_rpc.h | 14 +++++++------- test/utils.c | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/homa_rpc.h b/homa_rpc.h index be5f05b3..21153b40 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -44,17 +44,17 @@ struct homa_message_out { /** * @packets: Singly-linked list of all packets in message, linked - * using homa_next_skb. The list is in order of offset in the message - * (offset 0 first); each sk_buff can potentially contain multiple - * data_segments, which will be split into separate packets by GSO. - * This list grows gradually as data is copied in from user space, - * so it may not be complete. + * using homa_skb_info->next_skb. The list is in order of offset in + * the message (offset 0 first); each sk_buff can potentially contain + * multiple data_segments, which will be split into separate packets + * by GSO. This list grows gradually as data is copied in from user\ + * space, so it may not be complete. */ struct sk_buff *packets; /** * @next_xmit: Pointer to pointer to next packet to transmit (will - * either refer to @packets or homa_next_skb(skb) for some skb + * either refer to @packets or homa_skb_info->next_skb for some skb * in @packets). */ struct sk_buff **next_xmit; @@ -70,7 +70,7 @@ struct homa_message_out { * @first_not_tx: All packets in @packets preceding this one have * been confirmed to have been transmitted by the NIC (the driver * has released its reference). NULL means all packets are known to - * have been transmitted. Used by homa_rpc_tx_complete. + * have been transmitted. Used by homa_rpc_tx_end. */ struct sk_buff *first_not_tx; diff --git a/test/utils.c b/test/utils.c index c8055ea0..187cd54b 100644 --- a/test/utils.c +++ b/test/utils.c @@ -262,7 +262,7 @@ void unit_log_message_out_packets(struct homa_message_out *message, int verbose) * unit_log_filled_skbs() - Append to the test log a human-readable description * of a list of packet buffers created by homa_fill_packets. * @skb: First in list of sk_buffs to print; the list is linked - * using homa_next_skb. + * using homa_skb_info->next_skb. * @verbose: If non-zero, use homa_print_packet for each packet; * otherwise use homa_print_packet_short. */ From 3ec1db25a49ebabbc540e62f502f79b6bcc61e1a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 15 Sep 2025 10:51:36 -0700 Subject: [PATCH 493/625] Add missing include for homa_qdisc.h to homa_devel.c --- homa_devel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/homa_devel.c b/homa_devel.c index 2bfa400b..d3cdfc56 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -10,6 +10,7 @@ #include "homa_peer.h" #include "homa_rpc.h" #ifndef __STRIP__ /* See strip.py */ +#include "homa_qdisc.h" #include "homa_skb.h" #else /* See strip.py */ #include "homa_stub.h" From 41506939e2e72774f1a23421ab9da75ff95d7407 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 15 Sep 2025 12:10:01 -0700 Subject: [PATCH 494/625] Retain retransmitted packets until homa_rpc_reap Needed to ensure that RPCs don't get reaped until retransmitted packets have exited the tx pipeline. --- homa_impl.h | 5 +---- homa_outgoing.c | 5 ++++- homa_rpc.c | 21 +++++++++++++++------ homa_rpc.h | 17 ++++++++++++++++- test/mock.c | 21 --------------------- test/mock.h | 1 - test/unit_homa_outgoing.c | 18 +++++++++++++----- test/unit_homa_rpc.c | 2 +- 8 files changed, 50 insertions(+), 40 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index c8e78f17..1920375c 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -541,10 +541,7 @@ struct homa_net { * linear part of the skb. */ struct homa_skb_info { - /** - * @next_skb: used to link together all of the skb's for an - * outgoing Homa message (in order of offset). - */ + /** @next_skb: used to link together outgoing skb's for a message. */ struct sk_buff *next_skb; /** diff --git a/homa_outgoing.c b/homa_outgoing.c index c718dd3f..d69f89ab 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -824,7 +824,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) } new_homa_info = homa_get_skb_info(new_skb); - new_homa_info->next_skb = NULL; + new_homa_info->next_skb = rpc->msgout.to_free; new_homa_info->wire_bytes = rpc->hsk->ip_header_length + sizeof(struct homa_data_hdr) + seg_length + HOMA_ETH_OVERHEAD; @@ -832,6 +832,9 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) new_homa_info->seg_length = seg_length; new_homa_info->offset = offset; new_homa_info->rpc = rpc; + + rpc->msgout.to_free = new_skb; + skb_get(new_skb); tt_record3("retransmitting offset %d, length %d, id %d", offset, seg_length, rpc->id); #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_rpc.c b/homa_rpc.c index 2b49bea9..a14268ca 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -533,15 +533,23 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) * freeing until after releasing the socket lock. */ if (rpc->msgout.length >= 0) { - while (rpc->msgout.packets) { - struct sk_buff *skb = - rpc->msgout.packets; + while (1) { + struct sk_buff *skb; + + skb = rpc->msgout.to_free; + if (!skb) { + skb = rpc->msgout.packets; + if (!skb) + break; + rpc->msgout.to_free = skb; + rpc->msgout.packets = NULL; + } /* This tests whether skb is still in a * transmit queue somewhere; if so, * can't reap the RPC since homa_qdisc - * may try to access it via the skb's - * homa_skb_info. + * may try to access the RPC via the + * skb's homa_skb_info. */ if (refcount_read(&skb->users) > 1) { INC_METRIC(reaper_active_skbs, @@ -549,12 +557,13 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) goto next_rpc; } skbs[num_skbs] = skb; - rpc->msgout.packets = + rpc->msgout.to_free = homa_get_skb_info(skb)->next_skb; num_skbs++; rpc->msgout.num_skbs--; if (num_skbs >= batch_size) goto release; + } } diff --git a/homa_rpc.h b/homa_rpc.h index 21153b40..5fb52f63 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -47,7 +47,7 @@ struct homa_message_out { * using homa_skb_info->next_skb. The list is in order of offset in * the message (offset 0 first); each sk_buff can potentially contain * multiple data_segments, which will be split into separate packets - * by GSO. This list grows gradually as data is copied in from user\ + * by GSO. This list grows gradually as data is copied in from user * space, so it may not be complete. */ struct sk_buff *packets; @@ -74,6 +74,21 @@ struct homa_message_out { */ struct sk_buff *first_not_tx; + /** + * @to_free: Singly-linked list of packets that must be freed by + * homa_rpc_reap. Initially holds retransmitted packets, but + * eventually includes the packets in @packets. homa_rpc_reap uses + * this list to ensure that all tx packets have been freed by the + * IP stack before it frees the homa_rpc (otherwise homa_qdisc might + * try to access the RPC via a packet's homa_skb_info). Note: I + * considered using skb->destructor to release a reference on the RPC, + * but this does not appear to be reliable because (a) skb->destructor + * may be overwritten and (b) it may be called before the skb has + * cleared the tx pipeline (via skb_orphan?). Also, need to retain + * @packets in case they are needed for retransmission. + */ + struct sk_buff *to_free; + #ifndef __STRIP__ /* See strip.py */ /** * @unscheduled: Initial bytes of message that we'll send diff --git a/test/mock.c b/test/mock.c index 1437c4f3..8d52108e 100644 --- a/test/mock.c +++ b/test/mock.c @@ -76,11 +76,6 @@ struct task_struct mock_task; */ int mock_xmit_log_verbose; -/* If a test sets this variable to nonzero, ip_queue_xmit will log - * the contents of the homa_info from packets. - */ -int mock_xmit_log_homa_info; - /* If a test sets this variable to nonzero, calls to wake_up and * wake_up_all will be logged. */ @@ -781,14 +776,6 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, else homa_print_packet_short(skb, buffer, sizeof(buffer)); unit_log_printf("; ", "xmit %s", buffer); - if (mock_xmit_log_homa_info) { - struct homa_skb_info *homa_info; - - homa_info = homa_get_skb_info(skb); - unit_log_printf("; ", "homa_info: wire_bytes %d, data_bytes %d, seg_length %d, offset %d", - homa_info->wire_bytes, homa_info->data_bytes, - homa_info->seg_length, homa_info->offset); - } kfree_skb(skb); return 0; } @@ -816,13 +803,6 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) else homa_print_packet_short(skb, buffer, sizeof(buffer)); unit_log_printf("; ", "xmit %s", buffer); - if (mock_xmit_log_homa_info) { - struct homa_skb_info *homa_info; - - homa_info = homa_get_skb_info(skb); - unit_log_printf("; ", "homa_info: wire_bytes %d, data_bytes %d", - homa_info->wire_bytes, homa_info->data_bytes); - } kfree_skb(skb); return 0; } @@ -2343,7 +2323,6 @@ void mock_teardown(void) mock_prepare_to_wait_status = -ERESTARTSYS; mock_signal_pending = 0; mock_xmit_log_verbose = 0; - mock_xmit_log_homa_info = 0; mock_log_wakeups = 0; mock_mtu = 0; mock_max_skb_frags = MAX_SKB_FRAGS; diff --git a/test/mock.h b/test/mock.h index a6a5f081..211c24b3 100644 --- a/test/mock.h +++ b/test/mock.h @@ -175,7 +175,6 @@ extern int mock_trylock_errors; extern u64 mock_tt_cycles; extern int mock_vmalloc_errors; extern int mock_xmit_log_verbose; -extern int mock_xmit_log_homa_info; extern struct task_struct *current_task; diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index c6bd50c7..5b4971a1 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1180,9 +1180,11 @@ TEST_F(homa_outgoing, homa_resend_data__error_copying_data) unit_log_get()); } #endif /* See strip.py */ -TEST_F(homa_outgoing, homa_resend_data__set_homa_info) +TEST_F(homa_outgoing, homa_resend_data__add_to_to_free_and_set_homa_info) { struct homa_rpc *crpc; + struct sk_buff *skb; + struct homa_skb_info *homa_info; mock_set_ipv6(&self->hsk); mock_net_device.gso_max_size = 5000; @@ -1190,11 +1192,17 @@ TEST_F(homa_outgoing, homa_resend_data__set_homa_info) self->server_ip, self->server_port, self->client_id, 16000, 1000); unit_log_clear(); - mock_xmit_log_homa_info = 1; homa_resend_data(crpc, 8400, 8800, 2); - EXPECT_STREQ("xmit DATA retrans 1400@8400; " - "homa_info: wire_bytes 1538, data_bytes 1400, seg_length 1400, offset 8400", - unit_log_get()); + skb = crpc->msgout.to_free; + ASSERT_NE(NULL, skb); + homa_info = homa_get_skb_info(skb); + EXPECT_EQ(NULL, homa_info->next_skb); + EXPECT_EQ(1538, homa_info->wire_bytes); + EXPECT_EQ(1400, homa_info->data_bytes); + EXPECT_EQ(1400, homa_info->seg_length); + EXPECT_EQ(8400, homa_info->offset); + EXPECT_EQ(crpc, homa_info->rpc); + EXPECT_EQ(1, refcount_read(&skb->users)); } TEST_F(homa_outgoing, homa_rpc_tx_end) diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 87fa1441..3c3cbac0 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -656,7 +656,7 @@ TEST_F(homa_rpc, homa_rpc_reap__skip_rpc_because_of_skb_refcount) IF_NO_STRIP(EXPECT_EQ(1, homa_metrics_per_cpu()->reaper_active_skbs)); EXPECT_EQ(4, self->hsk.dead_skbs); - kfree_skb(crpc1->msgout.packets); + kfree_skb(crpc1->msgout.to_free); unit_log_clear(); EXPECT_EQ(0, homa_rpc_reap(&self->hsk, false)); EXPECT_STREQ("reaped 1234", unit_log_get()); From 303e76b40a85e50d9c94a4c8fa72286201207313 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 15 Sep 2025 13:26:13 -0700 Subject: [PATCH 495/625] Add homa_validate_rbtree function in homa_devel.c --- homa_devel.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++ homa_devel.h | 1 + 2 files changed, 144 insertions(+) diff --git a/homa_devel.c b/homa_devel.c index d3cdfc56..46640d15 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -17,6 +17,8 @@ #endif /* See strip.py */ #include "homa_wire.h" +#include + #ifndef __STRIP__ /* See strip.py */ /* homa_drop_packet will accept this many more packets before it drops some. */ static int accept_count; @@ -1128,3 +1130,144 @@ bool homa_rpcs_deferred(struct homa_net *hnet) mutex_unlock(&hnet->qdisc_devs_mutex); return result; } + +/** + * homa_validate_rbtree() - Scan the structure of a red-black tree and + * abort the kernel (dumping the timetrace) if the internal structure + * does not satisfy the required invariants. + * @node: Node whose subtree should be scanned. + * @depth: Depth of node (number of black nodes above this node, 0 for + * root). + * @message: Textual message identifying the point where this function + * was invoked (used when reporting errors). + */ +void homa_validate_rbtree(struct rb_node *node, int depth, char *message) +{ + struct homa_rpc *rpc, *child_rpc; + struct rb_node *child; + static int max_depth; + int black, new_depth; + + if (depth == 0) { + if (!node) + return; + if (!rb_is_black(node)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree root is red"); +#else + tt_record("freezing because rbtree root is red"); +#endif /* __UNIT_TEST__ */ + goto error; + } + max_depth = -1; + } + + rpc = container_of(node, struct homa_rpc, qrpc.rb_node); + if (rpc->magic != HOMA_RPC_MAGIC) { +#ifdef __UNIT_TEST__ + FAIL("rpc id %llu (0x%px) in rbtree has bad magic 0x%x", + rpc->id, rpc, rpc->magic); +#else + tt_record4("freezing because rpc id %d (0x%x%08x) in rbtree has bad magic 0x%x", + rpc->id, tt_hi(rpc), tt_lo(rpc), rpc->magic); +#endif /* __UNIT_TEST__ */ + goto error; + + } + + black = rb_is_black(node); + new_depth = depth + black; + if (!node->rb_left || !node->rb_right) { + if (max_depth < 0) { + max_depth = new_depth; + } else if (max_depth != new_depth) { +#ifdef __UNIT_TEST__ + FAIL("inconsistent rbtree depths: %d and %d", + max_depth, new_depth); +#else + tt_record2("freezing because of inconsistent rbtree depths: %d and %d", + max_depth, depth); +#endif /* __UNIT_TEST__ */ + goto error; + } + goto done; + } + + child = node->rb_left; + if (child) { + child_rpc = container_of(child, struct homa_rpc, qrpc.rb_node); + if (__rb_parent(child->__rb_parent_color) != node) { +#ifdef __UNIT_TEST__ + FAIL("rbtree left child has bad parent, rpc id %llu", + child_rpc->id); +#else + tt_record1("freezing because rbtree left child has bad parent, rpc id %llu", + child_rpc->id); +#endif /* __UNIT_TEST__ */ + goto error; + } + if (!black && !rb_is_black(child)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree red parent has red left child"); +#else + tt_record("rbtree red parent has red left child"); +#endif /* __UNIT_TEST__ */ + goto error; + } + if (!homa_qdisc_precedes(child_rpc, rpc)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree contained out-of-order left child"); +#else + tt_record("freezing because rbtree contained out-of-order left child"); +#endif /* __UNIT_TEST__ */ + goto error; + } + homa_validate_rbtree(child, depth + black, message); + } + + child = node->rb_right; + if (child) { + if (__rb_parent(child->__rb_parent_color) != node) { +#ifdef __UNIT_TEST__ + FAIL("rbtree right child has bad parent, rpc id %llu", + child_rpc->id); +#else + tt_record1("freezing because rbtree right child has bad parent, rpc id %llu", + child_rpc->id); +#endif /* __UNIT_TEST__ */ + goto error; + } + if (!black && !rb_is_black(child)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree red parent has red right child"); +#else + tt_record("freezing because rbtree red parent has red right child"); +#endif /* __UNIT_TEST__ */ + goto error; + } + child_rpc = container_of(child, struct homa_rpc, qrpc.rb_node); + if (!homa_qdisc_precedes(rpc, child_rpc)) { +#ifdef __UNIT_TEST__ + FAIL("rbtree contained out-of-order right child"); +#else + tt_record("freezing because rbtree rbtree contained out-of-order right child"); +#endif /* __UNIT_TEST__ */ + goto error; + } + homa_validate_rbtree(child, depth + black, message); + } + +done: + return; + +error: +#ifndef __UNIT_TEST__ + tt_record(message); + if (!atomic_read(&tt_frozen)) { + tt_freeze(); + pr_err("rbtree consistency error at %s\n", message); + tt_printk(); + BUG_ON(1); + } +#endif /* __UNIT_TEST__ */ +} diff --git a/homa_devel.h b/homa_devel.h index 07c6de53..8307100f 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -137,5 +137,6 @@ char *homa_symbol_for_type(uint8_t type); char *homa_symbol_for_state(struct homa_rpc *rpc); int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); +void homa_validate_rbtree(struct rb_node *node, int depth, char *message); #endif /* _HOMA_DEVEL_H */ From a1a73069c866de1404686a3be389c7f99cf9dde4 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 15 Sep 2025 13:27:35 -0700 Subject: [PATCH 496/625] Move location of rpc->magic reset in homa_rpc_reap Mustn't clear rpc->magic until after all tx skbs have been freed (otherwise homa_qdisc could still access the RPC). --- homa_rpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_rpc.c b/homa_rpc.c index a14268ca..0b482277 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -527,7 +527,6 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) INC_METRIC(deferred_rpc_reaps, 1); continue; } - rpc->magic = 0; /* For Tx sk_buffs, collect them here but defer * freeing until after releasing the socket lock. @@ -659,6 +658,7 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) } #endif /* See strip.py */ rpc->state = 0; + rpc->magic = 0; kfree(rpc); } homa_sock_wakeup_wmem(hsk); From e9951908148045d222c78bd41ea9f434a406e3a9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 15 Sep 2025 13:29:10 -0700 Subject: [PATCH 497/625] Various bug fixes and improvements to tthoma.py (Mostly affecting the rxsnapshot and txsnapshot analyzers) --- util/tthoma.py | 76 +++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 55d0c3c3..9212ada0 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1635,14 +1635,13 @@ def __qdisc_defer(self, trace, time, core, match, interests): def __qdisc_xmit(self, trace, time, core, match, interests): id = int(match.group(1)) offset = int(match.group(2)) - bytes_left = int(match.group(3)) for interest in interests: - interest.tt_qdisc_xmit(trace, time, core, id, offset, bytes_left) + interest.tt_qdisc_xmit(trace, time, core, id, offset) patterns.append({ 'name': 'qdisc_xmit', 'regexp': 'homa_qdisc_pacer queuing homa data packet for id ([0-9]+), ' - 'offset ([0-9]+), bytes_left ([0-9]+)' + 'offset ([0-9]+)' }) def __tcp_xmit(self, trace, time, core, match, interests): @@ -6319,7 +6318,7 @@ def tt_qdisc_defer(self, trace, t, core, id, offset): p = p['retransmits'][-1] p['qdisc_defer'] = t - def tt_qdisc_xmit(self, trace, t, core, id, offset, bytes_left): + def tt_qdisc_xmit(self, trace, t, core, id, offset): global packets p = packets[pkt_id(id, offset)] p['tx_node'] = trace['node'] @@ -6580,6 +6579,7 @@ def tx_end(self, rpc): of the trace, or None if there doesn't appear to be any tx activity for RPC during the traces. """ + global rpcs, traces if not 'sent' in rpc and (not rpc['send_data']): return None @@ -6588,10 +6588,23 @@ def tx_end(self, rpc): if 'end' in rpc: ceiling = rpc['end'] if not (rpc['id'] & 1): - if rpc['gro_data']: + rx_id = rpc['id']^1 + if rx_id in rpcs: + rx_rpc = rpcs[rx_id] + else: + rx_rpc = {} + if 'recvmsg_done' in rx_rpc: + ceiling = rx_rpc['recvmsg_done'] + elif 'sendmsg' in rx_rpc: + ceiling = rx_rpc['sendmsg'] + elif 'send_data_pkts' in rx_rpc and rx_rpc['send_data_pkts']: + ceiling = rx_rpc['send_data_pkts'][0]['xmit'] + elif rpc['gro_data']: ceiling = rpc['gro_data'][0][0] elif 'recvmsg_done' in rpc: ceiling = rpc['recvmsg_done'] + elif 'sent' in rx_rpc: + ceiling = traces[rx_rpc['node']]['first_time'] if rpc['send_data']: if ceiling != None: return rpc['send_data'][-1][0] @@ -7613,6 +7626,9 @@ def output(self): print('Length: Length of incoming message, if known') print('Gxmit: Highest offset for which grant has been passed ' 'to ip_*xmit') + print('RxRem: Bytes in message that haven\'t yet been received ' + '(Length - Gro);') + print(' smaller means higher SRPT priority for grants') print('GGro: Highest offset in grant that has been received by GRO') print('GSoft: Highest offset in grant that has been processed ' 'by SoftIRQ') @@ -7626,14 +7642,12 @@ def output(self): print('Copied: Offset just after last data byte that has been ' 'copied to user space') print('Incoming: Gxmit - SoftIrq') - print('Rank: Rank among RPCs receiving grants. Smaller means ' - 'higher priority,') - print(' blank means not grantable') print('Lost: Packets that appear to have been dropped in the network') - print(' Id Peer Length GXmit GGro GSoft ', end='') - print(' Xmit Gro SoftIrq Copied Incoming Rank Lost') - print('------------------------------------------------------', end='') - print('--------------------------------------------------') + print(' Id Peer Length RxRem GXmit GGro GSoft ', + end='') + print(' Xmit Gro SoftIrq Copied Incoming Lost') + print('--------------------------------------------------------------', end='') + print('---------------------------------------------') for id in sorted_ids: rx_rpc = rpcs[id^1] @@ -7647,14 +7661,14 @@ def output(self): incoming = live_rpc['pre_grant_xmit'] - received if incoming <= 0: incoming = '' - rank = '' - if 'rank' in rx_rpc: - rank = rx_rpc['rank'] - if rank < 0: - rank = '' - print('%10d %-10s %7s %7s %7s %7s ' % (id^1, + if rx_rpc['in_length']: + rx_rem = rx_rpc['in_length'] - live_rpc['pre_gro'] + else: + rx_rem = "" + print('%10d %-10s %7s %7s %7s %7s %7s ' % (id^1, rpcs[id]['node'] if id in rpcs else "", rx_rpc['in_length'] if rx_rpc['in_length'] != None else "", + rx_rem, str(live_rpc['pre_grant_xmit']) if live_rpc['pre_grant_xmit'] > live_rpc['unsched'] else "", str(live_rpc['pre_grant_gro']) @@ -7662,9 +7676,9 @@ def output(self): str(live_rpc['pre_grant_softirq']) if live_rpc['pre_grant_softirq'] > live_rpc['unsched'] else ""), end='') - print('%7d %7d %7d %7d %7s %4s %4d' % (live_rpc['pre_xmit2'], + print('%7d %7d %7d %7d %7s %4d' % (live_rpc['pre_xmit2'], live_rpc['pre_gro'], live_rpc['pre_softirq'], - live_rpc['pre_copied'], incoming, rank, live_rpc['lost'])) + live_rpc['pre_copied'], incoming, live_rpc['lost'])) print('\nFields in the tables below:') print('Id: Packet\'s RPC identifier on the receiver side') @@ -8696,6 +8710,9 @@ def output(self): print('Length: Length of outgoing message, if known') print('Window: Bytes that have been granted but not transmitted ' '(Gsoft - Xmit)') + print('TxRem: Bytes in message that have not yet been transmitted ' + '(Length - Xmit);') + print(' smaller means higher SRPT priority for transmission') print('Gxmit: Highest offset for which grant has been passed ' 'to ip_*xmit') print('GGro: Highest offset in grant that has been received by GRO') @@ -8712,12 +8729,12 @@ def output(self): 'copied to user space') print('Incoming: Gxmit - SoftIrq') print('Lost: Packets that appear to have been dropped in the network') - print(' Id Peer Length Window GXmit GGro GSoft ', + print(' Id Peer Length Window TxRem GXmit ', end='') - print(' Xmit Gro SoftIrq Copied Incoming Lost') - print('--------------------------------------------------------------', + print('GGro GSoft Xmit Gro SoftIrq Copied Incoming Lost') + print('---------------------------------------------------------', end='') - print('---------------------------------------------') + print('----------------------------------------------------------') for id in sorted_ids: tx_rpc = rpcs[id] @@ -8729,16 +8746,19 @@ def output(self): window = str(window) else: window = "" - print('%10d %-10s %7s %7s %7s %7s %7s ' % (id, get_rpc_node(id^1), + print('%10d %-10s %7s %7s %7s %7s ' % (id, get_rpc_node(id^1), tx_rpc['out_length'] if tx_rpc['out_length'] != None else "", window, + tx_rpc['out_length'] - live_rpc['pre_xmit2'] + if tx_rpc['out_length'] != None else "", str(live_rpc['pre_grant_xmit']) - if live_rpc['pre_grant_xmit'] > 0 else "", + if live_rpc['pre_grant_xmit'] > 0 else ""), end='') + print('%7s %7s %7d %7d %7d %7d %7d %4d' % ( str(live_rpc['pre_grant_gro']) if live_rpc['pre_grant_gro'] > 0 else "", str(live_rpc['pre_grant_softirq']) - if live_rpc['pre_grant_softirq'] > 0 else ""), end='') - print('%7d %7d %7d %7d %7d %4d' % (live_rpc['pre_xmit2'], + if live_rpc['pre_grant_softirq'] > 0 else "", + live_rpc['pre_xmit2'], live_rpc['pre_gro'], live_rpc['pre_softirq'], live_rpc['pre_copied'], incoming, live_rpc['lost'])) From 39641ebe73bb0229f0eafb320b28f0ebc6728b27 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 16 Sep 2025 14:21:19 -0700 Subject: [PATCH 498/625] Refactor management of homa_nets and net_devices in mock.c Make multiple net_devices easier, make APIs more obvious. --- test/mock.c | 122 +++++++++++++++++++++++++------------- test/mock.h | 8 +-- test/unit_homa_grant.c | 2 +- test/unit_homa_incoming.c | 2 +- test/unit_homa_interest.c | 2 +- test/unit_homa_offload.c | 2 +- test/unit_homa_outgoing.c | 30 +++++----- test/unit_homa_pacer.c | 2 +- test/unit_homa_peer.c | 8 +-- test/unit_homa_plumbing.c | 2 +- test/unit_homa_pool.c | 2 +- test/unit_homa_qdisc.c | 80 ++++++++++++------------- test/unit_homa_rpc.c | 2 +- test/unit_homa_sock.c | 6 +- test/unit_homa_timer.c | 2 +- test/unit_homa_utils.c | 6 +- 16 files changed, 159 insertions(+), 119 deletions(-) diff --git a/test/mock.c b/test/mock.c index 8d52108e..f684e601 100644 --- a/test/mock.c +++ b/test/mock.c @@ -249,8 +249,8 @@ static struct socket mock_socket; */ #define MOCK_MAX_NETS 10 struct net mock_nets[MOCK_MAX_NETS]; -struct homa_net mock_hnets[MOCK_MAX_NETS]; -int mock_num_hnets; +struct homa_net *mock_hnets[MOCK_MAX_NETS]; +struct net_device mock_devices[MOCK_MAX_NETS]; /* Nonzero means don't generate a unit test failure when freeing peers * if the reference count isn't zero (log a message instead). @@ -266,13 +266,6 @@ struct dst_ops mock_dst_ops = { .mtu = mock_get_mtu, .check = mock_dst_check}; struct netdev_queue mock_net_queue = {.state = 0}; -struct net_device mock_net_device = { - .gso_max_segs = 1000, - .gso_max_size = 0, - ._tx = &mock_net_queue, - .nd_net = {.net = &mock_nets[0]}, - .ethtool_ops = &mock_ethtool_ops -}; /* Number of invocations of netif_schedule_queue. */ int mock_netif_schedule_calls; @@ -347,7 +340,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, skb->_skb_refdst = 0; ip_hdr(skb)->saddr = 0; skb->truesize = SKB_TRUESIZE(size); - skb->dev = &mock_net_device; + skb->dev = &mock_devices[0]; return skb; } @@ -742,7 +735,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, } atomic_set(&route->dst.__rcuref.refcnt, 1); route->dst.ops = &mock_dst_ops; - route->dst.dev = &mock_net_device; + route->dst.dev = &mock_devices[0]; route->dst.obsolete = 0; if (!routes_in_use) routes_in_use = unit_hash_new(); @@ -833,7 +826,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } atomic_set(&route->dst.__rcuref.refcnt, 1); route->dst.ops = &mock_dst_ops; - route->dst.dev = &mock_net_device; + route->dst.dev = &mock_devices[0]; route->dst.obsolete = 0; if (!routes_in_use) routes_in_use = unit_hash_new(); @@ -1651,26 +1644,6 @@ int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, return 0; } -/** - * mock_alloc_hnet: Allocate a new struct homa_net. - * @homa: struct homa that the homa_net will be associated with. - * Return: The new homa_net. - */ -struct homa_net *mock_alloc_hnet(struct homa *homa) -{ - struct homa_net *hnet; - - if (mock_num_hnets >= MOCK_MAX_NETS) { - FAIL("Max number of network namespaces (%d) exceeded", - MOCK_MAX_NETS); - return &mock_hnets[0]; - } - hnet = &mock_hnets[mock_num_hnets]; - homa_net_init(hnet, &mock_nets[mock_num_hnets], homa); - mock_num_hnets++; - return hnet; -} - /** * mock_alloc_pages() - Called instead of alloc_pages when Homa is compiled * for unit testing. @@ -1763,6 +1736,38 @@ void mock_data_ready(struct sock *sk) unit_log_printf("; ", "sk->sk_data_ready invoked"); } +/** + * mock_dev() - Return a net_device suitable for use in unit tests. + * @index: Index of the desired device among all those available; must + * be < MOCK_MAX_NETS. + * @homa: struct homa that the device will be associated with; may be + * needed for hnet initialization. + * Return: The specified net_device. If this is the first call for @index + * in this unit test, the device will be initialized. It will be + * associated with mock_hnet(index), which will also be initialized + * if it wasn't already initialized. + */ +struct net_device *mock_dev(int index, struct homa *homa) +{ + struct net_device *dev; + + if (index >= MOCK_MAX_NETS) { + FAIL("Index %d exceeds maximum number of network namespaces (%d)", + index, MOCK_MAX_NETS); + index = 0; + } + dev = &mock_devices[index]; + if (!dev->ethtool_ops) { + dev->gso_max_segs = 1000; + dev->gso_max_size = mock_mtu; + dev->_tx = &mock_net_queue; + dev->nd_net.net = &mock_nets[0]; + dev->ethtool_ops = &mock_ethtool_ops; + mock_hnet(index, homa); + } + return dev; +} + struct dst_entry *mock_dst_check(struct dst_entry *dst, __u32 cookie) { if (mock_check_error(&mock_dst_check_errors)) @@ -1814,6 +1819,35 @@ int mock_get_link_ksettings(struct net_device *dev, return 0; } +/** + * mock_hnet() - Return a struct homa_net suitable for use in tests. + * @index: Index of this homa_net among those available for unit tests (must + * be < MOCK_MAX_NETS) + * @homa: struct homa that the homa_net will be associated with. + * Return: The requested homa_net. If this is the first time that @index + * has been specified during this unit test, the hnet will be + * initialized. + */ +struct homa_net *mock_hnet(int index, struct homa *homa) +{ + struct homa_net *hnet; + + if (index >= MOCK_MAX_NETS) { + FAIL("Index %d exceeds maximum number of network namespaces (%d)", + index, MOCK_MAX_NETS); + index = 0; + } + hnet = mock_hnets[index]; + if (!hnet) { + hnet = malloc(sizeof(*hnet)); + mock_hnets[index] = hnet; + homa_net_init(hnet, &mock_nets[index], homa); + if (index == 0) + mock_dev(0, homa); + } + return hnet; +} + /** * mock_net_for_hnet() - Return the struct net associated with a struct * homa_net, or NULL if the struct net can't be identified. @@ -1824,8 +1858,8 @@ struct net *mock_net_for_hnet(struct homa_net *hnet) { int i; - for (i = 0; i < mock_num_hnets; i++) { - if (hnet == &mock_hnets[i]) + for (i = 0; i < MOCK_MAX_NETS; i++) { + if (hnet == mock_hnets[i]) return &mock_nets[i]; } return NULL; @@ -1837,9 +1871,9 @@ void *mock_net_generic(const struct net *net, unsigned int id) if (id != homa_net_id) return NULL; - for (i = 0; i < mock_num_hnets; i++) { + for (i = 0; i < MOCK_MAX_NETS; i++) { if (net == &mock_nets[i]) - return &mock_hnets[i]; + return mock_hnets[i]; } return NULL; } @@ -2187,7 +2221,7 @@ struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h skb->_skb_refdst = 0; skb->hash = 3; skb->next = NULL; - skb->dev = &mock_net_device; + skb->dev = &mock_devices[0]; return skb; } @@ -2251,7 +2285,7 @@ int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port) hsk->inet.pinet6 = &hsk_pinfo; mock_mtu = UNIT_TEST_DATA_PER_PACKET + hsk->ip_header_length + sizeof(struct homa_data_hdr); - mock_net_device.gso_max_size = mock_mtu; + mock_devices[0].gso_max_size = mock_mtu; err = homa_pool_set_region(hsk, (void *) 0x1000000, 100*HOMA_BPAGE_SIZE); return err; @@ -2276,7 +2310,7 @@ void mock_spin_unlock(spinlock_t *lock) */ void mock_teardown(void) { - int count; + int count, i; pcpu_hot.cpu_number = 1; pcpu_hot.current_task = &mock_task; @@ -2334,11 +2368,15 @@ void mock_teardown(void) mock_rht_num_walk_results = 0; mock_min_default_port = 0x8000; homa_net_id = 0; - mock_num_hnets = 0; + for (i = 0; i < MOCK_MAX_NETS; i++) { + if (mock_hnets[i]) { + free(mock_hnets[i]); + mock_hnets[i] = NULL; + } + } + memset(mock_devices, 0, sizeof(mock_devices)); mock_peer_free_no_fail = 0; mock_link_mbps = 10000; - mock_net_device.gso_max_size = 0; - mock_net_device.gso_max_segs = 1000; mock_netif_schedule_calls = 0; memset(inet_offloads, 0, sizeof(inet_offloads)); inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) &tcp_offload; diff --git a/test/mock.h b/test/mock.h index 211c24b3..aa1b7aaa 100644 --- a/test/mock.h +++ b/test/mock.h @@ -150,8 +150,6 @@ extern int mock_max_grants; extern int mock_max_skb_frags; extern __u16 mock_min_default_port; extern int mock_mtu; -extern struct net_device - mock_net_device; extern struct netdev_queue mock_net_queue; extern struct net mock_nets[]; @@ -180,8 +178,6 @@ extern struct task_struct *current_task; struct page * mock_alloc_pages(gfp_t gfp, unsigned order); -struct homa_net - *mock_alloc_hnet(struct homa *homa); struct Qdisc *mock_alloc_qdisc(struct netdev_queue *dev_queue); int mock_check_error(int *errorMask); @@ -190,6 +186,8 @@ s64 mock_cmpxchg(atomic64_t *target, s64 old, s64 new); unsigned int mock_compound_order(struct page *page); int mock_cpu_to_node(int core); void mock_data_ready(struct sock *sk); +struct net_device + *mock_dev(int index, struct homa *homa); struct dst_entry *mock_dst_check(struct dst_entry *, __u32 cookie); cycles_t mock_get_cycles(void); @@ -198,6 +196,8 @@ int mock_get_link_ksettings(struct net_device *dev, unsigned int mock_get_mtu(const struct dst_entry *dst); void mock_get_page(struct page *page); +struct homa_net + *mock_hnet(int index, struct homa *homa); void *mock_kmalloc(size_t size, gfp_t flags); struct net *mock_net_for_hnet(struct homa_net *hnet); void *mock_net_generic(const struct net *net, unsigned int id); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 6b59dd7b..5217e8fc 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -90,7 +90,7 @@ FIXTURE_SETUP(homa_grant) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); self->homa.num_priorities = 1; self->homa.poll_cycles = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 40a8794a..928b2d23 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -96,7 +96,7 @@ FIXTURE_SETUP(homa_incoming) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.num_priorities = 1; self->homa.poll_cycles = 0; diff --git a/test/unit_homa_interest.c b/test/unit_homa_interest.c index af6a8f8e..7a2ab0d1 100644 --- a/test/unit_homa_interest.c +++ b/test/unit_homa_interest.c @@ -56,7 +56,7 @@ FIXTURE(homa_interest) { FIXTURE_SETUP(homa_interest) { homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); mock_sock_init(&self->hsk, self->hnet, 0); self->client_ip = unit_get_in_addr("196.168.0.1"); self->client_port = 40000; diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 7e106333..ffe8ddd6 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -42,7 +42,7 @@ FIXTURE_SETUP(homa_offload) int i; homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.unsched_bytes = 10000; mock_sock_init(&self->hsk, self->hnet, 99); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 5b4971a1..22d021c6 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -74,6 +74,7 @@ FIXTURE(homa_outgoing) { u64 server_id; struct homa homa; struct homa_net *hnet; + struct net_device *dev; struct homa_sock hsk; union sockaddr_in_union server_addr; struct homa_peer *peer; @@ -87,7 +88,8 @@ FIXTURE_SETUP(homa_outgoing) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); + self->dev = mock_dev(0, &self->homa); mock_clock = 10000; #ifndef __STRIP__ /* See strip.py */ self->homa.pacer->cycles_per_mbyte = 1000000; @@ -441,7 +443,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) self->hsk.sock.sk_protocol = IPPROTO_TCP; /* First try: not quite enough space for 3 packets in GSO. */ - mock_net_device.gso_max_size = mock_mtu - 1 + + self->dev->gso_max_size = mock_mtu - 1 + 2 * UNIT_TEST_DATA_PER_PACKET; homa_rpc_lock(crpc1); ASSERT_EQ(0, -homa_message_out_fill(crpc1, @@ -450,7 +452,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_hijacking) EXPECT_SUBSTR("max_seg_data 1400, max_gso_data 2800", unit_log_get()); /* Second try: just barely enough space for 3 packets in GSO. */ - mock_net_device.gso_max_size += 1; + self->dev->gso_max_size += 1; unit_log_clear(); homa_rpc_lock(crpc2); ASSERT_EQ(0, -homa_message_out_fill(crpc2, @@ -468,7 +470,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) mock_set_ipv6(&self->hsk); /* First try: not quite enough space for 3 packets in GSO. */ - mock_net_device.gso_max_size = mock_mtu - 1 + + self->dev->gso_max_size = mock_mtu - 1 + 2 * (UNIT_TEST_DATA_PER_PACKET + sizeof(struct homa_seg_hdr)); ASSERT_EQ(0, -homa_message_out_fill(crpc1, @@ -479,7 +481,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_geometry_no_hijacking) /* Second try: just barely enough space for 3 packets in GSO. */ crpc2 = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc2 == NULL); - mock_net_device.gso_max_size += 1; + self->dev->gso_max_size += 1; unit_log_clear(); ASSERT_EQ(0, -homa_message_out_fill(crpc2, unit_iov_iter((void *) 1000, 10000), 0)); @@ -493,7 +495,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) ASSERT_FALSE(crpc == NULL); unit_log_clear(); - mock_net_device.gso_max_size = 10000; + self->dev->gso_max_size = 10000; self->homa.max_gso_size = 1000; ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 5000), 0)); @@ -507,7 +509,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__disable_overlap_xmit_because_of_hom &self->server_addr); struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); ASSERT_FALSE(crpc == NULL); ASSERT_EQ(0, -homa_message_out_fill(crpc, @@ -525,7 +527,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__multiple_segs_per_skbuff) &self->server_addr); ASSERT_FALSE(crpc == NULL); - mock_net_device.gso_max_size = 5000; + self->dev->gso_max_size = 5000; unit_log_clear(); ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 10000), 0)); @@ -886,7 +888,7 @@ TEST_F(homa_outgoing, homa_xmit_data__dont_throttle_because_homa_qdisc_in_use) self->server_port, self->client_id, 2000, 1000); struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 1000000); self->homa.pacer->max_nic_queue_cycles = 0; @@ -1067,7 +1069,7 @@ TEST_F(homa_outgoing, homa_resend_data__basics) { struct homa_rpc *crpc; - mock_net_device.gso_max_size = 5000; + self->dev->gso_max_size = 5000; crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 16000, 1000); @@ -1138,7 +1140,7 @@ TEST_F(homa_outgoing, homa_resend_data__cant_allocate_skb) { struct homa_rpc *crpc; - mock_net_device.gso_max_size = 5000; + self->dev->gso_max_size = 5000; crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 16000, 1000); @@ -1154,7 +1156,7 @@ TEST_F(homa_outgoing, homa_resend_data__set_incoming) { struct homa_rpc *crpc; - mock_net_device.gso_max_size = 5000; + self->dev->gso_max_size = 5000; crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 16000, 1000); @@ -1168,7 +1170,7 @@ TEST_F(homa_outgoing, homa_resend_data__error_copying_data) { struct homa_rpc *crpc; - mock_net_device.gso_max_size = 5000; + self->dev->gso_max_size = 5000; crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 16000, 1000); @@ -1187,7 +1189,7 @@ TEST_F(homa_outgoing, homa_resend_data__add_to_to_free_and_set_homa_info) struct homa_skb_info *homa_info; mock_set_ipv6(&self->hsk); - mock_net_device.gso_max_size = 5000; + self->dev->gso_max_size = 5000; crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, 16000, 1000); diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index afa1290b..0baeab8f 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -64,7 +64,7 @@ FIXTURE_SETUP(homa_pacer) self->client_id = 1234; self->server_id = 1235; homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); self->homa.pacer->cycles_per_mbyte = 1000000; self->homa.pacer->throttle_min_bytes = 0; #ifndef __STRIP__ /* See strip.py */ diff --git a/test/unit_homa_peer.c b/test/unit_homa_peer.c index 6890dc3f..da09c673 100644 --- a/test/unit_homa_peer.c +++ b/test/unit_homa_peer.c @@ -27,7 +27,7 @@ FIXTURE(homa_peer) { FIXTURE_SETUP(homa_peer) { homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); mock_sock_init(&self->hsk, self->hnet, 0); self->client_ip[0] = unit_get_in_addr("196.168.0.1"); self->server_ip[0] = unit_get_in_addr("1.2.3.4"); @@ -130,7 +130,7 @@ TEST_F(homa_peer, homa_peer_free_net__basics) struct homa_sock hsk2; struct homa_net *hnet2; - hnet2 = mock_alloc_hnet(&self->homa); + hnet2 = mock_hnet(1, &self->homa); mock_sock_init(&hsk2, hnet2, 44); peer = homa_peer_get(&self->hsk, ip1111); @@ -208,7 +208,7 @@ TEST_F(homa_peer, homa_peer_prefer_evict) struct homa_net *hnet2; struct homa_sock hsk2; - hnet2 = mock_alloc_hnet(&self->homa); + hnet2 = mock_hnet(1, &self->homa); mock_sock_init(&hsk2, hnet2, 44); peer1 = homa_peer_get(&self->hsk, ip1111); @@ -312,7 +312,7 @@ TEST_F(homa_peer, homa_peer_pick_victims__filter_idle_jiffies_max) struct homa_net *hnet2; struct homa_sock hsk2; - hnet2 = mock_alloc_hnet(&self->homa); + hnet2 = mock_hnet(1, &self->homa); mock_sock_init(&hsk2, hnet2, 44); hnet2->num_peers = peertab->net_max + 1; diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index cf996b7b..2eea9fe6 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -59,7 +59,7 @@ FIXTURE_SETUP(homa_plumbing) homa_init(&self->homa); if (self->homa.wmem_max == 0) printf("homa_plumbing fixture found wmem_max 0\n"); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); mock_sock_init(&self->hsk, self->hnet, 0); self->client_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; self->server_addr.in6.sin6_family = self->hsk.inet.sk.sk_family; diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index e01b6716..717c6b18 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -21,7 +21,7 @@ FIXTURE(homa_pool) { FIXTURE_SETUP(homa_pool) { homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index c31a5f6b..680358bf 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -83,7 +83,7 @@ FIXTURE(homa_qdisc) { struct homa homa; struct homa_net *hnet; struct in6_addr addr; - struct net_device dev; + struct net_device *dev; #define NUM_TXQS 4 struct netdev_queue txqs[NUM_TXQS]; struct Qdisc *qdiscs[NUM_TXQS]; @@ -103,13 +103,13 @@ FIXTURE_SETUP(homa_qdisc) homa_qdisc_register(); homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); self->addr = unit_get_in_addr("1.2.3.4"); - memset(&self->dev, 0, sizeof(self->dev)); - self->dev._tx = self->txqs; - self->dev.num_tx_queues = NUM_TXQS; - self->dev.nd_net.net = mock_net_for_hnet(self->hnet); - self->dev.ethtool_ops = &self->ethtool_ops; + self->dev = mock_dev(0, &self->homa); + self->dev->_tx = self->txqs; + self->dev->num_tx_queues = NUM_TXQS; + self->dev->nd_net.net = mock_net_for_hnet(self->hnet); + self->dev->ethtool_ops = &self->ethtool_ops; memset(&self->ethtool_ops, 0, sizeof(self->ethtool_ops)); self->ethtool_ops.get_link_ksettings = mock_get_link_ksettings; @@ -117,11 +117,11 @@ FIXTURE_SETUP(homa_qdisc) memset(&self->qdiscs, 0, sizeof(self->qdiscs)); for (i = 0; i < NUM_TXQS; i++) { self->txqs[i].state = 0; - self->txqs[i].dev = &self->dev; + self->txqs[i].dev = self->dev; self->qdiscs[i] = mock_alloc_qdisc(&self->txqs[i]); self->txqs[i].qdisc = self->qdiscs[i]; } - mock_net_queue.dev = &mock_net_device; + mock_net_queue.dev = self->dev; self->client_ip = unit_get_in_addr("196.168.0.1"); self->server_ip = unit_get_in_addr("1.2.3.4"); @@ -157,7 +157,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__create_new) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_FALSE(IS_ERR(qdev)); EXPECT_EQ(1, qdev->refs); @@ -167,11 +167,11 @@ TEST_F(homa_qdisc, homa_qdisc_get__use_existing) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_FALSE(IS_ERR(qdev)); EXPECT_EQ(1, qdev->refs); - EXPECT_EQ(qdev, homa_qdisc_qdev_get(self->hnet, &mock_net_device)); + EXPECT_EQ(qdev, homa_qdisc_qdev_get(self->hnet, self->dev)); EXPECT_EQ(2, qdev->refs); homa_qdisc_qdev_put(qdev); @@ -182,7 +182,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__kmalloc_failure) struct homa_qdisc_dev *qdev; mock_kmalloc_errors = 1; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_TRUE(IS_ERR(qdev)); EXPECT_EQ(ENOMEM, -PTR_ERR(qdev)); } @@ -191,7 +191,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__cant_create_thread) struct homa_qdisc_dev *qdev; mock_kthread_create_errors = 1; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_TRUE(IS_ERR(qdev)); EXPECT_EQ(EACCES, -PTR_ERR(qdev)); } @@ -200,9 +200,9 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_put) { struct homa_qdisc_dev *qdev, *qdev2; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_FALSE(IS_ERR(qdev)); - homa_qdisc_qdev_get(self->hnet, &mock_net_device); + homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_EQ(2, qdev->refs); homa_qdisc_qdev_put(qdev); @@ -288,7 +288,7 @@ TEST_F(homa_qdisc, _homa_qdisc_homa_qdisc_set_qixs_object) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); /* Simple working case. */ homa_qdisc_set_qixs(qdev); @@ -541,7 +541,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__basics) &self->server_ip, self->client_port, self->server_id + 6, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 5000, 1500)); @@ -575,7 +575,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__multiple_pkts_for_rpc) &self->server_ip, self->client_port, self->server_id + 2, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 1000, 1500)); @@ -604,7 +604,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__dont_update_tx_left) self->server_id, 10000, 10000); srpc->qrpc.tx_left = 2000; - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 5000, 500)); EXPECT_EQ(2000, srpc->qrpc.tx_left); @@ -622,7 +622,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__throttled_cycles_metric) &self->server_ip, self->client_port, self->server_id + 2, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); mock_clock = 5000; homa_qdisc_defer_homa(qdev, @@ -648,7 +648,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__wake_up_pacer) &self->server_ip, self->client_port, self->server_id, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); skb = new_test_skb(srpc, &self->addr, 5000, 1500); unit_log_clear(); @@ -676,7 +676,7 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__basics) &self->server_ip, self->client_port, self->server_id + 4, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 5000, 1500)); @@ -710,7 +710,7 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_left_chain) &self->server_ip, self->client_port, self->server_id + 6, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 5000, 1500)); @@ -747,7 +747,7 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_right_chain) &self->server_ip, self->client_port, self->server_id +6 , 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 5000, 1500)); @@ -771,7 +771,7 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__no_deferred_rpcs) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(NULL, homa_qdisc_dequeue_homa(qdev)); @@ -788,7 +788,7 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__multiple_packets_for_rpc) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); skb = new_test_skb(srpc, &self->addr, 2000, 500); homa_qdisc_defer_homa(qdev, skb); @@ -820,7 +820,7 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__last_packet_for_rpc) self->server_id + 2, 10000, 10000); ASSERT_NE(NULL, srpc2); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); skb = new_test_skb(srpc1, &self->addr, 5000, 500); homa_qdisc_defer_homa(qdev, skb); @@ -848,7 +848,7 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__update_tx_left) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 4000, 500)); @@ -874,7 +874,7 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__throttled_cycles_metric) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); mock_clock = 5000; homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 2000, 500)); @@ -903,7 +903,7 @@ TEST_F(homa_qdisc, homa_qdisc_free_homa) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 1000, 500)); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 2000, 500)); @@ -1017,7 +1017,7 @@ TEST_F(homa_qdisc, homa_qdisc_update_link_idle__pacer_bytes_metric) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); ASSERT_FALSE(IS_ERR(qdev)); /* No deferred packets. */ @@ -1037,7 +1037,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_main__basics) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_FALSE(IS_ERR(qdev)); unit_hook_register(pacer_sleep_hook); @@ -1055,7 +1055,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__queue_empty) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); unit_log_clear(); homa_qdisc_pacer(qdev); @@ -1075,7 +1075,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); link_idle = atomic64_read(&qdev->link_idle_time); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); @@ -1104,7 +1104,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); link_idle = atomic64_read(&qdev->link_idle_time); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); @@ -1131,7 +1131,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); @@ -1172,7 +1172,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) self->server_id + 2, 10000, 10000); ASSERT_NE(NULL, srpc2); - qdev = homa_qdisc_qdev_get(self->hnet, &self->dev); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); @@ -1325,7 +1325,7 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl__basics) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_FALSE(IS_ERR(qdev)); self->homa.link_mbps = 25000; @@ -1340,7 +1340,7 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl__cant_get_link_speed_from_dev) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, &mock_net_device); + qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_FALSE(IS_ERR(qdev)); self->homa.link_mbps = 16000; diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 3c3cbac0..6040bfe3 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -42,7 +42,7 @@ FIXTURE_SETUP(homa_rpc) self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); #ifndef __STRIP__ /* See strip.py */ self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index d44e9e76..f4770355 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -40,7 +40,7 @@ FIXTURE(homa_sock) { FIXTURE_SETUP(homa_sock) { homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); mock_sock_init(&self->hsk, self->hnet, 0); self->client_ip[0] = unit_get_in_addr("196.168.0.1"); self->client_port = 40000; @@ -60,7 +60,7 @@ TEST_F(homa_sock, homa_socktab_destroy) struct homa_sock hsk1, hsk2, hsk3; struct homa_net *hnet; - hnet = mock_alloc_hnet(&self->homa); + hnet = mock_hnet(1, &self->homa); mock_sock_init(&hsk1, hnet, 100); mock_sock_init(&hsk2, hnet, 101); mock_sock_init(&hsk3, self->hnet, 100); @@ -350,7 +350,7 @@ TEST_F(homa_sock, homa_sock_find__same_port_in_different_hnets) struct homa_sock *hsk; struct homa_net *hnet; - hnet = mock_alloc_hnet(&self->homa); + hnet = mock_hnet(1, &self->homa); mock_sock_init(&hsk1, self->hnet, 100); mock_sock_init(&hsk2, hnet, 100); diff --git a/test/unit_homa_timer.c b/test/unit_homa_timer.c index 443ccafe..d578c32b 100644 --- a/test/unit_homa_timer.c +++ b/test/unit_homa_timer.c @@ -34,7 +34,7 @@ FIXTURE_SETUP(homa_timer) self->server_addr.in6.sin6_addr = *self->server_ip; self->server_addr.in6.sin6_port = htons(self->server_port); homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.resend_ticks = 2; self->homa.timer_ticks = 100; diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index 3a3cc6d3..de3c4424 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -20,7 +20,7 @@ FIXTURE(homa_utils) { FIXTURE_SETUP(homa_utils) { homa_init(&self->homa); - self->hnet = mock_alloc_hnet(&self->homa); + self->hnet = mock_hnet(0, &self->homa); mock_sock_init(&self->hsk, self->hnet, 0); unit_log_clear(); } @@ -132,7 +132,7 @@ TEST_F(homa_utils, homa_net_destroy__delete_sockets) struct homa_sock hsk1, hsk2, hsk3; struct homa_net *hnet; - hnet = mock_alloc_hnet(&self->homa); + hnet = mock_hnet(1, &self->homa); mock_sock_init(&hsk1, hnet, 100); mock_sock_init(&hsk2, hnet, 101); mock_sock_init(&hsk3, self->hnet, 100); @@ -151,7 +151,7 @@ TEST_F(homa_utils, homa_net_destroy__delete_peers) struct homa_sock hsk2; struct in6_addr addr; - hnet = mock_alloc_hnet(&self->homa); + hnet = mock_hnet(1, &self->homa); mock_sock_init(&hsk2, hnet, 44); addr = unit_get_in_addr("1.2.3.4"); From fe0b430b5234a6ebd7364afbb88ca140d43d0343 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 16 Sep 2025 15:09:07 -0700 Subject: [PATCH 499/625] Refactor structures that keep track of all homa_qdisc_devs This allows RCU scanning of all homa_qdisc_devs. --- homa_devel.c | 16 +- homa_devel.h | 2 +- homa_impl.h | 22 +-- homa_outgoing.c | 4 +- homa_qdisc.c | 221 ++++++++++++++++++++++---- homa_qdisc.h | 58 ++++++- homa_utils.c | 15 +- test/mock.c | 4 + test/unit_homa_outgoing.c | 15 +- test/unit_homa_qdisc.c | 315 ++++++++++++++++++++++++++++---------- test/unit_homa_utils.c | 4 +- 11 files changed, 520 insertions(+), 156 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 46640d15..8bbd0972 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -1112,22 +1112,24 @@ void homa_rpc_stats_log(void) /** * homa_rpcs_deferred() - Return true if there are any RPCs with packets * that have been deferred by homa_qdisc, false if there are none. - * @hnet: Consider only RPCs associatged with this network namespace. + * @homa: Overall information about the Homa protocol. * Return: See above. */ -bool homa_rpcs_deferred(struct homa_net *hnet) +bool homa_rpcs_deferred(struct homa *homa) { - struct homa_qdisc_dev *qdev; + struct homa_qdisc_qdevs *qdevs = homa->qdevs; bool result = false; + int num_qdevs, i; - mutex_lock(&hnet->qdisc_devs_mutex); - list_for_each_entry(qdev, &hnet->qdisc_devs, links) { - if (homa_qdisc_any_deferred(qdev)) { + rcu_read_lock(); + num_qdevs = READ_ONCE(qdevs->num_qdevs); + for (i = 0; i < num_qdevs; i++) { + if (homa_qdisc_any_deferred(qdevs->qdevs[i])) { result = true; break; } } - mutex_unlock(&hnet->qdisc_devs_mutex); + rcu_read_unlock(); return result; } diff --git a/homa_devel.h b/homa_devel.h index 8307100f..8d091eb6 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -128,7 +128,7 @@ void homa_rpc_log_tt(struct homa_rpc *rpc); void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); void homa_rpc_snapshot_log_tt(void); void homa_rpc_stats_log(void); -bool homa_rpcs_deferred(struct homa_net *hnet); +bool homa_rpcs_deferred(struct homa *homa); void homa_snapshot_get_stats(struct homa_rpc_snapshot *snap); void homa_snapshot_rpcs(void); int homa_snprintf(char *buffer, int size, int used, diff --git a/homa_impl.h b/homa_impl.h index 1920375c..93d99e51 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -119,6 +119,12 @@ struct homa { * grants for incoming messages. */ struct homa_grant *grant; + + /** + * @qdevs: Contains information used by homa_qdisc.c to manage + * homa_qdisc_qdevs for this struct homa. + */ + struct homa_qdisc_qdevs *qdevs; #endif /* See strip.py */ /** @@ -517,22 +523,6 @@ struct homa_net { * for this namespace. Managed by homa_peer.c under the peertab lock. */ int num_peers; - -#ifndef __STRIP__ /* See strip.py */ - /** - * @qdisc_devs: List of all homa_qdisc_dev objects that exist for - * this namespace. Protected by qdisc_devs_mutex. - */ - struct list_head qdisc_devs; - - /** - * @qdisc_devs_mutex: Used to synchronize operations on @qdisc_devs - * (creation and deletion of qdiscs). Must be a mutex rather than - * a spinlock because homa_qdisc_dev_get calls functions that may - * blocko. - */ - struct mutex qdisc_devs_mutex; -#endif /* See strip.py */ }; /** diff --git a/homa_outgoing.c b/homa_outgoing.c index d69f89ab..b53fad69 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -321,7 +321,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) #ifndef __STRIP__ /* See strip.py */ overlap_xmit = rpc->msgout.length > 2 * max_gso_data; - if (homa_qdisc_active(rpc->hsk->hnet)) + if (homa_qdisc_active(rpc->hsk->homa)) overlap_xmit = 0; rpc->msgout.granted = rpc->msgout.unscheduled; #endif /* See strip.py */ @@ -598,7 +598,7 @@ void homa_xmit_data(struct homa_rpc *rpc) if (rpc->msgout.length - rpc->msgout.next_xmit_offset > homa->pacer->throttle_min_bytes && - !homa_qdisc_active(rpc->hsk->hnet)) { + !homa_qdisc_active(rpc->hsk->homa)) { if (!homa_pacer_check_nic_q(homa->pacer, skb, force)) { tt_record1("homa_xmit_data adding id %u to throttle queue", rpc->id); diff --git a/homa_qdisc.c b/homa_qdisc.c index dfa4b39a..11227f0f 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -34,7 +34,7 @@ static struct Qdisc_ops homa_qdisc_ops __read_mostly = { /** * homa_qdisc_register() - Invoked when the Homa module is loaded; makes - * the homa qdisk known to Linux. + * the homa qdisc known to Linux. * Return: 0 for success or a negative errno if an error occurred. */ int homa_qdisc_register(void) @@ -51,35 +51,146 @@ void homa_qdisc_unregister(void) unregister_qdisc(&homa_qdisc_ops); } +/** + * homa_rcu_kfree() - Call kfree on a block of memory when it is safe to + * do so from an RCU standpoint. If possible, the freeing is done + * asynchronously. + * @object: Eventually invoke kfree on this. + */ +void homa_rcu_kfree(void *object) +{ + struct homa_rcu_kfreer *freer; + + freer = kmalloc(sizeof *freer, GFP_KERNEL); + if (!freer) { + /* Can't allocate memory needed for asynchronous freeing, + * so free synchronously. + */ + UNIT_LOG("; ", "homa_rcu_kfree kmalloc failed"); + synchronize_rcu(); + kfree(object); + } else { + freer->object = object; + call_rcu(&freer->rcu_head, homa_rcu_kfree_callback); + } +} + +/** + * homa_rcu_kfree_callback() - This function is invoked by the RCU subsystem + * when it safe to free an object previously passed to homa_rcu_kfree. + * @head: Points to the rcu_head member of a struct homa_rcu_kfreer. + */ +void homa_rcu_kfree_callback(struct rcu_head *head) +{ + struct homa_rcu_kfreer *freer; + + freer = container_of(head, struct homa_rcu_kfreer, rcu_head); + kfree(freer->object); + kfree(freer); +} + +/** + * homa_qdisc_alloc_devs() - Allocate and initialize a new homa_qdisc_qdevs + * object. + * Return: The new object, or an ERR_PTR if an error occurred. + */ +struct homa_qdisc_qdevs *homa_qdisc_qdevs_alloc(void) +{ + struct homa_qdisc_qdevs *qdevs; + + qdevs = kzalloc(sizeof(*qdevs), GFP_KERNEL); + if (!qdevs) + return ERR_PTR(-ENOMEM); + + mutex_init(&qdevs->mutex); + return qdevs; +} + +/** + * homa_qdisc_qdevs_free() - Invoked when a struct homa is being freed; + * releases information related to all the assoiciated homa_qdiscs. + * @qdevs: Information about homa_qdisc_devs associated with a + * particular struct homa. + */ +void homa_qdisc_qdevs_free(struct homa_qdisc_qdevs *qdevs) +{ + struct homa_qdisc_dev *qdev; + + /* At this point this object no-one else besides us should + * ever access this object again, but lock it just to be safe. + */ + mutex_lock(&qdevs->mutex); + if (qdevs->num_qdevs > 0) { + pr_err("homa_qdisc_devs_free found %d live qdevs (should have been none)\n", + qdevs->num_qdevs); + + /* We can't safely free the stranded qdevs, but at least + * stop their pacer threads to reduce the likelihood + * of derefernceing dangling pointers. + */ + while (qdevs->num_qdevs > 0) { + qdevs->num_qdevs--; + qdev = qdevs->qdevs[qdevs->num_qdevs]; + qdevs->qdevs[qdevs->num_qdevs] = NULL; + kthread_stop(qdev->pacer_kthread); + qdev->pacer_kthread = NULL; + } + } + mutex_unlock(&qdevs->mutex); + homa_rcu_kfree(qdevs->qdevs); + homa_rcu_kfree(qdevs); +} + /** * homa_qdisc_qdev_get() - Find the homa_qdisc_dev to use for a particular * net_device and increment its reference count. Create a new one if there - * isn't an existing one to use. - * @hnet: Network namespace for the homa_qdisc_dev. + * isn't an existing one to use. Do this in an RCU-safe fashion. * @dev: NIC that the homa_qdisc_dev will manage. - * Return: A pointer to the new homa_qdisc_dev, or a PTR_ERR errno. + * Return: A pointer to the new homa_qdisc_dev, or a PTR_ERR errno. */ -struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, - struct net_device *dev) +struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) { + struct homa_qdisc_dev **new_qdevs, **old_qdevs; + struct homa_qdisc_qdevs *qdevs; struct homa_qdisc_dev *qdev; + struct homa_net *hnet; + int num_qdevs, i; + + rcu_read_lock(); + hnet = homa_net(dev_net(dev)); + qdevs = hnet->homa->qdevs; + qdevs = homa_net(dev_net(dev))->homa->qdevs; + num_qdevs = READ_ONCE(qdevs->num_qdevs); + for (i = 0; i < num_qdevs; i++) { + qdev = READ_ONCE(qdevs->qdevs[i]); + if (qdev->dev == dev && refcount_inc_not_zero(&qdev->refs)) { + rcu_read_unlock(); + return qdev; + } + } + rcu_read_unlock(); - mutex_lock(&hnet->qdisc_devs_mutex); - list_for_each_entry(qdev, &hnet->qdisc_devs, links) { - if (qdev->dev == dev) { - qdev->refs++; + /* Must allocate a new homa_qdisc_dev (but must check again, + * after acquiring the mutex, in case someone else already + * created it). + */ + mutex_lock(&qdevs->mutex); + for (i = 0; i < qdevs->num_qdevs; i++) { + qdev = READ_ONCE(qdevs->qdevs[i]); + if (qdev->dev == dev && refcount_inc_not_zero(&qdev->refs)) { + UNIT_LOG("; ", "race in homa_qdisc_qdev_get"); goto done; } } - qdev = kzalloc(sizeof(*qdev), GFP_ATOMIC); + qdev = kzalloc(sizeof(*qdev), GFP_KERNEL); if (!qdev) { qdev = ERR_PTR(-ENOMEM); goto done; } qdev->dev = dev; qdev->hnet = hnet; - qdev->refs = 1; + refcount_set(&qdev->refs, 1); qdev->pacer_qix = -1; qdev->redirect_qix = -1; homa_qdisc_update_sysctl(qdev); @@ -101,10 +212,21 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, qdev = ERR_PTR(error); goto done; } - list_add(&qdev->links, &hnet->qdisc_devs); + + /* Note: the order below matters, because there could be concurrent + * scans of qdevs->qdevs. + */ + new_qdevs = kzalloc((qdevs->num_qdevs + 1) * sizeof(*new_qdevs), + GFP_KERNEL); + old_qdevs = qdevs->qdevs; + memcpy(new_qdevs, old_qdevs, qdevs->num_qdevs * sizeof(*new_qdevs)); + new_qdevs[qdevs->num_qdevs] = qdev; + smp_store_release(&qdevs->qdevs, new_qdevs); + qdevs->num_qdevs++; + homa_rcu_kfree(old_qdevs); done: - mutex_unlock(&hnet->qdisc_devs_mutex); + mutex_unlock(&qdevs->mutex); return qdev; } @@ -115,20 +237,48 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct homa_net *hnet, */ void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) { - struct homa_net *hnet = qdev->hnet; + struct homa_qdisc_qdevs *qdevs; + int i; - mutex_lock(&hnet->qdisc_devs_mutex); - qdev->refs--; - if (qdev->refs == 0) { - kthread_stop(qdev->pacer_kthread); - qdev->pacer_kthread = NULL; + if (!refcount_dec_and_test(&qdev->refs)) + return; - __list_del_entry(&qdev->links); - homa_qdisc_free_homa(qdev); - skb_queue_purge(&qdev->tcp_deferred); - kfree(qdev); + /* Make this homa_qdisc_dev inaccessible, then schedule an RCU-safe + * free. Think carefully before you modify this code, to ensure that + * concurrent RCU scans of qdevs->qdevs are safe. + */ + qdevs = qdev->hnet->homa->qdevs; + mutex_lock(&qdevs->mutex); + for (i = 0; i < qdevs->num_qdevs; i++) { + if (qdevs->qdevs[i] == qdev) + break; } - mutex_unlock(&hnet->qdisc_devs_mutex); + if (i < qdevs->num_qdevs) { + WRITE_ONCE(qdevs->qdevs[i], qdevs->qdevs[qdevs->num_qdevs - 1]); + smp_store_release(&qdevs->num_qdevs, qdevs->num_qdevs - 1); + } else { + pr_err("homa_qdisc_qdev_put couldn't find qdev to delete\n"); + } + + kthread_stop(qdev->pacer_kthread); + qdev->pacer_kthread = NULL; + call_rcu(&qdev->rcu_head, homa_qdisc_dev_callback); + mutex_unlock(&qdevs->mutex); +} + +/** + * homa_qdisc_dev_callback() - Invoked by the RCU subsystem when it is + * safe to finish deleting a homa_qdisc_dev. + * @head: Pointer to the rcu_head field in a homa_qdisc_qdev. + */ +void homa_qdisc_dev_callback(struct rcu_head *head) +{ + struct homa_qdisc_dev *qdev; + + qdev = container_of(head, struct homa_qdisc_dev, rcu_head); + homa_qdisc_free_homa(qdev); + skb_queue_purge(&qdev->tcp_deferred); + kfree(qdev); } /** @@ -143,11 +293,9 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, { struct homa_qdisc *q = qdisc_priv(sch); struct homa_qdisc_dev *qdev; - struct homa_net *hnet; int i; - hnet = homa_net(dev_net(sch->dev_queue->dev)); - qdev = homa_qdisc_qdev_get(hnet, sch->dev_queue->dev); + qdev = homa_qdisc_qdev_get(sch->dev_queue->dev); if (IS_ERR(qdev)) return PTR_ERR(qdev); @@ -400,9 +548,8 @@ struct sk_buff *homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev) } qrpc = container_of(node, struct homa_rpc_qdisc, rb_node); skb = skb_dequeue(&qrpc->packets); - if (skb_queue_len(&qrpc->packets) == 0) { + if (skb_queue_len(&qrpc->packets) == 0) rb_erase_cached(node, &qdev->deferred_rpcs); - } /* Update qrpc->bytes_left. This can change the priority of the RPC * in qdev->deferred_rpcs, but the RPC was already the highest- @@ -718,10 +865,18 @@ void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev) */ void homa_qdisc_update_all_sysctl(struct homa_net *hnet) { + struct homa_qdisc_qdevs *qdevs; struct homa_qdisc_dev *qdev; + int num_qdevs, i; - mutex_lock(&hnet->qdisc_devs_mutex); - list_for_each_entry(qdev, &hnet->qdisc_devs, links) + rcu_read_lock(); + qdevs = hnet->homa->qdevs; + num_qdevs = READ_ONCE(qdevs->num_qdevs); + for (i = 0; i < num_qdevs; i++) { + qdev = qdevs->qdevs[i]; + if (qdev->hnet != hnet) + continue; homa_qdisc_update_sysctl(qdev); - mutex_unlock(&hnet->qdisc_devs_mutex); + } + rcu_read_unlock(); } diff --git a/homa_qdisc.h b/homa_qdisc.h index f723f7e4..ead0a616 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -54,7 +54,7 @@ struct homa_qdisc_dev { * homa_qdisc that references this object). Must hold * hnet->qdisc_devs_lock to access. */ - int refs; + refcount_t refs; /** * @pacer_qix: Index of a netdev_queue within dev that is reserved @@ -157,6 +157,47 @@ struct homa_qdisc_dev { * never block on this. */ spinlock_t pacer_mutex __aligned(L1_CACHE_BYTES); + + /** + * @rcu_head: Holds state of a pending call_rcu invocation when + * this struct is deleted. + */ + struct rcu_head rcu_head; +}; + +/** + * struct homa_qdisc_qdevs - There is one of these structs for each + * struct homa. Used to manage all of the homa_qdisc_devs for the + * struct homa. + */ +struct homa_qdisc_qdevs { + /** + * @mutex: Must hold when modifying qdevs. Can scan qdevs + * without locking using RCU. + */ + struct mutex mutex; + + /** @num_devs: Number of entries currently in use in @qdevs. */ + int num_qdevs; + + /** + * @qdevs: Pointers to all homa_qdisc_devs that exist for this + * struct homa. Scan and/or retrieve pointers using RCU. Storage + * for this is dynamically allocated, must be kfreed. + */ + struct homa_qdisc_dev **qdevs; +}; + +/** + * struct homa_rcu_kfreer - Used by homa_rcu_kfree to defer kfree-ing + * an object until it is RCU-safe. + */ +struct homa_rcu_kfreer { + /** @rcu_head: Holds state of a pending call_rcu invocation. */ + struct rcu_head rcu_head; + + /** object: Kfree this after waiting until RCU has synced. */ + void *object; }; void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, @@ -164,6 +205,10 @@ void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff * homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev); void homa_qdisc_destroy(struct Qdisc *sch); +void homa_qdisc_dev_callback(struct rcu_head *head); +struct homa_qdisc_qdevs * + homa_qdisc_qdevs_alloc(void); +void homa_qdisc_qdevs_free(struct homa_qdisc_qdevs *qdevs); int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev); @@ -173,8 +218,7 @@ void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc); int homa_qdisc_pacer_main(void *device); struct homa_qdisc_dev * - homa_qdisc_qdev_get(struct homa_net *hnet, - struct net_device *dev); + homa_qdisc_qdev_get(struct net_device *dev); void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev); int homa_qdisc_redirect_skb(struct sk_buff *skb, struct homa_qdisc_dev *qdev, @@ -187,16 +231,18 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, int max_queue_ns); void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev); void homa_qdisc_pacer(struct homa_qdisc_dev *qdev); +void homa_rcu_kfree(void *object); +void homa_rcu_kfree_callback(struct rcu_head *head); /** * homa_qdisc_active() - Return true if homa qdiscs are enabled for @hnet * (so the old pacer should not be used), false otherwise. - * @hnet: Homa's information about a network namespace. + * @homa: Information about the Homa transport. * Return: See above. */ -static inline bool homa_qdisc_active(struct homa_net *hnet) +static inline bool homa_qdisc_active(struct homa *homa) { - return !list_empty(&hnet->qdisc_devs); + return homa->qdevs->num_qdevs > 0; } /** diff --git a/homa_utils.c b/homa_utils.c index 5ccff868..1578c493 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -11,6 +11,7 @@ #ifndef __STRIP__ /* See strip.py */ #include "homa_grant.h" #include "homa_pacer.h" +#include "homa_qdisc.h" #include "homa_skb.h" #else /* See strip.py */ #include "homa_stub.h" @@ -47,6 +48,12 @@ int homa_init(struct homa *homa) homa->grant = NULL; return err; } + homa->qdevs = homa_qdisc_qdevs_alloc(); + if (IS_ERR(homa->qdevs)) { + err = PTR_ERR(homa->qdevs); + homa->qdevs = NULL; + return err; + } #endif /* See strip.py */ homa->peertab = homa_peer_alloc_peertab(); if (IS_ERR(homa->peertab)) { @@ -132,6 +139,10 @@ void homa_destroy(struct homa *homa) homa->socktab = NULL; } #ifndef __STRIP__ /* See strip.py */ + if (homa->qdevs) { + homa_qdisc_qdevs_free(homa->qdevs); + homa->qdevs = NULL; + } if (homa->grant) { homa_grant_free(homa->grant); homa->grant = NULL; @@ -163,10 +174,6 @@ int homa_net_init(struct homa_net *hnet, struct net *net, struct homa *homa) memset(hnet, 0, sizeof(*hnet)); hnet->homa = homa; hnet->prev_default_port = HOMA_MIN_DEFAULT_PORT - 1; -#ifndef __STRIP__ /* See strip.py */ - INIT_LIST_HEAD(&hnet->qdisc_devs); - mutex_init(&hnet->qdisc_devs_mutex); -#endif /* See strip.py */ return 0; } diff --git a/test/mock.c b/test/mock.c index f684e601..fbcd993a 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1075,6 +1075,7 @@ void mutex_lock_nested(struct mutex *lock, unsigned int subclass) void mutex_lock(struct mutex *lock) #endif { + UNIT_HOOK("mutex_lock"); mock_active_locks++; } @@ -1563,6 +1564,9 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) return 0; } +void synchronize_rcu(void) +{} + void __tasklet_hi_schedule(struct tasklet_struct *t) {} diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 22d021c6..e5ec0403 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -505,11 +505,11 @@ TEST_F(homa_outgoing, homa_message_out_fill__gso_limit_less_than_mtu) #ifndef __STRIP__ /* See strip.py */ TEST_F(homa_outgoing, homa_message_out_fill__disable_overlap_xmit_because_of_homa_qdisc) { - struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, - &self->server_addr); struct homa_qdisc_dev *qdev; + struct homa_rpc *crpc; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); + crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); ASSERT_EQ(0, -homa_message_out_fill(crpc, @@ -883,12 +883,13 @@ TEST_F(homa_outgoing, homa_xmit_data__force) } TEST_F(homa_outgoing, homa_xmit_data__dont_throttle_because_homa_qdisc_in_use) { - struct homa_rpc *crpc = unit_client_rpc(&self->hsk, - UNIT_OUTGOING, self->client_ip, self->server_ip, - self->server_port, self->client_id, 2000, 1000); struct homa_qdisc_dev *qdev; + struct homa_rpc *crpc; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); + crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 2000, 1000); unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 1000000); self->homa.pacer->max_nic_queue_cycles = 0; diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 680358bf..70ab0804 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -79,6 +79,19 @@ static void pacer_sleep_hook(char *id) { } } +static int create_hook_count; +static struct net_device *hook_dev; +static void qdev_create_hook(char *id) +{ + if (strcmp(id, "mutex_lock") != 0) + return; + if (create_hook_count <= 0) + return; + create_hook_count--; + if (create_hook_count == 0) + homa_qdisc_qdev_get(hook_dev); +} + FIXTURE(homa_qdisc) { struct homa homa; struct homa_net *hnet; @@ -153,26 +166,122 @@ FIXTURE_TEARDOWN(homa_qdisc) unit_teardown(); } -TEST_F(homa_qdisc, homa_qdisc_qdev_get__create_new) +TEST_F(homa_qdisc, homa_rcu_kfree__kmalloc_succeeds) +{ + /* Nothing to check in this test; if it fails, test infrastructure + * will detect memory alloc-free mismatches. + */ + + homa_rcu_kfree(kmalloc(100, GFP_KERNEL)); +} +TEST_F(homa_qdisc, homa_rcu_kfree__kmalloc_fails) +{ + mock_kmalloc_errors = 2; + homa_rcu_kfree(kmalloc(100, GFP_KERNEL)); + EXPECT_STREQ("homa_rcu_kfree kmalloc failed", unit_log_get()); +} + +TEST_F(homa_qdisc, homa_rcu_kfree_callback) +{ + struct homa_rcu_kfreer *freer; + + /* Any errors in freeing will be detected by test infrastructure. */ + freer = kmalloc(sizeof(*freer), GFP_KERNEL); + freer->object = kmalloc(200, GFP_KERNEL); + homa_rcu_kfree_callback(&freer->rcu_head); +} + +TEST_F(homa_qdisc, homa_qdisc_qdevs_alloc__success) +{ + struct homa_qdisc_qdevs *qdevs; + + qdevs = homa_qdisc_qdevs_alloc(); + ASSERT_FALSE(IS_ERR(qdevs)); + EXPECT_EQ(0, qdevs->num_qdevs); + EXPECT_EQ(NULL, qdevs->qdevs); + kfree(qdevs); +} +TEST_F(homa_qdisc, homa_qdisc_qdevs_alloc__kmalloc_failure) +{ + struct homa_qdisc_qdevs *qdevs; + + mock_kmalloc_errors = 1; + qdevs = homa_qdisc_qdevs_alloc(); + ASSERT_TRUE(IS_ERR(qdevs)); + EXPECT_EQ(ENOMEM, -PTR_ERR(qdevs)); +} + +TEST_F(homa_qdisc, homa_qdisc_qdevs_free__basics) +{ + struct homa_qdisc_qdevs *qdevs; + + /* Test infrastructure will report any inconsistencie in + * memory allocation. + */ + qdevs = homa_qdisc_qdevs_alloc(); + homa_qdisc_qdevs_free(qdevs); +} +TEST_F(homa_qdisc, homa_qdisc_qdevs_free__unfreed_qdevs) +{ + struct homa_qdisc_qdevs *qdevs, *saved_qdevs; + struct homa_qdisc_dev *qdev; + + qdevs = homa_qdisc_qdevs_alloc(); + saved_qdevs = self->homa.qdevs; + self->homa.qdevs = qdevs; + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(1, qdevs->num_qdevs); + self->homa.qdevs = saved_qdevs; + mock_printk_output[0] = 0; + homa_qdisc_qdevs_free(qdevs); + EXPECT_STREQ("homa_qdisc_devs_free found 1 live qdevs " + "(should have been none)", mock_printk_output); + homa_qdisc_qdev_put(qdev); +} + +TEST_F(homa_qdisc, homa_qdisc_qdev_get__basics) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); - EXPECT_EQ(1, qdev->refs); + EXPECT_EQ(1, refcount_read(&qdev->refs)); + EXPECT_EQ(1, self->homa.qdevs->num_qdevs); homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_get__use_existing) { - struct homa_qdisc_dev *qdev; + struct homa_qdisc_dev *qdev, *qdev2; + + + qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); - EXPECT_EQ(1, qdev->refs); + EXPECT_EQ(2, self->homa.qdevs->num_qdevs); + EXPECT_EQ(1, refcount_read(&qdev->refs)); - EXPECT_EQ(qdev, homa_qdisc_qdev_get(self->hnet, self->dev)); - EXPECT_EQ(2, qdev->refs); + EXPECT_EQ(qdev, homa_qdisc_qdev_get(self->dev)); + EXPECT_EQ(2, refcount_read(&qdev->refs)); + + homa_qdisc_qdev_put(qdev2); + homa_qdisc_qdev_put(qdev); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_qdev_get__race_when_creating) +{ + struct homa_qdisc_dev *qdev; + + unit_hook_register(qdev_create_hook); + hook_dev = self->dev; + create_hook_count = 1; + unit_log_clear(); + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_FALSE(IS_ERR(qdev)); + EXPECT_EQ(1, self->homa.qdevs->num_qdevs); + EXPECT_EQ(2, refcount_read(&qdev->refs)); + EXPECT_SUBSTR("race in homa_qdisc_qdev_get", unit_log_get()); homa_qdisc_qdev_put(qdev); homa_qdisc_qdev_put(qdev); @@ -182,7 +291,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__kmalloc_failure) struct homa_qdisc_dev *qdev; mock_kmalloc_errors = 1; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); EXPECT_TRUE(IS_ERR(qdev)); EXPECT_EQ(ENOMEM, -PTR_ERR(qdev)); } @@ -191,30 +300,98 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__cant_create_thread) struct homa_qdisc_dev *qdev; mock_kthread_create_errors = 1; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); EXPECT_TRUE(IS_ERR(qdev)); EXPECT_EQ(EACCES, -PTR_ERR(qdev)); } - -TEST_F(homa_qdisc, homa_qdisc_qdev_put) +TEST_F(homa_qdisc, homa_qdisc_get__fillin_qdevs_array) { - struct homa_qdisc_dev *qdev, *qdev2; + struct homa_qdisc_dev *qdev1, *qdev2, *qdev3; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - EXPECT_FALSE(IS_ERR(qdev)); - homa_qdisc_qdev_get(self->hnet, self->dev); - EXPECT_EQ(2, qdev->refs); + qdev1 = homa_qdisc_qdev_get(self->dev); + qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); + qdev3 = homa_qdisc_qdev_get(mock_dev(2, &self->homa)); - homa_qdisc_qdev_put(qdev); - EXPECT_EQ(1, qdev->refs); - qdev2 = list_first_entry_or_null(&self->hnet->qdisc_devs, - struct homa_qdisc_dev, links); - EXPECT_EQ(qdev, qdev2); + EXPECT_EQ(3, self->homa.qdevs->num_qdevs); + EXPECT_EQ(1, refcount_read(&qdev1->refs)); + EXPECT_EQ(qdev1, self->homa.qdevs->qdevs[0]); + EXPECT_EQ(qdev2, self->homa.qdevs->qdevs[1]); + EXPECT_EQ(qdev3, self->homa.qdevs->qdevs[2]); + + EXPECT_EQ(qdev3, homa_qdisc_qdev_get(mock_dev(2, &self->homa))); + EXPECT_EQ(2, refcount_read(&qdev3->refs)); + + homa_qdisc_qdev_put(qdev1); + homa_qdisc_qdev_put(qdev2); + homa_qdisc_qdev_put(qdev3); + homa_qdisc_qdev_put(qdev3); +} +TEST_F(homa_qdisc, homa_qdisc_qdev_put__basics) +{ + struct homa_qdisc_dev *qdev1, *qdev2, *qdev3; + + qdev1 = homa_qdisc_qdev_get(self->dev); + EXPECT_FALSE(IS_ERR(qdev1)); + qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); + EXPECT_FALSE(IS_ERR(qdev2)); + qdev3 = homa_qdisc_qdev_get(mock_dev(2, &self->homa)); + EXPECT_FALSE(IS_ERR(qdev3)); + + EXPECT_EQ(qdev2, homa_qdisc_qdev_get(mock_dev(1, &self->homa))); + EXPECT_EQ(2, refcount_read(&qdev2->refs)); + + /* First call: refcount doesn't hit zero. */ + homa_qdisc_qdev_put(qdev2); + EXPECT_EQ(1, refcount_read(&qdev2->refs)); + EXPECT_EQ(3, self->homa.qdevs->num_qdevs); + + /* Second call: refcount hits zero. */ + homa_qdisc_qdev_put(qdev2); + EXPECT_EQ(2, self->homa.qdevs->num_qdevs); + EXPECT_EQ(qdev3, self->homa.qdevs->qdevs[1]); + + homa_qdisc_qdev_put(qdev3); + homa_qdisc_qdev_put(qdev1); + EXPECT_EQ(0, self->homa.qdevs->num_qdevs); +} +TEST_F(homa_qdisc, homa_qdisc_qdev_put__cant_find_qdev_in_array) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + self->homa.qdevs->num_qdevs = 0; + mock_printk_output[0] = 0; homa_qdisc_qdev_put(qdev); - qdev2 = list_first_entry_or_null(&self->hnet->qdisc_devs, - struct homa_qdisc_dev, links); - EXPECT_EQ(NULL, qdev2); + EXPECT_STREQ("homa_qdisc_qdev_put couldn't find qdev to delete", + mock_printk_output); +} + +TEST_F(homa_qdisc, homa_qdisc_dev_callback) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 1000, 1500)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 2000, 1500)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]; [id 1235, offsets 1000]", + unit_log_get()); + + /* If skbs aren't freed, test infrastructure will complain. */ + homa_qdisc_qdev_put(qdev); + EXPECT_EQ(0, self->homa.qdevs->num_qdevs); } TEST_F(homa_qdisc, homa_qdisc_init__basics) @@ -224,10 +401,9 @@ TEST_F(homa_qdisc, homa_qdisc_init__basics) struct homa_qdisc *q; EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, - struct homa_qdisc_dev, links); + qdev = self->homa.qdevs->qdevs[0]; ASSERT_NE(NULL, qdev); - EXPECT_EQ(1, qdev->refs); + EXPECT_EQ(1, refcount_read(&qdev->refs)); EXPECT_EQ(10000, qdev->link_mbps); EXPECT_EQ(10240, qdisc->limit); q = qdisc_priv(qdisc); @@ -238,13 +414,11 @@ TEST_F(homa_qdisc, homa_qdisc_init__basics) TEST_F(homa_qdisc, homa_qdisc_init__cant_create_new_qdisc_dev) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); - struct homa_qdisc_dev *qdev; mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_qdisc_init(qdisc, NULL, NULL)); - qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, - struct homa_qdisc_dev, links); - EXPECT_EQ(NULL, qdev); + EXPECT_EQ(0, self->homa.qdevs->num_qdevs); + EXPECT_EQ(NULL, self->homa.qdevs->qdevs); kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_init__set_qix) @@ -268,18 +442,15 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); qdisc2 = mock_alloc_qdisc(&mock_net_queue); EXPECT_EQ(0, homa_qdisc_init(qdisc2, NULL, NULL)); - qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, - struct homa_qdisc_dev, links); + qdev = self->homa.qdevs->qdevs[0]; EXPECT_NE(NULL, qdev); - EXPECT_EQ(2, qdev->refs); + EXPECT_EQ(2, refcount_read(&qdev->refs)); homa_qdisc_destroy(qdisc2); - EXPECT_EQ(1, qdev->refs); + EXPECT_EQ(1, refcount_read(&qdev->refs)); homa_qdisc_destroy(qdisc); - qdev = list_first_entry_or_null(&self->hnet->qdisc_devs, - struct homa_qdisc_dev, links); - EXPECT_EQ(NULL, qdev); + EXPECT_EQ(0, self->homa.qdevs->num_qdevs); kfree(qdisc); kfree(qdisc2); } @@ -288,7 +459,7 @@ TEST_F(homa_qdisc, _homa_qdisc_homa_qdisc_set_qixs_object) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); /* Simple working case. */ homa_qdisc_set_qixs(qdev); @@ -528,6 +699,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__basics) struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; struct homa_qdisc_dev *qdev; + qdev = homa_qdisc_qdev_get(self->dev); srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); @@ -541,8 +713,6 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__basics) &self->server_ip, self->client_port, self->server_id + 6, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 5000, 1500)); homa_qdisc_defer_homa(qdev, @@ -568,6 +738,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__multiple_pkts_for_rpc) struct homa_rpc *srpc1, *srpc2; struct homa_qdisc_dev *qdev; + qdev = homa_qdisc_qdev_get(self->dev); srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); @@ -575,8 +746,6 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__multiple_pkts_for_rpc) &self->server_ip, self->client_port, self->server_id + 2, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 1000, 1500)); homa_qdisc_defer_homa(qdev, @@ -599,13 +768,12 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__dont_update_tx_left) struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); srpc->qrpc.tx_left = 2000; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 5000, 500)); EXPECT_EQ(2000, srpc->qrpc.tx_left); homa_qdisc_qdev_put(qdev); @@ -615,6 +783,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__throttled_cycles_metric) struct homa_rpc *srpc1, *srpc2; struct homa_qdisc_dev *qdev; + qdev = homa_qdisc_qdev_get(self->dev); srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); @@ -622,8 +791,6 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__throttled_cycles_metric) &self->server_ip, self->client_port, self->server_id + 2, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - mock_clock = 5000; homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 1000, 1500)); @@ -644,12 +811,11 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__wake_up_pacer) struct homa_rpc *srpc; struct sk_buff *skb; + qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - skb = new_test_skb(srpc, &self->addr, 5000, 1500); unit_log_clear(); mock_log_wakeups = 1; @@ -666,6 +832,7 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__basics) struct homa_rpc *srpc1, *srpc2, *srpc3; struct homa_qdisc_dev *qdev; + qdev = homa_qdisc_qdev_get(self->dev); srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); @@ -676,8 +843,6 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__basics) &self->server_ip, self->client_port, self->server_id + 4, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 5000, 1500)); homa_qdisc_defer_homa(qdev, @@ -697,6 +862,7 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_left_chain) struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; struct homa_qdisc_dev *qdev; + qdev = homa_qdisc_qdev_get(self->dev); srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); @@ -710,8 +876,6 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_left_chain) &self->server_ip, self->client_port, self->server_id + 6, 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 5000, 1500)); homa_qdisc_defer_homa(qdev, @@ -734,6 +898,7 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_right_chain) struct homa_rpc *srpc1, *srpc2, *srpc3, *srpc4; struct homa_qdisc_dev *qdev; + qdev = homa_qdisc_qdev_get(self->dev); srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); @@ -747,8 +912,6 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_right_chain) &self->server_ip, self->client_port, self->server_id +6 , 10000, 10000); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 5000, 1500)); homa_qdisc_defer_homa(qdev, @@ -771,7 +934,7 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__no_deferred_rpcs) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(NULL, homa_qdisc_dequeue_homa(qdev)); @@ -783,13 +946,12 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__multiple_packets_for_rpc) struct homa_rpc *srpc; struct sk_buff *skb; + qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - skb = new_test_skb(srpc, &self->addr, 2000, 500); homa_qdisc_defer_homa(qdev, skb); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); @@ -811,6 +973,7 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__last_packet_for_rpc) struct homa_qdisc_dev *qdev; struct sk_buff *skb; + qdev = homa_qdisc_qdev_get(self->dev); srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); @@ -820,8 +983,6 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__last_packet_for_rpc) self->server_id + 2, 10000, 10000); ASSERT_NE(NULL, srpc2); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - skb = new_test_skb(srpc1, &self->addr, 5000, 500); homa_qdisc_defer_homa(qdev, skb); homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, 500)); @@ -843,13 +1004,12 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__update_tx_left) struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 4000, 500)); srpc->qrpc.tx_left = 6000; @@ -869,13 +1029,12 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__throttled_cycles_metric) struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - mock_clock = 5000; homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 2000, 500)); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); @@ -898,13 +1057,12 @@ TEST_F(homa_qdisc, homa_qdisc_free_homa) struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 1000, 500)); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 2000, 500)); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); @@ -1012,14 +1170,13 @@ TEST_F(homa_qdisc, homa_qdisc_update_link_idle__pacer_bytes_metric) struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + qdev = homa_qdisc_qdev_get(self->dev); + ASSERT_FALSE(IS_ERR(qdev)); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); - ASSERT_FALSE(IS_ERR(qdev)); - /* No deferred packets. */ homa_qdisc_update_link_idle(qdev, 200, -1); EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_bytes); @@ -1037,7 +1194,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_main__basics) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); unit_hook_register(pacer_sleep_hook); @@ -1055,7 +1212,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__queue_empty) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); unit_log_clear(); homa_qdisc_pacer(qdev); @@ -1070,12 +1227,12 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) u64 link_idle; struct homa_rpc *srpc; + qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); link_idle = atomic64_read(&qdev->link_idle_time); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); @@ -1099,12 +1256,12 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) u64 link_idle; struct homa_rpc *srpc; + qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); link_idle = atomic64_read(&qdev->link_idle_time); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); @@ -1126,12 +1283,12 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); @@ -1163,6 +1320,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) struct homa_qdisc_dev *qdev; struct sk_buff *skb; + qdev = homa_qdisc_qdev_get(self->dev); srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); @@ -1172,7 +1330,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) self->server_id + 2, 10000, 10000); ASSERT_NE(NULL, srpc2); - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); @@ -1325,7 +1482,7 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl__basics) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); self->homa.link_mbps = 25000; @@ -1340,7 +1497,7 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl__cant_get_link_speed_from_dev) { struct homa_qdisc_dev *qdev; - qdev = homa_qdisc_qdev_get(self->hnet, self->dev); + qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); self->homa.link_mbps = 16000; @@ -1390,6 +1547,8 @@ TEST_F(homa_qdisc, homa_qdisc_update_all_sysctl) kfree(qdisc2); } +/* Inline functions in homa_qdisc.h: */ + TEST_F(homa_qdisc, homa_qdisc_precedes__bytes_left) { struct homa_rpc *srpc1, *srpc2, *srpc3; diff --git a/test/unit_homa_utils.c b/test/unit_homa_utils.c index de3c4424..f0dd830a 100644 --- a/test/unit_homa_utils.c +++ b/test/unit_homa_utils.c @@ -97,7 +97,7 @@ TEST_F(homa_utils, homa_init__cant_allocate_port_map) struct homa homa2; #ifndef __STRIP__ /* See strip.py */ - mock_kmalloc_errors = 0x10; + mock_kmalloc_errors = 0x20; #else /* See strip.py */ mock_kmalloc_errors = 4; #endif/* See strip.py */ @@ -111,7 +111,7 @@ TEST_F(homa_utils, homa_init__homa_skb_init_failure) { struct homa homa2; - mock_kmalloc_errors = 0x20; + mock_kmalloc_errors = 0x40; EXPECT_EQ(ENOMEM, -homa_init(&homa2)); EXPECT_SUBSTR("Couldn't initialize skb management (errno 12)", mock_printk_output); From 2e61bca453aeed4477993a12c59a73599478cfd8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 16 Sep 2025 16:18:30 -0700 Subject: [PATCH 500/625] Implement homa_qdisc_check_pacer This gets throughput with homa_qdisc almost (1% less than) throughput with the old pacer. --- homa_offload.c | 2 + homa_qdisc.c | 39 +++++++++++++++++++ homa_qdisc.h | 3 +- test/unit_homa_qdisc.c | 88 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 1 deletion(-) diff --git a/homa_offload.c b/homa_offload.c index 56d3dbd0..a2b0a214 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -7,6 +7,7 @@ #include "homa_impl.h" #include "homa_offload.h" #include "homa_pacer.h" +#include "homa_qdisc.h" DEFINE_PER_CPU(struct homa_offload_core, homa_offload_core); @@ -467,6 +468,7 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, done: homa_pacer_check(homa->pacer); + homa_qdisc_pacer_check(homa); offload_core->last_gro = homa_clock(); return result; diff --git a/homa_qdisc.c b/homa_qdisc.c index 11227f0f..0fd5ebc3 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -816,6 +816,45 @@ int homa_qdisc_redirect_skb(struct sk_buff *skb, return result; } +/** + * homa_qdisc_pacer_check() - Check whether any of the homa_qdisc pacer + * threads associated with @homa have fallen behind (e.g. because they + * got descheduled by Linux). If so, call the pacer directly to transmit + * deferred packets. + * @homa: Overall information about the Homa transport; used to find + * homa_qdisc_devs to check. + */ +void homa_qdisc_pacer_check(struct homa *homa) { + struct homa_qdisc_qdevs *qdevs; + struct homa_qdisc_dev *qdev; + u64 now = homa_clock(); + int num_qdevs, i; + int max_cycles; + + max_cycles = homa->pacer->max_nic_queue_cycles; + qdevs = homa->qdevs; + rcu_read_lock(); + num_qdevs = READ_ONCE(qdevs->num_qdevs); + for (i = 0; i < num_qdevs; i++) { + qdev = qdevs->qdevs[i]; + if (!homa_qdisc_any_deferred(qdev)) + continue; + + /* The ">> 1" means that we only help out if the NIC queue has + * dropped below half of its maximum allowed capacity. This + * gives the pacer thread the first shot at queuting new + * packets. + */ + if (now + (max_cycles >> 1) < + atomic64_read(&qdev->link_idle_time)) + continue; + tt_record("homa_qdisc_pacer_check calling homa_qdisc_pacer"); + homa_qdisc_pacer(qdev); + INC_METRIC(pacer_needed_help, 1); + } + rcu_read_unlock(); +} + /** * homa_qdisc_update_sysctl() - Recompute information in a homa_qdisc_dev * that depends on sysctl parameters. diff --git a/homa_qdisc.h b/homa_qdisc.h index ead0a616..20ab8974 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -216,6 +216,8 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack); void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc); +void homa_qdisc_pacer(struct homa_qdisc_dev *qdev); +void homa_qdisc_pacer_check(struct homa *homa); int homa_qdisc_pacer_main(void *device); struct homa_qdisc_dev * homa_qdisc_qdev_get(struct net_device *dev); @@ -230,7 +232,6 @@ void homa_qdisc_update_all_sysctl(struct homa_net *hnet); int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, int max_queue_ns); void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev); -void homa_qdisc_pacer(struct homa_qdisc_dev *qdev); void homa_rcu_kfree(void *object); void homa_rcu_kfree_callback(struct rcu_head *head); diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 70ab0804..46ef2369 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -1478,6 +1478,94 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__no_suitable_qdisc) homa_qdisc_destroy(self->qdiscs[i]); } +TEST_F(homa_qdisc, homa_qdisc_pacer_check__enqueue_packet) +{ + struct homa_qdisc_dev *qdev, *qdev2; + struct homa_rpc *srpc; + + /* Create 2 qdevs to verify that homa_qdisc_pacer_check loops over + * all qdevs. + */ + qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + + atomic64_set(&qdev->link_idle_time, 20000); + mock_clock = 15000; + self->homa.pacer->max_nic_queue_cycles = 12000; + + homa_qdisc_pacer_check(&self->homa); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_needed_help); + + homa_qdisc_destroy(self->qdiscs[3]); + homa_qdisc_qdev_put(qdev); + homa_qdisc_qdev_put(qdev2); +} +TEST_F(homa_qdisc, homa_qdisc_pacer_check__no_deferred_rpcs) +{ + struct homa_qdisc_dev *qdev, *qdev2; + + /* Create 2 qdevs to verify that homa_qdisc_pacer_check loops over + * all qdevs. + */ + qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + + atomic64_set(&qdev->link_idle_time, 20000); + mock_clock = 15000; + self->homa.pacer->max_nic_queue_cycles = 12000; + + homa_qdisc_pacer_check(&self->homa); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_needed_help); + + homa_qdisc_destroy(self->qdiscs[3]); + homa_qdisc_qdev_put(qdev); + homa_qdisc_qdev_put(qdev2); +} +TEST_F(homa_qdisc, homa_qdisc_pacer_check__lag_not_long_enough) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + + atomic64_set(&qdev->link_idle_time, 20000); + mock_clock = 13000; + self->homa.pacer->max_nic_queue_cycles = 12000; + + homa_qdisc_pacer_check(&self->homa); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_needed_help); + + homa_qdisc_destroy(self->qdiscs[3]); + homa_qdisc_qdev_put(qdev); +} + TEST_F(homa_qdisc, homa_qdisc_update_sysctl__basics) { struct homa_qdisc_dev *qdev; From fea64ffa4576306677e4afd252f00fc5266f5583 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 16 Sep 2025 20:59:51 -0700 Subject: [PATCH 501/625] Added latency plot to cp_vs_tcp Also renamed start_slowdown_plot to start_plot_vs_msg_length in cperf.py, since the function can be used for multiple purposes. --- util/cp_both | 2 +- util/cp_buffers | 2 +- util/cp_config | 2 +- util/cp_config_buf | 2 +- util/cp_load | 2 +- util/cp_tcp | 2 +- util/cp_tcp_config | 4 ++-- util/cp_vs_tcp | 22 ++++++++++++++++++++-- util/cperf.py | 39 ++++++++++++++++++++++++++++++++------- 9 files changed, 60 insertions(+), 17 deletions(-) diff --git a/util/cp_both b/util/cp_both index 63264502..d2c71477 100755 --- a/util/cp_both +++ b/util/cp_both @@ -61,7 +61,7 @@ log("Generating slowdown plot for %s" % (options.workload)) title = "TCP (%.1f Gbps) and Homa (%.1f Gbps) together, %s %d nodes" % ( options.gbps - options.homa_gbps, options.homa_gbps, options.workload.capitalize(), options.num_nodes) -ax = start_slowdown_plot(title, 1000, homa_exp) +ax = start_plot_vs_msg_length(title, 1000, homa_exp) plot_slowdown(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) plot_slowdown(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) plot_slowdown(ax, homa_exp, "p99", "Homa P99", color=homa_color) diff --git a/util/cp_buffers b/util/cp_buffers index 7d8fc883..6af75e88 100755 --- a/util/cp_buffers +++ b/util/cp_buffers @@ -163,7 +163,7 @@ for workload, bw, seconds in load_info: title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), options.num_nodes, bw) exp_name = "%s_%s" % (options.protocol, workload) - ax = start_slowdown_plot(title, 1000, exp_name, y_label=" Slowdown") + ax = start_plot_vs_msg_length(title, 1000, exp_name, y_label=" Slowdown") for exp in data: plot_slowdown(ax, exp["name"], "p99", "%.1f MB P99" % exp["buf_size"]) ax.legend(loc="upper left", prop={'size': 9}) diff --git a/util/cp_config b/util/cp_config index 55bd88c2..7641a677 100755 --- a/util/cp_config +++ b/util/cp_config @@ -336,7 +336,7 @@ for workload, bw, seconds in load_info: log("Generating slowdown plot for %s" % (workload)) title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), options.num_nodes, bw) - ax = start_slowdown_plot(title, plot_max_y, "%s_%s" % ( + ax = start_plot_vs_msg_length(title, plot_max_y, "%s_%s" % ( specs[0]['exp_name'], workload), y_label=" Slowdown") for spec in specs: exp_name = "%s_%s" % (spec['exp_name'], workload) diff --git a/util/cp_config_buf b/util/cp_config_buf index 865143b3..7ca16dbe 100755 --- a/util/cp_config_buf +++ b/util/cp_config_buf @@ -338,7 +338,7 @@ for workload, bw, seconds in load_info: log("Generating slowdown plot for %s" % (workload)) title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), options.num_nodes, bw) - ax = start_slowdown_plot(title, plot_max_y, "%s_%s" % ( + ax = start_plot_vs_msg_length(title, plot_max_y, "%s_%s" % ( specs[0]['exp_name'], workload), y_label=" Slowdown") for spec in specs: exp_name = "%s_%s" % (spec['exp_name'], workload) diff --git a/util/cp_load b/util/cp_load index 96f379b5..1517be71 100755 --- a/util/cp_load +++ b/util/cp_load @@ -89,7 +89,7 @@ set_unloaded(unloaded_exp) log("Generating slowdown plot for %s" % (options.workload)) title = "%s %d nodes, %.2f GB/s" % (options.workload.capitalize(), options.num_nodes, options.gbps) -ax = start_slowdown_plot(title, 1000, "homa1.0_%s" % (options.workload)) +ax = start_plot_vs_msg_length(title, 1000, "homa1.0_%s" % (options.workload)) if options.dctcp: plot_slowdown(ax, "dctcp%3.1f_%s" % (fractions[0], options.workload), "p99", "DCTCP %4.2f Gbps" % (fractions[0] * options.gbps), diff --git a/util/cp_tcp b/util/cp_tcp index 7b124c7b..35a59f43 100755 --- a/util/cp_tcp +++ b/util/cp_tcp @@ -81,7 +81,7 @@ for workload, bw in load_info: log("Generating slowdown plot for %s" % (workload)) title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), options.num_nodes, bw) - ax = start_slowdown_plot(title, 1000, exp) + ax = start_plot_vs_msg_length(title, 1000, exp) plot_slowdown(ax, exp, "p99", "%s P99" % (prot)) plot_slowdown(ax, exp, "p50", "%s P50" % (prot)) ax.legend() diff --git a/util/cp_tcp_config b/util/cp_tcp_config index cb8de9b8..3cdf6e70 100755 --- a/util/cp_tcp_config +++ b/util/cp_tcp_config @@ -111,7 +111,7 @@ for workload, bw in load_info: if options.tcp: title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), options.num_nodes, bw) - ax = start_slowdown_plot(title, 1000, + ax = start_plot_vs_msg_length(title, 1000, "tcp_%s_%s" % (specs[0]['exp_name'], workload), y_label="TCP Slowdown") for spec in specs: @@ -128,7 +128,7 @@ for workload, bw in load_info: if options.dctcp: title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), options.num_nodes, bw) - ax = start_slowdown_plot(title, 10000, + ax = start_plot_vs_msg_length(title, 10000, "dctcp_%s_%s" % (specs[0]['exp_name'], workload), y_label="DCTCP Slowdown") for spec in specs: diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index 2edc9933..76b26f1d 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -116,7 +116,7 @@ for workload, bw, seconds in load_info: log("Generating slowdown plot for %s" % (workload)) title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), options.num_nodes, bw) - ax = start_slowdown_plot(title, 1000, homa_exp) + ax = start_plot_vs_msg_length(title, 1000, homa_exp) if options.tcp: plot_slowdown(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) plot_slowdown(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) @@ -127,7 +127,25 @@ for workload, bw, seconds in load_info: plot_slowdown(ax, homa_exp, "p50", "Homa P50", color=homa_color2) ax.legend(loc="upper right", prop={'size': 9}) plt.tight_layout() - plt.savefig("%s/reports/vs_tcp_%s.pdf" % (options.log_dir, workload)) + plt.savefig("%s/reports/slowdown_%s.pdf" % (options.log_dir, workload)) + + # Generate latency plot. + log("Generating RTT latency plot for %s" % (workload)) + title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, bw) + ax = start_plot_vs_msg_length(title, [30, 30000], homa_exp, + y_label=r'RTT (µsec)') + if options.tcp: + plot_histogram(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) + plot_histogram(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) + if options.dctcp: + plot_histogram(ax, dctcp_exp, "p99", "DCTCP P99", color=dctcp_color) + plot_histogram(ax, dctcp_exp, "p50", "DCTCP P50", color=dctcp_color2) + plot_histogram(ax, homa_exp, "p99", "Homa P99", color=homa_color) + plot_histogram(ax, homa_exp, "p50", "Homa P50", color=homa_color2) + ax.legend(loc="upper right", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/reports/rtt_%s.pdf" % (options.log_dir, workload)) # Generate CDF of small-message RTTs. log("Generating short message CDF for %s" % (workload)) diff --git a/util/cperf.py b/util/cperf.py index 59446001..a91bc88e 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -1447,15 +1447,18 @@ def get_digest(experiment): digests[experiment] = digest return digest -def start_slowdown_plot(title, max_y, x_experiment, size=10, +def start_plot_vs_msg_length(title, y_range, x_experiment, size=10, show_top_label=True, show_bot_label=True, figsize=[6,4], y_label="Slowdown", show_upper_x_axis=True): """ - Create a pyplot graph that will be used for slowdown data. Returns the - Axes object for the plot. + Create a pyplot graph that will be used to display some value as a + function of message size, with the x-axis scaled so that distance + corresponds to cumulative number of messages. title: Title for the plot; may be empty - max_y: Maximum y-coordinate + y_range: Either a single value giving maximum y-coordinate + (min will be 1) or a list containing min and max + values. The y-axis will be log-scale. x_experiment: Name of experiment whose rtt distribution will be used to label the x-axis of the plot. None means don't label the x-axis (caller will presumably invoke cdf_xaxis to do it). @@ -1473,11 +1476,16 @@ def start_slowdown_plot(title, max_y, x_experiment, size=10, ax.set_title(title, size=size) ax.set_xlim(0, 1.0) ax.set_yscale("log") - ax.set_ylim(1, max_y) + if isinstance(y_range, list): + min_y, max_y = y_range + else: + min_y = 1 + max_y = y_range + ax.set_ylim(min_y, max_y) ax.tick_params(right=True, which="both", direction="in", length=5) ticks = [] labels = [] - y = 1 + y = 10 ** (math.ceil(math.log10(min_y))) while y <= max_y: ticks.append(y) labels.append("%d" % (y)) @@ -1607,7 +1615,7 @@ def plot_slowdown(ax, experiment, percentile, label, **kwargs): ax: matplotlib Axes object: info will be plotted here. experiment: Name of the experiment whose data should be graphed. - percentile: While percentile of slowdown to graph: must be "p50", "p99", + percentile: Which percentile of slowdown to graph: must be "p50", "p99", or "p999" label: Text to display in the graph legend for this curve kwargs: Additional keyword arguments to pass through to plt.plot @@ -1628,6 +1636,23 @@ def plot_slowdown(ax, experiment, percentile, label, **kwargs): % (percentile)) ax.plot(x, y, label=label, **kwargs) +def plot_histogram(ax, experiment, metric, label, **kwargs): + """ + Add a histogram to a plot created by start_plot_vs_msg_length(). + + ax: matplotlib Axes object: info will be plotted here. + experiment: Name of the experiment whose data should be graphed. + percentile: Metric from experiment to graph, such as "p50" for 50th + percentile latency or "slow_99" for 99th percentile + slowdown + label: Text to display in the graph legend for this curve + kwargs: Additional keyword arguments to pass through to plt.plot + """ + digest = get_digest(experiment) + x, y = make_histogram(digest["cum_frac"], digest[metric], + init=[0, digest[metric][0]], after=False) + ax.plot(x, y, label=label, **kwargs) + def start_cdf_plot(title, min_x, max_x, min_y, x_label, y_label, figsize=[5, 4], size=10, xscale="log", yscale="log"): """ From 48236a74365f4d24e52a0822348a5c4943e91e61 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 17 Sep 2025 15:02:41 -0700 Subject: [PATCH 502/625] Use rculist to keep track of homa_qdisc_devs Previous choice of a custom RCU array was nonstandard and unnecessarily complex. --- homa_devel.c | 7 ++-- homa_qdisc.c | 84 +++++++++++++----------------------------- homa_qdisc.h | 22 +++++------ test/unit_homa_qdisc.c | 72 ++++++++++-------------------------- 4 files changed, 60 insertions(+), 125 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 8bbd0972..64d3abde 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -1118,13 +1118,12 @@ void homa_rpc_stats_log(void) bool homa_rpcs_deferred(struct homa *homa) { struct homa_qdisc_qdevs *qdevs = homa->qdevs; + struct homa_qdisc_dev *qdev; bool result = false; - int num_qdevs, i; rcu_read_lock(); - num_qdevs = READ_ONCE(qdevs->num_qdevs); - for (i = 0; i < num_qdevs; i++) { - if (homa_qdisc_any_deferred(qdevs->qdevs[i])) { + list_for_each_entry_rcu(qdev, &qdevs->qdevs, links) { + if (homa_qdisc_any_deferred(qdev)) { result = true; break; } diff --git a/homa_qdisc.c b/homa_qdisc.c index 0fd5ebc3..9c5f667e 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -103,6 +103,7 @@ struct homa_qdisc_qdevs *homa_qdisc_qdevs_alloc(void) return ERR_PTR(-ENOMEM); mutex_init(&qdevs->mutex); + INIT_LIST_HEAD(&qdevs->qdevs); return qdevs; } @@ -115,29 +116,34 @@ struct homa_qdisc_qdevs *homa_qdisc_qdevs_alloc(void) void homa_qdisc_qdevs_free(struct homa_qdisc_qdevs *qdevs) { struct homa_qdisc_dev *qdev; + int stranded = 0; /* At this point this object no-one else besides us should * ever access this object again, but lock it just to be safe. */ mutex_lock(&qdevs->mutex); - if (qdevs->num_qdevs > 0) { - pr_err("homa_qdisc_devs_free found %d live qdevs (should have been none)\n", - qdevs->num_qdevs); + while (1) { + qdev = list_first_or_null_rcu(&qdevs->qdevs, + struct homa_qdisc_dev, links); + if (!qdev) + break; - /* We can't safely free the stranded qdevs, but at least - * stop their pacer threads to reduce the likelihood - * of derefernceing dangling pointers. + /* This code should never execute (all the qdevs should + * already have been deleted). We can't safely free the + * stranded qdevs, but at least stop their pacer threads to + * reduce the likelihood of dereferencing dangling pointers. */ - while (qdevs->num_qdevs > 0) { - qdevs->num_qdevs--; - qdev = qdevs->qdevs[qdevs->num_qdevs]; - qdevs->qdevs[qdevs->num_qdevs] = NULL; - kthread_stop(qdev->pacer_kthread); - qdev->pacer_kthread = NULL; - } + stranded++; + list_del_rcu(&qdev->links); + INIT_LIST_HEAD(&qdev->links); + kthread_stop(qdev->pacer_kthread); + qdev->pacer_kthread = NULL; } + + if (stranded != 0) + pr_err("homa_qdisc_devs_free found %d live qdevs (should have been none)\n", + stranded); mutex_unlock(&qdevs->mutex); - homa_rcu_kfree(qdevs->qdevs); homa_rcu_kfree(qdevs); } @@ -150,19 +156,14 @@ void homa_qdisc_qdevs_free(struct homa_qdisc_qdevs *qdevs) */ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) { - struct homa_qdisc_dev **new_qdevs, **old_qdevs; struct homa_qdisc_qdevs *qdevs; struct homa_qdisc_dev *qdev; struct homa_net *hnet; - int num_qdevs, i; rcu_read_lock(); hnet = homa_net(dev_net(dev)); qdevs = hnet->homa->qdevs; - qdevs = homa_net(dev_net(dev))->homa->qdevs; - num_qdevs = READ_ONCE(qdevs->num_qdevs); - for (i = 0; i < num_qdevs; i++) { - qdev = READ_ONCE(qdevs->qdevs[i]); + list_for_each_entry_rcu(qdev, &qdevs->qdevs, links) { if (qdev->dev == dev && refcount_inc_not_zero(&qdev->refs)) { rcu_read_unlock(); return qdev; @@ -175,8 +176,7 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) * created it). */ mutex_lock(&qdevs->mutex); - for (i = 0; i < qdevs->num_qdevs; i++) { - qdev = READ_ONCE(qdevs->qdevs[i]); + list_for_each_entry_rcu(qdev, &qdevs->qdevs, links) { if (qdev->dev == dev && refcount_inc_not_zero(&qdev->refs)) { UNIT_LOG("; ", "race in homa_qdisc_qdev_get"); goto done; @@ -212,18 +212,7 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) qdev = ERR_PTR(error); goto done; } - - /* Note: the order below matters, because there could be concurrent - * scans of qdevs->qdevs. - */ - new_qdevs = kzalloc((qdevs->num_qdevs + 1) * sizeof(*new_qdevs), - GFP_KERNEL); - old_qdevs = qdevs->qdevs; - memcpy(new_qdevs, old_qdevs, qdevs->num_qdevs * sizeof(*new_qdevs)); - new_qdevs[qdevs->num_qdevs] = qdev; - smp_store_release(&qdevs->qdevs, new_qdevs); - qdevs->num_qdevs++; - homa_rcu_kfree(old_qdevs); + list_add_rcu(&qdev->links, &qdevs->qdevs); done: mutex_unlock(&qdevs->mutex); @@ -238,7 +227,6 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) { struct homa_qdisc_qdevs *qdevs; - int i; if (!refcount_dec_and_test(&qdev->refs)) return; @@ -249,17 +237,7 @@ void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) */ qdevs = qdev->hnet->homa->qdevs; mutex_lock(&qdevs->mutex); - for (i = 0; i < qdevs->num_qdevs; i++) { - if (qdevs->qdevs[i] == qdev) - break; - } - if (i < qdevs->num_qdevs) { - WRITE_ONCE(qdevs->qdevs[i], qdevs->qdevs[qdevs->num_qdevs - 1]); - smp_store_release(&qdevs->num_qdevs, qdevs->num_qdevs - 1); - } else { - pr_err("homa_qdisc_qdev_put couldn't find qdev to delete\n"); - } - + list_del_rcu(&qdev->links); kthread_stop(qdev->pacer_kthread); qdev->pacer_kthread = NULL; call_rcu(&qdev->rcu_head, homa_qdisc_dev_callback); @@ -825,18 +803,13 @@ int homa_qdisc_redirect_skb(struct sk_buff *skb, * homa_qdisc_devs to check. */ void homa_qdisc_pacer_check(struct homa *homa) { - struct homa_qdisc_qdevs *qdevs; struct homa_qdisc_dev *qdev; u64 now = homa_clock(); - int num_qdevs, i; int max_cycles; max_cycles = homa->pacer->max_nic_queue_cycles; - qdevs = homa->qdevs; rcu_read_lock(); - num_qdevs = READ_ONCE(qdevs->num_qdevs); - for (i = 0; i < num_qdevs; i++) { - qdev = qdevs->qdevs[i]; + list_for_each_entry_rcu(qdev, &homa->qdevs->qdevs, links) { if (!homa_qdisc_any_deferred(qdev)) continue; @@ -904,15 +877,10 @@ void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev) */ void homa_qdisc_update_all_sysctl(struct homa_net *hnet) { - struct homa_qdisc_qdevs *qdevs; struct homa_qdisc_dev *qdev; - int num_qdevs, i; rcu_read_lock(); - qdevs = hnet->homa->qdevs; - num_qdevs = READ_ONCE(qdevs->num_qdevs); - for (i = 0; i < num_qdevs; i++) { - qdev = qdevs->qdevs[i]; + list_for_each_entry_rcu(qdev, &hnet->homa->qdevs->qdevs, links) { if (qdev->hnet != hnet) continue; homa_qdisc_update_sysctl(qdev); diff --git a/homa_qdisc.h b/homa_qdisc.h index 20ab8974..2ce0e89a 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -98,6 +98,12 @@ struct homa_qdisc_dev { */ int cycles_per_mibyte; + /** + * @links: Used to link this object into the qdevs list in a + * homa_qdisc_qdevs struct. + */ + struct list_head links; + /** * @link_idle_time: The time, measured by homa_clock, at which we * estimate that all of the packets passed to @dev will have been @@ -105,9 +111,6 @@ struct homa_qdisc_dev { */ atomic64_t link_idle_time __aligned(L1_CACHE_BYTES); - /** @links: Used to link this struct into homa->qdisc_devs. */ - struct list_head links; - /** * @deferred_rpcs: Contains all homa_rpc's with deferred packets, in * SRPT order. @@ -177,15 +180,11 @@ struct homa_qdisc_qdevs { */ struct mutex mutex; - /** @num_devs: Number of entries currently in use in @qdevs. */ - int num_qdevs; - /** - * @qdevs: Pointers to all homa_qdisc_devs that exist for this - * struct homa. Scan and/or retrieve pointers using RCU. Storage - * for this is dynamically allocated, must be kfreed. + * @qdevs: RCU list of all homa_qdisc_devs that currently + * exist for this struct homa. */ - struct homa_qdisc_dev **qdevs; + struct list_head qdevs; }; /** @@ -243,7 +242,8 @@ void homa_rcu_kfree_callback(struct rcu_head *head); */ static inline bool homa_qdisc_active(struct homa *homa) { - return homa->qdevs->num_qdevs > 0; + return list_first_or_null_rcu(&homa->qdevs->qdevs, + struct homa_qdisc_dev, links) != NULL; } /** diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 46ef2369..b0017b95 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -197,8 +197,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdevs_alloc__success) qdevs = homa_qdisc_qdevs_alloc(); ASSERT_FALSE(IS_ERR(qdevs)); - EXPECT_EQ(0, qdevs->num_qdevs); - EXPECT_EQ(NULL, qdevs->qdevs); + EXPECT_EQ(0, unit_list_length(&qdevs->qdevs)); kfree(qdevs); } TEST_F(homa_qdisc, homa_qdisc_qdevs_alloc__kmalloc_failure) @@ -230,7 +229,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdevs_free__unfreed_qdevs) saved_qdevs = self->homa.qdevs; self->homa.qdevs = qdevs; qdev = homa_qdisc_qdev_get(self->dev); - EXPECT_EQ(1, qdevs->num_qdevs); + EXPECT_EQ(1, unit_list_length(&qdevs->qdevs)); self->homa.qdevs = saved_qdevs; mock_printk_output[0] = 0; homa_qdisc_qdevs_free(qdevs); @@ -246,7 +245,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__basics) qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); EXPECT_EQ(1, refcount_read(&qdev->refs)); - EXPECT_EQ(1, self->homa.qdevs->num_qdevs); + EXPECT_EQ(1, unit_list_length(&self->homa.qdevs->qdevs)); homa_qdisc_qdev_put(qdev); } @@ -254,12 +253,14 @@ TEST_F(homa_qdisc, homa_qdisc_get__use_existing) { struct homa_qdisc_dev *qdev, *qdev2; - + /* Arrange for the desired qdev not to be first on this list, to + * exercise list traversal. + */ + qdev = homa_qdisc_qdev_get(self->dev); qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); - qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); - EXPECT_EQ(2, self->homa.qdevs->num_qdevs); + EXPECT_EQ(2, unit_list_length(&self->homa.qdevs->qdevs)); EXPECT_EQ(1, refcount_read(&qdev->refs)); EXPECT_EQ(qdev, homa_qdisc_qdev_get(self->dev)); @@ -279,7 +280,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__race_when_creating) unit_log_clear(); qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); - EXPECT_EQ(1, self->homa.qdevs->num_qdevs); + EXPECT_EQ(1, unit_list_length(&self->homa.qdevs->qdevs)); EXPECT_EQ(2, refcount_read(&qdev->refs)); EXPECT_SUBSTR("race in homa_qdisc_qdev_get", unit_log_get()); @@ -304,30 +305,8 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__cant_create_thread) EXPECT_TRUE(IS_ERR(qdev)); EXPECT_EQ(EACCES, -PTR_ERR(qdev)); } -TEST_F(homa_qdisc, homa_qdisc_get__fillin_qdevs_array) -{ - struct homa_qdisc_dev *qdev1, *qdev2, *qdev3; - qdev1 = homa_qdisc_qdev_get(self->dev); - qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); - qdev3 = homa_qdisc_qdev_get(mock_dev(2, &self->homa)); - - EXPECT_EQ(3, self->homa.qdevs->num_qdevs); - EXPECT_EQ(1, refcount_read(&qdev1->refs)); - EXPECT_EQ(qdev1, self->homa.qdevs->qdevs[0]); - EXPECT_EQ(qdev2, self->homa.qdevs->qdevs[1]); - EXPECT_EQ(qdev3, self->homa.qdevs->qdevs[2]); - - EXPECT_EQ(qdev3, homa_qdisc_qdev_get(mock_dev(2, &self->homa))); - EXPECT_EQ(2, refcount_read(&qdev3->refs)); - - homa_qdisc_qdev_put(qdev1); - homa_qdisc_qdev_put(qdev2); - homa_qdisc_qdev_put(qdev3); - homa_qdisc_qdev_put(qdev3); -} - -TEST_F(homa_qdisc, homa_qdisc_qdev_put__basics) +TEST_F(homa_qdisc, homa_qdisc_qdev_put) { struct homa_qdisc_dev *qdev1, *qdev2, *qdev3; @@ -344,27 +323,15 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_put__basics) /* First call: refcount doesn't hit zero. */ homa_qdisc_qdev_put(qdev2); EXPECT_EQ(1, refcount_read(&qdev2->refs)); - EXPECT_EQ(3, self->homa.qdevs->num_qdevs); + EXPECT_EQ(3, unit_list_length(&self->homa.qdevs->qdevs)); /* Second call: refcount hits zero. */ homa_qdisc_qdev_put(qdev2); - EXPECT_EQ(2, self->homa.qdevs->num_qdevs); - EXPECT_EQ(qdev3, self->homa.qdevs->qdevs[1]); + EXPECT_EQ(2, unit_list_length(&self->homa.qdevs->qdevs)); homa_qdisc_qdev_put(qdev3); homa_qdisc_qdev_put(qdev1); - EXPECT_EQ(0, self->homa.qdevs->num_qdevs); -} -TEST_F(homa_qdisc, homa_qdisc_qdev_put__cant_find_qdev_in_array) -{ - struct homa_qdisc_dev *qdev; - - qdev = homa_qdisc_qdev_get(self->dev); - self->homa.qdevs->num_qdevs = 0; - mock_printk_output[0] = 0; - homa_qdisc_qdev_put(qdev); - EXPECT_STREQ("homa_qdisc_qdev_put couldn't find qdev to delete", - mock_printk_output); + EXPECT_EQ(0, unit_list_length(&self->homa.qdevs->qdevs)); } TEST_F(homa_qdisc, homa_qdisc_dev_callback) @@ -391,7 +358,7 @@ TEST_F(homa_qdisc, homa_qdisc_dev_callback) /* If skbs aren't freed, test infrastructure will complain. */ homa_qdisc_qdev_put(qdev); - EXPECT_EQ(0, self->homa.qdevs->num_qdevs); + EXPECT_EQ(0, unit_list_length(&self->homa.qdevs->qdevs)); } TEST_F(homa_qdisc, homa_qdisc_init__basics) @@ -401,7 +368,8 @@ TEST_F(homa_qdisc, homa_qdisc_init__basics) struct homa_qdisc *q; EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - qdev = self->homa.qdevs->qdevs[0]; + qdev = list_first_or_null_rcu(&self->homa.qdevs->qdevs, + struct homa_qdisc_dev, links); ASSERT_NE(NULL, qdev); EXPECT_EQ(1, refcount_read(&qdev->refs)); EXPECT_EQ(10000, qdev->link_mbps); @@ -417,8 +385,7 @@ TEST_F(homa_qdisc, homa_qdisc_init__cant_create_new_qdisc_dev) mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_qdisc_init(qdisc, NULL, NULL)); - EXPECT_EQ(0, self->homa.qdevs->num_qdevs); - EXPECT_EQ(NULL, self->homa.qdevs->qdevs); + EXPECT_EQ(0, unit_list_length(&self->homa.qdevs->qdevs)); kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_init__set_qix) @@ -442,7 +409,8 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); qdisc2 = mock_alloc_qdisc(&mock_net_queue); EXPECT_EQ(0, homa_qdisc_init(qdisc2, NULL, NULL)); - qdev = self->homa.qdevs->qdevs[0]; + qdev = list_first_or_null_rcu(&self->homa.qdevs->qdevs, + struct homa_qdisc_dev, links); EXPECT_NE(NULL, qdev); EXPECT_EQ(2, refcount_read(&qdev->refs)); @@ -450,7 +418,7 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) EXPECT_EQ(1, refcount_read(&qdev->refs)); homa_qdisc_destroy(qdisc); - EXPECT_EQ(0, self->homa.qdevs->num_qdevs); + EXPECT_EQ(0, unit_list_length(&self->homa.qdevs->qdevs)); kfree(qdisc); kfree(qdisc2); } From f878a5d43144cbf50c5193ae483c9c5e4448759a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 18 Sep 2025 09:25:56 -0700 Subject: [PATCH 503/625] Fix bugs in ttoffset.py Didn't handle negative times properly. --- util/ttoffset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/ttoffset.py b/util/ttoffset.py index ea2b51aa..f3a0fdb4 100755 --- a/util/ttoffset.py +++ b/util/ttoffset.py @@ -33,9 +33,9 @@ delta = float(sys.argv[2]) - float(sys.argv[1]) for line in f: - match = re.match(' *([0-9.]+) us (.*)', line) + match = re.match(' *(-[0-9.]+) us (.*)', line) if not match: - print(line) + print(line, end='') continue time = float(match.group(1)) print("%9.3f us %s" % (time + delta, match.group(2))) \ No newline at end of file From 3e46c502d9ed54301f90cb16cfb76feeb63acf18 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 18 Sep 2025 09:31:28 -0700 Subject: [PATCH 504/625] Fix bug in cp_vs_tcp (was using wrong nodes for configuring TCP) --- util/cp_vs_tcp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index 76b26f1d..e9372e59 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -80,14 +80,14 @@ if not options.plot_only: if options.tcp: options.protocol = "tcp" set_sysctl_parameter("net.ipv4.tcp_congestion_control", - "cubic", range(0, options.num_nodes)) + "cubic", options.nodes) start_servers(tcp_exp, options.servers, options) run_experiment(tcp_exp, options.clients, options) if options.dctcp: options.protocol = "tcp" set_sysctl_parameter("net.ipv4.tcp_congestion_control", - "dctcp", range(0, options.num_nodes)) + "dctcp", options.nodes) start_servers(tcp_exp, options.servers, options) run_experiment(dctcp_exp, options.clients, options) except Exception as e: @@ -96,7 +96,7 @@ if not options.plot_only: if options.tcp or options.dctcp: print("Resetting TCP congestion control to %s" % (congestion)) set_sysctl_parameter("net.ipv4.tcp_congestion_control", congestion, - range(0, options.num_nodes)) + options.nodes) log("Stopping nodes") stop_nodes() scan_logs() From 0cefbdd317c5af7446001cf34da53987680158e0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 18 Sep 2025 09:57:50 -0700 Subject: [PATCH 505/625] Update perf.txt with measurements of CPU utilization --- perf.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/perf.txt b/perf.txt index 0512079e..5ffcb608 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,22 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +63. (September 2025) Compared CPU utilization against TCP. Measured with +top, running cp_vs_tcp -w w4 -b20 on a 6-node xl170 cluster (20 cores): + + Homa TCP Homa no polling +us (user) 9.8 15.7 11.0 +sy (system) 31.5 11.8 17.3 +ni (nice) 0.0 0.0 0.0 +id (idle) 38.0 49.2 51.9 +wa (iowait) 0.0 0.0 0.0 +hi (hardware interrupts) 0.0 0.0 0.0 +si (software interrupts) 19.3 22.2 19.5 +st (hypervisor steal) 0.0 0.0 0.0 + +Without polling, Homa's CPU utilization is slightly lower than TCP's. +Polling costs an extra 2-3 cores for Homa. + 62. (August 2025) Using ktime_get_ns (rdtscp) instead of get_cycles (rdtsc) in homa_clock (Linux reviewers won't allow get_cycles for upstreaming). rdtscp takes about 14 ns per call, vs. 8 for ktime_get_ns. Running "w4 -b20" From 008bb8af7d666af1f34cdc12e5e51d3eaaa09975 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 18 Sep 2025 09:58:15 -0700 Subject: [PATCH 506/625] Revise pipelining mechanism in homa_message_out_fill New approach works with both the old pacer and homa_qdisc. --- homa_outgoing.c | 29 +++++++++++++++-------------- test/unit_homa_outgoing.c | 32 ++++++++------------------------ 2 files changed, 23 insertions(+), 38 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index b53fad69..22944207 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -262,7 +262,6 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) struct sk_buff **last_link; struct dst_entry *dst; u64 segs_per_gso; - IF_NO_STRIP(int overlap_xmit); /* Bytes of the message that haven't yet been copied into skbs. */ int bytes_left; int gso_size; @@ -320,9 +319,6 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) mtu, max_seg_data, max_gso_data); #ifndef __STRIP__ /* See strip.py */ - overlap_xmit = rpc->msgout.length > 2 * max_gso_data; - if (homa_qdisc_active(rpc->hsk->homa)) - overlap_xmit = 0; rpc->msgout.granted = rpc->msgout.unscheduled; #endif /* See strip.py */ homa_skb_stash_pages(rpc->hsk->homa, rpc->msgout.length); @@ -378,23 +374,28 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) rpc->msgout.copied_from_user = rpc->msgout.length - bytes_left; rpc->msgout.first_not_tx = rpc->msgout.packets; #ifndef __STRIP__ /* See strip.py */ - if (overlap_xmit && list_empty(&rpc->throttled_links) && - xmit && offset < rpc->msgout.granted) { - tt_record1("waking up pacer for id %d", rpc->id); - homa_pacer_manage_rpc(rpc); - } + /* The code below improves pipelining for long messages + * by overlapping transmission with copying from user space. + * This is a bit tricky because sending the packets takes + * a significant amount time. On high-speed networks (e.g. + * 100 Gbps and above), copying from user space is the + * bottleneck, so transmitting the packets here will slow + * that down. Thes, we only transmit the unscheduled packets + * here, to fill the pipe. Packets after that can be + * transmitted by SoftIRQ in response to incoming grants; + * this allows us to use two cores: this core copying data + * and the SoftIRQ core sending packets. + */ + if (offset < rpc->msgout.unscheduled && xmit) + homa_xmit_data(rpc, false); #endif /* See strip.py */ } tt_record2("finished copy from user space for id %d, length %d", rpc->id, rpc->msgout.length); INC_METRIC(sent_msg_bytes, rpc->msgout.length); refcount_add(rpc->msgout.skb_memory, &rpc->hsk->sock.sk_wmem_alloc); -#ifndef __STRIP__ /* See strip.py */ - if (!overlap_xmit && xmit) + if (xmit) homa_xmit_data(rpc, false); -#else /* See strip.py */ - homa_xmit_data(rpc); -#endif /* See strip.py */ return 0; error: diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index e5ec0403..489eb61c 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -584,38 +584,22 @@ TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) homa_rpc_unlock(crpc); } #ifndef __STRIP__ /* See strip.py */ -TEST_F(homa_outgoing, homa_message_out_fill__add_to_throttled) +TEST_F(homa_outgoing, homa_message_out_fill__xmit_packets) { struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); ASSERT_FALSE(crpc == NULL); + self->homa.unsched_bytes = 2800; ASSERT_EQ(0, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 5000), 1)); homa_rpc_unlock(crpc); - unit_log_clear(); - unit_log_filled_skbs(crpc->msgout.packets, 0); - EXPECT_STREQ("DATA 1400@0; DATA 1400@1400; DATA 1400@2800; " - "DATA 800@4200", - unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("request id 2, next_offset 0", - unit_log_get()); -} -TEST_F(homa_outgoing, homa_message_out_fill__too_short_for_pipelining) -{ - struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, - &self->server_addr); - - ASSERT_FALSE(crpc == NULL); - ASSERT_EQ(0, -homa_message_out_fill(crpc, - unit_iov_iter((void *) 1000, 1000), 1)); - homa_rpc_unlock(crpc); - EXPECT_SUBSTR("xmit DATA 1000@0", unit_log_get()); - unit_log_clear(); - unit_log_throttled(&self->homa); - EXPECT_STREQ("", unit_log_get()); + EXPECT_SUBSTR(" _copy_from_iter 1400 bytes at 1000; " + "xmit DATA 1400@0; " + "_copy_from_iter 1400 bytes at 2400; " + "xmit DATA 1400@1400; " + "_copy_from_iter 1400 bytes at 3800; " + "_copy_from_iter 800 bytes at 5200", unit_log_get()); } #endif /* See strip.py */ TEST_F(homa_outgoing, homa_message_out_fill__packet_memory_accounting) From dbe806cfda4dac711301cb12a33af1348301dfc2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 18 Sep 2025 11:36:08 -0700 Subject: [PATCH 507/625] Fix errors in metrics.py when previous data didn't have a symbol --- util/metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util/metrics.py b/util/metrics.py index b3b5aa37..ffc481e9 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -123,7 +123,8 @@ def scale_number(number): total_cur += core[symbol] total_prev = 0 for core in prev: - total_prev += core[symbol] + if symbol in core: + total_prev += core[symbol] delta = total_cur - total_prev deltas[symbol] = delta From 7ff44766da8f8a1947adb7ba4b91ef7b27c49787 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 18 Sep 2025 11:43:12 -0700 Subject: [PATCH 508/625] Update notes.txt --- notes.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/notes.txt b/notes.txt index 2f01b12e..badfc361 100755 --- a/notes.txt +++ b/notes.txt @@ -4,6 +4,9 @@ Notes for Homa implementation in Linux: * Move interest cleanup code from homa_sock to a new function in homa_interest. Also move wakeup code from homa_rpc_handoff. +* Find a way to reduce pacer core utilization? It currently takes about 0.7 + core when running at high load. Maybe use polling threads instead? + * Use skb_attempt_defer_free once it has been properly exported. * Thoughts on making TCP and Homa play better together: @@ -79,10 +82,6 @@ Notes for Homa implementation in Linux: * Use vmap to map the user-space buffer pool so that the kernel can use memcpy rather than copy_to_user? -* SoftIRQ processing can lock out kernel-to-user copies; add a preemption - mechanism where the copying code can set a flag that it needs the lock, - then SoftIRQ releases the lock until the flag is clear? - * For W3, throttle_min_bytes is a problem: a significant fraction of all transmitted bytes aren't being counted; as a result, the NIC queue can build up. Reducing throttle_min_bytes from 1000 to 200 reduced P99 From 867d4740296e11c41af1096f8b43aef2e143104a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 1 Oct 2025 12:57:37 -0700 Subject: [PATCH 509/625] Fix issues with ethtool settings in cloudlab/bin/config * If rx-frames wasn't supported, rx-usecs didn't get set. * rx-usecs was only being set to 5, instead of 0. --- cloudlab/bin/config | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cloudlab/bin/config b/cloudlab/bin/config index 02eb11ca..423c7d23 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -585,10 +585,14 @@ def config_nic(): mechanisms). """ interface = get_interfaces()[0] + + # Use a separate ethtool command for each paramemeter. Otherwise, + # if one parametere isn't supported the command will be aborted, + # so no parameters will get set. exec_cmd(["sudo", "ethtool", "-C", interface, "adaptive-rx", "off"], check=False) - exec_cmd(["sudo", "ethtool", "-C", interface, "rx-usecs", "5", - "rx-frames", "1"], check=False) + exec_cmd(["sudo", "ethtool", "-C", interface, "rx-usecs", "0"], check=False) + exec_cmd(["sudo", "ethtool", "-C", interface, "rx-frames", "1"], check=False) def config_power(): """ From 82b6b88931b87db09d561579a086636d5b87027a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 2 Oct 2025 11:48:50 -0700 Subject: [PATCH 510/625] Set doff field in control packets If this field contains garbage Intel E810-C NICs will drop outgoing packets. --- homa_outgoing.c | 1 + homa_wire.h | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index 22944207..ddb7c85a 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -429,6 +429,7 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, #ifndef __STRIP__ /* See strip.py */ h->flags = HOMA_TCP_FLAGS; h->urgent = htons(HOMA_TCP_URGENT); + h->doff = 0x50; #endif /* See strip.py */ h->sender_id = cpu_to_be64(rpc->id); return __homa_xmit_control(contents, length, rpc->peer, rpc->hsk); diff --git a/homa_wire.h b/homa_wire.h index 871b740e..eeca2b3d 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -126,10 +126,13 @@ struct homa_common_hdr { u8 type; /** - * @doff: High order 4 bits holds the number of 4-byte chunks in a - * homa_data_hdr (low-order bits unused). Used only for DATA packets; - * must be in the same position as the data offset in a TCP header. - * Used by TSO to determine where the replicated header portion ends. + * @doff: High order 4 bits corespond to the Data Offset field of a + * TCP header. In DATA packets they hold the number of 4-byte chunks + * in a homa_data_hdr; used by TSO to determine where the replicated + * header portion ends. For other packets the offset is always 5 + * (standard TCP header length); other values may cause some NICs + * (such as Intel E810-C) to drop outgoing packets when TCP hijacking + * is enabled. The low-order bits are always 0. */ u8 doff; From 5d07d3ea26db7c3f5127925ba20bd94674b9030f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 3 Oct 2025 08:56:48 -0700 Subject: [PATCH 511/625] Update tthoma.py to support new traces for Intel NICs --- util/tthoma.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 9212ada0..11c8d685 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1178,31 +1178,31 @@ def __qdisc_queue_data(self, trace, time, core, match, interests): 'id ([0-9]+), offset ([0-9]+), qid ([0-9]+) \(([^)]+)\)' }) - def __mlx_data(self, trace, time, core, match, interests): - peer = match.group(1) - id = int(match.group(2)) - offset = int(match.group(3)) - tx_queue = match.group(4) + def __nic_data(self, trace, time, core, match, interests): + peer = match.group(2) + id = int(match.group(3)) + offset = int(match.group(4)) + tx_queue = match.group(5) for interest in interests: - interest.tt_mlx_data(trace, time, core, peer, id, offset, tx_queue) + interest.tt_nic_data(trace, time, core, peer, id, offset, tx_queue) patterns.append({ - 'name': 'mlx_data', - 'regexp': 'mlx sent homa data packet to ([^,]+), id ([0-9]+), ' + 'name': 'nic_data', + 'regexp': '(mlx|ice) sent homa data packet to ([^,]+), id ([0-9]+), ' 'offset ([0-9]+), queue (0x[0-9a-f]+)' }) - def __mlx_grant(self, trace, time, core, match, interests): - peer = match.group(1) - id = int(match.group(2)) - offset = int(match.group(3)) - tx_queue = match.group(4) + def __nic_grant(self, trace, time, core, match, interests): + peer = match.group(2) + id = int(match.group(3)) + offset = int(match.group(4)) + tx_queue = match.group(5) for interest in interests: - interest.tt_mlx_grant(trace, time, core, peer, id, offset, tx_queue) + interest.tt_nic_grant(trace, time, core, peer, id, offset, tx_queue) patterns.append({ - 'name': 'mlx_grant', - 'regexp': 'mlx sent homa grant to ([^,]+), id ([0-9]+), ' + 'name': 'nic_grant', + 'regexp': '(mlx|ice) sent homa grant to ([^,]+), id ([0-9]+), ' 'offset ([0-9]+), queue (0x[0-9a-f]+)' }) @@ -2949,7 +2949,8 @@ def print_worst(label, data): break time, delay, node = data[i] result += '%-26s %6.1f %9.3f %10s %5.1f\n' % ( - label, time, delay, node, 100*i/(num-1)) + label, time, delay, node, + 100*i/(num-1) if num > 1 else 100) return result verbose += print_worst('GRO to SoftIRQ', soft) @@ -6235,7 +6236,7 @@ def tt_ip_xmit(self, trace, t, core, id, offset): else: p['retransmits'][-1]['xmit'] = t - def tt_mlx_data(self, trace, t, core, peer, id, offset, tx_queue): + def tt_nic_data(self, trace, t, core, peer, id, offset, tx_queue): global packets p = packets[pkt_id(id, offset)] p['tx_node'] = trace['node'] @@ -6340,7 +6341,7 @@ def tt_send_grant(self, trace, t, core, id, offset, priority, increment): g['tx_node'] = trace['node'] g['increment'] = increment - def tt_mlx_grant(self, trace, t, core, peer, id, offset, tx_queue): + def tt_nic_grant(self, trace, t, core, peer, id, offset, tx_queue): global grants g = grants[pkt_id(id, offset)] g['nic'] = t From c51ec5cffc77691e26ccb368c819580b36e7451a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 3 Oct 2025 08:57:51 -0700 Subject: [PATCH 512/625] Fix bug in previous commit for ttoffset.py That commit was incorrect and broke the application --- util/ttoffset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/ttoffset.py b/util/ttoffset.py index f3a0fdb4..017140b4 100755 --- a/util/ttoffset.py +++ b/util/ttoffset.py @@ -33,7 +33,7 @@ delta = float(sys.argv[2]) - float(sys.argv[1]) for line in f: - match = re.match(' *(-[0-9.]+) us (.*)', line) + match = re.match(' *([-0-9.]+) us (.*)', line) if not match: print(line, end='') continue From 41ac37625845290a53879619f87cb9994873d457 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 3 Oct 2025 09:00:49 -0700 Subject: [PATCH 513/625] Add test/rbtree.c Needed for unit tests of modules that use rbtrees; should have been added a while ago. --- test/rbtree.c | 597 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 597 insertions(+) create mode 100644 test/rbtree.c diff --git a/test/rbtree.c b/test/rbtree.c new file mode 100644 index 00000000..9e730718 --- /dev/null +++ b/test/rbtree.c @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + (C) 2002 David Woodhouse + (C) 2012 Michel Lespinasse + + + linux/lib/rbtree.c +*/ + +#include +#include + +/* + * red-black trees properties: https://en.wikipedia.org/wiki/Rbtree + * + * 1) A node is either red or black + * 2) The root is black + * 3) All leaves (NULL) are black + * 4) Both children of every red node are black + * 5) Every simple path from root to leaves contains the same number + * of black nodes. + * + * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two + * consecutive red nodes in a path and every red node is therefore followed by + * a black. So if B is the number of black nodes on every simple path (as per + * 5), then the longest possible path due to 4 is 2B. + * + * We shall indicate color with case, where black nodes are uppercase and red + * nodes will be lowercase. Unknown color nodes shall be drawn as red within + * parentheses and have some accompanying text comment. + */ + +/* + * Notes on lockless lookups: + * + * All stores to the tree structure (rb_left and rb_right) must be done using + * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the + * tree structure as seen in program order. + * + * These two requirements will allow lockless iteration of the tree -- not + * correct iteration mind you, tree rotations are not atomic so a lookup might + * miss entire subtrees. + * + * But they do guarantee that any such traversal will only see valid elements + * and that it will indeed complete -- does not get stuck in a loop. + * + * It also guarantees that if the lookup returns an element it is the 'correct' + * one. But not returning an element does _NOT_ mean it's not present. + * + * NOTE: + * + * Stores to __rb_parent_color are not important for simple lookups so those + * are left undone as of now. Nor did I check for loops involving parent + * pointers. + */ + +static inline void rb_set_black(struct rb_node *rb) +{ + rb->__rb_parent_color += RB_BLACK; +} + +static inline struct rb_node *rb_red_parent(struct rb_node *red) +{ + return (struct rb_node *)red->__rb_parent_color; +} + +/* + * Helper function for rotations: + * - old's parent and color get assigned to new + * - old gets assigned new as a parent and 'color' as a color. + */ +static inline void +__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, + struct rb_root *root, int color) +{ + struct rb_node *parent = rb_parent(old); + new->__rb_parent_color = old->__rb_parent_color; + rb_set_parent_color(old, new, color); + __rb_change_child(old, new, parent, root); +} + +static __always_inline void +__rb_insert(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; + + while (true) { + /* + * Loop invariant: node is red. + */ + if (unlikely(!parent)) { + /* + * The inserted node is root. Either this is the + * first node, or we recursed at Case 1 below and + * are no longer violating 4). + */ + rb_set_parent_color(node, NULL, RB_BLACK); + break; + } + + /* + * If there is a black parent, we are done. + * Otherwise, take some corrective action as, + * per 4), we don't want a red root or two + * consecutive red nodes. + */ + if(rb_is_black(parent)) + break; + + gparent = rb_red_parent(parent); + + tmp = gparent->rb_right; + if (parent != tmp) { /* parent == gparent->rb_left */ + if (tmp && rb_is_red(tmp)) { + /* + * Case 1 - node's uncle is red (color flips). + * + * G g + * / \ / \ + * p u --> P U + * / / + * n n + * + * However, since g's parent might be red, and + * 4) does not allow this, we need to recurse + * at g. + */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; + } + + tmp = parent->rb_right; + if (node == tmp) { + /* + * Case 2 - node's uncle is black and node is + * the parent's right child (left rotate at parent). + * + * G G + * / \ / \ + * p U --> n U + * \ / + * n p + * + * This still leaves us in violation of 4), the + * continuation into Case 3 will fix that. + */ + tmp = node->rb_left; + WRITE_ONCE(parent->rb_right, tmp); + WRITE_ONCE(node->rb_left, parent); + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); + parent = node; + tmp = node->rb_right; + } + + /* + * Case 3 - node's uncle is black and node is + * the parent's left child (right rotate at gparent). + * + * G P + * / \ / \ + * p U --> n g + * / \ + * n U + */ + WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */ + WRITE_ONCE(parent->rb_right, gparent); + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); + break; + } else { + tmp = gparent->rb_left; + if (tmp && rb_is_red(tmp)) { + /* Case 1 - color flips */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; + } + + tmp = parent->rb_left; + if (node == tmp) { + /* Case 2 - right rotate at parent */ + tmp = node->rb_right; + WRITE_ONCE(parent->rb_left, tmp); + WRITE_ONCE(node->rb_right, parent); + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); + parent = node; + tmp = node->rb_left; + } + + /* Case 3 - left rotate at gparent */ + WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */ + WRITE_ONCE(parent->rb_left, gparent); + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); + break; + } + } +} + +/* + * Inline version for rb_erase() use - we want to be able to inline + * and eliminate the dummy_rotate callback there + */ +static __always_inline void +____rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; + + while (true) { + /* + * Loop invariants: + * - node is black (or NULL on first iteration) + * - node is not the root (parent is not NULL) + * - All leaf paths going through parent and node have a + * black node count that is 1 lower than other leaf paths. + */ + sibling = parent->rb_right; + if (node != sibling) { /* node == parent->rb_left */ + if (rb_is_red(sibling)) { + /* + * Case 1 - left rotate at parent + * + * P S + * / \ / \ + * N s --> p Sr + * / \ / \ + * Sl Sr N Sl + */ + tmp1 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp1); + WRITE_ONCE(sibling->rb_left, parent); + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + augment_rotate(parent, sibling); + sibling = tmp1; + } + tmp1 = sibling->rb_right; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_left; + if (!tmp2 || rb_is_black(tmp2)) { + /* + * Case 2 - sibling color flip + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N s + * / \ / \ + * Sl Sr Sl Sr + * + * This leaves us violating 5) which + * can be fixed by flipping p to black + * if it was red, or by recursing at p. + * p is red when coming from Case 1. + */ + rb_set_parent_color(sibling, parent, + RB_RED); + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; + } + /* + * Case 3 - right rotate at sibling + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N sl + * / \ \ + * sl Sr S + * \ + * Sr + * + * Note: p might be red, and then both + * p and sl are red after rotation(which + * breaks property 4). This is fixed in + * Case 4 (in __rb_rotate_set_parents() + * which set sl the color of p + * and set p RB_BLACK) + * + * (p) (sl) + * / \ / \ + * N sl --> P S + * \ / \ + * S N Sr + * \ + * Sr + */ + tmp1 = tmp2->rb_right; + WRITE_ONCE(sibling->rb_left, tmp1); + WRITE_ONCE(tmp2->rb_right, sibling); + WRITE_ONCE(parent->rb_right, tmp2); + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + augment_rotate(sibling, tmp2); + tmp1 = sibling; + sibling = tmp2; + } + /* + * Case 4 - left rotate at parent + color flips + * (p and sl could be either color here. + * After rotation, p becomes black, s acquires + * p's color, and sl keeps its color) + * + * (p) (s) + * / \ / \ + * N S --> P Sr + * / \ / \ + * (sl) sr N (sl) + */ + tmp2 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp2); + WRITE_ONCE(sibling->rb_left, parent); + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); + augment_rotate(parent, sibling); + break; + } else { + sibling = parent->rb_left; + if (rb_is_red(sibling)) { + /* Case 1 - right rotate at parent */ + tmp1 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp1); + WRITE_ONCE(sibling->rb_right, parent); + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + augment_rotate(parent, sibling); + sibling = tmp1; + } + tmp1 = sibling->rb_left; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_right; + if (!tmp2 || rb_is_black(tmp2)) { + /* Case 2 - sibling color flip */ + rb_set_parent_color(sibling, parent, + RB_RED); + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; + } + /* Case 3 - left rotate at sibling */ + tmp1 = tmp2->rb_left; + WRITE_ONCE(sibling->rb_right, tmp1); + WRITE_ONCE(tmp2->rb_left, sibling); + WRITE_ONCE(parent->rb_left, tmp2); + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + augment_rotate(sibling, tmp2); + tmp1 = sibling; + sibling = tmp2; + } + /* Case 4 - right rotate at parent + color flips */ + tmp2 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp2); + WRITE_ONCE(sibling->rb_right, parent); + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); + augment_rotate(parent, sibling); + break; + } + } +} + +/* Non-inline version for rb_erase_augmented() use */ +void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + ____rb_erase_color(parent, root, augment_rotate); +} + +/* + * Non-augmented rbtree manipulation functions. + * + * We use dummy augmented callbacks here, and have the compiler optimize them + * out of the rb_insert_color() and rb_erase() function definitions. + */ + +static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {} +static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} +static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} + +static const struct rb_augment_callbacks dummy_callbacks = { + .propagate = dummy_propagate, + .copy = dummy_copy, + .rotate = dummy_rotate +}; + +void rb_insert_color(struct rb_node *node, struct rb_root *root) +{ + __rb_insert(node, root, dummy_rotate); +} + +void rb_erase(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, root, dummy_rotate); +} + +/* + * Augmented rbtree manipulation functions. + * + * This instantiates the same __always_inline functions as in the non-augmented + * case, but this time with user-defined callbacks. + */ + +void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + __rb_insert(node, root, augment_rotate); +} + +/* + * This function returns the first node (in sort order) of the tree. + */ +struct rb_node *rb_first(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_left) + n = n->rb_left; + return n; +} + +struct rb_node *rb_last(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_right) + n = n->rb_right; + return n; +} + +struct rb_node *rb_next(const struct rb_node *node) +{ + struct rb_node *parent; + + if (RB_EMPTY_NODE(node)) + return NULL; + + /* + * If we have a right-hand child, go down and then left as far + * as we can. + */ + if (node->rb_right) { + node = node->rb_right; + while (node->rb_left) + node = node->rb_left; + return (struct rb_node *)node; + } + + /* + * No right-hand children. Everything down and left is smaller than us, + * so any 'next' node must be in the general direction of our parent. + * Go up the tree; any time the ancestor is a right-hand child of its + * parent, keep going up. First time it's a left-hand child of its + * parent, said parent is our 'next' node. + */ + while ((parent = rb_parent(node)) && node == parent->rb_right) + node = parent; + + return parent; +} + +struct rb_node *rb_prev(const struct rb_node *node) +{ + struct rb_node *parent; + + if (RB_EMPTY_NODE(node)) + return NULL; + + /* + * If we have a left-hand child, go down and then right as far + * as we can. + */ + if (node->rb_left) { + node = node->rb_left; + while (node->rb_right) + node = node->rb_right; + return (struct rb_node *)node; + } + + /* + * No left-hand children. Go up till we find an ancestor which + * is a right-hand child of its parent. + */ + while ((parent = rb_parent(node)) && node == parent->rb_left) + node = parent; + + return parent; +} + +void rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); + + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; + + /* Set the surrounding nodes to point to the replacement */ + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + __rb_change_child(victim, new, parent, root); +} + +static struct rb_node *rb_left_deepest_node(const struct rb_node *node) +{ + for (;;) { + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + else + return (struct rb_node *)node; + } +} + +struct rb_node *rb_next_postorder(const struct rb_node *node) +{ + const struct rb_node *parent; + if (!node) + return NULL; + parent = rb_parent(node); + + /* If we're sitting on node, we've already seen our children */ + if (parent && node == parent->rb_left && parent->rb_right) { + /* If we are the parent's left node, go to the parent's right + * node then all the way down to the left */ + return rb_left_deepest_node(parent->rb_right); + } else + /* Otherwise we are the parent's right node, and the parent + * should be next */ + return (struct rb_node *)parent; +} + +struct rb_node *rb_first_postorder(const struct rb_root *root) +{ + if (!root->rb_node) + return NULL; + + return rb_left_deepest_node(root->rb_node); +} From b75d4f6cc6ca76185a0d4eeae80efbfb0b828efc Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 7 Oct 2025 14:28:24 -0700 Subject: [PATCH 514/625] Fix bug in homa_resend_data Wasn't incrementing num_skbs to account for resend skbs added to to_free list. --- homa_outgoing.c | 1 + homa_rpc.h | 2 +- test/unit_homa_outgoing.c | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index ddb7c85a..bc9296b5 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -836,6 +836,7 @@ void homa_resend_data(struct homa_rpc *rpc, int start, int end) new_homa_info->rpc = rpc; rpc->msgout.to_free = new_skb; + rpc->msgout.num_skbs++; skb_get(new_skb); tt_record3("retransmitting offset %d, length %d, id %d", offset, seg_length, rpc->id); diff --git a/homa_rpc.h b/homa_rpc.h index 5fb52f63..0c2eac84 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -27,7 +27,7 @@ struct homa_message_out { */ int length; - /** @num_skbs: Total number of buffers currently in @packets. */ + /** @num_skbs: Total number of buffers currently in @to_free. */ int num_skbs; /** diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 489eb61c..39a5f387 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1169,9 +1169,9 @@ TEST_F(homa_outgoing, homa_resend_data__error_copying_data) #endif /* See strip.py */ TEST_F(homa_outgoing, homa_resend_data__add_to_to_free_and_set_homa_info) { + struct homa_skb_info *homa_info; struct homa_rpc *crpc; struct sk_buff *skb; - struct homa_skb_info *homa_info; mock_set_ipv6(&self->hsk); self->dev->gso_max_size = 5000; @@ -1190,6 +1190,7 @@ TEST_F(homa_outgoing, homa_resend_data__add_to_to_free_and_set_homa_info) EXPECT_EQ(8400, homa_info->offset); EXPECT_EQ(crpc, homa_info->rpc); EXPECT_EQ(1, refcount_read(&skb->users)); + EXPECT_EQ(6, crpc->msgout.num_skbs); } TEST_F(homa_outgoing, homa_rpc_tx_end) From cd7c22aa797a62b3911d69eda5d824bba074e441 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 7 Oct 2025 15:14:52 -0700 Subject: [PATCH 515/625] Rework metrics for homa_qdisc * Added some new metrics, plus new "Pacer" section in homa_metrics.py * Removed pacer_lost_cycles: didn't seem to be meaningful. * Renamed throttled_cycles to nic_backlog_cycles. * Also removed all STRIP ifdefs from homa_pacer.c --- homa_metrics.c | 22 ++-- homa_metrics.h | 41 ++++--- homa_pacer.c | 50 ++------- homa_pacer.h | 9 -- homa_qdisc.c | 47 ++++----- homa_qdisc.h | 11 +- test/mock.c | 1 + test/unit_homa_pacer.c | 78 ++++---------- test/unit_homa_qdisc.c | 235 +++++++++++++++++++++-------------------- util/metrics.py | 50 +++++---- 10 files changed, 248 insertions(+), 296 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index 22e31aeb..5eba0607 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -289,22 +289,24 @@ char *homa_metrics_print(void) "Time in homa_data_pkt spent reaping RPCs\n"); M("idle_time_conflicts", m->idle_time_conflicts, "Cache conflicts when updating link_idle_time\n"); + M("nic_backlog_cycles", m->nic_backlog_cycles, + "Time when NIC queue was backlogged\n"); M("pacer_cycles", m->pacer_cycles, - "Time spent in homa_pacer_main\n"); + "Execution time in pacer thread\n"); + M("pacer_xmit_cycles", m->pacer_xmit_cycles, + "Time pacer spent xmitting packets (vs. polling NIC queue)\n"); + M("pacer_packets", m->pacer_packets, + "Packets transmitted by the pacer\n"); + M("pacer_bytes", m->pacer_bytes, + "Bytes transmitted by the pacer (including headers)\n"); + M("pacer_help_bytes", m->pacer_help_bytes, + "Bytes transmitted via homa_qdisc_pacer_check"); M("homa_cycles", m->softirq_cycles + m->napi_cycles + m->send_cycles + m->recv_cycles + m->reply_cycles - m->blocked_cycles + - m->timer_cycles + m->pacer_cycles, + m->timer_cycles + m->nic_backlog_cycles, "Total time in all Homa-related functions\n"); - M("pacer_lost_cycles", m->pacer_lost_cycles, - "Lost transmission time because pacer was slow\n"); - M("pacer_bytes", m->pacer_bytes, - "Bytes transmitted when the pacer was active\n"); - M("pacer_needed_help", m->pacer_needed_help, - "homa_pacer_xmit invocations from homa_check_pacer\n"); - M("throttled_cycles", m->throttled_cycles, - "Time when output was throttled because NIC was backlogged\n"); M("resent_packets", m->resent_packets, "DATA packets sent in response to RESENDs\n"); M("peer_allocs", m->peer_allocs, diff --git a/homa_metrics.h b/homa_metrics.h index 2b9e12d0..f2d722e5 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -375,36 +375,45 @@ struct homa_metrics { __u64 idle_time_conflicts; /** - * @pacer_cycles: total time spent executing in homa_pacer_main - * (not including blocked time). + * @nic_backlog_cycles: total amount of time when there were packets + * waiting to be transmitted in homa_qdisc because the NIC queue was + * too long. + */ + u64 nic_backlog_cycles; + + /** + * @pacer_cycles: total execution time in the pacer thread (excluding + * blocked time). */ u64 pacer_cycles; /** - * @pacer_lost_cycles: unnecessary delays in transmitting packets - * (i.e. wasted output bandwidth) because the pacer was slow or got - * descheduled. + * @pacer_xmit_cycles: total time spent by the pacer actually + * transmitting packets (as opposed to polling waiting for the + * NIC queue to subside). */ - u64 pacer_lost_cycles; + u64 pacer_xmit_cycles; /** - * @pacer_bytes: total number of bytes transmitted when - * @homa->throttled_rpcs is nonempty. + * @pacer_packets: total number of Homa packets that were transmitted + * by homa_qdisc_pacer (they were deferred because of NIC queue + * overload). */ - u64 pacer_bytes; + u64 pacer_packets; /** - * @pacer_needed_help: total number of times that homa_check_pacer - * found that the pacer was running behind, so it actually invoked - * homa_pacer_xmit. + * @pacer_bytes: total number of bytes in packets that were + * transmitted by homa_qdisc_pacer (they were deferred because of + * NIC queue overload). */ - u64 pacer_needed_help; + u64 pacer_bytes; /** - * @throttled_cycles: total amount of time that @homa->throttled_rpcs - * is nonempty. + * @pacer_help_bytes: bytes in @pacer_bytes that were transmitted via + * calls to homa_qdisc_pacer_check (presumably because the pacer thread + * wasn't keeping up). Includes header bytes. */ - u64 throttled_cycles; + u64 pacer_help_bytes; /** * @resent_packets: total number of data packets issued in response to diff --git a/homa_pacer.c b/homa_pacer.c index 28bde94d..ea1239fe 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -9,7 +9,6 @@ #include "homa_pacer.h" #include "homa_rpc.h" -#ifndef __STRIP__ /* See strip.py */ /* Used to enable sysctl access to pacer-specific configuration parameters. The * @data fields are actually offsets within a struct homa_pacer; these are * converted to pointers into a net-specific struct homa later. @@ -38,7 +37,6 @@ static struct ctl_table pacer_ctl_table[] = { .proc_handler = homa_pacer_dointvec }, }; -#endif /* See strip.py */ /** * homa_pacer_alloc() - Allocate and initialize a new pacer object, which @@ -71,7 +69,6 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) } atomic64_set(&pacer->link_idle_time, homa_clock()); -#ifndef __STRIP__ /* See strip.py */ pacer->sysctl_header = register_net_sysctl(&init_net, "net/homa", pacer_ctl_table); if (!pacer->sysctl_header) { @@ -79,7 +76,6 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) pr_err("couldn't register sysctl parameters for Homa pacer\n"); goto error; } -#endif /* See strip.py */ homa_pacer_update_sysctl_deps(pacer); return pacer; @@ -96,12 +92,10 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) */ void homa_pacer_free(struct homa_pacer *pacer) { -#ifndef __STRIP__ /* See strip.py */ if (pacer->sysctl_header) { unregister_net_sysctl_table(pacer->sysctl_header); pacer->sysctl_header = NULL; } -#endif /* See strip.py */ if (pacer->kthread) { kthread_stop(pacer->kthread); pacer->kthread = NULL; @@ -141,27 +135,12 @@ int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, if ((clock + pacer->max_nic_queue_cycles) < idle && !force && !(pacer->homa->flags & HOMA_FLAG_DONT_THROTTLE)) return 0; -#ifndef __STRIP__ /* See strip.py */ if (!list_empty(&pacer->throttled_rpcs)) INC_METRIC(pacer_bytes, bytes); - if (idle < clock) { - if (pacer->wake_time) { - u64 lost = (pacer->wake_time > idle) - ? clock - pacer->wake_time - : clock - idle; - INC_METRIC(pacer_lost_cycles, lost); - tt_record1("pacer lost %d cycles", lost); - } - new_idle = clock + cycles_for_packet; - } else { - new_idle = idle + cycles_for_packet; - } -#else /* See strip.py */ if (idle < clock) new_idle = clock + cycles_for_packet; else new_idle = idle + cycles_for_packet; -#endif /* See strip.py */ /* This method must be thread-safe. */ if (atomic64_cmpxchg_relaxed(&pacer->link_idle_time, idle, @@ -180,15 +159,15 @@ int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, int homa_pacer_main(void *arg) { struct homa_pacer *pacer = arg; + u64 wake_time; int status; while (1) { if (kthread_should_stop()) break; - pacer->wake_time = homa_clock(); + wake_time = homa_clock(); homa_pacer_xmit(pacer); - INC_METRIC(pacer_cycles, homa_clock() - pacer->wake_time); - pacer->wake_time = 0; + INC_METRIC(pacer_cycles, homa_clock() - wake_time); if (!list_empty(&pacer->throttled_rpcs)) { /* NIC queue is full; before calling pacer again, * give other threads a chance to run (otherwise @@ -285,12 +264,8 @@ void homa_pacer_xmit(struct homa_pacer *pacer) /* Note: rpc->state could be RPC_DEAD here, but the code * below should work anyway. */ -#ifndef __STRIP__ /* See strip.py */ if (!*rpc->msgout.next_xmit || rpc->msgout.next_xmit_offset >= rpc->msgout.granted) { -#else /* See strip.py */ - if (!*rpc->msgout.next_xmit) { -#endif /* See strip.py */ /* No more data can be transmitted from this message * (right now), so remove it from the throttled list. */ @@ -317,25 +292,22 @@ void homa_pacer_manage_rpc(struct homa_rpc *rpc) struct homa_pacer *pacer = rpc->hsk->homa->pacer; struct homa_rpc *candidate; int bytes_left; - - IF_NO_STRIP(int checks = 0); - IF_NO_STRIP(u64 now); + int checks = 0; + u64 now; if (!list_empty(&rpc->throttled_links)) return; - IF_NO_STRIP(now = homa_clock()); -#ifndef __STRIP__ /* See strip.py */ + now = homa_clock(); if (!list_empty(&pacer->throttled_rpcs)) - INC_METRIC(throttled_cycles, now - pacer->throttle_add); + INC_METRIC(nic_backlog_cycles, now - pacer->throttle_add); pacer->throttle_add = now; -#endif /* See strip.py */ bytes_left = rpc->msgout.length - rpc->msgout.next_xmit_offset; homa_pacer_throttle_lock(pacer); list_for_each_entry(candidate, &pacer->throttled_rpcs, throttled_links) { int bytes_left_cand; - IF_NO_STRIP(checks++); + checks++; /* Watch out: the pacer might have just transmitted the last * packet from candidate. @@ -371,11 +343,9 @@ void homa_pacer_unmanage_rpc(struct homa_rpc *rpc) UNIT_LOG("; ", "removing id %llu from throttled list", rpc->id); homa_pacer_throttle_lock(pacer); list_del_init(&rpc->throttled_links); -#ifndef __STRIP__ /* See strip.py */ if (list_empty(&pacer->throttled_rpcs)) - INC_METRIC(throttled_cycles, homa_clock() + INC_METRIC(nic_backlog_cycles, homa_clock() - pacer->throttle_add); -#endif /* See strip.py */ homa_pacer_throttle_unlock(pacer); } } @@ -399,7 +369,6 @@ void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) pacer->cycles_per_mbyte = tmp; } -#ifndef __STRIP__ /* See strip.py */ /** * homa_pacer_dointvec() - This function is a wrapper around proc_dointvec. It * is invoked to read and write pacer-related sysctl values. @@ -481,4 +450,3 @@ void homa_pacer_throttle_lock_slow(struct homa_pacer *pacer) INC_METRIC(throttle_lock_misses, 1); INC_METRIC(throttle_lock_miss_cycles, homa_clock() - start); } -#endif /* See strip.py */ diff --git a/homa_pacer.h b/homa_pacer.h index d3fab9db..08975e5b 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -35,12 +35,6 @@ struct homa_pacer { */ int fifo_count; - /** - * @wake_time: homa_clock() time when the pacer woke up (if the pacer - * is running) or 0 if the pacer is sleeping. - */ - u64 wake_time; - /** * @throttle_lock: Used to synchronize access to @throttled_rpcs. Must * hold when inserting or removing an RPC from throttled_rpcs. @@ -54,13 +48,11 @@ struct homa_pacer { */ struct list_head throttled_rpcs; -#ifndef __STRIP__ /* See strip.py */ /** * @throttle_add: The most recent homa_clock() time when an RPC was * added to @throttled_rpcs. */ u64 throttle_add; -#endif /* See strip.py */ /** * @fifo_fraction: Out of every 1000 packets transmitted by the @@ -168,7 +160,6 @@ static inline void homa_pacer_check(struct homa_pacer *pacer) return; tt_record("homa_check_pacer calling homa_pacer_xmit"); homa_pacer_xmit(pacer); - INC_METRIC(pacer_needed_help, 1); } #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_qdisc.c b/homa_qdisc.c index 9c5f667e..542870bd 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -448,12 +448,10 @@ void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb) { struct homa_skb_info *info = homa_get_skb_info(skb); struct homa_rpc *rpc = info->rpc; - struct rb_node *prev_deferred; u64 now = homa_clock(); unsigned long flags; spin_lock_irqsave(&qdev->defer_lock, flags); - prev_deferred = rb_first_cached(&qdev->deferred_rpcs); __skb_queue_tail(&rpc->qrpc.packets, skb); if (skb_queue_len(&rpc->qrpc.packets) == 1) { int bytes_left; @@ -463,8 +461,8 @@ void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb) rpc->qrpc.tx_left = bytes_left; homa_qdisc_insert_rb(qdev, rpc); } - if (prev_deferred) - INC_METRIC(throttled_cycles, now - qdev->last_defer); + if (qdev->last_defer) + INC_METRIC(nic_backlog_cycles, now - qdev->last_defer); else wake_up(&qdev->pacer_sleep); qdev->last_defer = now; @@ -541,8 +539,10 @@ struct sk_buff *homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev) if (bytes_left < qrpc->tx_left) qrpc->tx_left = bytes_left; - if (!rb_first_cached(&qdev->deferred_rpcs)) - INC_METRIC(throttled_cycles, homa_clock() - qdev->last_defer); + if (!homa_qdisc_any_deferred(qdev)) { + INC_METRIC(nic_backlog_cycles, homa_clock() - qdev->last_defer); + qdev->last_defer = 0; + } spin_unlock_irqrestore(&qdev->defer_lock, flags); return skb; } @@ -597,13 +597,6 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, clock = homa_clock(); idle = atomic64_read(&qdev->link_idle_time); if (idle < clock) { - if (qdev->pacer_wake_time) { - u64 lost = (qdev->pacer_wake_time > idle) - ? clock - qdev->pacer_wake_time - : clock - idle; - INC_METRIC(pacer_lost_cycles, lost); - tt_record1("homa_qdisc pacer lost %d cycles", lost); - } new_idle = clock + cycles_for_packet; } else { if (max_queue_cycles >= 0 && (idle - clock) > @@ -617,8 +610,6 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, break; INC_METRIC(idle_time_conflicts, 1); } - if (rb_first_cached(&qdev->deferred_rpcs)) - INC_METRIC(pacer_bytes, bytes); return 1; } @@ -632,17 +623,14 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, int homa_qdisc_pacer_main(void *device) { struct homa_qdisc_dev *qdev = device; + u64 wake_time; int status; - u64 start; + wake_time = homa_clock(); while (1) { if (kthread_should_stop()) break; - start = homa_clock(); - qdev->pacer_wake_time = start; - homa_qdisc_pacer(qdev); - qdev->pacer_wake_time = 0; - INC_METRIC(pacer_cycles, homa_clock() - start); + homa_qdisc_pacer(qdev, false); if (homa_qdisc_any_deferred(qdev)) { /* There are more packets to transmit (the NIC queue @@ -656,8 +644,10 @@ int homa_qdisc_pacer_main(void *device) } tt_record("homa_qdisc pacer sleeping"); + INC_METRIC(pacer_cycles, homa_clock() - wake_time); status = wait_event_interruptible(qdev->pacer_sleep, kthread_should_stop() || homa_qdisc_any_deferred(qdev)); + wake_time = homa_clock(); tt_record1("homa_qdisc pacer woke up with status %d", status); if (status != 0 && status != -ERESTARTSYS) break; @@ -680,8 +670,11 @@ int homa_qdisc_pacer_main(void *device) * invocations are not guaranteed to happen, so the pacer thread provides a * backstop. * @qdev: The device on which to transmit. + * @help: True means this function was invoked from homa_qdisc_pacer_check + * rather than homa_qdisc_pacer_main (indicating that the pacer + * thread wasn't keeping up and needs help). */ -void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) +void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) { int i; @@ -721,10 +714,16 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) * but we transmit anyway (don't want this thread to get * starved by others). */ + UNIT_HOOK("pacer_xmit"); skb = homa_qdisc_dequeue_homa(qdev); if (!skb) break; + INC_METRIC(pacer_packets, 1); + INC_METRIC(pacer_bytes, qdisc_skb_cb(skb)->pkt_len); + if (help) + INC_METRIC(pacer_help_bytes, + qdisc_skb_cb(skb)->pkt_len); homa_qdisc_update_link_idle(qdev, qdisc_skb_cb(skb)->pkt_len, -1); h = (struct homa_data_hdr *)skb_transport_header(skb); @@ -732,6 +731,7 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev) be64_to_cpu(h->common.sender_id), ntohl(h->seg.offset), qdev->pacer_qix); homa_qdisc_redirect_skb(skb, qdev, true); + INC_METRIC(pacer_xmit_cycles, homa_clock() - now); } done: spin_unlock_bh(&qdev->pacer_mutex); @@ -822,8 +822,7 @@ void homa_qdisc_pacer_check(struct homa *homa) { atomic64_read(&qdev->link_idle_time)) continue; tt_record("homa_qdisc_pacer_check calling homa_qdisc_pacer"); - homa_qdisc_pacer(qdev); - INC_METRIC(pacer_needed_help, 1); + homa_qdisc_pacer(qdev, true); } rcu_read_unlock(); } diff --git a/homa_qdisc.h b/homa_qdisc.h index 2ce0e89a..cb8d1be4 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -126,7 +126,8 @@ struct homa_qdisc_dev { /** * @last_defer: The most recent homa_clock() time when a packet was - * added to homa_deferred or tcp_deferred. + * added to homa_deferred or tcp_deferred, or 0 if there are currently + * no deferred packets. */ u64 last_defer; @@ -136,12 +137,6 @@ struct homa_qdisc_dev { */ spinlock_t defer_lock; - /** - * @pacer_wake_time: homa_clock() time when the pacer woke up (if - * the pacer is running) or 0 if the pacer is sleeping. - */ - u64 pacer_wake_time; - /** * @pacer_kthread: Kernel thread that eventually transmits packets * on homa_deferred and tcp_deferred. @@ -215,7 +210,7 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack); void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc); -void homa_qdisc_pacer(struct homa_qdisc_dev *qdev); +void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help); void homa_qdisc_pacer_check(struct homa *homa); int homa_qdisc_pacer_main(void *device); struct homa_qdisc_dev * diff --git a/test/mock.c b/test/mock.c index fbcd993a..ff900cf6 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1239,6 +1239,7 @@ void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) unsigned long _raw_spin_lock_irqsave(raw_spinlock_t *lock) { + UNIT_HOOK("spin_lock_irqsave"); mock_record_locked(lock); return 1234; } diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index 0baeab8f..0a0ab54c 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -21,14 +21,18 @@ static void unmanage_hook(char *id) { homa_pacer_unmanage_rpc(hook_rpc); } -static u64 hook_exit_cycles; -static struct homa_pacer *hook_pacer; +static u64 hook_exit_count; static void exit_hook(char *id) { - mock_clock += mock_clock_tick; - if (mock_clock >= hook_exit_cycles) - mock_exit_thread = true; + if (strcmp(id, "prepare_to_wait") != 0) + return; + if (hook_exit_count > 0) { + hook_exit_count--; + if (hook_exit_count == 0) + mock_exit_thread = true; + } } +static struct homa_pacer *hook_pacer; static void exit_idle_hook(char *id) { if (strcmp(id, "schedule") == 0) unit_log_printf("; ", "time %llu", mock_clock); @@ -67,9 +71,7 @@ FIXTURE_SETUP(homa_pacer) self->hnet = mock_hnet(0, &self->homa); self->homa.pacer->cycles_per_mbyte = 1000000; self->homa.pacer->throttle_min_bytes = 0; -#ifndef __STRIP__ /* See strip.py */ self->homa.pacer->fifo_fraction = 0; -#endif /* See strip.py */ mock_sock_init(&self->hsk, self->hnet, self->client_port); } FIXTURE_TEARDOWN(homa_pacer) @@ -105,7 +107,6 @@ TEST_F(homa_pacer, homa_pacer_new__cant_create_pacer_thread) EXPECT_TRUE(IS_ERR(pacer)); EXPECT_EQ(EACCES, -PTR_ERR(pacer)); } -#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_pacer, homa_pacer_new__cant_register_sysctls) { struct homa_pacer *pacer; @@ -115,7 +116,6 @@ TEST_F(homa_pacer, homa_pacer_new__cant_register_sysctls) EXPECT_TRUE(IS_ERR(pacer)); EXPECT_EQ(ENOMEM, -PTR_ERR(pacer)); } -#endif /* See strip.py */ TEST_F(homa_pacer, homa_pacer_free__basics) { @@ -125,13 +125,8 @@ TEST_F(homa_pacer, homa_pacer_free__basics) EXPECT_FALSE(IS_ERR(pacer)); unit_log_clear(); homa_pacer_free(pacer); -#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("unregister_net_sysctl_table; kthread_stop", unit_log_get()); -#else /* See strip.py */ - EXPECT_STREQ("kthread_stop", - unit_log_get()); -#endif /* See strip.py */ } TEST_F(homa_pacer, homa_pacer_free__no_thread) { @@ -142,9 +137,7 @@ TEST_F(homa_pacer, homa_pacer_free__no_thread) pacer->kthread = NULL; unit_log_clear(); homa_pacer_free(pacer); -#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); -#endif /* See strip.py */ } TEST_F(homa_pacer, homa_pacer_check_nic_q__success) @@ -198,29 +191,6 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full_but_force) crpc->msgout.packets, true)); EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); } -TEST_F(homa_pacer, homa_pacer_check_nic_q__pacer_metrics) -{ - struct homa_rpc *crpc; - - crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, - self->server_ip, self->server_port, - self->client_id, 500, 1000); - - homa_get_skb_info(crpc->msgout.packets)->wire_bytes = 500; - homa_pacer_manage_rpc(crpc); - unit_log_clear(); - atomic64_set(&self->homa.pacer->link_idle_time, 9000); - self->homa.pacer->wake_time = 9800; - mock_clock = 10000; - self->homa.pacer->max_nic_queue_cycles = 1000; - EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, - crpc->msgout.packets, true)); - EXPECT_EQ(10500, atomic64_read(&self->homa.pacer->link_idle_time)); -#ifndef __STRIP__ /* See strip.py */ - EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); - EXPECT_EQ(200, homa_metrics_per_cpu()->pacer_lost_cycles); -#endif /* See strip.py */ -} TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_empty) { struct homa_rpc *crpc; @@ -242,11 +212,9 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_empty) TEST_F(homa_pacer, homa_pacer_main__exit) { unit_hook_register(exit_hook); - hook_pacer = self->homa.pacer; - hook_exit_cycles = 5000; - mock_clock_tick = 200; + hook_exit_count = 10; homa_pacer_main(self->homa.pacer); - EXPECT_TRUE(mock_clock >= 5000); + EXPECT_EQ(0, hook_exit_count); } TEST_F(homa_pacer, homa_pacer_main__xmit_data) { @@ -267,7 +235,6 @@ TEST_F(homa_pacer, homa_pacer_main__xmit_data) hook_pacer = self->homa.pacer; unit_log_clear(); homa_pacer_main(self->homa.pacer); -#ifndef __STRIP__ /* See strip.py */ EXPECT_STREQ("xmit DATA 1400@0; " "xmit DATA 1400@1400; " "xmit DATA 1400@2800; time 1600; time 2200; " @@ -283,18 +250,16 @@ TEST_F(homa_pacer, homa_pacer_main__xmit_data) "xmit DATA 200@9800; " "removing id 1236 from throttled list", unit_log_get()); -#endif /* See strip.py */ } TEST_F(homa_pacer, homa_pacer_main__rpc_arrives_while_sleeping) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, UNIT_OUTGOING, self->client_ip, self->server_ip, self->server_port, self->client_id, - 5000, 1000); + 3000, 1000); unit_hook_register(exit_hook); - hook_pacer = self->homa.pacer; - hook_exit_cycles = 5000; + hook_exit_count = 5; mock_clock_tick = 200; unit_hook_register(manage_hook); hook_rpc = crpc; @@ -302,7 +267,10 @@ TEST_F(homa_pacer, homa_pacer_main__rpc_arrives_while_sleeping) unit_log_clear(); homa_pacer_main(self->homa.pacer); - EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400; xmit DATA 1400@2800", + EXPECT_STREQ("xmit DATA 1400@0; " + "xmit DATA 1400@1400; " + "xmit DATA 200@2800; " + "removing id 1234 from throttled list", unit_log_get()); } TEST_F(homa_pacer, homa_pacer_main__exit_on_signal) @@ -551,7 +519,6 @@ TEST_F(homa_pacer, homa_pacer_manage_rpc__basics) "request id 8, next_offset 0; " "request id 6, next_offset 0", unit_log_get()); } -#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_pacer, homa_pacer_manage_rpc__inc_metrics) { struct homa_rpc *crpc1, *crpc2, *crpc3; @@ -578,7 +545,6 @@ TEST_F(homa_pacer, homa_pacer_manage_rpc__inc_metrics) EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_adds); EXPECT_EQ(3, homa_metrics_per_cpu()->throttle_list_checks); } -#endif /* See strip.py */ TEST_F(homa_pacer, homa_pacer_unmanage_rpc__basics) { @@ -603,7 +569,6 @@ TEST_F(homa_pacer, homa_pacer_unmanage_rpc__basics) EXPECT_TRUE(list_empty(&self->homa.pacer->throttled_rpcs)); EXPECT_STREQ("", unit_log_get()); } -#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_pacer, homa_pacer_unmanage_rpc__metrics) { struct homa_rpc *crpc1, *crpc2; @@ -618,24 +583,23 @@ TEST_F(homa_pacer, homa_pacer_unmanage_rpc__metrics) mock_clock = 1000; homa_pacer_manage_rpc(crpc1); EXPECT_EQ(1000, self->homa.pacer->throttle_add); - EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); mock_clock = 3000; homa_pacer_manage_rpc(crpc2); EXPECT_EQ(3000, self->homa.pacer->throttle_add); - EXPECT_EQ(2000, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(2000, homa_metrics_per_cpu()->nic_backlog_cycles); mock_clock = 7000; homa_pacer_unmanage_rpc(crpc1); EXPECT_EQ(3000, self->homa.pacer->throttle_add); - EXPECT_EQ(2000, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(2000, homa_metrics_per_cpu()->nic_backlog_cycles); mock_clock = 8000; homa_pacer_unmanage_rpc(crpc2); EXPECT_EQ(3000, self->homa.pacer->throttle_add); - EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(7000, homa_metrics_per_cpu()->nic_backlog_cycles); } -#endif /* See strip.py */ TEST_F(homa_pacer, homa_pacer_update_sysctl_deps) { diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index b0017b95..a6a07309 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -44,7 +44,7 @@ static struct sk_buff *new_test_skb(struct homa_rpc *rpc, info->rpc = rpc; info->data_bytes = length; info->offset = offset; - qdisc_skb_cb(skb)->pkt_len = length; + qdisc_skb_cb(skb)->pkt_len = length + 100; return skb; } @@ -67,18 +67,28 @@ void log_deferred(struct homa_qdisc_dev *qdev) } } -static struct homa_qdisc_dev *hook_qdev; -static int hook_sleep_count; -static void pacer_sleep_hook(char *id) { +static struct homa_qdisc_dev *exit_hook_qdev; +static int exit_hook_count; +static void exit_hook(char *id) { if (strcmp(id, "prepare_to_wait") != 0) return; - if (hook_sleep_count > 0) { - hook_sleep_count--; - if (hook_sleep_count == 0) + if (exit_hook_count > 0) { + exit_hook_count--; + if (exit_hook_count == 0) mock_exit_thread = true; } } +static struct homa_qdisc_dev *defer_hook_qdev; +static struct sk_buff *defer_hook_skb; +static void defer_hook(char *id) +{ + if (strcmp(id, "prepare_to_wait") == 0 && defer_hook_qdev) { + homa_qdisc_defer_homa(defer_hook_qdev, defer_hook_skb); + defer_hook_qdev = NULL; + } +} + static int create_hook_count; static struct net_device *hook_dev; static void qdev_create_hook(char *id) @@ -92,6 +102,15 @@ static void qdev_create_hook(char *id) homa_qdisc_qdev_get(hook_dev); } +static u64 xmit_clock; +static void xmit_hook(char *id) +{ + if (strcmp(id, "pacer_xmit") != 0) + return; + if (xmit_clock == 0) + xmit_clock = mock_clock; +} + FIXTURE(homa_qdisc) { struct homa homa; struct homa_net *hnet; @@ -746,7 +765,7 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__dont_update_tx_left) EXPECT_EQ(2000, srpc->qrpc.tx_left); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_defer_homa__throttled_cycles_metric) +TEST_F(homa_qdisc, homa_qdisc_defer_homa__nic_backlog_cycles_metric) { struct homa_rpc *srpc1, *srpc2; struct homa_qdisc_dev *qdev; @@ -763,13 +782,13 @@ TEST_F(homa_qdisc, homa_qdisc_defer_homa__throttled_cycles_metric) homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 1000, 1500)); EXPECT_EQ(5000, qdev->last_defer); - EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); mock_clock = 12000; homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, 1500)); EXPECT_EQ(12000, qdev->last_defer); - EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(7000, homa_metrics_per_cpu()->nic_backlog_cycles); homa_qdisc_qdev_put(qdev); } @@ -992,7 +1011,7 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__update_tx_left) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__throttled_cycles_metric) +TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__nic_backlog_cycles_metric) { struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; @@ -1006,17 +1025,20 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__throttled_cycles_metric) mock_clock = 5000; homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 2000, 500)); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 3000, 500)); - EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); EXPECT_EQ(5000, qdev->last_defer); mock_clock = 12000; kfree_skb(homa_qdisc_dequeue_homa(qdev)); - EXPECT_EQ(0, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_EQ(5000, qdev->last_defer); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + mock_clock = 14000; kfree_skb(homa_qdisc_dequeue_homa(qdev)); - EXPECT_EQ(7000, homa_metrics_per_cpu()->throttled_cycles); + EXPECT_EQ(9000, homa_metrics_per_cpu()->nic_backlog_cycles); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, qdev->last_defer); homa_qdisc_qdev_put(qdev); } @@ -1060,34 +1082,6 @@ TEST_F(homa_qdisc, homa_qdisc_update_link_idle__nic_idle) EXPECT_EQ(1200 + HOMA_ETH_FRAME_OVERHEAD, atomic64_read(&qdev.link_idle_time)); } -TEST_F(homa_qdisc, homa_qdisc_update_link_idle__pacer_lost_cycles_metric) -{ - struct homa_qdisc_dev qdev; - - /* qdev->pacer_wake_time < idle */ - mock_clock = 10000; - memset(&qdev, 0, sizeof(qdev)); - qdev.cycles_per_mibyte = 1 << 20; /* 1 cycle per byte. */ - atomic64_set(&qdev.link_idle_time, 4000); - qdev.pacer_wake_time = 2000; - - homa_qdisc_update_link_idle(&qdev, 200, 0); - EXPECT_EQ(6000, homa_metrics_per_cpu()->pacer_lost_cycles); - - /* qdev->pacer_wake_time > idle */ - atomic64_set(&qdev.link_idle_time, 4000); - qdev.pacer_wake_time = 8000; - - homa_qdisc_update_link_idle(&qdev, 200, 0); - EXPECT_EQ(8000, homa_metrics_per_cpu()->pacer_lost_cycles); - - /* pacer_inactive */ - atomic64_set(&qdev.link_idle_time, 4000); - qdev.pacer_wake_time = 0; - - homa_qdisc_update_link_idle(&qdev, 200, 0); - EXPECT_EQ(8000, homa_metrics_per_cpu()->pacer_lost_cycles); -} TEST_F(homa_qdisc, homa_qdisc_update_link_idle__queue_too_long) { struct homa_qdisc_dev qdev; @@ -1133,60 +1127,71 @@ TEST_F(homa_qdisc, homa_qdisc_update_link_idle__cmpxchg_conflicts) atomic64_read(&qdev.link_idle_time)); EXPECT_EQ(4, homa_metrics_per_cpu()->idle_time_conflicts); } -TEST_F(homa_qdisc, homa_qdisc_update_link_idle__pacer_bytes_metric) + +TEST_F(homa_qdisc, homa_qdisc_pacer_main) { struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + /* This test checks for two things: + * (a) proper handling of deferred packets that arrive while sleeping + * (b) proper thread exit + */ qdev = homa_qdisc_qdev_get(self->dev); - ASSERT_FALSE(IS_ERR(qdev)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 10000, 10000); + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - /* No deferred packets. */ - homa_qdisc_update_link_idle(qdev, 200, -1); - EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_bytes); + exit_hook_qdev = qdev; + exit_hook_count = 10; + unit_hook_register(exit_hook); + defer_hook_qdev = qdev; + defer_hook_skb = new_test_skb(srpc, &self->addr, 1000, 500); + unit_hook_register(defer_hook); - /* Deferred packets. */ - homa_qdisc_defer_homa(qdev, - new_test_skb(srpc, &self->addr, 0, 1500)); - homa_qdisc_update_link_idle(qdev, 500, -1); - EXPECT_EQ(500, homa_metrics_per_cpu()->pacer_bytes); + homa_qdisc_pacer_main(qdev); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_packets); + EXPECT_EQ(0, exit_hook_count); + homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_pacer_main__basics) +TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) { struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; qdev = homa_qdisc_qdev_get(self->dev); - EXPECT_FALSE(IS_ERR(qdev)); - - unit_hook_register(pacer_sleep_hook); - hook_qdev = qdev; - hook_sleep_count = 3; - mock_clock_tick = 200; - - homa_qdisc_pacer_main(qdev); - EXPECT_EQ(400, homa_metrics_per_cpu()->pacer_cycles); - - homa_qdisc_qdev_put(qdev); -} + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); -TEST_F(homa_qdisc, homa_qdisc_pacer__queue_empty) -{ - struct homa_qdisc_dev *qdev; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); - qdev = homa_qdisc_qdev_get(self->dev); + mock_clock = 0; + mock_clock_tick = 1000; + atomic64_set(&qdev->link_idle_time, 10000); + self->homa.pacer->max_nic_queue_cycles = 3500; unit_log_clear(); + unit_hook_register(xmit_hook); + xmit_clock = 0; - homa_qdisc_pacer(qdev); - EXPECT_STREQ("", unit_log_get()); - EXPECT_EQ(0, atomic64_read(&qdev->link_idle_time)); + homa_qdisc_pacer(qdev, false); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_EQ(7000, xmit_clock); + homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) @@ -1210,7 +1215,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) unit_log_clear(); mock_trylock_errors = 1; - homa_qdisc_pacer(qdev); + homa_qdisc_pacer(qdev, false); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); EXPECT_EQ(link_idle, atomic64_read(&qdev->link_idle_time)); @@ -1218,38 +1223,24 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) +TEST_F(homa_qdisc, homa_qdisc_pacer__queue_empty) { struct homa_qdisc_dev *qdev; - u64 link_idle; - struct homa_rpc *srpc; qdev = homa_qdisc_qdev_get(self->dev); - srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 10000, 10000); - ASSERT_NE(NULL, srpc); - - link_idle = atomic64_read(&qdev->link_idle_time); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); - EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; - EXPECT_EQ(0, self->qdiscs[3]->q.qlen); unit_log_clear(); - homa_qdisc_pacer(qdev); - EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(1, self->qdiscs[3]->q.qlen); - EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + homa_qdisc_pacer(qdev, false); + EXPECT_STREQ("", unit_log_get()); + EXPECT_EQ(0, atomic64_read(&qdev->link_idle_time)); - homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) +TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) { struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + u64 link_idle; qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, @@ -1257,27 +1248,23 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); + link_idle = atomic64_read(&qdev->link_idle_time); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); - - mock_clock = 0; mock_clock_tick = 1000; - atomic64_set(&qdev->link_idle_time, 10000); - self->homa.pacer->max_nic_queue_cycles = 3500; unit_log_clear(); - homa_qdisc_pacer(qdev); + homa_qdisc_pacer(qdev, false); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); - - /* Packet will get transmitted when mock_clock ticks to 7000, but - * clock ticks once more in homa_qdisc_update_link_idle, then once - * in homa_qdisc_dequeue_homa (to update metrics when the queue - * empties) and once more in homa_qdisc_pacer before it returns. - */ - EXPECT_EQ(10000, mock_clock); + EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_packets); + EXPECT_EQ(1100, homa_metrics_per_cpu()->pacer_bytes); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_help_bytes); + EXPECT_NE(0, homa_metrics_per_cpu()->pacer_xmit_cycles); homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); @@ -1314,7 +1301,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) self->homa.pacer->max_nic_queue_cycles = 100; unit_log_clear(); - homa_qdisc_pacer(qdev); + homa_qdisc_pacer(qdev, false); unit_log_clear(); log_deferred(qdev); EXPECT_STREQ("[id 1237, offsets 4000]", unit_log_get()); @@ -1324,6 +1311,33 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } +TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_help_bytes_metric) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 800)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + unit_log_clear(); + + homa_qdisc_pacer(qdev, true); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_packets); + EXPECT_EQ(900, homa_metrics_per_cpu()->pacer_bytes); + EXPECT_EQ(900, homa_metrics_per_cpu()->pacer_help_bytes); + + homa_qdisc_destroy(self->qdiscs[3]); + homa_qdisc_qdev_put(qdev); +} TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_pacer_qix) { @@ -1473,7 +1487,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__enqueue_packet) homa_qdisc_pacer_check(&self->homa); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_needed_help); homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); @@ -1499,7 +1512,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__no_deferred_rpcs) homa_qdisc_pacer_check(&self->homa); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_needed_help); homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); @@ -1528,7 +1540,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__lag_not_long_enough) homa_qdisc_pacer_check(&self->homa); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_needed_help); homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); diff --git a/util/metrics.py b/util/metrics.py index ffc481e9..678df556 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -222,8 +222,7 @@ def scale_number(number): for core in range(first_core, end_core): line += " Core%-2d" % (core) print(line) - for where in ["napi", "softirq", "send", "recv", "reply", - "timer", "pacer"]: + for where in ["napi", "softirq", "send", "recv", "reply", "timer"]: if where == "softirq": symbol = "linux_softirq_cycles" else: @@ -403,36 +402,49 @@ def scale_number(number): print("GRO bypass for data packets: %5.1f%%" % (data_bypass_percent)) print("GRO bypass for grant packets: %5.1f%%" % (grant_bypass_percent)) + if deltas["pacer_packets"] != 0: + print("\nPacer:") + print("--------") + print("Packets sent: %5.3f M/sec (%.1f %% of all packets)" % + (1e-6*deltas["pacer_packets"]/elapsed_secs, + 100*deltas["pacer_packets"]/packets_sent)) + print("Throughput (including headers): %5.2f Gbps" % + (8e-9*deltas["pacer_bytes"]/elapsed_secs)) + print("Helper throughput: %5.2f Gbps (%.1f%% of all pacer bytes)" % + (8e-9*deltas["pacer_help_bytes"]/elapsed_secs, + 100*deltas["pacer_help_bytes"]/deltas["pacer_bytes"])) + backlog_secs = float(deltas["nic_backlog_cycles"])/(cpu_khz * 1000.0) + print("Active throughput: %5.2f Gbps (NIC backlogged %.1f%% of time)" % ( + deltas["pacer_bytes"]*8e-09/backlog_secs, + 100*backlog_secs/elapsed_secs)) + xmit_secs = float(deltas["pacer_xmit_cycles"])/(cpu_khz * 1000.0) + print("Pacer thread duty cycle: %5.1f %%" % + (100*deltas["pacer_cycles"]/time_delta)) + print("Time xmitting packets: %5.1f %% (%.2f usecs/packet)" % + (100*xmit_secs/elapsed_secs, 1e6*xmit_secs/deltas["pacer_packets"])) + print("\nMiscellaneous:") print("--------------") if packets_received > 0: - print("Bytes/packet rcvd: %6.0f" % ( + print("Bytes/packet rcvd: %6.0f" % ( total_received_bytes/packets_received)) - print("Packets received: %5.3f M/sec" % ( + print("Packets received: %5.3f M/sec" % ( 1e-6*packets_received/elapsed_secs)) - print("Packets sent: %5.3f M/sec" % ( + print("Packets sent: %5.3f M/sec" % ( 1e-6*packets_sent/elapsed_secs)) - print("Core efficiency: %5.3f M packets/sec/core " + print("Core efficiency: %5.3f M packets/sec/core " "(sent & received combined)" % ( 1e-6*(packets_sent + packets_received)/elapsed_secs /total_cores_used)) - print(" %5.2f Gbps/core (goodput)" % ( + print(" %5.2f Gbps/core (goodput)" % ( 8e-9*(total_received_bytes + float(deltas["sent_msg_bytes"])) /(total_cores_used * elapsed_secs))) - if deltas["pacer_cycles"] != 0: - pacer_secs = float(deltas["pacer_cycles"])/(cpu_khz * 1000.0) - print("Pacer throughput: %6.2f Gbps (pacer output when pacer active)" % ( - deltas["pacer_bytes"]*8e-09/pacer_secs)) - if deltas["throttled_cycles"] != 0: - throttled_secs = float(deltas["throttled_cycles"])/(cpu_khz * 1000.0) - print("Throttled throughput: %5.2f Gbps (pacer output when NIC backlogged)" % ( - deltas["pacer_bytes"]*8e-09/throttled_secs)) if deltas["skb_allocs"] != 0: - print("Skb alloc time: %4.2f usec/skb" % ( + print("Skb alloc time: %4.2f usec/skb" % ( float(deltas["skb_alloc_cycles"]) / (cpu_khz / 1000.0) / deltas["skb_allocs"])) if deltas["skb_page_allocs"] != 0: - print("Skb page alloc time: %5.2f usec/page" % ( + print("Skb page alloc time: %5.2f usec/page" % ( float(deltas["skb_page_alloc_cycles"]) / (cpu_khz / 1000.0) / deltas["skb_page_allocs"])) @@ -452,8 +464,8 @@ def scale_number(number): rate_info = ("(%s/s) " % (scale_number(rate))).ljust(13) print("%-30s %15d %s%s" % (symbol, deltas[symbol], rate_info, docs[symbol])) - for symbol in ["pacer_lost_cycles", "timer_reap_cycles", - "data_pkt_reap_cycles", "grant_lock_cycles"]: + for symbol in ["timer_reap_cycles", "data_pkt_reap_cycles", + "grant_lock_cycles"]: delta = deltas[symbol] if delta == 0 or time_delta == 0: continue From 3679931946560d659b1e788f7a527aa4a58b39d6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 7 Oct 2025 15:29:10 -0700 Subject: [PATCH 516/625] Change calculation of pacer_cycles metric The previous approach was including time spent in calls to schedule(). This reverts to the approach used by homa_pacer.c until recently (both homa_pacer and homa_qdisc use the same approach). --- homa_pacer.c | 6 +++--- homa_qdisc.c | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/homa_pacer.c b/homa_pacer.c index ea1239fe..a92dd140 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -159,15 +159,15 @@ int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, int homa_pacer_main(void *arg) { struct homa_pacer *pacer = arg; - u64 wake_time; + u64 start; int status; while (1) { if (kthread_should_stop()) break; - wake_time = homa_clock(); + start = homa_clock(); homa_pacer_xmit(pacer); - INC_METRIC(pacer_cycles, homa_clock() - wake_time); + INC_METRIC(pacer_cycles, homa_clock() - start); if (!list_empty(&pacer->throttled_rpcs)) { /* NIC queue is full; before calling pacer again, * give other threads a chance to run (otherwise diff --git a/homa_qdisc.c b/homa_qdisc.c index 542870bd..7485a0a5 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -623,14 +623,15 @@ int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, int homa_qdisc_pacer_main(void *device) { struct homa_qdisc_dev *qdev = device; - u64 wake_time; int status; + u64 start; - wake_time = homa_clock(); while (1) { if (kthread_should_stop()) break; + start = homa_clock(); homa_qdisc_pacer(qdev, false); + INC_METRIC(pacer_cycles, homa_clock() - start); if (homa_qdisc_any_deferred(qdev)) { /* There are more packets to transmit (the NIC queue @@ -644,10 +645,8 @@ int homa_qdisc_pacer_main(void *device) } tt_record("homa_qdisc pacer sleeping"); - INC_METRIC(pacer_cycles, homa_clock() - wake_time); status = wait_event_interruptible(qdev->pacer_sleep, kthread_should_stop() || homa_qdisc_any_deferred(qdev)); - wake_time = homa_clock(); tt_record1("homa_qdisc pacer woke up with status %d", status); if (status != 0 && status != -ERESTARTSYS) break; From abc4e442e500f36bd8fdd24d4f03bd0fcdd53170 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 10 Oct 2025 16:58:30 -0700 Subject: [PATCH 517/625] Document how buffers are returned in server.cc and homa_test.cc --- util/homa_test.cc | 6 +++++- util/server.cc | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/util/homa_test.cc b/util/homa_test.cc index 13e8ab78..5546089f 100644 --- a/util/homa_test.cc +++ b/util/homa_test.cc @@ -48,7 +48,11 @@ char *buf_region; /* Either AF_INET or AF_INET6: indicates whether to use IPv6 instead of IPv4. */ int inet_family = AF_INET; -/* Control blocks for receiving messages. */ +/* Control blocks for receiving messages. Reusing the same + * homa_recvmsg_args causes receive buffers to be returned to Homa + * automatically. Each call to recvmsg returns the buffers from the + * previous call. + */ struct homa_recvmsg_args recv_args; struct msghdr recv_hdr; diff --git a/util/server.cc b/util/server.cc index 0754ebe1..a87d753d 100644 --- a/util/server.cc +++ b/util/server.cc @@ -116,6 +116,10 @@ void homa_server(int port) int seed; int result; + /* Note: by reusing recv_args for successive calls to + * recvmsg we automatically return to Homa the buffers + * left in recv_args by the previous call to recvmsg. + */ recv_args.id = 0; hdr.msg_controllen = sizeof(recv_args); length = recvmsg(fd, &hdr, 0); From 235915fdd5ca1b426e401333b5438972c193c309 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 10 Oct 2025 17:04:18 -0700 Subject: [PATCH 518/625] Fix typo in comment --- test/mock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/mock.c b/test/mock.c index ff900cf6..b3a3d220 100644 --- a/test/mock.c +++ b/test/mock.c @@ -196,7 +196,7 @@ unsigned int tsc_khz = 1000000; */ bool mock_exit_thread; -/* Indicates whether we should be simulation IPv6 or IPv4 in the +/* Indicates whether we should be simulating IPv6 or IPv4 in the * current test. Can be overridden by a test. */ bool mock_ipv6 = true; From 35df7f27b36a0c70ba4fab5f0342b784f639e2ec Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 10 Oct 2025 17:08:44 -0700 Subject: [PATCH 519/625] Implement HOMAIOCINFO ioctl * Add error_msg field to homa_sock along with code to set it in Homa system calls. * Add homa_pool_avail_bytes function. * Add log_homa_info method in cp_node.cc for testing. * Fix various bugs in the ioctl mechanism (it was using the struct proto entry point instead of the struct proto_ops entry, which broke it in a couple of ways). --- README.md | 2 + homa.h | 154 +++++++++++++++++++- homa_impl.h | 5 +- homa_outgoing.c | 17 ++- homa_peer.c | 5 +- homa_plumbing.c | 265 +++++++++++++++++++++++++--------- homa_pool.c | 24 ++++ homa_pool.h | 1 + homa_rpc.c | 61 +++++++- homa_rpc.h | 10 +- homa_sock.c | 14 +- homa_sock.h | 8 ++ man/homa.7 | 189 ++++++++++++++++++++++++ test/unit_homa_outgoing.c | 7 + test/unit_homa_plumbing.c | 292 ++++++++++++++++++++++++++++++++++++-- test/unit_homa_pool.c | 32 +++++ test/unit_homa_rpc.c | 168 +++++++++++++++++++++- test/unit_homa_sock.c | 5 + util/cp_node.cc | 90 +++++++++++- 19 files changed, 1243 insertions(+), 106 deletions(-) diff --git a/README.md b/README.md index e7ccff55..5c03c4cc 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,8 @@ This repo contains an implementation of the Homa transport protocol as a Linux k sysctl mechanism. For details, see the man page `homa.7`. ## Significant changes +- October 2025: added the HOMAIOCINFO ioctl for retrieving status + information about a Homa socket. See man/homa.7 for details. - May 2025: `homa_api.c` has been removed, so the functions `homa_abort`, `homa_reply`, `homa_replyv`, `homa_send`, and `homa_sendv` no longer exist. diff --git a/homa.h b/homa.h index fffdfb69..ceeeaafc 100644 --- a/homa.h +++ b/homa.h @@ -172,13 +172,159 @@ struct homa_rcvbuf_args { */ #define HOMA_FLAG_DONT_THROTTLE 2 -/* I/O control calls on Homa sockets. These are mapped into the - * SIOCPROTOPRIVATE range of 0x89e0 through 0x89ef. +/** + * struct homa_rpc_info - Used by HOMAIOCINFO to return information about + * a specific RPC. */ +struct homa_rpc_info { + /** + * @id: Identifier for the RPC, unique among all RPCs sent by the + * client node. If the low-order bit is 1, this node is the server + * for the RPC; 0 means we are the client. + */ + __u64 id; + + /** @peer: Address of the peer socket for this RPC. */ + union { + struct sockaddr_storage storage; + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } peer; + + /** + * @completion_cookie: For client-side RPCs this gives the completion + * cookie specified when the RPC was initiated. For server-side RPCs + * this is zero. + */ + __u64 completion_cookie; + + /** + * @tx_length: Length of the outgoing message in bytes, or -1 if + * the sendmsg hasn't yet been called. + */ + __s32 tx_length; + + /** + * @tx_sent: Number of bytes of the outgoing message that have been + * transmitted at least once. + */ + __u32 tx_sent; + + /** + * @tx_granted: Number of bytes of the outgoing message that the + * receiver has authorized us to transmit (includes unscheduled + * bytes). + */ + __u32 tx_granted; + + /** + * @tx_prio: Current priority level that the receiver has specified + * for transmitting packets. + */ + __u32 tx_prio; + + /** + * @rx_length: Length of the incoming message, in bytes. -1 means + * the length is not yet known (this is a client-side RPC and + * no packets have been received). + */ + __s32 rx_length; + + /** + * @rx_received: Number of bytes in the incoming message that have + * not yet been received. + */ + __u32 rx_remaining; + + /** + * @rx_gaps: The number of gaps in the incoming message. A gap is + * a range of bytes that have not been received yet, but bytes after + * the gap have been received. + */ + __u32 rx_gaps; + + /** + * @rx_gap_bytes: The total number of bytes in gaps in the incoming + * message. + */ + __u32 rx_gap_bytes; + + /** + * @rx_granted: The number of bytes in the message that the sender + * is authorized to transmit (includes unscheduled bytes). + */ + __u32 rx_granted; + + /** + * @flags: Various single-bit values associated with the RPC: + * HOMA_RPC_BUF_STALL: The incoming message is currently stalled + * because there is insufficient receiver buffer + * space. + * HOMA_RPC_PRIVATE: The RPC has been created as "private"; set + * only on the client side. + * HOMA_RPC_RX_READY: The incoming message is complete and has + * been queued waiting for a thread to call + * recvmsg. + * HOMA_RPC_RX_COPY: There are packets that have been received, + * whose data has not yet been copied from + * packet buffers to user space. + */ + __u16 flags; +#define HOMA_RPC_BUF_STALL 1 +#define HOMA_RPC_PRIVATE 2 +#define HOMA_RPC_RX_READY 4 +#define HOMA_RPC_RX_COPY 8 +}; + +/** + * struct homa_info - In/out argument passed to HOMAIOCINFO. Fields labeled + * as "in" must be set by the application; other fields are returned to the + * application from the kernel. + */ +struct homa_info { + /** + * @rpc_info: (in) Address of memory region in which to store infomation + * about individual RPCs. + */ + struct homa_rpc_info *rpc_info; + + /** + * @rpc_info_length: (in) Number of bytes of storage available at + * rpc_info. + */ + size_t rpc_info_length; + + /** + * @bpool_avail_bytes: Number of bytes in the buffer pool for incoming + * messages that is currently available for new messages. + */ + __u64 bpool_avail_bytes; + + /** @port: Port number handled by this socket. */ + __u32 port; + + /** + * @num_rpcs: Total number of active RPCs (both server and client) for + * this socket. The number stored at @rpc_info will be less than this + * if @rpc_info_length is too small. + */ + __u32 num_rpcs; + + /** + * @error_msg: Provides additional information about the last error + * returned by a Homa-related kernel call such as sendmsg, recvmsg, + * or ioctl. Not updated for some obvious return values such as EINTR + * or EWOULDBLOCK. + */ +#define HOMA_ERROR_MSG_SIZE 100 + char error_msg[HOMA_ERROR_MSG_SIZE]; +}; +/* I/O control calls on Homa sockets.*/ +#define HOMAIOCINFO _IOWR('h', 1, struct homa_info) #ifndef __STRIP__ /* See strip.py */ -#define HOMAIOCABORT _IOWR(0x89, 0xe3, struct homa_abort_args) +#define HOMAIOCABORT _IOWR('h', 2, struct homa_abort_args) +#define HOMAIOCFREEZE _IO('h', 3) #endif /* See strip.py */ -#define HOMAIOCFREEZE _IO(0x89, 0xef) #endif /* _UAPI_LINUX_HOMA_H */ diff --git a/homa_impl.h b/homa_impl.h index 93d99e51..4d1a7761 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -724,7 +724,8 @@ int homa_getsockopt(struct sock *sk, int level, int optname, int homa_hash(struct sock *sk); enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); int homa_init(struct homa *homa); -int homa_ioctl(struct sock *sk, int cmd, int *karg); +int homa_ioc_info(struct socket *sock, unsigned long arg); +int homa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int homa_load(void); int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit); @@ -774,7 +775,7 @@ void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk); int homa_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void homa_incoming_sysctl_changed(struct homa *homa); -int homa_ioc_abort(struct sock *sk, int *karg); +int homa_ioc_abort(struct socket *sock, unsigned long arg); int homa_message_in_init(struct homa_rpc *rpc, int length, int unsched); void homa_prios_changed(struct homa *homa); diff --git a/homa_outgoing.c b/homa_outgoing.c index bc9296b5..b4b112b8 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -122,7 +122,8 @@ int homa_fill_data_interleaved(struct homa_rpc *rpc, struct sk_buff *skb, * much data. * @max_seg_data: Maximum number of bytes of message data that can go in * a single segment of the GSO packet. - * Return: A pointer to the new packet, or a negative errno. + * Return: A pointer to the new packet, or a negative errno. Sets + * rpc->hsk->error_msg on errors. */ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, struct iov_iter *iter, int offset, @@ -145,8 +146,10 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, skb = homa_skb_alloc_tx(sizeof(struct homa_data_hdr) + length + (segs - 1) * sizeof(struct homa_seg_hdr)); #endif /* See strip.py */ - if (!skb) + if (!skb) { + rpc->hsk->error_msg = "couldn't allocate sk_buff for outgoing message"; return ERR_PTR(-ENOMEM); + } /* Fill in the Homa header (which will be replicated in every * network packet by GSO). @@ -207,8 +210,10 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, err = homa_skb_append_from_iter(rpc->hsk->homa, skb, iter, length); } - if (err) + if (err) { + rpc->hsk->error_msg = "couldn't copy message body into packet buffers"; goto error; + } if (segs > 1) { skb_shinfo(skb)->gso_segs = segs; @@ -244,7 +249,7 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, * Return: 0 for success, or a negative errno for failure. It is possible * for the RPC to be freed while this function is active. If that * happens, copying will cease, -EINVAL will be returned, and - * rpc->state will be RPC_DEAD. + * rpc->state will be RPC_DEAD. Sets rpc->hsk->error_msg on errors. */ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) __must_hold(rpc->bucket->lock) @@ -269,8 +274,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) if (unlikely(iter->count > HOMA_MAX_MESSAGE_LENGTH || iter->count == 0)) { - tt_record2("homa_message_out_fill found bad length %d for id %d", - iter->count, rpc->id); + rpc->hsk->error_msg = "message length exceeded HOMA_MAX_MESSAGE_LENGTH"; err = -EINVAL; goto error; } @@ -362,6 +366,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) homa_rpc_lock(rpc); if (rpc->state == RPC_DEAD) { /* RPC was freed while we were copying. */ + rpc->hsk->error_msg = "rpc deleted while creating outgoing message"; err = -EINVAL; homa_skb_free_tx(rpc->hsk->homa, skb); goto error; diff --git a/homa_peer.c b/homa_peer.c index 625b552d..2cfaf357 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -351,7 +351,8 @@ void homa_peer_gc(struct homa_peertab *peertab) * as IPv4-mapped IPv6 addresses. * Return: The peer associated with @addr, or a negative errno if an * error occurred. On a successful return the reference count - * will be incremented for the returned peer. + * will be incremented for the returned peer. Sets hsk->error_msg + * on errors. */ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, const struct in6_addr *addr) @@ -362,6 +363,7 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, peer = kzalloc(sizeof(*peer), GFP_ATOMIC); if (!peer) { INC_METRIC(peer_kmalloc_errors, 1); + hsk->error_msg = "couldn't allocate memory for homa_peer"; return (struct homa_peer *)ERR_PTR(-ENOMEM); } peer->ht_key.addr = *addr; @@ -379,6 +381,7 @@ struct homa_peer *homa_peer_alloc(struct homa_sock *hsk, status = homa_peer_reset_dst(peer, hsk); if (status != 0) { + hsk->error_msg = "couldn't find route for peer"; kfree(peer); return ERR_PTR(status); } diff --git a/homa_plumbing.c b/homa_plumbing.c index 560cbe01..563e3f59 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -50,7 +50,7 @@ static const struct proto_ops homa_proto_ops = { .accept = sock_no_accept, .getname = inet_getname, .poll = homa_poll, - .ioctl = inet_ioctl, + .ioctl = homa_ioctl, .listen = sock_no_listen, .shutdown = homa_shutdown, .setsockopt = sock_common_setsockopt, @@ -71,7 +71,7 @@ static const struct proto_ops homav6_proto_ops = { .accept = sock_no_accept, .getname = inet6_getname, .poll = homa_poll, - .ioctl = inet6_ioctl, + .ioctl = homa_ioctl, .listen = sock_no_listen, .shutdown = homa_shutdown, .setsockopt = sock_common_setsockopt, @@ -93,7 +93,6 @@ static struct proto homa_prot = { .owner = THIS_MODULE, .close = homa_close, .connect = ip4_datagram_connect, - .ioctl = homa_ioctl, .init = homa_socket, .destroy = homa_sock_destroy, .setsockopt = homa_setsockopt, @@ -111,7 +110,6 @@ static struct proto homav6_prot = { .owner = THIS_MODULE, .close = homa_close, .connect = ip6_datagram_connect, - .ioctl = homa_ioctl, .init = homa_socket, .destroy = homa_sock_destroy, .setsockopt = homa_setsockopt, @@ -749,9 +747,10 @@ void homa_net_exit(struct net *net) * there is no need to invoke this system call for sockets that are only * used as clients. * @sock: Socket on which the system call was invoked. - * @addr: Contains the desired port number. + * @addr: Contains the desired port number. * @addr_len: Number of bytes in uaddr. - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { @@ -759,15 +758,21 @@ int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct homa_sock *hsk = homa_sk(sock->sk); int port = 0; - if (unlikely(addr->sa_family != sock->sk->sk_family)) + if (unlikely(addr->sa_family != sock->sk->sk_family)) { + hsk->error_msg = "address family in bind address didn't match socket"; return -EAFNOSUPPORT; + } if (addr_in->in6.sin6_family == AF_INET6) { - if (addr_len < sizeof(struct sockaddr_in6)) + if (addr_len < sizeof(struct sockaddr_in6)) { + hsk->error_msg = "ipv6 address too short"; return -EINVAL; + } port = ntohs(addr_in->in4.sin_port); } else if (addr_in->in4.sin_family == AF_INET) { - if (addr_len < sizeof(struct sockaddr_in)) + if (addr_len < sizeof(struct sockaddr_in)) { + hsk->error_msg = "ipv4 address too short"; return -EINVAL; + } port = ntohs(addr_in->in6.sin6_port); } return homa_sock_bind(hsk->hnet, hsk, port); @@ -809,31 +814,38 @@ int homa_shutdown(struct socket *sock, int how) /** * homa_ioc_abort() - The top-level function for the ioctl that implements * the homa_abort user-level API. - * @sk: Socket for this request. - * @karg: Used to pass information from user space. + * @sock: Socket for this request. + * @arg: User-space address of a homa_abort_args struct. * - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ -int homa_ioc_abort(struct sock *sk, int *karg) +int homa_ioc_abort(struct socket *sock, unsigned long arg) { - struct homa_sock *hsk = homa_sk(sk); + struct homa_sock *hsk = homa_sk(sock->sk); struct homa_abort_args args; struct homa_rpc *rpc; int ret = 0; - if (unlikely(copy_from_user(&args, (void __user *)karg, sizeof(args)))) + if (unlikely(copy_from_user(&args, (void __user *)arg, sizeof(args)))) { + hsk->error_msg = "invalid address for homa_abort_args"; return -EFAULT; + } - if (args._pad1 || args._pad2[0] || args._pad2[1]) + if (args._pad1 || args._pad2[0] || args._pad2[1]) { + hsk->error_msg = "reserved fields in homa_abort_args must be zero"; return -EINVAL; + } if (args.id == 0) { homa_abort_sock_rpcs(hsk, -args.error); return 0; } rpc = homa_rpc_find_client(hsk, args.id); - if (!rpc) + if (!rpc) { + hsk->error_msg = "RPC identifier did not match any existing RPC"; return -EINVAL; + } if (args.error == 0) homa_rpc_end(rpc); else @@ -843,38 +855,108 @@ int homa_ioc_abort(struct sock *sk, int *karg) } #endif /* See strip.py */ +/** + * homa_ioc_info() - The top-level function that implements the + * HOMAIOCINFO ioctl for Homa sockets. + * @sock: Socket for this request + * @arg: The address in user space of the argument to ioctl, which + * is a homa_info struct. + * + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. + */ +int homa_ioc_info(struct socket *sock, unsigned long arg) +{ + struct homa_sock *hsk = homa_sk(sock->sk); + struct homa_rpc_info rinfo; + struct homa_info hinfo; + struct homa_rpc *rpc; + int bytes_avl; + char *dst; + + if (unlikely(copy_from_user(&hinfo, (void __user *)arg, + sizeof(hinfo)))) { + hsk->error_msg = "invalid address for homa_info"; + return -EFAULT; + } + + if (!homa_protect_rpcs(hsk)) { + hsk->error_msg = "socket has been shut down"; + return -ESHUTDOWN; + } + hinfo.bpool_avail_bytes = homa_pool_avail_bytes(hsk->buffer_pool); + hinfo.port = hsk->port; + dst = (char *)hinfo.rpc_info; + bytes_avl = hinfo.rpc_info_length; + hinfo.num_rpcs = 0; + list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) { + homa_rpc_lock(rpc); + if (rpc->state == RPC_DEAD) { + homa_rpc_unlock(rpc); + continue; + } + homa_rpc_get_info(rpc, &rinfo); + homa_rpc_unlock(rpc); + if (dst && bytes_avl >= sizeof(rinfo)) { + if (copy_to_user((void __user *)dst, &rinfo, + sizeof(rinfo))) { + homa_unprotect_rpcs(hsk); + hsk->error_msg = "couldn't copy homa_rpc_info to user space: invalid or read-only address?"; + return -EFAULT; + } + dst += sizeof(rinfo); + bytes_avl -= sizeof(rinfo); + } + hinfo.num_rpcs++; + } + homa_unprotect_rpcs(hsk); + + if (hsk->error_msg) + snprintf(hinfo.error_msg, HOMA_ERROR_MSG_SIZE, "%s", + hsk->error_msg); + else + hinfo.error_msg[0] = 0; + + if (copy_to_user((void __user *)arg, &hinfo, sizeof(hinfo))) { + hsk->error_msg = "couldn't copy homa_info to user space: read-only address?"; + return -EFAULT; + } + return 0; +} + /** * homa_ioctl() - Implements the ioctl system call for Homa sockets. - * @sk: Socket on which the system call was invoked. + * @sock: Socket on which the system call was invoked. * @cmd: Identifier for a particular ioctl operation. - * @karg: Operation-specific argument; typically the address of a block + * karg: Operation-specific argument; typically the address of a block * of data in user address space. * - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ -int homa_ioctl(struct sock *sk, int cmd, int *karg) +int homa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { #ifndef __STRIP__ /* See strip.py */ - u64 start = homa_clock(); - int result; - if (cmd == HOMAIOCABORT) { - result = homa_ioc_abort(sk, karg); + u64 start = homa_clock(); + int result; + + result = homa_ioc_abort(sock, arg); INC_METRIC(abort_calls, 1); INC_METRIC(abort_cycles, homa_clock() - start); - } else if (cmd == HOMAIOCFREEZE) { + return result; + } + if (cmd == HOMAIOCFREEZE) { tt_record1("Freezing timetrace because of HOMAIOCFREEZE ioctl, pid %d", current->pid); tt_freeze(); - result = 0; - } else { - pr_notice("Unknown Homa ioctl: %d\n", cmd); - result = -EINVAL; + return 0; } - return result; -#else /* See strip.py */ - return -EINVAL; #endif /* See strip.py */ + if (cmd == HOMAIOCINFO) + return homa_ioc_info(sock, arg); + homa_sk(sock->sk)->error_msg = "ioctl opcode isn't supported by Homa"; + return -EINVAL; } /** @@ -905,7 +987,8 @@ int homa_socket(struct sock *sk) * @optname: Identifies a particular setsockopt operation. * @optval: Address in user space of information about the option. * @optlen: Number of bytes of data at @optval. - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) @@ -913,8 +996,10 @@ int homa_setsockopt(struct sock *sk, int level, int optname, struct homa_sock *hsk = homa_sk(sk); int ret; - if (level != IPPROTO_HOMA) + if (level != IPPROTO_HOMA) { + hsk->error_msg = "homa_setsockopt invoked with level not IPPROTO_HOMA"; return -ENOPROTOOPT; + } if (optname == SO_HOMA_RCVBUF) { struct homa_rcvbuf_args args; @@ -922,18 +1007,24 @@ int homa_setsockopt(struct sock *sk, int level, int optname, u64 start = homa_clock(); #endif /* See strip.py */ - if (optlen != sizeof(struct homa_rcvbuf_args)) + if (optlen != sizeof(struct homa_rcvbuf_args)) { + hsk->error_msg = "invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)"; return -EINVAL; + } - if (copy_from_sockptr(&args, optval, optlen)) + if (copy_from_sockptr(&args, optval, optlen)) { + hsk->error_msg = "invalid address for homa_rcvbuf_args"; return -EFAULT; + } /* Do a trivial test to make sure we can at least write the * first page of the region. */ if (copy_to_user(u64_to_user_ptr(args.start), &args, - sizeof(args))) + sizeof(args))) { + hsk->error_msg = "receive buffer region is not writable"; return -EFAULT; + } ret = homa_pool_set_region(hsk, u64_to_user_ptr(args.start), args.length); @@ -942,11 +1033,15 @@ int homa_setsockopt(struct sock *sk, int level, int optname, } else if (optname == SO_HOMA_SERVER) { int arg; - if (optlen != sizeof(arg)) + if (optlen != sizeof(arg)) { + hsk->error_msg = "invalid optlen argument: must be sizeof(int)"; return -EINVAL; + } - if (copy_from_sockptr(&arg, optval, optlen)) + if (copy_from_sockptr(&arg, optval, optlen)) { + hsk->error_msg = "invalid address for SO_HOMA_SERVER value"; return -EFAULT; + } if (arg) hsk->is_server = true; @@ -954,6 +1049,7 @@ int homa_setsockopt(struct sock *sk, int level, int optname, hsk->is_server = false; ret = 0; } else { + hsk->error_msg = "setsockopt option not supported by Homa"; ret = -ENOPROTOOPT; } return ret; @@ -968,7 +1064,8 @@ int homa_setsockopt(struct sock *sk, int level, int optname, * @optval: Address in user space where the option's value should be stored. * @optlen: Number of bytes available at optval; will be overwritten with * actual number of bytes stored. - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ int homa_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -979,14 +1076,20 @@ int homa_getsockopt(struct sock *sk, int level, int optname, void *result; int len; - if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int))) + if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int))) { + hsk->error_msg = "invalid address for optlen argument to getsockopt"; return -EFAULT; + } - if (level != IPPROTO_HOMA) + if (level != IPPROTO_HOMA) { + hsk->error_msg = "homa_setsockopt invoked with level not IPPROTO_HOMA"; return -ENOPROTOOPT; + } if (optname == SO_HOMA_RCVBUF) { - if (len < sizeof(rcvbuf_args)) + if (len < sizeof(rcvbuf_args)) { + hsk->error_msg = "invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)"; return -EINVAL; + } homa_sock_lock(hsk); homa_pool_get_rcvbuf(hsk->buffer_pool, &rcvbuf_args); @@ -994,21 +1097,28 @@ int homa_getsockopt(struct sock *sk, int level, int optname, len = sizeof(rcvbuf_args); result = &rcvbuf_args; } else if (optname == SO_HOMA_SERVER) { - if (len < sizeof(is_server)) + if (len < sizeof(is_server)) { + hsk->error_msg = "invalid optlen argument: must be sizeof(int)"; return -EINVAL; + } is_server = hsk->is_server; len = sizeof(is_server); result = &is_server; } else { + hsk->error_msg = "getsockopt option not supported by Homa"; return -ENOPROTOOPT; } - if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int))) + if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int))) { + hsk->error_msg = "couldn't update optlen argument to getsockopt: read-only?"; return -EFAULT; + } - if (copy_to_sockptr(USER_SOCKPTR(optval), result, len)) + if (copy_to_sockptr(USER_SOCKPTR(optval), result, len)) { + hsk->error_msg = "couldn't update optval argument to getsockopt: read-only?"; return -EFAULT; + } return 0; } @@ -1019,7 +1129,8 @@ int homa_getsockopt(struct sock *sk, int level, int optname, * @msg: Structure describing the message to send; the msg_control * field points to additional information. * @length: Number of bytes of the message. - * Return: 0 on success, otherwise a negative errno. + * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg + * on errors. */ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { @@ -1039,22 +1150,26 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) addr = (union sockaddr_in_union *)msg->msg_name; if (!addr) { + hsk->error_msg = "no msg_name passed to sendmsg"; result = -EINVAL; goto error; } if (unlikely(!msg->msg_control_is_user)) { tt_record("homa_sendmsg error: !msg->msg_control_is_user"); + hsk->error_msg = "msg_control argument for sendmsg isn't in user space"; result = -EINVAL; goto error; } if (unlikely(copy_from_user(&args, (void __user *)msg->msg_control, sizeof(args)))) { + hsk->error_msg = "invalid address for msg_control argument to sendmsg"; result = -EFAULT; goto error; } if (args.flags & ~HOMA_SENDMSG_VALID_FLAGS || args.reserved != 0) { + hsk->error_msg = "reserved fields in homa_sendmsg_args must be zero"; result = -EINVAL; goto error; } @@ -1062,18 +1177,20 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (!homa_sock_wmem_avl(hsk)) { result = homa_sock_wait_wmem(hsk, msg->msg_flags & MSG_DONTWAIT); - if (result != 0) + if (result != 0) { goto error; + } } if (addr->sa.sa_family != sk->sk_family) { + hsk->error_msg = "address family in sendmsg address must match the socket"; result = -EAFNOSUPPORT; goto error; } if (msg->msg_namelen < sizeof(struct sockaddr_in) || (msg->msg_namelen < sizeof(struct sockaddr_in6) && addr->in6.sin6_family == AF_INET6)) { - tt_record("homa_sendmsg error: msg_namelen too short"); + hsk->error_msg = "msg_namelen too short"; result = -EINVAL; goto error; } @@ -1105,6 +1222,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (unlikely(copy_to_user((void __user *)msg->msg_control, &args, sizeof(args)))) { homa_rpc_lock(rpc); + hsk->error_msg = "couldn't update homa_sendmsg_args argument to sendmsg: read-only?"; result = -EFAULT; goto error; } @@ -1123,7 +1241,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) tt_record4("homa_sendmsg response, id %llu, port %d, pid %d, length %d", args.id, hsk->port, current->pid, length); if (args.completion_cookie != 0) { - tt_record("homa_sendmsg error: nonzero cookie"); + hsk->error_msg = "completion_cookie must be zero when sending responses"; result = -EINVAL; goto error; } @@ -1135,18 +1253,16 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) * this could be totally valid (e.g. client is * no longer interested in it). */ - tt_record2("homa_sendmsg error: RPC id %d, peer 0x%x, doesn't exist", - args.id, tt_addr(canonical_dest)); return 0; } homa_rpc_hold(rpc); if (rpc->error) { + hsk->error_msg = "RPC has failed, so can't send response"; result = rpc->error; goto error; } if (rpc->state != RPC_IN_SERVICE) { - tt_record2("homa_sendmsg error: RPC id %d in bad state %d", - rpc->id, rpc->state); + hsk->error_msg = "RPC is not in a state where a response can be sent"; result = -EINVAL; goto error_dont_end_rpc; } @@ -1191,7 +1307,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) * @flags: Flags from system call; only MSG_DONTWAIT is used. * @addr_len: Store the length of the sender address here * Return: The length of the message on success, otherwise a negative - * errno. + * errno. Sets hsk->error_msg on errors. */ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) @@ -1213,41 +1329,57 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, /* This test isn't strictly necessary, but it provides a * hook for testing kernel call times. */ + hsk->error_msg = "no msg_control passed to recvmsg"; return -EINVAL; } - if (msg->msg_controllen != sizeof(control)) + if (msg->msg_controllen != sizeof(control)) { + hsk->error_msg = "invalid msg_controllen in recvmsg"; return -EINVAL; + } if (unlikely(copy_from_user(&control, (void __user *)msg->msg_control, - sizeof(control)))) + sizeof(control)))) { + hsk->error_msg = "invalid address for msg_control argument to recvmsg"; return -EFAULT; + } control.completion_cookie = 0; tt_record2("homa_recvmsg starting, port %d, pid %d", hsk->port, current->pid); - if (control.num_bpages > HOMA_MAX_BPAGES || control.reserved != 0) { + if (control.num_bpages > HOMA_MAX_BPAGES) { + hsk->error_msg = "num_pages exceeds HOMA_MAX_BPAGES"; + result = -EINVAL; + goto done; + } + if (control.reserved != 0) { + hsk->error_msg = "reserved fields in homa_recvmsg_args must be zero"; result = -EINVAL; goto done; } if (!hsk->buffer_pool) { + hsk->error_msg = "SO_HOMA_RECVBUF socket option has not been set"; result = -EINVAL; goto done; } result = homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages, control.bpage_offsets); control.num_bpages = 0; - if (result != 0) + if (result != 0) { + hsk->error_msg = "error while releasing buffer pages"; goto done; + } nonblocking = flags & MSG_DONTWAIT; if (control.id != 0) { rpc = homa_rpc_find_client(hsk, control.id); /* Locks RPC. */ if (!rpc) { + hsk->error_msg = "invalid RPC id passed to recvmsg"; result = -EINVAL; goto done; } homa_rpc_hold(rpc); result = homa_wait_private(rpc, nonblocking); if (result != 0) { + hsk->error_msg = "error while waiting for private RPC to complete"; control.id = 0; goto done; } @@ -1258,11 +1390,18 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, * prevented us from finding an RPC to return. Errors * in the RPC itself are handled below. */ + hsk->error_msg = "error while waiting for shared RPC to complete"; result = PTR_ERR(rpc); rpc = NULL; goto done; } } + if (rpc->error) { + hsk->error_msg = "RPC failed"; + result = rpc->error; + } else { + result = rpc->msgin.length; + } result = rpc->error ? rpc->error : rpc->msgin.length; #ifndef __STRIP__ /* See strip.py */ @@ -1345,11 +1484,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (unlikely(copy_to_user((__force void __user *)msg->msg_control, &control, sizeof(control)))) { -#ifndef __UPSTREAM__ /* See strip.py */ - /* Note: in this case the message's buffers will be leaked. */ - pr_notice("%s couldn't copy back args to 0x%px\n", - __func__, msg->msg_control); -#endif /* See strip.py */ + hsk->error_msg = "couldn't update homa_recvmsg_args argument to recvmsg: read-only?"; result = -EFAULT; } diff --git a/homa_pool.c b/homa_pool.c index b1d76a78..a2684cad 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -546,3 +546,27 @@ void homa_pool_check_waiting(struct homa_pool *pool) homa_rpc_unlock(rpc); } } + +/** + * homa_pool_avail_bytes() - Return a count of the number of bytes currently + * unused and available for allocation in a pool. + * @pool: Pool of interest. + * Return See above. + */ +u64 homa_pool_avail_bytes(struct homa_pool *pool) +{ + struct homa_pool_core *core; + u64 avail; + int cpu; + + if (!pool->region) + return 0; + avail = atomic_read(&pool->free_bpages); + avail *= HOMA_BPAGE_SIZE; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + core = per_cpu_ptr(pool->cores, cpu); + if (pool->descriptors[core->page_hint].owner == cpu) + avail += HOMA_BPAGE_SIZE - core->allocated; + } + return avail; +} diff --git a/homa_pool.h b/homa_pool.h index 15ba5c5d..1f545566 100644 --- a/homa_pool.h +++ b/homa_pool.h @@ -120,6 +120,7 @@ struct homa_pool { bool homa_bpage_available(struct homa_bpage *bpage, u64 now); struct homa_pool *homa_pool_alloc(struct homa_sock *hsk); int homa_pool_alloc_msg(struct homa_rpc *rpc); +u64 homa_pool_avail_bytes(struct homa_pool *pool); void homa_pool_check_waiting(struct homa_pool *pool); void homa_pool_free(struct homa_pool *pool); void __user *homa_pool_get_buffer(struct homa_rpc *rpc, int offset, diff --git a/homa_rpc.c b/homa_rpc.c index 0b482277..014dbbb8 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -25,7 +25,7 @@ * * Return: A printer to the newly allocated object, or a negative * errno if an error occurred. The RPC will be locked; the - * caller must eventually unlock it. + * caller must eventually unlock it. Sets hsk->error_msg on errors. */ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, const union sockaddr_in_union *dest) @@ -37,8 +37,10 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, int err; crpc = kzalloc(sizeof(*crpc), GFP_KERNEL); - if (unlikely(!crpc)) + if (unlikely(!crpc)) { + hsk->error_msg = "couldn't allocate memory for client RPC"; return ERR_PTR(-ENOMEM); + } /* Initialize fields that don't require the socket lock. */ crpc->hsk = hsk; @@ -49,7 +51,6 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, refcount_set(&crpc->refs, 1); crpc->peer = homa_peer_get(hsk, &dest_addr_as_ipv6); if (IS_ERR(crpc->peer)) { - tt_record("error in homa_peer_get"); err = PTR_ERR(crpc->peer); crpc->peer = NULL; goto error; @@ -79,6 +80,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, if (hsk->shutdown) { homa_sock_unlock(hsk); homa_rpc_unlock(crpc); + hsk->error_msg = "socket has been shut down"; err = -ESHUTDOWN; goto error; } @@ -761,3 +763,56 @@ struct homa_rpc *homa_rpc_find_server(struct homa_sock *hsk, homa_bucket_unlock(bucket, id); return NULL; } + +/** + * homa_rpc_get_info() - Extract information from an RPC for returning to + * an application via the HOMAIOCINFO ioctl. + * @rpc: RPC for which information is desired. + * @info: Structure in which to store the information. + */ +void homa_rpc_get_info(struct homa_rpc *rpc, struct homa_rpc_info *info) +{ + struct homa_gap *gap; + + memset(info, 0, sizeof(*info)); + info->id = rpc->id; + if (rpc->hsk->inet.sk.sk_family == AF_INET6) { + info->peer.in6.sin6_family = AF_INET6; + info->peer.in6.sin6_addr = rpc->peer->addr; + info->peer.in6.sin6_port = htons(rpc->dport); + } else { + info->peer.in6.sin6_family = AF_INET; + info->peer.in4.sin_addr.s_addr = ipv6_to_ipv4(rpc->peer->addr); + info->peer.in4.sin_port = htons(rpc->dport); + } + info->completion_cookie = rpc->completion_cookie; + if (rpc->msgout.length >= 0) { + info->tx_length = rpc->msgout.length; + info->tx_sent = rpc->msgout.next_xmit_offset; + info->tx_granted = rpc->msgout.granted; + info->tx_prio = rpc->msgout.sched_priority; + } else { + info->tx_length = -1; + } + if (rpc->msgin.length >= 0) { + info->rx_length = rpc->msgin.length; + info->rx_remaining = rpc->msgin.bytes_remaining; + list_for_each_entry(gap, &rpc->msgin.gaps, links) { + info->rx_gaps++; + info->rx_gap_bytes += gap->end - gap->start; + } + info->rx_granted = rpc->msgin.granted; + if (skb_queue_len(&rpc->msgin.packets) > 0) + info->flags |= HOMA_RPC_RX_COPY; + } else { + info->rx_length = -1; + } + if (!list_empty(&rpc->buf_links)) + info->flags |= HOMA_RPC_BUF_STALL; + if (!list_empty(&rpc->ready_links) && + rpc->msgin.bytes_remaining == 0 && + skb_queue_len(&rpc->msgin.packets) == 0) + info->flags |= HOMA_RPC_RX_READY; + if (rpc->flags & RPC_PRIVATE) + info->flags |= HOMA_RPC_PRIVATE; +} \ No newline at end of file diff --git a/homa_rpc.h b/homa_rpc.h index 0c2eac84..269d747f 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -145,8 +145,8 @@ struct homa_gap { */ struct homa_message_in { /** - * @length: Payload size in bytes. A value less than 0 means this - * structure is uninitialized and therefore not in use. + * @length: Payload size in bytes. -1 means this structure is + * uninitialized and therefore not in use. */ int length; @@ -474,6 +474,8 @@ void homa_abort_rpcs(struct homa *homa, const struct in6_addr *addr, int port, int error); void homa_abort_sock_rpcs(struct homa_sock *hsk, int error); void homa_rpc_abort(struct homa_rpc *crpc, int error); +void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, + struct homa_ack *ack); struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, const union sockaddr_in_union *dest); @@ -487,9 +489,7 @@ struct homa_rpc struct homa_rpc *homa_rpc_find_server(struct homa_sock *hsk, const struct in6_addr *saddr, u64 id); -void homa_rpc_acked(struct homa_sock *hsk, const struct in6_addr *saddr, - struct homa_ack *ack); -void homa_rpc_end(struct homa_rpc *rpc); +void homa_rpc_get_info(struct homa_rpc *rpc, struct homa_rpc_info *info); int homa_rpc_reap(struct homa_sock *hsk, bool reap_all); /** diff --git a/homa_sock.c b/homa_sock.c index 859d1d2a..2b2d77a6 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -140,7 +140,7 @@ void homa_socktab_end_scan(struct homa_socktab_scan *scan) * @hsk: Object to initialize. The Homa-specific parts must have been * initialized to zeroes by the caller. * - * Return: 0 for success, otherwise a negative errno. + * Return: 0 for success, otherwise a negative errno. */ int homa_sock_init(struct homa_sock *hsk) { @@ -375,7 +375,8 @@ void homa_sock_destroy(struct sock *sk) * becomes a no-op: the socket will continue to use * its randomly assigned client port. * - * Return: 0 for success, otherwise a negative errno. + * Return: 0 for success, otherwise a negative errno. If an error is + * returned, hsk->error_msg is set. */ int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, u16 port) @@ -386,11 +387,14 @@ int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, if (port == 0) return result; - if (port >= HOMA_MIN_DEFAULT_PORT) + if (port >= HOMA_MIN_DEFAULT_PORT) { + hsk->error_msg = "port number invalid: in the automatically assigned range"; return -EINVAL; + } homa_sock_lock(hsk); spin_lock_bh(&socktab->write_lock); if (hsk->shutdown) { + hsk->error_msg = "socket has been shut down"; result = -ESHUTDOWN; goto done; } @@ -398,8 +402,10 @@ int homa_sock_bind(struct homa_net *hnet, struct homa_sock *hsk, owner = homa_sock_find(hnet, port); if (owner) { sock_put(&owner->sock); - if (owner != hsk) + if (owner != hsk) { + hsk->error_msg = "requested port number is already in use"; result = -EADDRINUSE; + } goto done; } hlist_del_rcu(&hsk->socktab_links); diff --git a/homa_sock.h b/homa_sock.h index e2d904fa..9b433bb2 100644 --- a/homa_sock.h +++ b/homa_sock.h @@ -180,6 +180,14 @@ struct homa_sock { /** @socktab_links: Links this socket into a homa_socktab bucket. */ struct hlist_node socktab_links; + /** + * @error_msg: Static string giving human-readable information about + * the reason for the last error returned by a Homa kernel call. + * Applications can fetch this with the HOMAIOCINFO ioctl to figure + * out why a call failed. + */ + char *error_msg; + /* Information above is (almost) never modified; start a new * cache line below for info that is modified frequently. */ diff --git a/man/homa.7 b/man/homa.7 index 0064c02d..6c46d7a4 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -298,6 +298,195 @@ an abort is requested, it may still be executed on the server. Any response from the server will be discarded. .PP Only outgoing (client-side) RPCs may be aborted. +.SH STATUS INFORMATION +.PP +To retrieve information about the state of a Homa socket, invoke +.B ioctl +with the +.B HOMAIOCINFO +operation. One additional argument must be specified for +.BR ioctl , +consisting of a pointer to the following structure: +.in +4n +.ps -1 +.vs -2 +.EX +struct homa_info { + struct homa_rpc_info *rpc_info; + size_t rpc_info_length; + __u64 bpool_avail_bytes; + __u32 port; + __u32 num_rpcs; + char error_msg[HOMA_ERROR_MSG_SIZE]; +}; +.EE +.vs +2 +.ps +1 +.in +.PP +The caller must set the +.BR rpc_info +and +.B +rpc_info_length +fields before invoking +.BR ioctl. +The +.B rpc_info +field points to an area in application memory that can be used to +return detailed information about all of the active RPCs on the socket; +.B rpc_info_length +indicates how much memory is available at +.BR rpc_info , +in bytes. If +.B rpc_info +is NULL or +.B rpc_info_length +is zero then no detailed RPC information will be returned. +.PP +The other fields are used to return information about the Homa socket +to the application: +.TP 18n +.BR bpool_avail_bytes +The amount of memory currently available in the receive +buffer region for the socket (previously specified with the +.B SO_HOMA_RCVBUF +socket option). +.TP 18n +.BR port +The socket's port number. +.TP 18n +.BR num_rpcs +The number of active RPCs on the socket. +.TP 18n +.BR error_msg +A null-terminated string containing additional information about the +most recent error returne by Homa for a system call on this socket. This +is particularly useful for errors such as +.B EINVAL +and +.BR EFAULT , +where there are numerous possible causes. +.PP +If +.B num_rpcs +is greater than zero and +.B rpc_info +has been specified then details about active RPCs will be returned in +.B rpc_info +(if +.B rpc_info_length +is too small to hold all of the active RPCs, then some RPCs will +not be recorded). Each element of +.B rpc_info +contains information about one RPC, in the following format: +.in +4n +.ps -1 +.vs -2 +.EX +struct homa_rpc_info { + __u64 id; + union { + struct sockaddr_storage storage; + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } peer; + __u64 completion_cookie; + __s32 tx_length; + __u32 tx_sent; + __u32 tx_granted; + __u32 tx_prio; + __s32 rx_length; + __u32 rx_remaining; + __u32 rx_gaps; + __u32 rx_gap_bytes; + __u32 rx_granted; + __u16 flags; +}; +.EE +.vs +2 +.ps +1 +.in +.PP +The fields have the following meaning: +.TP 18n +.BR id +Identifier for the RPC. If the low-order bit is 1, this node is the server +for the RPC; 0 means this node is the client. +.TP 18n +.BR peer +Address of the peer application for the RPC, including both its host address +and port number. +.TP 18n +.BR completion_cookie +For client RPCs, this is the completion cookie specified when +.B sendmsg +was invoked to create the RPC. For server RPCs this field is always zero. +.TP 18n +.BR tx_length +The length of the outgoing message for the RPC (in bytes) or -1 if this +is a server RPC and +.B sendmsg +has not yet been invoked for the RPC's response. +.TP 18n +.B tx_sent +The number of bytes in the outgoing message that have been transmitted +at least once (this is also the index within the message of the first +byte that has not yet been transmitted). +.TP 18n +.BR tx_granted +The number of bytes in the outgoing message that this node is authorized +to transmit (includes any unscheduled bytes). +.TP 18n +.BR tx_prio +The priority level currently being used to transmit data packets. +.TP 18n +.BR rx_length +The length of the incoming message for the RPC (in bytes) or -1 if this +is a client RPC and no packets have been received for the response yet. +.TP 18n +.BR rx_remaining +The number of bytes in the incoming message that have not yet been received. +.TP 18n +.BR rx_gaps +The number of gaps in the incoming message. A gap is a range of bytes that +have not been received while the byte just after the end of the range has +been received. +.TP 18n +.BR rx_gap_bytes +The total number of bytes in all of the gaps of the incoming message. +.TP 18n +.BR rx_granted +The number of bytes in the incoming message that this node has authorized +the peer to transmit (this is also the index within the message of the +first byte that has not been granted). +.TP 18n +.BR flags +A bit mask containing various flags; see below. +.PP +The supported bits in +.B flags +are as follows: +.TP 25n +.BR HOMA_RPC_PRIVATE +This is a client RPC and was declared private when +.B sendmsg +was invoked. +.TP 25n +.BR HOMA_RPC_BUF_STALL +The incoming message has stalled because there is no buffer space +available for it. Transmission of the message will restart when buffer space +becomes available. +.TP 25n +.BR HOMA_RPC_RX_COPY +There exist packets that have been received for the incoming message +whose data has not yet been copied out of +the packet(s) and into the buffer memory for the message. +.TP 25n +.BR HOMA_RPC_RX_READY +The incoming message has been received successully and has been copied +to buffer memory; it is currently queued waiting for a thread to invoke +.BR recvmsg . .SH SHUTDOWN .PP The diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 39a5f387..ef524e81 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -218,6 +218,8 @@ TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__cant_allocate_skb) skb = homa_tx_data_pkt_alloc(crpc, iter, 0, 500, 2000); EXPECT_TRUE(IS_ERR(skb)); EXPECT_EQ(ENOMEM, -PTR_ERR(skb)); + EXPECT_STREQ("couldn't allocate sk_buff for outgoing message", + self->hsk.error_msg); } TEST_F(homa_outgoing, homa_tx_data_pkt_alloc__include_acks) { @@ -412,6 +414,8 @@ TEST_F(homa_outgoing, homa_message_out_fill__message_too_long) EXPECT_EQ(EINVAL, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, HOMA_MAX_MESSAGE_LENGTH+1), 0)); + EXPECT_STREQ("message length exceeded HOMA_MAX_MESSAGE_LENGTH", + self->hsk.error_msg); homa_rpc_unlock(crpc); EXPECT_EQ(0, crpc->msgout.skb_memory); EXPECT_EQ(1, refcount_read(&self->hsk.sock.sk_wmem_alloc)); @@ -560,6 +564,8 @@ TEST_F(homa_outgoing, homa_message_out_fill__error_in_homa_tx_data_packet_alloc) EXPECT_EQ(EFAULT, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 3000), 0)); + EXPECT_STREQ("couldn't copy message body into packet buffers", + self->hsk.error_msg); homa_rpc_unlock(crpc); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); EXPECT_EQ(1, crpc->msgout.num_skbs); @@ -577,6 +583,7 @@ TEST_F(homa_outgoing, homa_message_out_fill__rpc_freed_during_copy) hook_rpc = crpc; ASSERT_EQ(EINVAL, -homa_message_out_fill(crpc, unit_iov_iter((void *) 1000, 3000), 0)); + EXPECT_STREQ("rpc deleted while creating outgoing message", self->hsk.error_msg); EXPECT_EQ(0, crpc->msgout.num_skbs); EXPECT_EQ(RPC_DEAD, crpc->state); EXPECT_EQ(0, crpc->msgout.skb_memory); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 2eea9fe6..2ddc5d10 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -156,6 +156,8 @@ TEST_F(homa_plumbing, homa_bind__version_mismatch) sock.sk = &self->hsk.inet.sk; result = homa_bind(&sock, &addr, sizeof(addr)); EXPECT_EQ(EAFNOSUPPORT, -result); + EXPECT_STREQ("address family in bind address didn't match socket", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_bind__ipv6_address_too_short) { @@ -172,6 +174,7 @@ TEST_F(homa_plumbing, homa_bind__ipv6_address_too_short) sock.sk = &self->hsk.inet.sk; result = homa_bind(&sock, &addr.sa, sizeof(addr.in6)-1); EXPECT_EQ(EINVAL, -result); + EXPECT_STREQ("ipv6 address too short", self->hsk.error_msg); } TEST_F(homa_plumbing, homa_bind__ipv6_ok) { @@ -208,6 +211,7 @@ TEST_F(homa_plumbing, homa_bind__ipv4_address_too_short) sock.sk = &self->hsk.inet.sk; result = homa_bind(&sock, &addr.sa, sizeof(addr.in4)-1); EXPECT_EQ(EINVAL, -result); + EXPECT_STREQ("ipv4 address too short", self->hsk.error_msg); } TEST_F(homa_plumbing, homa_bind__ipv4_ok) { @@ -249,6 +253,17 @@ TEST_F(homa_plumbing, homa_ioc_abort__cant_read_user_args) mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); + EXPECT_STREQ("invalid address for homa_abort_args", + self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_ioc_abort__nonzero_reserved_fields) +{ + struct homa_abort_args args; + + args._pad1 = 777; + EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); + EXPECT_STREQ("reserved fields in homa_abort_args must be zero", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) { @@ -272,9 +287,188 @@ TEST_F(homa_plumbing, homa_ioc_abort__nonexistent_rpc) struct homa_abort_args args = {99, 0}; EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); + EXPECT_STREQ("RPC identifier did not match any existing RPC", + self->hsk.error_msg); +} + +TEST_F(homa_plumbing, homa_ioc_info__cant_read_homa_info_from_user_space) +{ + struct homa_info hinfo; + + mock_copy_data_errors = 1; + EXPECT_EQ(EFAULT, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_STREQ("invalid address for homa_info", self->hsk.error_msg); +} +TEST_F(homa_plumbing, homa_ioc_info__basics) +{ + struct homa_info hinfo; + + memset(&hinfo, 0, sizeof(hinfo)); + EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(100 * HOMA_BPAGE_SIZE, hinfo.bpool_avail_bytes); + EXPECT_EQ(99, hinfo.port); +} +TEST_F(homa_plumbing, homa_ioc_info__socket_shutdown) +{ + struct homa_info hinfo; + struct homa_sock hsk; + + mock_sock_init(&hsk, self->hnet, self->server_port); + homa_sock_shutdown(&hsk); + + EXPECT_EQ(ESHUTDOWN, -homa_ioc_info(&hsk.inet.sk, (int *) &hinfo)); + EXPECT_STREQ("socket has been shut down", hsk.error_msg); + unit_sock_destroy(&hsk); +} +TEST_F(homa_plumbing, homa_ioc_info__rpc_info) +{ + struct homa_rpc_info info[10]; + struct homa_info hinfo; + + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + hinfo.rpc_info = info; + hinfo.rpc_info_length = sizeof(info); + + EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(2, hinfo.num_rpcs); + EXPECT_EQ(self->server_id, info[0].id); + EXPECT_EQ(self->server_id + 2, info[1].id); +} +TEST_F(homa_plumbing, homa_ioc_info__ignore_dead_rpc) +{ + struct homa_rpc_info info[10]; + struct homa_info hinfo; + struct homa_rpc *srpc; + + srpc = unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + EXPECT_EQ(RPC_IN_SERVICE, srpc->state); + srpc->state = RPC_DEAD; + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + hinfo.rpc_info = info; + hinfo.rpc_info_length = sizeof(info); + + EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(1, hinfo.num_rpcs); + EXPECT_EQ(self->server_id + 2, info[0].id); + srpc->state = RPC_IN_SERVICE; +} +TEST_F(homa_plumbing, homa_ioc_info__no_memory_for_rpc_info) +{ + struct homa_info hinfo; + + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + hinfo.rpc_info = NULL; + hinfo.rpc_info_length = 1000; + + EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(2, hinfo.num_rpcs); +} +TEST_F(homa_plumbing, homa_ioc_info__not_enough_space_for_all_rpcs) +{ + struct homa_rpc_info info[10]; + struct homa_info hinfo; + + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + memset(info, 0, sizeof(info)); + hinfo.rpc_info = info; + hinfo.rpc_info_length = sizeof(*info); + + EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(2, hinfo.num_rpcs); + EXPECT_EQ(self->server_id, info[0].id); + EXPECT_EQ(0, info[1].id); +} +TEST_F(homa_plumbing, homa_ioc_info__cant_copy_rpc_info_to_user) +{ + struct homa_rpc_info info[10]; + struct homa_info hinfo; + + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id, + 2000, 100); + unit_server_rpc(&self->hsk, UNIT_IN_SERVICE, self->client_ip, + self->server_ip, self->client_port, self->server_id + 2, + 5000, 2000); + memset(info, 0, sizeof(info)); + hinfo.rpc_info = info; + hinfo.rpc_info_length = sizeof(info); + + mock_copy_to_user_errors = 2; + EXPECT_EQ(EFAULT, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_STREQ("couldn't copy homa_rpc_info to user space: invalid or read-only address?", + self->hsk.error_msg); + EXPECT_EQ(self->server_id, info[0].id); + EXPECT_EQ(0, info[1].id); +} +TEST_F(homa_plumbing, homa_ioc_info__error_msg) +{ + struct homa_info hinfo; + + /* First call: no error message. */ + strcpy(hinfo.error_msg, "Bogus message"); + EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_STREQ("", hinfo.error_msg); + + /* Second call: there is a message. */ + self->hsk.error_msg = "Sample error message"; + EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_STREQ("Sample error message", hinfo.error_msg); + + /* Third call: the message is too long. */ + self->hsk.error_msg = "This message is very long; " + "a lot longer than you might think; " + "so long that it exceeds the available space " + "for storing message in struct homa_info"; + EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(HOMA_ERROR_MSG_SIZE - 1, strlen(hinfo.error_msg)); +} +TEST_F(homa_plumbing, homa_ioc_info__cant_copy_back_to_user_space) +{ + struct homa_info hinfo; + + mock_copy_to_user_errors = 1; + EXPECT_EQ(EFAULT, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_STREQ("couldn't copy homa_info to user space: read-only address?", + self->hsk.error_msg); } #endif /* See strip.py */ +TEST_F(homa_plumbing, homa_ioctl__HOMAIOCINFO) +{ + struct homa_info hinfo; + + hinfo.rpc_info = NULL; + self->hsk.error_msg = "Sample error message"; + EXPECT_EQ(0, -homa_ioctl(&self->hsk.inet.sk, HOMAIOCINFO, + (int *) &hinfo)); + EXPECT_STREQ("Sample error message", hinfo.error_msg); +} +TEST_F(homa_plumbing, homa_ioctl__unknown_ioctl_command) +{ + EXPECT_EQ(EINVAL, -homa_ioctl(&self->hsk.inet.sk, 47, NULL)); + EXPECT_STREQ("ioctl opcode isn't supported by Homa", + self->hsk.error_msg); +} + TEST_F(homa_plumbing, homa_socket__success) { struct homa_sock hsk; @@ -300,17 +494,16 @@ TEST_F(homa_plumbing, homa_setsockopt__bad_level) { EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, 0, 0, self->optval, sizeof(struct homa_rcvbuf_args))); -} -TEST_F(homa_plumbing, homa_setsockopt__bad_optname) -{ - EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, 0, - self->optval, sizeof(struct homa_rcvbuf_args))); + EXPECT_STREQ("homa_setsockopt invoked with level not IPPROTO_HOMA", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_setsockopt__recvbuf_bad_optlen) { EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, self->optval, sizeof(struct homa_rcvbuf_args) - 1)); + EXPECT_STREQ("invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_setsockopt__recvbuf_copy_from_sockptr_fails) { @@ -318,8 +511,10 @@ TEST_F(homa_plumbing, homa_setsockopt__recvbuf_copy_from_sockptr_fails) EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, self->optval, sizeof(struct homa_rcvbuf_args))); + EXPECT_STREQ("invalid address for homa_rcvbuf_args", + self->hsk.error_msg); } -TEST_F(homa_plumbing, homa_setsockopt__recvbuf_copy_to_user_fails) +TEST_F(homa_plumbing, homa_setsockopt__recvbuf_region_not_writable) { struct homa_rcvbuf_args args = {0x100000, 5*HOMA_BPAGE_SIZE}; @@ -328,6 +523,8 @@ TEST_F(homa_plumbing, homa_setsockopt__recvbuf_copy_to_user_fails) EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, self->optval, sizeof(struct homa_rcvbuf_args))); + EXPECT_STREQ("receive buffer region is not writable", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_setsockopt__recvbuf_success) { @@ -353,12 +550,16 @@ TEST_F(homa_plumbing, homa_setsockopt__server_bad_optlen) { EXPECT_EQ(EINVAL, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_SERVER, self->optval, sizeof(int) - 1)); + EXPECT_STREQ("invalid optlen argument: must be sizeof(int)", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_setsockopt__server_copy_from_sockptr_fails) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_SERVER, self->optval, sizeof(int))); + EXPECT_STREQ("invalid address for SO_HOMA_SERVER value", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_setsockopt__server_success) { @@ -374,7 +575,13 @@ TEST_F(homa_plumbing, homa_setsockopt__server_success) SO_HOMA_SERVER, self->optval, sizeof(int))); EXPECT_EQ(0, self->hsk.is_server); } - +TEST_F(homa_plumbing, homa_setsockopt__bad_optname) +{ + EXPECT_EQ(ENOPROTOOPT, -homa_setsockopt(&self->hsk.sock, IPPROTO_HOMA, 0, + self->optval, sizeof(struct homa_rcvbuf_args))); + EXPECT_STREQ("setsockopt option not supported by Homa", + self->hsk.error_msg); +} TEST_F(homa_plumbing, homa_getsockopt__recvbuf_success) { @@ -399,6 +606,8 @@ TEST_F(homa_plumbing, homa_getsockopt__cant_read_size) mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, 0, SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_STREQ("invalid address for optlen argument to getsockopt", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_getsockopt__bad_level) { @@ -407,6 +616,8 @@ TEST_F(homa_plumbing, homa_getsockopt__bad_level) EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, 0, SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_STREQ("homa_setsockopt invoked with level not IPPROTO_HOMA", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_getsockopt__recvbuf_bad_length) { @@ -415,6 +626,8 @@ TEST_F(homa_plumbing, homa_getsockopt__recvbuf_bad_length) EXPECT_EQ(EINVAL, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_STREQ("invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_getsockopt__server_bad_length) { @@ -423,6 +636,8 @@ TEST_F(homa_plumbing, homa_getsockopt__server_bad_length) EXPECT_EQ(EINVAL, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_SERVER, (char *)&is_server, &size)); + EXPECT_STREQ("invalid optlen argument: must be sizeof(int)", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_getsockopt__server_success) { @@ -449,6 +664,8 @@ TEST_F(homa_plumbing, homa_getsockopt__bad_optname) EXPECT_EQ(ENOPROTOOPT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF-1, (char *)&val, &size)); + EXPECT_STREQ("getsockopt option not supported by Homa", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_size) { @@ -459,6 +676,8 @@ TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_size) EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_STREQ("couldn't update optlen argument to getsockopt: read-only?", + self->hsk.error_msg); EXPECT_EQ(0, val.start); EXPECT_EQ(sizeof(val) + 10, size); } @@ -471,6 +690,8 @@ TEST_F(homa_plumbing, homa_getsockopt__cant_copy_out_value) EXPECT_EQ(EFAULT, -homa_getsockopt(&self->hsk.sock, IPPROTO_HOMA, SO_HOMA_RCVBUF, (char *)&val, &size)); + EXPECT_STREQ("couldn't update optval argument to getsockopt: read-only?", + self->hsk.error_msg); EXPECT_EQ(0, val.start); EXPECT_EQ(sizeof(val), size); } @@ -480,20 +701,26 @@ TEST_F(homa_plumbing, homa_sendmsg__msg_name_null) self->sendmsg_hdr.msg_name = NULL; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("no msg_name passed to sendmsg", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_sendmsg__args_not_in_user_space) +TEST_F(homa_plumbing, homa_sendmsg__msg_control_not_in_user_space) { self->sendmsg_hdr.msg_control_is_user = 0; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("msg_control argument for sendmsg isn't in user space", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_sendmsg__cant_read_args) +TEST_F(homa_plumbing, homa_sendmsg__cant_read_msg_control) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("invalid address for msg_control argument to sendmsg", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__illegal_flag) @@ -501,6 +728,8 @@ TEST_F(homa_plumbing, homa_sendmsg__illegal_flag) self->sendmsg_args.flags = 4; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("reserved fields in homa_sendmsg_args must be zero", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__nonzero_reserved_field) @@ -508,6 +737,8 @@ TEST_F(homa_plumbing, homa_sendmsg__nonzero_reserved_field) self->sendmsg_args.reserved = 0x1000; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("reserved fields in homa_sendmsg_args must be zero", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__bad_address_family) @@ -515,6 +746,8 @@ TEST_F(homa_plumbing, homa_sendmsg__bad_address_family) self->client_addr.in4.sin_family = 1; EXPECT_EQ(EAFNOSUPPORT, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("address family in sendmsg address must match the socket", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__address_too_short) @@ -524,6 +757,7 @@ TEST_F(homa_plumbing, homa_sendmsg__address_too_short) self->sendmsg_hdr.msg_namelen = sizeof(struct sockaddr_in) - 1; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("msg_namelen too short", self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); self->client_addr.in4.sin_family = AF_INET6; @@ -531,13 +765,16 @@ TEST_F(homa_plumbing, homa_sendmsg__address_too_short) self->sendmsg_hdr.msg_namelen = sizeof(struct sockaddr_in6) - 1; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("msg_namelen too short", self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } -TEST_F(homa_plumbing, homa_sendmsg__error_in_homa_rpc_new_client) +TEST_F(homa_plumbing, homa_sendmsg__error_in_homa_rpc_alloc_client) { mock_kmalloc_errors = 2; EXPECT_EQ(ENOMEM, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("couldn't allocate memory for homa_peer", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__error_in_homa_message_out_fill) @@ -545,6 +782,8 @@ TEST_F(homa_plumbing, homa_sendmsg__error_in_homa_message_out_fill) self->sendmsg_hdr.msg_iter.count = HOMA_MAX_MESSAGE_LENGTH+1; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("message length exceeded HOMA_MAX_MESSAGE_LENGTH", + self->hsk.error_msg); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } TEST_F(homa_plumbing, homa_sendmsg__cant_update_user_arguments) @@ -553,6 +792,8 @@ TEST_F(homa_plumbing, homa_sendmsg__cant_update_user_arguments) atomic64_set(&self->homa.next_outgoing_id, 1234); EXPECT_EQ(EFAULT, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("couldn't update homa_sendmsg_args argument to sendmsg: read-only?", + self->hsk.error_msg); EXPECT_SUBSTR("xmit DATA 200@0", unit_log_get()); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } @@ -591,6 +832,8 @@ TEST_F(homa_plumbing, homa_sendmsg__response_nonzero_completion_cookie) self->sendmsg_args.completion_cookie = 12345; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("completion_cookie must be zero when sending responses", + self->hsk.error_msg); EXPECT_EQ(RPC_IN_SERVICE, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); } @@ -616,6 +859,8 @@ TEST_F(homa_plumbing, homa_sendmsg__response_error_in_rpc) srpc->error = -ENOMEM; EXPECT_EQ(ENOMEM, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("RPC has failed, so can't send response", + self->hsk.error_msg); EXPECT_EQ(RPC_DEAD, srpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } @@ -628,6 +873,8 @@ TEST_F(homa_plumbing, homa_sendmsg__response_wrong_state) self->sendmsg_args.id = self->server_id; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("RPC is not in a state where a response can be sent", + self->hsk.error_msg); EXPECT_EQ(RPC_INCOMING, srpc->state); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); } @@ -641,6 +888,8 @@ TEST_F(homa_plumbing, homa_sendmsg__homa_message_out_fill_returns_error) self->sendmsg_hdr.msg_iter.count = HOMA_MAX_MESSAGE_LENGTH + 1; EXPECT_EQ(EINVAL, -homa_sendmsg(&self->hsk.inet.sk, &self->sendmsg_hdr, self->sendmsg_hdr.msg_iter.count)); + EXPECT_STREQ("message length exceeded HOMA_MAX_MESSAGE_LENGTH", + self->hsk.error_msg); EXPECT_EQ(RPC_DEAD, srpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } @@ -690,12 +939,16 @@ TEST_F(homa_plumbing, homa_recvmsg__wrong_args_length) self->recvmsg_hdr.msg_controllen -= 1; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("invalid msg_controllen in recvmsg", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__cant_read_args) { mock_copy_data_errors = 1; EXPECT_EQ(EFAULT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("invalid address for msg_control argument to recvmsg", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__clear_cookie) { @@ -712,12 +965,15 @@ TEST_F(homa_plumbing, homa_recvmsg__num_bpages_too_large) self->recvmsg_args.num_bpages = HOMA_MAX_BPAGES + 1; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("num_pages exceeds HOMA_MAX_BPAGES", self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__reserved_not_zero) { self->recvmsg_args.reserved = 1; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("reserved fields in homa_recvmsg_args must be zero", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__no_buffer_pool) { @@ -726,6 +982,8 @@ TEST_F(homa_plumbing, homa_recvmsg__no_buffer_pool) self->hsk.buffer_pool = NULL; EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("SO_HOMA_RECVBUF socket option has not been set", + self->hsk.error_msg); self->hsk.buffer_pool = saved_pool; } TEST_F(homa_plumbing, homa_recvmsg__release_buffers) @@ -751,6 +1009,8 @@ TEST_F(homa_plumbing, homa_recvmsg__error_in_release_buffers) EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("error while releasing buffer pages", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__private_rpc_doesnt_exist) { @@ -758,6 +1018,8 @@ TEST_F(homa_plumbing, homa_recvmsg__private_rpc_doesnt_exist) EXPECT_EQ(EINVAL, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("invalid RPC id passed to recvmsg", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_private) { @@ -772,6 +1034,8 @@ TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_private) EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("error while waiting for private RPC to complete", + self->hsk.error_msg); EXPECT_EQ(0, self->recvmsg_args.id); EXPECT_EQ(1, unit_list_length(&self->hsk.active_rpcs)); } @@ -789,6 +1053,7 @@ TEST_F(homa_plumbing, homa_recvmsg__private_rpc_has_error) EXPECT_EQ(ETIMEDOUT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("RPC failed", self->hsk.error_msg); EXPECT_EQ(self->client_id, self->recvmsg_args.id); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } @@ -796,6 +1061,8 @@ TEST_F(homa_plumbing, homa_recvmsg__error_from_homa_wait_shared) { EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("error while waiting for shared RPC to complete", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__MSG_DONT_WAIT) { @@ -808,6 +1075,8 @@ TEST_F(homa_plumbing, homa_recvmsg__MSG_DONT_WAIT) EXPECT_EQ(EAGAIN, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, MSG_DONTWAIT, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("error while waiting for shared RPC to complete", + self->hsk.error_msg); } TEST_F(homa_plumbing, homa_recvmsg__normal_completion_ipv4) { @@ -884,6 +1153,7 @@ TEST_F(homa_plumbing, homa_recvmsg__rpc_has_error) EXPECT_EQ(ETIMEDOUT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("RPC failed", self->hsk.error_msg); EXPECT_EQ(self->client_id, self->recvmsg_args.id); EXPECT_EQ(44444, self->recvmsg_args.completion_cookie); EXPECT_EQ(AF_INET6, self->addr.in6.sin6_family); @@ -969,6 +1239,8 @@ TEST_F(homa_plumbing, homa_recvmsg__error_copying_out_args) EXPECT_EQ(EFAULT, -homa_recvmsg(&self->hsk.inet.sk, &self->recvmsg_hdr, 0, 0, &self->recvmsg_hdr.msg_namelen)); + EXPECT_STREQ("couldn't update homa_recvmsg_args argument to recvmsg: read-only?", + self->hsk.error_msg); EXPECT_EQ(0, self->recvmsg_args.id); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index 717c6b18..cf59cac4 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -749,3 +749,35 @@ TEST_F(homa_pool, homa_pool_check_waiting__reallocation_fails) EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(4, pool->bpages_needed); } + +TEST_F(homa_pool, homa_pool_avail_bytes__no_region) +{ + struct homa_pool *pool = homa_pool_alloc(&self->hsk); + + EXPECT_EQ(0, homa_pool_avail_bytes(pool)); + homa_pool_free(pool); +} +TEST_F(homa_pool, homa_pool_avail_bytes__a_few_pages_allocated) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + u32 pages[10]; + + EXPECT_EQ(100 * HOMA_BPAGE_SIZE, homa_pool_avail_bytes(pool)); + EXPECT_EQ(0, homa_pool_get_pages(pool, 5, pages, 0)); + EXPECT_EQ(95 * HOMA_BPAGE_SIZE, homa_pool_avail_bytes(pool)); +} +TEST_F(homa_pool, homa_pool_avail_bytes__include_free_space_in_core_private_pages) +{ + struct homa_pool *pool = self->hsk.buffer_pool; + + pcpu_hot.cpu_number = 3; + EXPECT_EQ(100 * HOMA_BPAGE_SIZE, homa_pool_avail_bytes(pool)); + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 2000); + EXPECT_EQ(100 * HOMA_BPAGE_SIZE - 2000, homa_pool_avail_bytes(pool)); + + pcpu_hot.cpu_number = 5; + unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, + &self->server_ip, 4000, 98, 1000, 50000); + EXPECT_EQ(100 * HOMA_BPAGE_SIZE - 52000, homa_pool_avail_bytes(pool)); +} diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 6040bfe3..6484fb55 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -53,7 +53,7 @@ FIXTURE_SETUP(homa_rpc) .sport = htons(self->client_port), .dport = htons(self->server_port), .type = DATA, - .sender_id = self->client_id + .sender_id = cpu_to_be64(self->client_id) }; self->data.message_length = htonl(10000); #ifndef __STRIP__ /* See strip.py */ @@ -85,7 +85,7 @@ static const char *dead_rpcs(struct homa_sock *hsk) return unit_log_get(); } -TEST_F(homa_rpc, homa_rpc_alloc_client_normal) +TEST_F(homa_rpc, homa_rpc_alloc_client__normal) { struct homa_rpc *crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); @@ -94,16 +94,17 @@ TEST_F(homa_rpc, homa_rpc_alloc_client_normal) homa_rpc_end(crpc); homa_rpc_unlock(crpc); } -TEST_F(homa_rpc, homa_rpc_alloc_client_malloc_error) +TEST_F(homa_rpc, homa_rpc_alloc_client__malloc_error) { struct homa_rpc *crpc; mock_kmalloc_errors = 1; crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); EXPECT_TRUE(IS_ERR(crpc)); + EXPECT_STREQ("couldn't allocate memory for client RPC", self->hsk.error_msg); EXPECT_EQ(ENOMEM, -PTR_ERR(crpc)); } -TEST_F(homa_rpc, homa_rpc_alloc_client_route_error) +TEST_F(homa_rpc, homa_rpc_alloc_client__route_error) { struct homa_rpc *crpc; @@ -111,8 +112,9 @@ TEST_F(homa_rpc, homa_rpc_alloc_client_route_error) crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); EXPECT_TRUE(IS_ERR(crpc)); EXPECT_EQ(EHOSTUNREACH, -PTR_ERR(crpc)); + EXPECT_STREQ("couldn't find route for peer", self->hsk.error_msg); } -TEST_F(homa_rpc, homa_rpc_alloc_client_socket_shutdown) +TEST_F(homa_rpc, homa_rpc_alloc_client__socket_shutdown) { struct homa_rpc *crpc; @@ -120,6 +122,7 @@ TEST_F(homa_rpc, homa_rpc_alloc_client_socket_shutdown) crpc = homa_rpc_alloc_client(&self->hsk, &self->server_addr); EXPECT_TRUE(IS_ERR(crpc)); EXPECT_EQ(ESHUTDOWN, -PTR_ERR(crpc)); + EXPECT_STREQ("socket has been shut down", self->hsk.error_msg); self->hsk.shutdown = 0; } @@ -927,3 +930,158 @@ TEST_F(homa_rpc, homa_rpc_find_server) homa_rpc_unlock(srpc4); EXPECT_EQ(NULL, homa_rpc_find_server(&self->hsk, self->client_ip, 3)); } + +TEST_F(homa_rpc, homa_rpc_get_info__basics) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 20000); + struct homa_rpc_info info; + + crpc->completion_cookie = 1111; + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(AF_INET6, info.peer.in6.sin6_family); + EXPECT_EQ(0, info.peer.in6.sin6_addr.in6_u.u6_addr32[0]); + EXPECT_EQ(0, info.peer.in6.sin6_addr.in6_u.u6_addr32[1]); + EXPECT_EQ(htonl(0x0000ffff), info.peer.in6.sin6_addr.in6_u.u6_addr32[2]); + EXPECT_EQ(0x04030201, info.peer.in6.sin6_addr.in6_u.u6_addr32[3]); + EXPECT_EQ(99, ntohs(info.peer.in6.sin6_port)); + EXPECT_EQ(1234, info.id); + EXPECT_EQ(1111, info.completion_cookie); + EXPECT_EQ(1000, info.tx_length); + EXPECT_EQ(1000, info.tx_sent); + EXPECT_EQ(1000, info.tx_granted); + EXPECT_EQ(0, info.tx_prio); + EXPECT_EQ(20000, info.rx_length); + EXPECT_EQ(18600, info.rx_remaining); + EXPECT_EQ(0, info.rx_gaps); + EXPECT_EQ(0, info.rx_gap_bytes); + EXPECT_EQ(11400, info.rx_granted); +} +TEST_F(homa_rpc, homa_rpc_get_info__ipv4_address) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 1000, 20000); + struct homa_rpc_info info; + + self->hsk.inet.sk.sk_family = AF_INET; + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(AF_INET, info.peer.in4.sin_family); + EXPECT_EQ(0x04030201, info.peer.in4.sin_addr.s_addr); + EXPECT_EQ(99, ntohs(info.peer.in4.sin_port)); +} +TEST_F(homa_rpc, homa_rpc_get_info__tx_incomplete) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 20000); + struct homa_rpc_info info; + + crpc->msgout.granted = 4000; + crpc->msgout.next_xmit_offset = 1400; + crpc->msgout.sched_priority = 5; + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(5000, info.tx_length); + EXPECT_EQ(1400, info.tx_sent); + EXPECT_EQ(4000, info.tx_granted); + EXPECT_EQ(5, info.tx_prio); +} +TEST_F(homa_rpc, homa_rpc_get_info__tx_not_started) +{ + struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, + self->client_ip, self->server_ip, self->client_port, + self->server_id, 10000, 100); + struct homa_rpc_info info; + + homa_rpc_get_info(srpc, &info); + EXPECT_EQ(-1, info.tx_length); +} +TEST_F(homa_rpc, homa_rpc_get_info__rx_gaps) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_RCVD_ONE_PKT, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 20000); + struct homa_rpc_info info; + + homa_gap_alloc(&crpc->msgin.gaps, 1000, 2000); + homa_gap_alloc(&crpc->msgin.gaps, 4000, 6000); + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(2, info.rx_gaps); + EXPECT_EQ(3000, info.rx_gap_bytes); +} +TEST_F(homa_rpc, homa_rpc_get_info__rx_not_started) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 20000); + struct homa_rpc_info info; + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(-1, info.rx_length); +} +TEST_F(homa_rpc, homa_rpc_get_info__HOMA_RPC_BUF_STALL) +{ + struct homa_rpc_info info; + struct homa_rpc *crpc; + + atomic_set(&self->hsk.buffer_pool->free_bpages, 0); + crpc = unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->server_port, + self->client_id, 5000, 20000); + + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(1, info.flags); +} +TEST_F(homa_rpc, homa_rpc_get_info__HOMA_RPC_RX_READY_and_HOMA_RPC_RX_COPY) +{ + struct homa_rpc_info info; + struct homa_rpc *srpc; + struct homa_sock hsk; + + mock_sock_init(&hsk, self->hnet, self->server_port); + self->data.message_length = htonl(2400); + srpc = unit_server_rpc(&hsk, UNIT_RCVD_ONE_PKT, self->client_ip, + self->server_ip, self->client_port, + self->server_id, 2400, 100); + + /* First call: some bytes haven't been received. */ + homa_rpc_get_info(srpc, &info); + EXPECT_EQ(2400, info.rx_length); + EXPECT_EQ(1000, info.rx_remaining); + EXPECT_EQ(HOMA_RPC_RX_COPY, info.flags); + + /* Second call: all bytes received, but haven't been copied out. */ + self->data.seg.offset = htonl(1400); + homa_dispatch_pkts(mock_skb_alloc(self->client_ip, &self->data.common, + 1000, 0)); + homa_rpc_get_info(srpc, &info); + EXPECT_EQ(0, info.rx_remaining); + EXPECT_EQ(2, skb_queue_len(&srpc->msgin.packets)); + EXPECT_EQ(HOMA_RPC_RX_COPY, info.flags); + + /* Third call: all bytes copied out. */ + homa_rpc_lock(srpc); + homa_copy_to_user(srpc); + homa_rpc_unlock(srpc); + homa_rpc_get_info(srpc, &info); + EXPECT_EQ(0, skb_queue_len(&srpc->msgin.packets)); + EXPECT_EQ(HOMA_RPC_RX_READY, info.flags); + + unit_sock_destroy(&hsk); +} +TEST_F(homa_rpc, homa_rpc_get_info__HOMA_RPC_PRIVATE) +{ + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 5000, 20000); + struct homa_rpc_info info; + + crpc->flags |= RPC_PRIVATE; + homa_rpc_get_info(crpc, &info); + EXPECT_EQ(HOMA_RPC_PRIVATE, info.flags); +} diff --git a/test/unit_homa_sock.c b/test/unit_homa_sock.c index f4770355..dd83df82 100644 --- a/test/unit_homa_sock.c +++ b/test/unit_homa_sock.c @@ -311,8 +311,12 @@ TEST_F(homa_sock, homa_sock_bind) EXPECT_EQ(HOMA_MIN_DEFAULT_PORT, self->hsk.port); EXPECT_EQ(EINVAL, -homa_sock_bind(self->hnet, &self->hsk, HOMA_MIN_DEFAULT_PORT + 100)); + EXPECT_STREQ("port number invalid: in the automatically assigned range", + self->hsk.error_msg); EXPECT_EQ(EADDRINUSE, -homa_sock_bind(self->hnet, &self->hsk, 100)); + EXPECT_STREQ("requested port number is already in use", + self->hsk.error_msg); EXPECT_EQ(0, -homa_sock_bind(self->hnet, &hsk2, 100)); EXPECT_EQ(0, -homa_sock_bind(self->hnet, &self->hsk, 110)); @@ -329,6 +333,7 @@ TEST_F(homa_sock, homa_sock_bind__socket_shutdown) { unit_sock_destroy(&self->hsk); EXPECT_EQ(ESHUTDOWN, -homa_sock_bind(self->hnet, &self->hsk, 100)); + EXPECT_STREQ("socket has been shut down", self->hsk.error_msg); } TEST_F(homa_sock, homa_sock_find__basics) diff --git a/util/cp_node.cc b/util/cp_node.cc index c9857ac9..42c511f6 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -2692,6 +2692,83 @@ void tcp_client::read(tcp_connection *connection, int pid) } } +/** + * homa_info() - Use the HOMAIOCINFO ioctl to extract the status of a + * Homa socket and print the information to the log. + * @fd: File descriptor for a Homa socket. + */ +void log_homa_info(int fd) { +#define MAX_RPCS 1000 + struct homa_rpc_info *rpcs, *rinfo; + struct homa_info hinfo; + const char *priv; + std::string flags; + bool is_server; + int status; + + rpcs = new homa_rpc_info[MAX_RPCS]; + hinfo.rpc_info = rpcs; + hinfo.rpc_info_length = MAX_RPCS * sizeof(*rpcs); + status = ioctl(fd, HOMAIOCINFO, &hinfo); + if (status != 0) { + log(NORMAL, "HOMAIOCINFO failed for fd %d (%p): %s\n", fd, + &hinfo, strerror(errno)); + goto done; + } + log(NORMAL, " Homa info for port %d (fd %d):\n", hinfo.port, fd); + log(NORMAL, " Free bytes in rx buffer pool: %llu\n", + hinfo.bpool_avail_bytes); + log(NORMAL, " %d active RPCs\n", hinfo.num_rpcs); + for (__u32 i = 0; i < hinfo.num_rpcs; i++) { + rinfo = &rpcs[i]; + is_server = rinfo->id & 1; + if (rinfo->flags & HOMA_RPC_PRIVATE) + priv = "(private)"; + else + priv = ""; + log(NORMAL, " %s RPC id %llu%s:\n", is_server ? "Server" : "Client", + rinfo->id, priv); + log(NORMAL, " Peer: %s\n", print_address(reinterpret_cast< + union sockaddr_in_union *>(&rinfo->peer))); + if (!is_server) + log(NORMAL, " Completion cookie: %lld\n", + rinfo->completion_cookie); + if (rinfo->tx_length >= 0) + log(NORMAL, " Tx: %d/%d sent, %d granted, prio %u\n", + rinfo->tx_sent, rinfo->tx_length, + rinfo->tx_granted, rinfo->tx_prio); + else + log(NORMAL, " Tx: not yet initiated\n"); + if (rinfo->rx_length >= 0) { + char gap_info[100]; + const char *state; + + if (rinfo->rx_gaps != 0) + snprintf(gap_info, sizeof(gap_info), + ", %d gaps (%d missing bytes)", + rinfo->rx_gaps, + rinfo->rx_gap_bytes); + else + gap_info[0] = 0; + state = ""; + if (rinfo->flags & HOMA_RPC_BUF_STALL) + state = " (waiting for buffer space)"; + else if (rinfo->flags & HOMA_RPC_RX_COPY) + state = " (data available for copying to user space)"; + else if (rinfo->flags & (HOMA_RPC_RX_READY)) + state = " (queued waiting for recvmsg)"; + log(NORMAL, " Rx: %d/%d remaining, %d granted%s%s\n", + rinfo->rx_remaining, rinfo->rx_length, + rinfo->rx_granted, gap_info, state); + } else { + log(NORMAL, " Rx: no packets received yet\n"); + } + } + +done: + delete[] rpcs; +} + /** * server_stats() - Prints recent statistics collected from all * servers. @@ -2902,8 +2979,19 @@ void log_stats() uint64_t now = rdtsc(); server_stats(now); client_stats(now); - last_stats_time = now; + +#if 0 + for (client *client: clients) { + homa_client *hclient = + dynamic_cast(client); + if (hclient == NULL) + continue; + log_homa_info(hclient->fd); + } + for (homa_server *server: homa_servers) + log_homa_info(server->fd); +#endif } } From ac25e05a4e4d3d7b67e81d40afbf4ebacd33c213 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 13 Oct 2025 16:09:22 -0700 Subject: [PATCH 520/625] Fix stale comment --- util/test_utils.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/util/test_utils.cc b/util/test_utils.cc index eec95ee0..81d3b561 100644 --- a/util/test_utils.cc +++ b/util/test_utils.cc @@ -217,8 +217,6 @@ void seed_buffer(void *buffer, size_t length, int seed) /** * print_address() - Generate a human-readable description of an inet address. * @addr: The address to print - * @buffer: Where to store the human readable description. - * @size: Number of bytes available in buffer. * * Return: The address of the human-readable string (buffer). * From c65a142ff89896d9ff134700e439b90c8ca281c2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 13 Oct 2025 16:52:21 -0700 Subject: [PATCH 521/625] Get __STRIP__ working again Also fix a few bugs that the recent HOMAIOCINFO commit introduced into unit tests. --- homa.h | 4 ++++ homa_devel.c | 2 ++ homa_devel.h | 5 ++++- homa_outgoing.c | 18 ++++++++++-------- homa_rpc.c | 12 ++++++++++-- test/mock.c | 6 ++++-- test/unit_homa_outgoing.c | 4 ++-- test/unit_homa_plumbing.c | 6 +++--- test/unit_homa_rpc.c | 6 ++++-- 9 files changed, 43 insertions(+), 20 deletions(-) diff --git a/homa.h b/homa.h index ceeeaafc..33dd686d 100644 --- a/homa.h +++ b/homa.h @@ -217,11 +217,15 @@ struct homa_rpc_info { */ __u32 tx_granted; +#ifndef __STRIP__ /* See strip.py */ /** * @tx_prio: Current priority level that the receiver has specified * for transmitting packets. */ __u32 tx_prio; +#else /* See strip.py */ + __u32 reserved; +#endif /* See strip.py */ /** * @rx_length: Length of the incoming message, in bytes. -1 means diff --git a/homa_devel.c b/homa_devel.c index 64d3abde..da1ebbb4 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -1109,6 +1109,7 @@ void homa_rpc_stats_log(void) snap.server_response_bytes_done); } +#ifndef __STRIP__ /* See strip.py */ /** * homa_rpcs_deferred() - Return true if there are any RPCs with packets * that have been deferred by homa_qdisc, false if there are none. @@ -1272,3 +1273,4 @@ void homa_validate_rbtree(struct rb_node *node, int depth, char *message) } #endif /* __UNIT_TEST__ */ } +#endif /* See strip.py */ diff --git a/homa_devel.h b/homa_devel.h index 8d091eb6..3db08a36 100644 --- a/homa_devel.h +++ b/homa_devel.h @@ -128,7 +128,6 @@ void homa_rpc_log_tt(struct homa_rpc *rpc); void homa_rpc_log_active_tt(struct homa *homa, int freeze_count); void homa_rpc_snapshot_log_tt(void); void homa_rpc_stats_log(void); -bool homa_rpcs_deferred(struct homa *homa); void homa_snapshot_get_stats(struct homa_rpc_snapshot *snap); void homa_snapshot_rpcs(void); int homa_snprintf(char *buffer, int size, int used, @@ -137,6 +136,10 @@ char *homa_symbol_for_type(uint8_t type); char *homa_symbol_for_state(struct homa_rpc *rpc); int homa_validate_incoming(struct homa *homa, int verbose, int *link_errors); + +#ifndef __STRIP__ /* See strip.py */ +bool homa_rpcs_deferred(struct homa *homa); void homa_validate_rbtree(struct rb_node *node, int depth, char *message); +#endif /* See strip.py */ #endif /* _HOMA_DEVEL_H */ diff --git a/homa_outgoing.c b/homa_outgoing.c index b4b112b8..35fba2a5 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -400,7 +400,11 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) INC_METRIC(sent_msg_bytes, rpc->msgout.length); refcount_add(rpc->msgout.skb_memory, &rpc->hsk->sock.sk_wmem_alloc); if (xmit) +#ifndef __STRIP__ /* See strip.py */ homa_xmit_data(rpc, false); +#else /* See strip.py */ + homa_xmit_data(rpc); +#endif /* See strip.py */ return 0; error: @@ -490,16 +494,8 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, hsk->inet.tos = hsk->homa->priority_map[priority] << 5; result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); } -#else /* See strip.py */ - if (hsk->inet.sk.sk_family == AF_INET6) - result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, - NULL, 0, 0); - else - result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); -#endif /* See strip.py */ if (unlikely(result != 0)) INC_METRIC(control_xmit_errors, 1); -#ifndef __STRIP__ /* See strip.py */ if (skb->dev) { struct netdev_queue *txq; @@ -509,6 +505,12 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, be64_to_cpu(h->sender_id), skb->queue_mapping, txq->dql.num_queued, txq->dql.adj_limit); } +#else /* See strip.py */ + if (hsk->inet.sk.sk_family == AF_INET6) + result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, + NULL, 0, 0); + else + result = ip_queue_xmit(&hsk->inet.sk, skb, &peer->flow); #endif /* See strip.py */ INC_METRIC(packets_sent[h->type - DATA], 1); INC_METRIC(priority_bytes[priority], skb->len); diff --git a/homa_rpc.c b/homa_rpc.c index 014dbbb8..44b0416a 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -58,7 +58,7 @@ struct homa_rpc *homa_rpc_alloc_client(struct homa_sock *hsk, crpc->dport = ntohs(dest->in6.sin6_port); crpc->msgin.length = -1; crpc->msgout.length = -1; - homa_qdisc_rpc_init(&crpc->qrpc); + IF_NO_STRIP(homa_qdisc_rpc_init(&crpc->qrpc)); INIT_LIST_HEAD(&crpc->ready_links); INIT_LIST_HEAD(&crpc->buf_links); INIT_LIST_HEAD(&crpc->dead_links); @@ -165,7 +165,7 @@ struct homa_rpc *homa_rpc_alloc_server(struct homa_sock *hsk, srpc->id = id; srpc->msgin.length = -1; srpc->msgout.length = -1; - homa_qdisc_rpc_init(&srpc->qrpc); + IF_NO_STRIP(homa_qdisc_rpc_init(&srpc->qrpc)); INIT_LIST_HEAD(&srpc->ready_links); INIT_LIST_HEAD(&srpc->buf_links); INIT_LIST_HEAD(&srpc->dead_links); @@ -789,8 +789,12 @@ void homa_rpc_get_info(struct homa_rpc *rpc, struct homa_rpc_info *info) if (rpc->msgout.length >= 0) { info->tx_length = rpc->msgout.length; info->tx_sent = rpc->msgout.next_xmit_offset; +#ifndef __STRIP__ /* See strip.py */ info->tx_granted = rpc->msgout.granted; info->tx_prio = rpc->msgout.sched_priority; +#else /* See strip.py */ + info->tx_granted = rpc->msgout.length; +#endif /* See strip.py */ } else { info->tx_length = -1; } @@ -801,7 +805,11 @@ void homa_rpc_get_info(struct homa_rpc *rpc, struct homa_rpc_info *info) info->rx_gaps++; info->rx_gap_bytes += gap->end - gap->start; } +#ifndef __STRIP__ /* See strip.py */ info->rx_granted = rpc->msgin.granted; +#else /* See strip.py */ + info->rx_granted = rpc->msgin.length; +#endif /* See strip.py */ if (skb_queue_len(&rpc->msgin.packets) > 0) info->flags |= HOMA_RPC_RX_COPY; } else { diff --git a/test/mock.c b/test/mock.c index b3a3d220..ed6ee3bc 100644 --- a/test/mock.c +++ b/test/mock.c @@ -241,7 +241,8 @@ int mock_rht_num_walk_results; /* Used instead of HOMA_MIN_DEFAULT_PORT by homa_skb.c. */ __u16 mock_min_default_port = 0x8000; -/* Used as sk_socket for all sockets created by mock_sock_init. */ +/* Used as sk_socket for all sockets created by mock_sock_init. + * Its sk field points to the most recently created Homa socket. */ static struct socket mock_socket; /* Each of the entries in mock_hnets below is associated with the @@ -2271,8 +2272,9 @@ int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port) sk->sk_data_ready = mock_data_ready; sk->sk_family = mock_ipv6 ? AF_INET6 : AF_INET; sk->sk_socket = &mock_socket; - sk->sk_net.net = mock_net_for_hnet(hnet); memset(&mock_socket, 0, sizeof(mock_socket)); + mock_socket.sk = sk; + sk->sk_net.net = mock_net_for_hnet(hnet); refcount_set(&sk->sk_wmem_alloc, 1); init_waitqueue_head(&mock_socket.wq.wait); rcu_assign_pointer(sk->sk_wq, &mock_socket.wq); diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index ef524e81..e667f717 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -1174,7 +1174,7 @@ TEST_F(homa_outgoing, homa_resend_data__error_copying_data) unit_log_get()); } #endif /* See strip.py */ -TEST_F(homa_outgoing, homa_resend_data__add_to_to_free_and_set_homa_info) +TEST_F(homa_outgoing, homa_resend_data__update_to_free_and_set_homa_info) { struct homa_skb_info *homa_info; struct homa_rpc *crpc; @@ -1197,7 +1197,7 @@ TEST_F(homa_outgoing, homa_resend_data__add_to_to_free_and_set_homa_info) EXPECT_EQ(8400, homa_info->offset); EXPECT_EQ(crpc, homa_info->rpc); EXPECT_EQ(1, refcount_read(&skb->users)); - EXPECT_EQ(6, crpc->msgout.num_skbs); + IF_NO_STRIP(EXPECT_EQ(6, crpc->msgout.num_skbs)); } TEST_F(homa_outgoing, homa_rpc_tx_end) diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 2ddc5d10..d032400e 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -458,13 +458,13 @@ TEST_F(homa_plumbing, homa_ioctl__HOMAIOCINFO) hinfo.rpc_info = NULL; self->hsk.error_msg = "Sample error message"; - EXPECT_EQ(0, -homa_ioctl(&self->hsk.inet.sk, HOMAIOCINFO, - (int *) &hinfo)); + EXPECT_EQ(0, -homa_ioctl(self->hsk.sock.sk_socket, HOMAIOCINFO, + (unsigned long) &hinfo)); EXPECT_STREQ("Sample error message", hinfo.error_msg); } TEST_F(homa_plumbing, homa_ioctl__unknown_ioctl_command) { - EXPECT_EQ(EINVAL, -homa_ioctl(&self->hsk.inet.sk, 47, NULL)); + EXPECT_EQ(EINVAL, -homa_ioctl(self->hsk.sock.sk_socket, 47, 0)); EXPECT_STREQ("ioctl opcode isn't supported by Homa", self->hsk.error_msg); } diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index 6484fb55..fec95713 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -952,12 +952,12 @@ TEST_F(homa_rpc, homa_rpc_get_info__basics) EXPECT_EQ(1000, info.tx_length); EXPECT_EQ(1000, info.tx_sent); EXPECT_EQ(1000, info.tx_granted); - EXPECT_EQ(0, info.tx_prio); + IF_NO_STRIP(EXPECT_EQ(0, info.tx_prio)); EXPECT_EQ(20000, info.rx_length); EXPECT_EQ(18600, info.rx_remaining); EXPECT_EQ(0, info.rx_gaps); EXPECT_EQ(0, info.rx_gap_bytes); - EXPECT_EQ(11400, info.rx_granted); + IF_NO_STRIP(EXPECT_EQ(11400, info.rx_granted)); } TEST_F(homa_rpc, homa_rpc_get_info__ipv4_address) { @@ -973,6 +973,7 @@ TEST_F(homa_rpc, homa_rpc_get_info__ipv4_address) EXPECT_EQ(0x04030201, info.peer.in4.sin_addr.s_addr); EXPECT_EQ(99, ntohs(info.peer.in4.sin_port)); } +#ifndef __STRIP__ /* See strip.py */ TEST_F(homa_rpc, homa_rpc_get_info__tx_incomplete) { struct homa_rpc *crpc = unit_client_rpc(&self->hsk, @@ -990,6 +991,7 @@ TEST_F(homa_rpc, homa_rpc_get_info__tx_incomplete) EXPECT_EQ(4000, info.tx_granted); EXPECT_EQ(5, info.tx_prio); } +#endif /* See strip.py */ TEST_F(homa_rpc, homa_rpc_get_info__tx_not_started) { struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, From 4e248eb8ef03eb9c09b614179cef66060e0c72c6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 14 Oct 2025 10:40:56 -0700 Subject: [PATCH 522/625] Cleanup issues from checkpatch, kdoc, xmastree --- homa.h | 7 +++--- homa_grant.c | 2 +- homa_incoming.c | 4 +-- homa_offload.c | 2 +- homa_outgoing.c | 7 +++--- homa_pacer.c | 2 +- homa_peer.h | 2 +- homa_plumbing.c | 7 +++--- homa_qdisc.c | 13 +++++----- homa_qdisc.h | 4 +-- homa_rpc.c | 3 +-- homa_rpc.h | 3 ++- homa_wire.h | 2 +- test/unit_homa_plumbing.c | 53 ++++++++++++++++++++++++++------------- 14 files changed, 65 insertions(+), 46 deletions(-) diff --git a/homa.h b/homa.h index 33dd686d..5e5505fe 100644 --- a/homa.h +++ b/homa.h @@ -224,6 +224,7 @@ struct homa_rpc_info { */ __u32 tx_prio; #else /* See strip.py */ + /** @reserved: Reserved for future use. */ __u32 reserved; #endif /* See strip.py */ @@ -235,7 +236,7 @@ struct homa_rpc_info { __s32 rx_length; /** - * @rx_received: Number of bytes in the incoming message that have + * @rx_remaining: Number of bytes in the incoming message that have * not yet been received. */ __u32 rx_remaining; @@ -287,8 +288,8 @@ struct homa_rpc_info { */ struct homa_info { /** - * @rpc_info: (in) Address of memory region in which to store infomation - * about individual RPCs. + * @rpc_info: (in) Address of memory region in which to store + * information about individual RPCs. */ struct homa_rpc_info *rpc_info; diff --git a/homa_grant.c b/homa_grant.c index a7daa7a4..1ea24a8d 100644 --- a/homa_grant.c +++ b/homa_grant.c @@ -548,8 +548,8 @@ void homa_grant_unmanage_rpc(struct homa_rpc *rpc, __must_hold(rpc->bucket->lock) { struct homa_grant *grant = rpc->hsk->homa->grant; - bool removed = false; u64 time = homa_clock(); + bool removed = false; homa_grant_lock(grant); diff --git a/homa_incoming.c b/homa_incoming.c index fee19b88..3040fc4a 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1130,9 +1130,9 @@ int homa_wait_private(struct homa_rpc *rpc, int nonblocking) } #ifndef __STRIP__ /* See strip.py */ - if (avail_immediately) + if (avail_immediately) { INC_METRIC(wait_none, 1); - else if (result == 0) { + } else if (result == 0) { if (blocked) INC_METRIC(wait_block, 1); else diff --git a/homa_offload.c b/homa_offload.c index a2b0a214..812b567c 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -288,9 +288,9 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * gro_list by the caller, so it will be considered for merges * in the future. */ + struct homa *homa = homa_net(dev_net(skb->dev))->homa; u64 saved_softirq_metric, softirq_cycles; struct homa_offload_core *offload_core; - struct homa *homa = homa_net(dev_net(skb->dev))->homa; struct sk_buff *result = NULL; struct homa_data_hdr *h_new; u64 *softirq_cycles_metric; diff --git a/homa_outgoing.c b/homa_outgoing.c index 35fba2a5..86cc376d 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -385,7 +385,7 @@ int homa_message_out_fill(struct homa_rpc *rpc, struct iov_iter *iter, int xmit) * a significant amount time. On high-speed networks (e.g. * 100 Gbps and above), copying from user space is the * bottleneck, so transmitting the packets here will slow - * that down. Thes, we only transmit the unscheduled packets + * that down. Thus, we only transmit the unscheduled packets * here, to fill the pipe. Packets after that can be * transmitted by SoftIRQ in response to incoming grants; * this allows us to use two cores: this core copying data @@ -502,8 +502,9 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, txq = netdev_get_tx_queue(skb->dev, skb->queue_mapping); if (netif_tx_queue_stopped(txq)) tt_record4("__homa_xmit_control found stopped txq for id %d, qid %u, num_queued %u, limit %d", - be64_to_cpu(h->sender_id), skb->queue_mapping, - txq->dql.num_queued, txq->dql.adj_limit); + be64_to_cpu(h->sender_id), + skb->queue_mapping, + txq->dql.num_queued, txq->dql.adj_limit); } #else /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) diff --git a/homa_pacer.c b/homa_pacer.c index a92dd140..ebe466a8 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -159,8 +159,8 @@ int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, int homa_pacer_main(void *arg) { struct homa_pacer *pacer = arg; - u64 start; int status; + u64 start; while (1) { if (kthread_should_stop()) diff --git a/homa_peer.h b/homa_peer.h index 3d50dd98..65d6701a 100644 --- a/homa_peer.h +++ b/homa_peer.h @@ -183,7 +183,7 @@ struct homa_peer { /** * @flow: Addressing info used to create @dst and also required * when transmitting packets. - * */ + */ struct flowi flow; #ifndef __STRIP__ /* See strip.py */ diff --git a/homa_plumbing.c b/homa_plumbing.c index 563e3f59..7428394b 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -928,7 +928,7 @@ int homa_ioc_info(struct socket *sock, unsigned long arg) * homa_ioctl() - Implements the ioctl system call for Homa sockets. * @sock: Socket on which the system call was invoked. * @cmd: Identifier for a particular ioctl operation. - * karg: Operation-specific argument; typically the address of a block + * @arg: Operation-specific argument; typically the address of a block * of data in user address space. * * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg @@ -1177,9 +1177,8 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) if (!homa_sock_wmem_avl(hsk)) { result = homa_sock_wait_wmem(hsk, msg->msg_flags & MSG_DONTWAIT); - if (result != 0) { + if (result != 0) goto error; - } } if (addr->sa.sa_family != sk->sk_family) { @@ -1692,8 +1691,8 @@ int homa_softirq(struct sk_buff *skb) */ int homa_err_handler_v4(struct sk_buff *skb, u32 info) { - const struct icmphdr *icmp = icmp_hdr(skb); struct homa *homa = homa_net(dev_net(skb->dev))->homa; + const struct icmphdr *icmp = icmp_hdr(skb); struct in6_addr daddr; int type = icmp->type; int code = icmp->code; diff --git a/homa_qdisc.c b/homa_qdisc.c index 7485a0a5..4976f862 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -61,7 +61,7 @@ void homa_rcu_kfree(void *object) { struct homa_rcu_kfreer *freer; - freer = kmalloc(sizeof *freer, GFP_KERNEL); + freer = kmalloc(sizeof(*freer), GFP_KERNEL); if (!freer) { /* Can't allocate memory needed for asynchronous freeing, * so free synchronously. @@ -90,7 +90,7 @@ void homa_rcu_kfree_callback(struct rcu_head *head) } /** - * homa_qdisc_alloc_devs() - Allocate and initialize a new homa_qdisc_qdevs + * homa_qdisc_qdevs_alloc() - Allocate and initialize a new homa_qdisc_qdevs * object. * Return: The new object, or an ERR_PTR if an error occurred. */ @@ -415,8 +415,8 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (h->common.type == DATA) { h = (struct homa_data_hdr *)skb_transport_header(skb); tt_record3("homa_qdisc_enqueue queuing homa data packet for id %d, offset %d on qid %d", - be64_to_cpu(h->common.sender_id), offset, - q->ix); + be64_to_cpu(h->common.sender_id), offset, + q->ix); } } else { tt_record2("homa_qdisc_enqueue queuing non-homa packet, qix %d, pacer_qix %d", @@ -477,7 +477,7 @@ void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb) */ void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) { - struct rb_node **new = &(qdev->deferred_rpcs.rb_root.rb_node); + struct rb_node **new = &qdev->deferred_rpcs.rb_root.rb_node; struct rb_node *parent = NULL; struct homa_rpc *rpc2; bool leftmost = true; @@ -801,7 +801,8 @@ int homa_qdisc_redirect_skb(struct sk_buff *skb, * @homa: Overall information about the Homa transport; used to find * homa_qdisc_devs to check. */ -void homa_qdisc_pacer_check(struct homa *homa) { +void homa_qdisc_pacer_check(struct homa *homa) +{ struct homa_qdisc_dev *qdev; u64 now = homa_clock(); int max_cycles; diff --git a/homa_qdisc.h b/homa_qdisc.h index cb8d1be4..8aca30a9 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -132,7 +132,7 @@ struct homa_qdisc_dev { u64 last_defer; /** - * @defer_lock: Sychronizes access to information about deferred + * @defer_lock: Synchronizes access to information about deferred * packets, including deferred_rpcs, tcp_deferred, and last_defer. */ spinlock_t defer_lock; @@ -190,7 +190,7 @@ struct homa_rcu_kfreer { /** @rcu_head: Holds state of a pending call_rcu invocation. */ struct rcu_head rcu_head; - /** object: Kfree this after waiting until RCU has synced. */ + /** @object: Kfree this after waiting until RCU has synced. */ void *object; }; diff --git a/homa_rpc.c b/homa_rpc.c index 44b0416a..dc77b3c9 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -564,7 +564,6 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) rpc->msgout.num_skbs--; if (num_skbs >= batch_size) goto release; - } } @@ -823,4 +822,4 @@ void homa_rpc_get_info(struct homa_rpc *rpc, struct homa_rpc_info *info) info->flags |= HOMA_RPC_RX_READY; if (rpc->flags & RPC_PRIVATE) info->flags |= HOMA_RPC_PRIVATE; -} \ No newline at end of file +} diff --git a/homa_rpc.h b/homa_rpc.h index 269d747f..1bb9e8e3 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -229,7 +229,7 @@ struct homa_message_in { */ struct homa_rpc_qdisc { /** - * @deferred: List of tx skbs from this RPC that have been deferred + * @packets: List of tx skbs from this RPC that have been deferred * by homa_qdisc. Non-empty means this RPC is currently linked into * homa_qdisc_dev->deferred_rpcs. */ @@ -520,6 +520,7 @@ static inline int homa_rpc_try_lock(struct homa_rpc *rpc) * homa_rpc_lock_preempt() - Same as homa_rpc_lock, except sets the * APP_NEEDS_LOCK flags while waiting to encourage the existing lock * owner to relinquish the lock. + * @rpc: RPC to lock. */ static inline void homa_rpc_lock_preempt(struct homa_rpc *rpc) __acquires(rpc->bucket->lock) diff --git a/homa_wire.h b/homa_wire.h index eeca2b3d..680ee7eb 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -126,7 +126,7 @@ struct homa_common_hdr { u8 type; /** - * @doff: High order 4 bits corespond to the Data Offset field of a + * @doff: High order 4 bits correspond to the Data Offset field of a * TCP header. In DATA packets they hold the number of 4-byte chunks * in a homa_data_hdr; used by TSO to determine where the replicated * header portion ends. For other packets the offset is always 5 diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index d032400e..3cc9127e 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -243,7 +243,8 @@ TEST_F(homa_plumbing, homa_ioc_abort__basics) struct homa_abort_args args = {self->client_id, 0}; ASSERT_NE(NULL, crpc); - EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); + EXPECT_EQ(0, homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); EXPECT_EQ(RPC_DEAD, crpc->state); EXPECT_EQ(0, unit_list_length(&self->hsk.active_rpcs)); } @@ -252,7 +253,8 @@ TEST_F(homa_plumbing, homa_ioc_abort__cant_read_user_args) struct homa_abort_args args = {self->client_id, 0}; mock_copy_data_errors = 1; - EXPECT_EQ(EFAULT, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); + EXPECT_EQ(EFAULT, -homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); EXPECT_STREQ("invalid address for homa_abort_args", self->hsk.error_msg); } @@ -261,7 +263,8 @@ TEST_F(homa_plumbing, homa_ioc_abort__nonzero_reserved_fields) struct homa_abort_args args; args._pad1 = 777; - EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); + EXPECT_EQ(EINVAL, -homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); EXPECT_STREQ("reserved fields in homa_abort_args must be zero", self->hsk.error_msg); } @@ -277,7 +280,8 @@ TEST_F(homa_plumbing, homa_ioc_abort__abort_multiple_rpcs) ASSERT_NE(NULL, crpc1); ASSERT_NE(NULL, crpc2); - EXPECT_EQ(0, homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); + EXPECT_EQ(0, homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); EXPECT_EQ(-ECANCELED, crpc1->error); EXPECT_EQ(-ECANCELED, crpc2->error); EXPECT_EQ(2, unit_list_length(&self->hsk.active_rpcs)); @@ -286,17 +290,20 @@ TEST_F(homa_plumbing, homa_ioc_abort__nonexistent_rpc) { struct homa_abort_args args = {99, 0}; - EXPECT_EQ(EINVAL, -homa_ioc_abort(&self->hsk.inet.sk, (int *) &args)); + EXPECT_EQ(EINVAL, -homa_ioc_abort(self->hsk.sock.sk_socket, + (unsigned long) &args)); EXPECT_STREQ("RPC identifier did not match any existing RPC", self->hsk.error_msg); } +#endif /* See strip.py */ TEST_F(homa_plumbing, homa_ioc_info__cant_read_homa_info_from_user_space) { struct homa_info hinfo; mock_copy_data_errors = 1; - EXPECT_EQ(EFAULT, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(EFAULT, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_STREQ("invalid address for homa_info", self->hsk.error_msg); } TEST_F(homa_plumbing, homa_ioc_info__basics) @@ -304,7 +311,8 @@ TEST_F(homa_plumbing, homa_ioc_info__basics) struct homa_info hinfo; memset(&hinfo, 0, sizeof(hinfo)); - EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_EQ(100 * HOMA_BPAGE_SIZE, hinfo.bpool_avail_bytes); EXPECT_EQ(99, hinfo.port); } @@ -316,7 +324,8 @@ TEST_F(homa_plumbing, homa_ioc_info__socket_shutdown) mock_sock_init(&hsk, self->hnet, self->server_port); homa_sock_shutdown(&hsk); - EXPECT_EQ(ESHUTDOWN, -homa_ioc_info(&hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(ESHUTDOWN, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_STREQ("socket has been shut down", hsk.error_msg); unit_sock_destroy(&hsk); } @@ -334,7 +343,8 @@ TEST_F(homa_plumbing, homa_ioc_info__rpc_info) hinfo.rpc_info = info; hinfo.rpc_info_length = sizeof(info); - EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_EQ(2, hinfo.num_rpcs); EXPECT_EQ(self->server_id, info[0].id); EXPECT_EQ(self->server_id + 2, info[1].id); @@ -356,7 +366,8 @@ TEST_F(homa_plumbing, homa_ioc_info__ignore_dead_rpc) hinfo.rpc_info = info; hinfo.rpc_info_length = sizeof(info); - EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_EQ(1, hinfo.num_rpcs); EXPECT_EQ(self->server_id + 2, info[0].id); srpc->state = RPC_IN_SERVICE; @@ -374,7 +385,8 @@ TEST_F(homa_plumbing, homa_ioc_info__no_memory_for_rpc_info) hinfo.rpc_info = NULL; hinfo.rpc_info_length = 1000; - EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_EQ(2, hinfo.num_rpcs); } TEST_F(homa_plumbing, homa_ioc_info__not_enough_space_for_all_rpcs) @@ -392,7 +404,8 @@ TEST_F(homa_plumbing, homa_ioc_info__not_enough_space_for_all_rpcs) hinfo.rpc_info = info; hinfo.rpc_info_length = sizeof(*info); - EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_EQ(2, hinfo.num_rpcs); EXPECT_EQ(self->server_id, info[0].id); EXPECT_EQ(0, info[1].id); @@ -413,7 +426,8 @@ TEST_F(homa_plumbing, homa_ioc_info__cant_copy_rpc_info_to_user) hinfo.rpc_info_length = sizeof(info); mock_copy_to_user_errors = 2; - EXPECT_EQ(EFAULT, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(EFAULT, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_STREQ("couldn't copy homa_rpc_info to user space: invalid or read-only address?", self->hsk.error_msg); EXPECT_EQ(self->server_id, info[0].id); @@ -425,12 +439,14 @@ TEST_F(homa_plumbing, homa_ioc_info__error_msg) /* First call: no error message. */ strcpy(hinfo.error_msg, "Bogus message"); - EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_STREQ("", hinfo.error_msg); /* Second call: there is a message. */ self->hsk.error_msg = "Sample error message"; - EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_STREQ("Sample error message", hinfo.error_msg); /* Third call: the message is too long. */ @@ -438,7 +454,8 @@ TEST_F(homa_plumbing, homa_ioc_info__error_msg) "a lot longer than you might think; " "so long that it exceeds the available space " "for storing message in struct homa_info"; - EXPECT_EQ(0, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(0, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_EQ(HOMA_ERROR_MSG_SIZE - 1, strlen(hinfo.error_msg)); } TEST_F(homa_plumbing, homa_ioc_info__cant_copy_back_to_user_space) @@ -446,11 +463,11 @@ TEST_F(homa_plumbing, homa_ioc_info__cant_copy_back_to_user_space) struct homa_info hinfo; mock_copy_to_user_errors = 1; - EXPECT_EQ(EFAULT, -homa_ioc_info(&self->hsk.inet.sk, (int *) &hinfo)); + EXPECT_EQ(EFAULT, -homa_ioc_info(self->hsk.sock.sk_socket, + (unsigned long) &hinfo)); EXPECT_STREQ("couldn't copy homa_info to user space: read-only address?", self->hsk.error_msg); } -#endif /* See strip.py */ TEST_F(homa_plumbing, homa_ioctl__HOMAIOCINFO) { From d417dace2e31d9c2a72db1c8164f22a61f1d9afd Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 14 Oct 2025 10:54:44 -0700 Subject: [PATCH 523/625] Minor fix in comment --- homa_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_impl.h b/homa_impl.h index 4d1a7761..ea508277 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -823,7 +823,7 @@ static inline u64 homa_clock(void) #else /* __UNIT_TEST__ */ #ifndef __UPSTREAM__ /* See strip.py */ /* As of August 2025, get_cycles takes only about 8 ns/call, vs. - * 14 ns/call for ktime_get_ns. This saves about .04 core when + * 14 ns/call for ktime_get_ns. This saves about .24 core when * driving a 25 Gbps network at high load (see perf.txt for details). * Unfortunately, Linux reviewers will not allow get_cycles in the * upstreamed version. From 59ffc247e30e18ab222973823ac85122478907c1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 14 Oct 2025 11:15:11 -0700 Subject: [PATCH 524/625] Fix trivial comment typo in homa_sock.c --- homa_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_sock.c b/homa_sock.c index 2b2d77a6..0c5e97f6 100644 --- a/homa_sock.c +++ b/homa_sock.c @@ -511,7 +511,7 @@ int homa_sock_wait_wmem(struct homa_sock *hsk, int nonblocking) int result; /* Note: we can't use sock_wait_for_wmem because that function - * is not available to modules (as of August 2025 it's static). + * is not available to modules (as of August 2025 it's static). */ if (nonblocking) From 61e523937d2ca8c17b3ac4bef16da38e6fbc8bb1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 14 Oct 2025 11:21:58 -0700 Subject: [PATCH 525/625] Remove homa_rpc_qdisc from stripped versions --- homa_rpc.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/homa_rpc.h b/homa_rpc.h index 1bb9e8e3..16750746 100644 --- a/homa_rpc.h +++ b/homa_rpc.h @@ -223,6 +223,7 @@ struct homa_message_in { #endif /* See strip.py */ }; +#ifndef __STRIP__ /* See strip.py */ /** * struct homa_rpc_qdisc - Information that homa_qdisc needs to store in * each RPC. Managed entirely by homa_qdisc. @@ -250,6 +251,7 @@ struct homa_rpc_qdisc { */ int tx_left; }; +#endif /* See strip.py */ /** * struct homa_rpc - One of these structures exists for each active From f36b9d9488cc57da532af79e58e9c4633977c843 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 14 Oct 2025 11:56:45 -0700 Subject: [PATCH 526/625] Modify comment to be different when stripped --- homa_rpc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/homa_rpc.c b/homa_rpc.c index dc77b3c9..1b6a36e2 100644 --- a/homa_rpc.c +++ b/homa_rpc.c @@ -546,12 +546,18 @@ int homa_rpc_reap(struct homa_sock *hsk, bool reap_all) rpc->msgout.packets = NULL; } +#ifndef __STRIP__ /* See strip.py */ /* This tests whether skb is still in a * transmit queue somewhere; if so, * can't reap the RPC since homa_qdisc * may try to access the RPC via the * skb's homa_skb_info. */ +#else /* See strip.py */ + /* Don't reap RPC if anyone besides + * us has a reference to the skb. + */ +#endif /* See strip.py */ if (refcount_read(&skb->users) > 1) { INC_METRIC(reaper_active_skbs, 1); From c7b0d5465cf9e5a33aaf9c8c6f27a1190f45b9e8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 15 Oct 2025 09:35:14 -0700 Subject: [PATCH 527/625] Move sysctl variables from homa_pacer.c to homa_qdisc.c This is a better place for them, since homa_pacer.c will eventually go away. Also renamed homa_qdisc_qdevs to homa_qdisc_shared, and renamed qdevs variables to qshared. --- homa_devel.c | 4 +- homa_impl.h | 11 ++- homa_outgoing.c | 2 +- homa_pacer.c | 88 +------------------ homa_pacer.h | 58 +------------ homa_plumbing.c | 2 + homa_pool.c | 2 +- homa_qdisc.c | 178 +++++++++++++++++++++++++++----------- homa_qdisc.h | 64 +++++++++++--- homa_utils.c | 20 ++--- test/unit_homa_grant.c | 2 +- test/unit_homa_incoming.c | 4 +- test/unit_homa_outgoing.c | 25 +++--- test/unit_homa_pacer.c | 50 ++++------- test/unit_homa_qdisc.c | 123 +++++++++++++++----------- test/unit_homa_rpc.c | 5 +- test/utils.c | 5 +- 17 files changed, 324 insertions(+), 319 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index da1ebbb4..ba748987 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -1118,12 +1118,12 @@ void homa_rpc_stats_log(void) */ bool homa_rpcs_deferred(struct homa *homa) { - struct homa_qdisc_qdevs *qdevs = homa->qdevs; + struct homa_qdisc_shared *qshared = homa->qshared; struct homa_qdisc_dev *qdev; bool result = false; rcu_read_lock(); - list_for_each_entry_rcu(qdev, &qdevs->qdevs, links) { + list_for_each_entry_rcu(qdev, &qshared->qdevs, links) { if (homa_qdisc_any_deferred(qdev)) { result = true; break; diff --git a/homa_impl.h b/homa_impl.h index ea508277..61c4c912 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -108,6 +108,11 @@ struct homa { */ atomic64_t next_outgoing_id; +#ifndef __UPSTREAM__ /* See strip.py */ + /** @qshared: Contains information used by homa_qdisc.c. */ + struct homa_qdisc_shared *qshared; +#endif /* See strip.py */ + #ifndef __STRIP__ /* See strip.py */ /** * @pacer: Information related to the pacer; managed by homa_pacer.c. @@ -119,12 +124,6 @@ struct homa { * grants for incoming messages. */ struct homa_grant *grant; - - /** - * @qdevs: Contains information used by homa_qdisc.c to manage - * homa_qdisc_qdevs for this struct homa. - */ - struct homa_qdisc_qdevs *qdevs; #endif /* See strip.py */ /** diff --git a/homa_outgoing.c b/homa_outgoing.c index 86cc376d..d3bfef4d 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -607,7 +607,7 @@ void homa_xmit_data(struct homa_rpc *rpc) } if (rpc->msgout.length - rpc->msgout.next_xmit_offset > - homa->pacer->throttle_min_bytes && + homa->qshared->defer_min_bytes && !homa_qdisc_active(rpc->hsk->homa)) { if (!homa_pacer_check_nic_q(homa->pacer, skb, force)) { tt_record1("homa_xmit_data adding id %u to throttle queue", diff --git a/homa_pacer.c b/homa_pacer.c index ebe466a8..f7fe7efc 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -9,35 +9,6 @@ #include "homa_pacer.h" #include "homa_rpc.h" -/* Used to enable sysctl access to pacer-specific configuration parameters. The - * @data fields are actually offsets within a struct homa_pacer; these are - * converted to pointers into a net-specific struct homa later. - */ -#define OFFSET(field) ((void *)offsetof(struct homa_pacer, field)) -static struct ctl_table pacer_ctl_table[] = { - { - .procname = "max_nic_queue_ns", - .data = OFFSET(max_nic_queue_ns), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_pacer_dointvec - }, - { - .procname = "pacer_fifo_fraction", - .data = OFFSET(fifo_fraction), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_pacer_dointvec - }, - { - .procname = "throttle_min_bytes", - .data = OFFSET(throttle_min_bytes), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_pacer_dointvec - }, -}; - /** * homa_pacer_alloc() - Allocate and initialize a new pacer object, which * will hold pacer-related information for @homa. @@ -57,9 +28,6 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) pacer->fifo_count = 1000; spin_lock_init(&pacer->throttle_lock); INIT_LIST_HEAD_RCU(&pacer->throttled_rpcs); - pacer->fifo_fraction = 50; - pacer->max_nic_queue_ns = 5000; - pacer->throttle_min_bytes = 1000; init_waitqueue_head(&pacer->wait_queue); pacer->kthread = kthread_run(homa_pacer_main, pacer, "homa_pacer"); if (IS_ERR(pacer->kthread)) { @@ -68,15 +36,6 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) goto error; } atomic64_set(&pacer->link_idle_time, homa_clock()); - - pacer->sysctl_header = register_net_sysctl(&init_net, "net/homa", - pacer_ctl_table); - if (!pacer->sysctl_header) { - err = -ENOMEM; - pr_err("couldn't register sysctl parameters for Homa pacer\n"); - goto error; - } - homa_pacer_update_sysctl_deps(pacer); return pacer; error: @@ -92,10 +51,6 @@ struct homa_pacer *homa_pacer_alloc(struct homa *homa) */ void homa_pacer_free(struct homa_pacer *pacer) { - if (pacer->sysctl_header) { - unregister_net_sysctl_table(pacer->sysctl_header); - pacer->sysctl_header = NULL; - } if (pacer->kthread) { kthread_stop(pacer->kthread); pacer->kthread = NULL; @@ -132,8 +87,8 @@ int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, while (1) { clock = homa_clock(); idle = atomic64_read(&pacer->link_idle_time); - if ((clock + pacer->max_nic_queue_cycles) < idle && !force && - !(pacer->homa->flags & HOMA_FLAG_DONT_THROTTLE)) + if ((clock + pacer->homa->qshared->max_nic_queue_cycles) < idle && + !force && !(pacer->homa->flags & HOMA_FLAG_DONT_THROTTLE)) return 0; if (!list_empty(&pacer->throttled_rpcs)) INC_METRIC(pacer_bytes, bytes); @@ -216,7 +171,7 @@ void homa_pacer_xmit(struct homa_pacer *pacer) while (1) { queue_cycles = atomic64_read(&pacer->link_idle_time) - homa_clock(); - if (queue_cycles >= pacer->max_nic_queue_cycles) + if (queue_cycles >= pacer->homa->qshared->max_nic_queue_cycles) break; if (list_empty(&pacer->throttled_rpcs)) break; @@ -229,7 +184,7 @@ void homa_pacer_xmit(struct homa_pacer *pacer) * Locking Strategy" in homa_impl.h). */ homa_pacer_throttle_lock(pacer); - pacer->fifo_count -= pacer->fifo_fraction; + pacer->fifo_count -= pacer->homa->qshared->fifo_fraction; if (pacer->fifo_count <= 0) { struct homa_rpc *cur; u64 oldest = ~0; @@ -360,47 +315,12 @@ void homa_pacer_update_sysctl_deps(struct homa_pacer *pacer) { u64 tmp; - pacer->max_nic_queue_cycles = - homa_ns_to_cycles(pacer->max_nic_queue_ns); - /* Underestimate link bandwidth (overestimate time) by 1%. */ tmp = 101 * 8000 * (u64)homa_clock_khz(); do_div(tmp, pacer->homa->link_mbps * 100); pacer->cycles_per_mbyte = tmp; } -/** - * homa_pacer_dointvec() - This function is a wrapper around proc_dointvec. It - * is invoked to read and write pacer-related sysctl values. - * @table: sysctl table describing value to be read or written. - * @write: Nonzero means value is being written, 0 means read. - * @buffer: Address in user space of the input/output data. - * @lenp: Not exactly sure. - * @ppos: Not exactly sure. - * - * Return: 0 for success, nonzero for error. - */ -int homa_pacer_dointvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table table_copy; - struct homa_pacer *pacer; - int result; - - pacer = homa_net(current->nsproxy->net_ns)->homa->pacer; - - /* Generate a new ctl_table that refers to a field in the - * net-specific struct homa. - */ - table_copy = *table; - table_copy.data = ((char *)pacer) + (uintptr_t)table_copy.data; - - result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); - if (write) - homa_pacer_update_sysctl_deps(pacer); - return result; -} - /** * homa_pacer_log_throttled() - Print information to the system log about the * RPCs on the throttled list. diff --git a/homa_pacer.h b/homa_pacer.h index 08975e5b..6379f012 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -10,9 +10,7 @@ #define _HOMA_PACER_H #include "homa_impl.h" -#ifndef __STRIP__ /* See strip.py */ -#include "homa_metrics.h" -#endif /* See strip.py */ +#include "homa_qdisc.h" /** * struct homa_pacer - Contains information that the pacer users to @@ -54,27 +52,6 @@ struct homa_pacer { */ u64 throttle_add; - /** - * @fifo_fraction: Out of every 1000 packets transmitted by the - * pacer, this number will be transmitted from the oldest message - * rather than the highest-priority message. Set externally via - * sysctl. - */ - int fifo_fraction; - - /** - * @max_nic_queue_ns: Limits the NIC queue length: we won't queue - * up a packet for transmission if link_idle_time is this many - * nanoseconds in the future (or more). Set externally via sysctl. - */ - int max_nic_queue_ns; - - /** - * @max_nic_queue_cycles: Same as max_nic_queue_ns except in - * homa_clock() units. - */ - int max_nic_queue_cycles; - /** * @cycles_per_mbyte: the number of homa_clock() cycles that it takes to * transmit 10**6 bytes on our uplink. This is actually a slight @@ -83,25 +60,6 @@ struct homa_pacer { */ u32 cycles_per_mbyte; - /** - * @throttle_min_bytes: If a packet has fewer bytes than this, then it - * bypasses the throttle mechanism and is transmitted immediately. - * We have this limit because for very small packets CPU overheads - * make it impossible to keep up with the NIC so (a) the NIC queue - * can't grow and (b) using the pacer would serialize all of these - * packets through a single core, which makes things even worse. - * Set externally via sysctl. - */ - int throttle_min_bytes; - -#ifndef __STRIP__ /* See strip.py */ - /** - * @sysctl_header: Used to remove sysctl values when this structure - * is destroyed. - */ - struct ctl_table_header *sysctl_header; -#endif /* See strip.py */ - /** * @wait_queue: Used to block the pacer thread when there * are no throttled RPCs. @@ -155,14 +113,13 @@ static inline void homa_pacer_check(struct homa_pacer *pacer) * to queue new packets; if the NIC queue becomes more than half * empty, then we will help out here. */ - if ((homa_clock() + (pacer->max_nic_queue_cycles >> 1)) < + if ((homa_clock() + (pacer->homa->qshared->max_nic_queue_cycles >> 1)) < atomic64_read(&pacer->link_idle_time)) return; tt_record("homa_check_pacer calling homa_pacer_xmit"); homa_pacer_xmit(pacer); } -#ifndef __STRIP__ /* See strip.py */ /** * homa_pacer_throttle_lock() - Acquire the throttle lock. If the lock * isn't immediately available, record stats on the waiting time. @@ -174,17 +131,6 @@ static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) if (!spin_trylock_bh(&pacer->throttle_lock)) homa_pacer_throttle_lock_slow(pacer); } -#else /* See strip.py */ -/** - * homa_pacer_throttle_lock() - Acquire the throttle lock. - * @pacer: Pacer information for a Homa transport. - */ -static inline void homa_pacer_throttle_lock(struct homa_pacer *pacer) - __acquires(pacer->throttle_lock) -{ - spin_lock_bh(&pacer->throttle_lock); -} -#endif /* See strip.py */ /** * homa_pacer_throttle_unlock() - Release the throttle lock. diff --git a/homa_plumbing.c b/homa_plumbing.c index 7428394b..92cf022b 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -1828,6 +1828,8 @@ int homa_dointvec(const struct ctl_table *table, int write, * dependent information). */ homa_incoming_sysctl_changed(homa); + homa_pacer_update_sysctl_deps(homa->pacer); + homa_qdisc_update_sysctl_deps(homa->qshared); /* For this value, only call the method when this * particular value was written (don't want to increment diff --git a/homa_pool.c b/homa_pool.c index a2684cad..9bc6b33f 100644 --- a/homa_pool.c +++ b/homa_pool.c @@ -551,7 +551,7 @@ void homa_pool_check_waiting(struct homa_pool *pool) * homa_pool_avail_bytes() - Return a count of the number of bytes currently * unused and available for allocation in a pool. * @pool: Pool of interest. - * Return See above. + * Return: See above. */ u64 homa_pool_avail_bytes(struct homa_pool *pool) { diff --git a/homa_qdisc.c b/homa_qdisc.c index 4976f862..d3f300d3 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -12,7 +12,6 @@ */ #include "homa_impl.h" -#include "homa_pacer.h" #include "homa_qdisc.h" #include "homa_rpc.h" #include "timetrace.h" @@ -20,6 +19,36 @@ #include #include +/* Used to enable sysctl access to configuration parameters related to + * homa_qdisc. The @data fields are actually offsets within a struct + * homa_qdisc_shared; these are converted to pointers into a net-specific + * struct homa later. + */ +#define OFFSET(field) ((void *)offsetof(struct homa_qdisc_shared, field)) +static struct ctl_table homa_qdisc_ctl_table[] = { + { + .procname = "max_nic_queue_ns", + .data = OFFSET(max_nic_queue_ns), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, + { + .procname = "pacer_fifo_fraction", + .data = OFFSET(fifo_fraction), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, + { + .procname = "defer_min_bytes", + .data = OFFSET(defer_min_bytes), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, +}; + static struct Qdisc_ops homa_qdisc_ops __read_mostly = { .id = "homa", .priv_size = sizeof(struct homa_qdisc), @@ -90,40 +119,51 @@ void homa_rcu_kfree_callback(struct rcu_head *head) } /** - * homa_qdisc_qdevs_alloc() - Allocate and initialize a new homa_qdisc_qdevs + * homa_qdisc_shared_alloc() - Allocate and initialize a new homa_qdisc_shared * object. * Return: The new object, or an ERR_PTR if an error occurred. */ -struct homa_qdisc_qdevs *homa_qdisc_qdevs_alloc(void) +struct homa_qdisc_shared *homa_qdisc_shared_alloc(void) { - struct homa_qdisc_qdevs *qdevs; + struct homa_qdisc_shared *qshared; - qdevs = kzalloc(sizeof(*qdevs), GFP_KERNEL); - if (!qdevs) + qshared = kzalloc(sizeof(*qshared), GFP_KERNEL); + if (!qshared) return ERR_PTR(-ENOMEM); - mutex_init(&qdevs->mutex); - INIT_LIST_HEAD(&qdevs->qdevs); - return qdevs; + mutex_init(&qshared->mutex); + INIT_LIST_HEAD(&qshared->qdevs); + qshared->fifo_fraction = 50; + qshared->max_nic_queue_ns = 5000; + qshared->defer_min_bytes = 1000; + qshared->sysctl_header = register_net_sysctl(&init_net, "net/homa", + homa_qdisc_ctl_table); + if (!qshared->sysctl_header) { + pr_err("couldn't register sysctl parameters for Homa qdisc\n"); + kfree(qshared); + return ERR_PTR(-ENOMEM); + } + homa_qdisc_update_sysctl_deps(qshared); + return qshared; } /** - * homa_qdisc_qdevs_free() - Invoked when a struct homa is being freed; - * releases information related to all the assoiciated homa_qdiscs. - * @qdevs: Information about homa_qdisc_devs associated with a + * homa_qdisc_shared_free() - Invoked when a struct homa is being freed; + * releases information related to all the associated homa_qdiscs. + * @qshared: Information about homa_qdisc_devs associated with a * particular struct homa. */ -void homa_qdisc_qdevs_free(struct homa_qdisc_qdevs *qdevs) +void homa_qdisc_shared_free(struct homa_qdisc_shared *qshared) { struct homa_qdisc_dev *qdev; int stranded = 0; - /* At this point this object no-one else besides us should - * ever access this object again, but lock it just to be safe. + /* At this point no-one else besides us should ever access this object + * again, but lock it just to be safe. */ - mutex_lock(&qdevs->mutex); + mutex_lock(&qshared->mutex); while (1) { - qdev = list_first_or_null_rcu(&qdevs->qdevs, + qdev = list_first_or_null_rcu(&qshared->qdevs, struct homa_qdisc_dev, links); if (!qdev) break; @@ -139,12 +179,16 @@ void homa_qdisc_qdevs_free(struct homa_qdisc_qdevs *qdevs) kthread_stop(qdev->pacer_kthread); qdev->pacer_kthread = NULL; } - if (stranded != 0) pr_err("homa_qdisc_devs_free found %d live qdevs (should have been none)\n", stranded); - mutex_unlock(&qdevs->mutex); - homa_rcu_kfree(qdevs); + + if (qshared->sysctl_header) { + unregister_net_sysctl_table(qshared->sysctl_header); + qshared->sysctl_header = NULL; + } + mutex_unlock(&qshared->mutex); + homa_rcu_kfree(qshared); } /** @@ -156,14 +200,14 @@ void homa_qdisc_qdevs_free(struct homa_qdisc_qdevs *qdevs) */ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) { - struct homa_qdisc_qdevs *qdevs; + struct homa_qdisc_shared *qshared; struct homa_qdisc_dev *qdev; struct homa_net *hnet; rcu_read_lock(); hnet = homa_net(dev_net(dev)); - qdevs = hnet->homa->qdevs; - list_for_each_entry_rcu(qdev, &qdevs->qdevs, links) { + qshared = hnet->homa->qshared; + list_for_each_entry_rcu(qdev, &qshared->qdevs, links) { if (qdev->dev == dev && refcount_inc_not_zero(&qdev->refs)) { rcu_read_unlock(); return qdev; @@ -175,8 +219,8 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) * after acquiring the mutex, in case someone else already * created it). */ - mutex_lock(&qdevs->mutex); - list_for_each_entry_rcu(qdev, &qdevs->qdevs, links) { + mutex_lock(&qshared->mutex); + list_for_each_entry_rcu(qdev, &qshared->qdevs, links) { if (qdev->dev == dev && refcount_inc_not_zero(&qdev->refs)) { UNIT_LOG("; ", "race in homa_qdisc_qdev_get"); goto done; @@ -193,7 +237,7 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) refcount_set(&qdev->refs, 1); qdev->pacer_qix = -1; qdev->redirect_qix = -1; - homa_qdisc_update_sysctl(qdev); + homa_qdev_update_sysctl(qdev); INIT_LIST_HEAD(&qdev->links); qdev->deferred_rpcs = RB_ROOT_CACHED; skb_queue_head_init(&qdev->tcp_deferred); @@ -212,10 +256,10 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) qdev = ERR_PTR(error); goto done; } - list_add_rcu(&qdev->links, &qdevs->qdevs); + list_add_rcu(&qdev->links, &qshared->qdevs); done: - mutex_unlock(&qdevs->mutex); + mutex_unlock(&qshared->mutex); return qdev; } @@ -226,22 +270,22 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) */ void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev) { - struct homa_qdisc_qdevs *qdevs; + struct homa_qdisc_shared *qshared; if (!refcount_dec_and_test(&qdev->refs)) return; /* Make this homa_qdisc_dev inaccessible, then schedule an RCU-safe * free. Think carefully before you modify this code, to ensure that - * concurrent RCU scans of qdevs->qdevs are safe. + * concurrent RCU scans of qshared->qdevs are safe. */ - qdevs = qdev->hnet->homa->qdevs; - mutex_lock(&qdevs->mutex); + qshared = qdev->hnet->homa->qshared; + mutex_lock(&qshared->mutex); list_del_rcu(&qdev->links); kthread_stop(qdev->pacer_kthread); qdev->pacer_kthread = NULL; call_rcu(&qdev->rcu_head, homa_qdisc_dev_callback); - mutex_unlock(&qdevs->mutex); + mutex_unlock(&qshared->mutex); } /** @@ -354,7 +398,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct homa_qdisc *q = qdisc_priv(sch); struct homa_qdisc_dev *qdev = q->qdev; - struct homa *homa = qdev->hnet->homa; + struct homa_qdisc_shared *qshared = qdev->hnet->homa->qshared; struct homa_data_hdr *h; int pkt_len; int result; @@ -392,14 +436,14 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (offset == -1) offset = ntohl(h->common.sequence); if (h->common.type != DATA || ntohl(h->message_length) < - homa->pacer->throttle_min_bytes) { + qshared->defer_min_bytes) { homa_qdisc_update_link_idle(qdev, pkt_len, -1); goto enqueue; } if (!homa_qdisc_any_deferred(qdev) && homa_qdisc_update_link_idle(qdev, pkt_len, - homa->pacer->max_nic_queue_cycles)) + qshared->max_nic_queue_cycles)) goto enqueue; /* This packet needs to be deferred until the NIC queue has @@ -696,7 +740,7 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) /* If the NIC queue is too long, wait until it gets shorter. */ now = homa_clock(); idle_time = atomic64_read(&qdev->link_idle_time); - while ((now + qdev->hnet->homa->pacer->max_nic_queue_cycles) < + while ((now + qdev->hnet->homa->qshared->max_nic_queue_cycles) < idle_time) { /* If we've xmitted at least one packet then * return (this helps with testing and also @@ -807,9 +851,9 @@ void homa_qdisc_pacer_check(struct homa *homa) u64 now = homa_clock(); int max_cycles; - max_cycles = homa->pacer->max_nic_queue_cycles; + max_cycles = homa->qshared->max_nic_queue_cycles; rcu_read_lock(); - list_for_each_entry_rcu(qdev, &homa->qdevs->qdevs, links) { + list_for_each_entry_rcu(qdev, &homa->qshared->qdevs, links) { if (!homa_qdisc_any_deferred(qdev)) continue; @@ -828,11 +872,43 @@ void homa_qdisc_pacer_check(struct homa *homa) } /** - * homa_qdisc_update_sysctl() - Recompute information in a homa_qdisc_dev + * homa_qdisc_dointvec() - This function is a wrapper around proc_dointvec. It + * is invoked to read and write pacer-related sysctl values. + * @table: sysctl table describing value to be read or written. + * @write: Nonzero means value is being written, 0 means read. + * @buffer: Address in user space of the input/output data. + * @lenp: Not exactly sure. + * @ppos: Not exactly sure. + * + * Return: 0 for success, nonzero for error. + */ +int homa_qdisc_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table table_copy; + struct homa_qdisc_shared *qshared; + int result; + + qshared = homa_net(current->nsproxy->net_ns)->homa->qshared; + + /* Generate a new ctl_table that refers to a field in the + * net-specific struct homa. + */ + table_copy = *table; + table_copy.data = ((char *)qshared) + (uintptr_t)table_copy.data; + + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + if (write) + homa_qdisc_update_sysctl_deps(qshared); + return result; +} + +/** + * homa_qdev_update_sysctl() - Recompute information in a homa_qdisc_dev * that depends on sysctl parameters. * @qdev: Update information here that depends on sysctl values. */ -void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev) +void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev) { struct ethtool_link_ksettings ksettings; struct homa *homa = qdev->hnet->homa; @@ -869,20 +945,20 @@ void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev) } /** - * homa_qdisc_update_all_sysctl() - Invoked whenever a sysctl value is changed; - * updates all qdisc structures to reflect new values. - * @hnet: Homa's information about a network namespace: changes will apply - * to qdiscs in this namespace. + * homa_qdisc_update_sysctl_deps() - Update any qdisc fields that depend + * on values set by sysctl. This function is invoked anytime a qdisc sysctl + * value is updated. + * @qshared: Qdisc data to update. */ -void homa_qdisc_update_all_sysctl(struct homa_net *hnet) +void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared) { struct homa_qdisc_dev *qdev; + qshared->max_nic_queue_cycles = + homa_ns_to_cycles(qshared->max_nic_queue_ns); + rcu_read_lock(); - list_for_each_entry_rcu(qdev, &hnet->homa->qdevs->qdevs, links) { - if (qdev->hnet != hnet) - continue; - homa_qdisc_update_sysctl(qdev); - } + list_for_each_entry_rcu(qdev, &qshared->qdevs, links) + homa_qdev_update_sysctl(qdev); rcu_read_unlock(); } diff --git a/homa_qdisc.h b/homa_qdisc.h index 8aca30a9..fee414e0 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -100,7 +100,7 @@ struct homa_qdisc_dev { /** * @links: Used to link this object into the qdevs list in a - * homa_qdisc_qdevs struct. + * homa_qdisc_shared struct. */ struct list_head links; @@ -164,11 +164,11 @@ struct homa_qdisc_dev { }; /** - * struct homa_qdisc_qdevs - There is one of these structs for each - * struct homa. Used to manage all of the homa_qdisc_devs for the - * struct homa. + * struct homa_qdisc_shared - There is one of these structs for each + * struct homa. Contains information that is shared across all homq_qdiscs + * and homa_qdisc_devs for the struct homa. */ -struct homa_qdisc_qdevs { +struct homa_qdisc_shared { /** * @mutex: Must hold when modifying qdevs. Can scan qdevs * without locking using RCU. @@ -180,6 +180,46 @@ struct homa_qdisc_qdevs { * exist for this struct homa. */ struct list_head qdevs; + + /** + * @fifo_fraction: Out of every 1000 packets transmitted by the + * pacer, this number will be transmitted from the oldest message + * rather than the highest-priority message. Set externally via + * sysctl. + */ + int fifo_fraction; + + /** + * @max_nic_queue_ns: Limits the NIC queue length: we won't queue + * up a packet for transmission if link_idle_time is this many + * nanoseconds in the future (or more). Set externally via sysctl. + */ + int max_nic_queue_ns; + + /** + * @max_nic_queue_cycles: Same as max_nic_queue_ns except in + * homa_clock() units. + */ + int max_nic_queue_cycles; + + /** + * @defer_min_bytes: If a packet has fewer bytes than this, then it + * will be transmitted immediately, regardless of NIC queue length. + * We have this limit because for very small packets CPU overheads + * make it impossible to keep up with the NIC so (a) the NIC queue + * can't grow and (b) using the pacer would serialize all of these + * packets through a single core, which makes things even worse. + * Set externally via sysctl. + */ + int defer_min_bytes; + +#ifndef __STRIP__ /* See strip.py */ + /** + * @sysctl_header: Used to remove sysctl values when this structure + * is destroyed. + */ + struct ctl_table_header *sysctl_header; +#endif /* See strip.py */ }; /** @@ -194,15 +234,15 @@ struct homa_rcu_kfreer { void *object; }; +void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev); void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb); struct sk_buff * homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev); void homa_qdisc_destroy(struct Qdisc *sch); void homa_qdisc_dev_callback(struct rcu_head *head); -struct homa_qdisc_qdevs * - homa_qdisc_qdevs_alloc(void); -void homa_qdisc_qdevs_free(struct homa_qdisc_qdevs *qdevs); +int homa_qdisc_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev); @@ -221,11 +261,13 @@ int homa_qdisc_redirect_skb(struct sk_buff *skb, bool pacer); int homa_qdisc_register(void); void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev); +struct homa_qdisc_shared * + homa_qdisc_shared_alloc(void); +void homa_qdisc_shared_free(struct homa_qdisc_shared *qshared); void homa_qdisc_unregister(void); -void homa_qdisc_update_all_sysctl(struct homa_net *hnet); int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, int max_queue_ns); -void homa_qdisc_update_sysctl(struct homa_qdisc_dev *qdev); +void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared); void homa_rcu_kfree(void *object); void homa_rcu_kfree_callback(struct rcu_head *head); @@ -237,7 +279,7 @@ void homa_rcu_kfree_callback(struct rcu_head *head); */ static inline bool homa_qdisc_active(struct homa *homa) { - return list_first_or_null_rcu(&homa->qdevs->qdevs, + return list_first_or_null_rcu(&homa->qshared->qdevs, struct homa_qdisc_dev, links) != NULL; } diff --git a/homa_utils.c b/homa_utils.c index 1578c493..e11fe969 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -36,6 +36,12 @@ int homa_init(struct homa *homa) atomic64_set(&homa->next_outgoing_id, 2); homa->link_mbps = 25000; #ifndef __STRIP__ /* See strip.py */ + homa->qshared = homa_qdisc_shared_alloc(); + if (IS_ERR(homa->qshared)) { + err = PTR_ERR(homa->qshared); + homa->qshared = NULL; + return err; + } homa->pacer = homa_pacer_alloc(homa); if (IS_ERR(homa->pacer)) { err = PTR_ERR(homa->pacer); @@ -48,12 +54,6 @@ int homa_init(struct homa *homa) homa->grant = NULL; return err; } - homa->qdevs = homa_qdisc_qdevs_alloc(); - if (IS_ERR(homa->qdevs)) { - err = PTR_ERR(homa->qdevs); - homa->qdevs = NULL; - return err; - } #endif /* See strip.py */ homa->peertab = homa_peer_alloc_peertab(); if (IS_ERR(homa->peertab)) { @@ -139,10 +139,6 @@ void homa_destroy(struct homa *homa) homa->socktab = NULL; } #ifndef __STRIP__ /* See strip.py */ - if (homa->qdevs) { - homa_qdisc_qdevs_free(homa->qdevs); - homa->qdevs = NULL; - } if (homa->grant) { homa_grant_free(homa->grant); homa->grant = NULL; @@ -151,6 +147,10 @@ void homa_destroy(struct homa *homa) homa_pacer_free(homa->pacer); homa->pacer = NULL; } + if (homa->qshared) { + homa_qdisc_shared_free(homa->qshared); + homa->qshared = NULL; + } #endif /* See strip.py */ if (homa->peertab) { homa_peer_free_peertab(homa->peertab); diff --git a/test/unit_homa_grant.c b/test/unit_homa_grant.c index 5217e8fc..150edd99 100644 --- a/test/unit_homa_grant.c +++ b/test/unit_homa_grant.c @@ -94,7 +94,7 @@ FIXTURE_SETUP(homa_grant) self->homa.num_priorities = 1; self->homa.poll_cycles = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; - self->homa.pacer->fifo_fraction = 0; + self->homa.qshared->fifo_fraction = 0; self->homa.grant->fifo_fraction = 0; self->homa.grant->window = 10000; self->homa.grant->max_incoming = 50000; diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 928b2d23..4ce7e5e0 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -3,7 +3,6 @@ #include "homa_impl.h" #include "homa_grant.h" #include "homa_interest.h" -#include "homa_pacer.h" #include "homa_peer.h" #include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 @@ -14,6 +13,7 @@ #ifndef __STRIP__ /* See strip.py */ #include "homa_offload.h" +#include "homa_pacer.h" #endif /* See strip.py */ #ifndef __STRIP__ /* See strip.py */ @@ -101,7 +101,7 @@ FIXTURE_SETUP(homa_incoming) self->homa.num_priorities = 1; self->homa.poll_cycles = 0; self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; - self->homa.pacer->fifo_fraction = 0; + self->homa.qshared->fifo_fraction = 0; self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; #endif /* See strip.py */ diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index e667f717..520a2a21 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -2,20 +2,21 @@ #include "homa_impl.h" #include "homa_grant.h" -#include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" +#define KSELFTEST_NOT_MAIN 1 +#include "kselftest_harness.h" +#include "ccutils.h" +#include "mock.h" +#include "utils.h" + #ifndef __STRIP__ /* See strip.py */ +#include "homa_pacer.h" #include "homa_qdisc.h" #include "homa_skb.h" #else /* See strip.py */ #include "homa_stub.h" #endif /* See strip.py */ -#define KSELFTEST_NOT_MAIN 1 -#include "kselftest_harness.h" -#include "ccutils.h" -#include "mock.h" -#include "utils.h" #ifndef __STRIP__ /* See strip.py */ #define XMIT_DATA(rpc, force) homa_xmit_data(rpc, force) @@ -96,7 +97,7 @@ FIXTURE_SETUP(homa_outgoing) self->homa.flags |= HOMA_FLAG_DONT_THROTTLE; self->homa.unsched_bytes = 10000; self->homa.grant->window = 10000; - self->homa.pacer->fifo_fraction = 0; + self->homa.qshared->fifo_fraction = 0; #endif /* See strip.py */ mock_sock_init(&self->hsk, self->hnet, self->client_port); self->server_addr.in6.sin6_family = AF_INET; @@ -830,8 +831,8 @@ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 11000); - self->homa.pacer->max_nic_queue_cycles = 500; - self->homa.pacer->throttle_min_bytes = 250; + self->homa.qshared->max_nic_queue_cycles = 500; + self->homa.qshared->defer_min_bytes = 250; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); XMIT_DATA(crpc, false); @@ -852,7 +853,7 @@ TEST_F(homa_outgoing, homa_xmit_data__force) /* First, get an RPC on the throttled list. */ atomic64_set(&self->homa.pacer->link_idle_time, 11000); - self->homa.pacer->max_nic_queue_cycles = 3000; + self->homa.qshared->max_nic_queue_cycles = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc1); XMIT_DATA(crpc1, false); @@ -883,7 +884,7 @@ TEST_F(homa_outgoing, homa_xmit_data__dont_throttle_because_homa_qdisc_in_use) self->client_id, 2000, 1000); unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 1000000); - self->homa.pacer->max_nic_queue_cycles = 0; + self->homa.qshared->max_nic_queue_cycles = 0; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); @@ -903,7 +904,7 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 11000); - self->homa.pacer->max_nic_queue_cycles = 3000; + self->homa.qshared->max_nic_queue_cycles = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index 0a0ab54c..ab4a693f 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -70,8 +70,8 @@ FIXTURE_SETUP(homa_pacer) homa_init(&self->homa); self->hnet = mock_hnet(0, &self->homa); self->homa.pacer->cycles_per_mbyte = 1000000; - self->homa.pacer->throttle_min_bytes = 0; - self->homa.pacer->fifo_fraction = 0; + self->homa.qshared->defer_min_bytes = 0; + self->homa.qshared->fifo_fraction = 0; mock_sock_init(&self->hsk, self->hnet, self->client_port); } FIXTURE_TEARDOWN(homa_pacer) @@ -107,15 +107,6 @@ TEST_F(homa_pacer, homa_pacer_new__cant_create_pacer_thread) EXPECT_TRUE(IS_ERR(pacer)); EXPECT_EQ(EACCES, -PTR_ERR(pacer)); } -TEST_F(homa_pacer, homa_pacer_new__cant_register_sysctls) -{ - struct homa_pacer *pacer; - - mock_register_sysctl_errors = 1; - pacer = homa_pacer_alloc(&self->homa); - EXPECT_TRUE(IS_ERR(pacer)); - EXPECT_EQ(ENOMEM, -PTR_ERR(pacer)); -} TEST_F(homa_pacer, homa_pacer_free__basics) { @@ -125,8 +116,7 @@ TEST_F(homa_pacer, homa_pacer_free__basics) EXPECT_FALSE(IS_ERR(pacer)); unit_log_clear(); homa_pacer_free(pacer); - EXPECT_STREQ("unregister_net_sysctl_table; kthread_stop", - unit_log_get()); + EXPECT_STREQ("kthread_stop", unit_log_get()); } TEST_F(homa_pacer, homa_pacer_free__no_thread) { @@ -137,7 +127,7 @@ TEST_F(homa_pacer, homa_pacer_free__no_thread) pacer->kthread = NULL; unit_log_clear(); homa_pacer_free(pacer); - EXPECT_STREQ("unregister_net_sysctl_table", unit_log_get()); + EXPECT_STREQ("", unit_log_get()); } TEST_F(homa_pacer, homa_pacer_check_nic_q__success) @@ -152,7 +142,7 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__success) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); mock_clock = 8000; - self->homa.pacer->max_nic_queue_cycles = 1000; + self->homa.qshared->max_nic_queue_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, false)); EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -169,7 +159,7 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); mock_clock = 7999; - self->homa.pacer->max_nic_queue_cycles = 1000; + self->homa.qshared->max_nic_queue_cycles = 1000; EXPECT_EQ(0, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, false)); EXPECT_EQ(9000, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -186,7 +176,7 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full_but_force) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); mock_clock = 7999; - self->homa.pacer->max_nic_queue_cycles = 1000; + self->homa.qshared->max_nic_queue_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, true)); EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -203,7 +193,7 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_empty) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); mock_clock = 10000; - self->homa.pacer->max_nic_queue_cycles = 1000; + self->homa.qshared->max_nic_queue_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, true)); EXPECT_EQ(10500, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -229,7 +219,7 @@ TEST_F(homa_pacer, homa_pacer_main__xmit_data) homa_pacer_manage_rpc(crpc1); homa_pacer_manage_rpc(crpc2); - self->homa.pacer->max_nic_queue_cycles = 3000; + self->homa.qshared->max_nic_queue_cycles = 3000; mock_clock_tick = 200; unit_hook_register(exit_idle_hook); hook_pacer = self->homa.pacer; @@ -263,7 +253,7 @@ TEST_F(homa_pacer, homa_pacer_main__rpc_arrives_while_sleeping) mock_clock_tick = 200; unit_hook_register(manage_hook); hook_rpc = crpc; - self->homa.pacer->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_queue_cycles = 2000; unit_log_clear(); homa_pacer_main(self->homa.pacer); @@ -298,7 +288,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__basics) homa_pacer_manage_rpc(crpc1); homa_pacer_manage_rpc(crpc2); homa_pacer_manage_rpc(crpc3); - self->homa.pacer->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_queue_cycles = 2000; unit_log_clear(); homa_pacer_xmit(self->homa.pacer); EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", @@ -318,7 +308,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__pacer_already_active) self->client_id, 10000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.pacer->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_queue_cycles = 2000; mock_trylock_errors = 1; unit_log_clear(); homa_pacer_xmit(self->homa.pacer); @@ -336,7 +326,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__nic_queue_fills) self->client_id, 10000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.pacer->max_nic_queue_cycles = 2001; + self->homa.qshared->max_nic_queue_cycles = 2001; mock_clock = 10000; atomic64_set(&self->homa.pacer->link_idle_time, 12000); unit_log_clear(); @@ -350,7 +340,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__nic_queue_fills) } TEST_F(homa_pacer, homa_pacer_xmit__queue_empty) { - self->homa.pacer->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_queue_cycles = 2000; unit_log_clear(); homa_pacer_xmit(self->homa.pacer); unit_log_throttled(&self->homa); @@ -377,9 +367,9 @@ TEST_F(homa_pacer, homa_pacer_xmit__xmit_fifo) homa_pacer_manage_rpc(crpc3); /* First attempt: pacer->fifo_count doesn't reach zero. */ - self->homa.pacer->max_nic_queue_cycles = 1300; + self->homa.qshared->max_nic_queue_cycles = 1300; self->homa.pacer->fifo_count = 200; - self->homa.pacer->fifo_fraction = 150; + self->homa.qshared->fifo_fraction = 150; mock_clock= 13000; atomic64_set(&self->homa.pacer->link_idle_time, 10000); unit_log_clear(); @@ -416,7 +406,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__rpc_removed_from_queue_before_locked) self->client_id, 10000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.pacer->max_nic_queue_cycles = 10000; + self->homa.qshared->max_nic_queue_cycles = 10000; unit_log_clear(); unit_hook_register(unmanage_hook); hook_rpc = crpc; @@ -441,7 +431,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__remove_from_queue) homa_pacer_manage_rpc(crpc1); homa_pacer_manage_rpc(crpc2); - self->homa.pacer->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_queue_cycles = 2000; unit_log_clear(); /* First call completes id 2, but id 4 is still in the queue. */ @@ -455,7 +445,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__remove_from_queue) /* Second call completes id 4, queue now empty. */ unit_log_clear(); - self->homa.pacer->max_nic_queue_cycles = 10000; + self->homa.qshared->max_nic_queue_cycles = 10000; homa_pacer_xmit(self->homa.pacer); EXPECT_STREQ("xmit DATA 600@1400; removing id 4 from throttled list", unit_log_get()); @@ -603,10 +593,8 @@ TEST_F(homa_pacer, homa_pacer_unmanage_rpc__metrics) TEST_F(homa_pacer, homa_pacer_update_sysctl_deps) { - self->homa.pacer->max_nic_queue_ns = 6000; self->homa.link_mbps = 10000; homa_pacer_update_sysctl_deps(self->homa.pacer); - EXPECT_EQ(6000, self->homa.pacer->max_nic_queue_cycles); EXPECT_EQ(808000, self->homa.pacer->cycles_per_mbyte); self->homa.link_mbps = 1000; diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index a6a07309..dc528471 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -210,48 +210,59 @@ TEST_F(homa_qdisc, homa_rcu_kfree_callback) homa_rcu_kfree_callback(&freer->rcu_head); } -TEST_F(homa_qdisc, homa_qdisc_qdevs_alloc__success) +TEST_F(homa_qdisc, homa_qdisc_shared_alloc__success) { - struct homa_qdisc_qdevs *qdevs; + struct homa_qdisc_shared *qshared; - qdevs = homa_qdisc_qdevs_alloc(); - ASSERT_FALSE(IS_ERR(qdevs)); - EXPECT_EQ(0, unit_list_length(&qdevs->qdevs)); - kfree(qdevs); + qshared = homa_qdisc_shared_alloc(); + ASSERT_FALSE(IS_ERR(qshared)); + EXPECT_EQ(0, unit_list_length(&qshared->qdevs)); + kfree(qshared); } -TEST_F(homa_qdisc, homa_qdisc_qdevs_alloc__kmalloc_failure) +TEST_F(homa_qdisc, homa_qdisc_shared_alloc__kmalloc_failure) { - struct homa_qdisc_qdevs *qdevs; + struct homa_qdisc_shared *qshared; mock_kmalloc_errors = 1; - qdevs = homa_qdisc_qdevs_alloc(); - ASSERT_TRUE(IS_ERR(qdevs)); - EXPECT_EQ(ENOMEM, -PTR_ERR(qdevs)); + qshared = homa_qdisc_shared_alloc(); + ASSERT_TRUE(IS_ERR(qshared)); + EXPECT_EQ(ENOMEM, -PTR_ERR(qshared)); +} +TEST_F(homa_qdisc, homa_qdisc_shared_alloc__cant_register_sysctls) +{ + struct homa_qdisc_shared *qshared; + + mock_register_sysctl_errors = 1; + qshared = homa_qdisc_shared_alloc(); + ASSERT_TRUE(IS_ERR(qshared)); + EXPECT_EQ(ENOMEM, -PTR_ERR(qshared)); } -TEST_F(homa_qdisc, homa_qdisc_qdevs_free__basics) +TEST_F(homa_qdisc, homa_qdisc_shared_free__basics) { - struct homa_qdisc_qdevs *qdevs; + struct homa_qdisc_shared *qshared; /* Test infrastructure will report any inconsistencie in * memory allocation. */ - qdevs = homa_qdisc_qdevs_alloc(); - homa_qdisc_qdevs_free(qdevs); + qshared = homa_qdisc_shared_alloc(); + homa_qdisc_shared_free(qshared); + EXPECT_STREQ("unregister_net_sysctl_table; call_rcu invoked", + unit_log_get()); } -TEST_F(homa_qdisc, homa_qdisc_qdevs_free__unfreed_qdevs) +TEST_F(homa_qdisc, homa_qdisc_shared_free__unfreed_qdevs) { - struct homa_qdisc_qdevs *qdevs, *saved_qdevs; + struct homa_qdisc_shared *qshared, *saved_qshared; struct homa_qdisc_dev *qdev; - qdevs = homa_qdisc_qdevs_alloc(); - saved_qdevs = self->homa.qdevs; - self->homa.qdevs = qdevs; + qshared = homa_qdisc_shared_alloc(); + saved_qshared = self->homa.qshared; + self->homa.qshared = qshared; qdev = homa_qdisc_qdev_get(self->dev); - EXPECT_EQ(1, unit_list_length(&qdevs->qdevs)); - self->homa.qdevs = saved_qdevs; + EXPECT_EQ(1, unit_list_length(&qshared->qdevs)); + self->homa.qshared = saved_qshared; mock_printk_output[0] = 0; - homa_qdisc_qdevs_free(qdevs); + homa_qdisc_shared_free(qshared); EXPECT_STREQ("homa_qdisc_devs_free found 1 live qdevs " "(should have been none)", mock_printk_output); homa_qdisc_qdev_put(qdev); @@ -264,7 +275,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__basics) qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); EXPECT_EQ(1, refcount_read(&qdev->refs)); - EXPECT_EQ(1, unit_list_length(&self->homa.qdevs->qdevs)); + EXPECT_EQ(1, unit_list_length(&self->homa.qshared->qdevs)); homa_qdisc_qdev_put(qdev); } @@ -279,7 +290,7 @@ TEST_F(homa_qdisc, homa_qdisc_get__use_existing) qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); EXPECT_FALSE(IS_ERR(qdev)); - EXPECT_EQ(2, unit_list_length(&self->homa.qdevs->qdevs)); + EXPECT_EQ(2, unit_list_length(&self->homa.qshared->qdevs)); EXPECT_EQ(1, refcount_read(&qdev->refs)); EXPECT_EQ(qdev, homa_qdisc_qdev_get(self->dev)); @@ -299,7 +310,7 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_get__race_when_creating) unit_log_clear(); qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(IS_ERR(qdev)); - EXPECT_EQ(1, unit_list_length(&self->homa.qdevs->qdevs)); + EXPECT_EQ(1, unit_list_length(&self->homa.qshared->qdevs)); EXPECT_EQ(2, refcount_read(&qdev->refs)); EXPECT_SUBSTR("race in homa_qdisc_qdev_get", unit_log_get()); @@ -342,15 +353,15 @@ TEST_F(homa_qdisc, homa_qdisc_qdev_put) /* First call: refcount doesn't hit zero. */ homa_qdisc_qdev_put(qdev2); EXPECT_EQ(1, refcount_read(&qdev2->refs)); - EXPECT_EQ(3, unit_list_length(&self->homa.qdevs->qdevs)); + EXPECT_EQ(3, unit_list_length(&self->homa.qshared->qdevs)); /* Second call: refcount hits zero. */ homa_qdisc_qdev_put(qdev2); - EXPECT_EQ(2, unit_list_length(&self->homa.qdevs->qdevs)); + EXPECT_EQ(2, unit_list_length(&self->homa.qshared->qdevs)); homa_qdisc_qdev_put(qdev3); homa_qdisc_qdev_put(qdev1); - EXPECT_EQ(0, unit_list_length(&self->homa.qdevs->qdevs)); + EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); } TEST_F(homa_qdisc, homa_qdisc_dev_callback) @@ -377,7 +388,7 @@ TEST_F(homa_qdisc, homa_qdisc_dev_callback) /* If skbs aren't freed, test infrastructure will complain. */ homa_qdisc_qdev_put(qdev); - EXPECT_EQ(0, unit_list_length(&self->homa.qdevs->qdevs)); + EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); } TEST_F(homa_qdisc, homa_qdisc_init__basics) @@ -387,7 +398,7 @@ TEST_F(homa_qdisc, homa_qdisc_init__basics) struct homa_qdisc *q; EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - qdev = list_first_or_null_rcu(&self->homa.qdevs->qdevs, + qdev = list_first_or_null_rcu(&self->homa.qshared->qdevs, struct homa_qdisc_dev, links); ASSERT_NE(NULL, qdev); EXPECT_EQ(1, refcount_read(&qdev->refs)); @@ -404,7 +415,7 @@ TEST_F(homa_qdisc, homa_qdisc_init__cant_create_new_qdisc_dev) mock_kmalloc_errors = 1; EXPECT_EQ(ENOMEM, -homa_qdisc_init(qdisc, NULL, NULL)); - EXPECT_EQ(0, unit_list_length(&self->homa.qdevs->qdevs)); + EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_init__set_qix) @@ -428,7 +439,7 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); qdisc2 = mock_alloc_qdisc(&mock_net_queue); EXPECT_EQ(0, homa_qdisc_init(qdisc2, NULL, NULL)); - qdev = list_first_or_null_rcu(&self->homa.qdevs->qdevs, + qdev = list_first_or_null_rcu(&self->homa.qshared->qdevs, struct homa_qdisc_dev, links); EXPECT_NE(NULL, qdev); EXPECT_EQ(2, refcount_read(&qdev->refs)); @@ -437,7 +448,7 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) EXPECT_EQ(1, refcount_read(&qdev->refs)); homa_qdisc_destroy(qdisc); - EXPECT_EQ(0, unit_list_length(&self->homa.qdevs->qdevs)); + EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); kfree(qdisc); kfree(qdisc2); } @@ -590,7 +601,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet) /* First packet is deferred because the NIC queue is full. */ EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); - idle = mock_clock + 1 + self->homa.pacer->max_nic_queue_cycles + 1; + idle = mock_clock + 1 + self->homa.qshared->max_nic_queue_cycles + 1; atomic64_set(&q->qdev->link_idle_time, idle); skb = new_test_skb(srpc, &self->addr, 0, 1500); to_free = NULL; @@ -972,8 +983,10 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__last_packet_for_rpc) skb = new_test_skb(srpc1, &self->addr, 5000, 500); homa_qdisc_defer_homa(qdev, skb); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, 500)); - homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 3000, 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, + 500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 3000, + 500)); unit_log_clear(); log_deferred(qdev); EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 2000 3000]", @@ -1181,7 +1194,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) mock_clock = 0; mock_clock_tick = 1000; atomic64_set(&qdev->link_idle_time, 10000); - self->homa.pacer->max_nic_queue_cycles = 3500; + self->homa.qshared->max_nic_queue_cycles = 3500; unit_log_clear(); unit_hook_register(xmit_hook); xmit_clock = 0; @@ -1295,10 +1308,11 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) homa_qdisc_defer_homa(qdev, skb); unit_log_clear(); log_deferred(qdev); - EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 4000]", unit_log_get()); + EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 4000]", + unit_log_get()); mock_clock = atomic64_read(&qdev->link_idle_time); - self->homa.pacer->max_nic_queue_cycles = 100; + self->homa.qshared->max_nic_queue_cycles = 100; unit_log_clear(); homa_qdisc_pacer(qdev, false); @@ -1482,7 +1496,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__enqueue_packet) atomic64_set(&qdev->link_idle_time, 20000); mock_clock = 15000; - self->homa.pacer->max_nic_queue_cycles = 12000; + self->homa.qshared->max_nic_queue_cycles = 12000; homa_qdisc_pacer_check(&self->homa); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); @@ -1507,7 +1521,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__no_deferred_rpcs) atomic64_set(&qdev->link_idle_time, 20000); mock_clock = 15000; - self->homa.pacer->max_nic_queue_cycles = 12000; + self->homa.qshared->max_nic_queue_cycles = 12000; homa_qdisc_pacer_check(&self->homa); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); @@ -1535,7 +1549,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__lag_not_long_enough) atomic64_set(&qdev->link_idle_time, 20000); mock_clock = 13000; - self->homa.pacer->max_nic_queue_cycles = 12000; + self->homa.qshared->max_nic_queue_cycles = 12000; homa_qdisc_pacer_check(&self->homa); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); @@ -1545,7 +1559,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__lag_not_long_enough) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_update_sysctl__basics) +TEST_F(homa_qdisc, homa_qdevc_update_sysctl__basics) { struct homa_qdisc_dev *qdev; @@ -1554,13 +1568,13 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl__basics) self->homa.link_mbps = 25000; mock_link_mbps = 8000; - homa_qdisc_update_sysctl(qdev); + homa_qdev_update_sysctl(qdev); EXPECT_EQ(8000, qdev->link_mbps); EXPECT_EQ(1059061, qdev->cycles_per_mibyte); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_update_sysctl__cant_get_link_speed_from_dev) +TEST_F(homa_qdisc, homa_qdev_update_sysctl__cant_get_link_speed_from_dev) { struct homa_qdisc_dev *qdev; @@ -1570,14 +1584,21 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl__cant_get_link_speed_from_dev) self->homa.link_mbps = 16000; mock_link_mbps = 8000; mock_ethtool_ksettings_errors = 1; - homa_qdisc_update_sysctl(qdev); + homa_qdev_update_sysctl(qdev); EXPECT_EQ(16000, qdev->link_mbps); EXPECT_EQ(529530, qdev->cycles_per_mibyte); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_update_all_sysctl) +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__max_nic_queue_cycles) +{ + self->homa.qshared->max_nic_queue_ns = 6000; + self->homa.link_mbps = 10000; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(6000, self->homa.qshared->max_nic_queue_cycles); +} +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__update_all_qdevs) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct netdev_queue txq2; @@ -1585,6 +1606,10 @@ TEST_F(homa_qdisc, homa_qdisc_update_all_sysctl) struct homa_qdisc *q, *q2; struct Qdisc *qdisc2; + /* qdisc has a net device that provides link speed; qdisc2, created + * below, has a net device that doesn't provide link speed, so it + * uses homa->link_mbps. + */ memset(&txq2, 0, sizeof(txq2)); memset(&net_device2, 0, sizeof(net_device2)); txq2.dev = &net_device2; @@ -1603,7 +1628,7 @@ TEST_F(homa_qdisc, homa_qdisc_update_all_sysctl) self->homa.link_mbps = 25000; mock_link_mbps = 8000; - homa_qdisc_update_all_sysctl(self->hnet); + homa_qdisc_update_sysctl_deps(self->homa.qshared); EXPECT_EQ(8000, q->qdev->link_mbps); EXPECT_EQ(25000, q2->qdev->link_mbps); diff --git a/test/unit_homa_rpc.c b/test/unit_homa_rpc.c index fec95713..9cd00841 100644 --- a/test/unit_homa_rpc.c +++ b/test/unit_homa_rpc.c @@ -2,7 +2,6 @@ #include "homa_impl.h" #include "homa_grant.h" -#include "homa_pacer.h" #include "homa_peer.h" #include "homa_pool.h" #define KSELFTEST_NOT_MAIN 1 @@ -11,6 +10,10 @@ #include "mock.h" #include "utils.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_pacer.h" +#endif /* See strip.py */ + #define n(x) htons(x) #define N(x) htonl(x) diff --git a/test/utils.c b/test/utils.c index 187cd54b..7193ad03 100644 --- a/test/utils.c +++ b/test/utils.c @@ -6,7 +6,6 @@ #include "homa_impl.h" #include "homa_grant.h" -#include "homa_pacer.h" #include "homa_peer.h" #include "homa_rpc.h" #include "ccutils.h" @@ -15,6 +14,10 @@ #include "mock.h" #include "utils.h" +#ifndef __STRIP__ /* See strip.py */ +#include "homa_pacer.h" +#endif /* See strip.py */ + /** * unit_client_rpc() - Create a homa_client_rpc and arrange for it to be * in a given state. From b331805b0b8483ce6dea82fe3c6f41cfc91ed546 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 15 Oct 2025 11:18:59 -0700 Subject: [PATCH 528/625] Fix sparse issue in homa_peer.c --- homa_peer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homa_peer.c b/homa_peer.c index 2cfaf357..9907d4db 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -401,7 +401,7 @@ void homa_peer_free(struct rcu_head *head) struct homa_peer *peer; peer = container_of(head, struct homa_peer, rcu_head); - dst_release(peer->dst); + dst_release(rcu_dereference(peer->dst)); kfree(peer); } From c13a9dff1c250f1cebe06fda6c4b387f03dea280 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 15 Oct 2025 11:30:46 -0700 Subject: [PATCH 529/625] Change 'encoding' in cperf.py to avoid errors --- util/cperf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/cperf.py b/util/cperf.py index a91bc88e..5d9ee820 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -383,7 +383,7 @@ def init(options): vlog("Options: %s" % (s)) vlog("Homa configuration (node%d):" % (options.nodes[0])) result = subprocess.run(['ssh', 'node%d' % (options.nodes[0]), - 'sysctl', '-a'], capture_output=True, encoding="utf-8") + 'sysctl', '-a'], capture_output=True, encoding="iso8859-1") if (result.returncode != 0): log("sysctl -a on node%d exited with status %d:" % (options.nodes[0], result.returncode)) From 449b9db3a83e6f1c3df975ecd7af229e0d37b15b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 16 Oct 2025 11:11:11 -0700 Subject: [PATCH 530/625] Cleanup flowi6 initialization in homa_peer.c Don't use flowic_tos field (which no longer exists), conform more closely to code in tcp_v6_connect. --- homa_peer.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/homa_peer.c b/homa_peer.c index 9907d4db..6aa36a0c 100644 --- a/homa_peer.c +++ b/homa_peer.c @@ -542,23 +542,17 @@ int homa_peer_reset_dst(struct homa_peer *peer, struct homa_sock *hsk) dst = &rt->dst; peer->dst_cookie = 0; } else { - peer->flow.u.ip6.flowi6_oif = hsk->sock.sk_bound_dev_if; - peer->flow.u.ip6.flowi6_iif = LOOPBACK_IFINDEX; - peer->flow.u.ip6.flowi6_mark = hsk->sock.sk_mark; - peer->flow.u.ip6.flowi6_scope = RT_SCOPE_UNIVERSE; + /* This code is derived from code in tcp_v6_connect. */ peer->flow.u.ip6.flowi6_proto = hsk->sock.sk_protocol; - peer->flow.u.ip6.flowi6_flags = 0; - peer->flow.u.ip6.flowi6_secid = 0; - peer->flow.u.ip6.flowi6_tun_key.tun_id = 0; - peer->flow.u.ip6.flowi6_uid = hsk->sock.sk_uid; peer->flow.u.ip6.daddr = peer->addr; peer->flow.u.ip6.saddr = hsk->inet.pinet6->saddr; - peer->flow.u.ip6.fl6_dport = 0; - peer->flow.u.ip6.fl6_sport = 0; - peer->flow.u.ip6.mp_hash = 0; - peer->flow.u.ip6.__fl_common.flowic_tos = hsk->inet.tos; peer->flow.u.ip6.flowlabel = ip6_make_flowinfo(hsk->inet.tos, 0); + peer->flow.u.ip6.flowi6_oif = hsk->sock.sk_bound_dev_if; + peer->flow.u.ip6.flowi6_mark = hsk->sock.sk_mark; + peer->flow.u.ip6.fl6_dport = 0; + peer->flow.u.ip6.fl6_sport = 0; + peer->flow.u.ip6.flowi6_uid = hsk->sock.sk_uid; security_sk_classify_flow(&hsk->sock, &peer->flow.u.__fl_common); dst = ip6_dst_lookup_flow(sock_net(&hsk->sock), &hsk->sock, From c82a24ad03b2b5132a57731f9b122efc4114d299 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 20 Oct 2025 16:35:03 -0700 Subject: [PATCH 531/625] Add support for deferring TCP packets in homa_qdisc --- homa_metrics.c | 12 +- homa_metrics.h | 33 ++- homa_pacer.c | 6 +- homa_qdisc.c | 274 ++++++++++++++++--- homa_qdisc.h | 82 +++++- man/homa.7 | 12 + test/mock.c | 181 ++++++++----- test/mock.h | 6 +- test/unit_homa_plumbing.c | 8 +- test/unit_homa_qdisc.c | 552 ++++++++++++++++++++++++++++++++------ util/metrics.py | 31 ++- 11 files changed, 961 insertions(+), 236 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index 5eba0607..b041e5a8 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -295,10 +295,14 @@ char *homa_metrics_print(void) "Execution time in pacer thread\n"); M("pacer_xmit_cycles", m->pacer_xmit_cycles, "Time pacer spent xmitting packets (vs. polling NIC queue)\n"); - M("pacer_packets", m->pacer_packets, - "Packets transmitted by the pacer\n"); - M("pacer_bytes", m->pacer_bytes, - "Bytes transmitted by the pacer (including headers)\n"); + M("pacer_homa_packets", m->pacer_homa_packets, + "Homa packets transmitted by the pacer\n"); + M("pacer_homa_bytes", m->pacer_homa_bytes, + "Homa bytes transmitted by the pacer (including headers)\n"); + M("pacer_tcp_packets", m->pacer_tcp_packets, + "TCP packets transmitted by the pacer\n"); + M("pacer_tcp_bytes", m->pacer_tcp_bytes, + "TCP bytes transmitted by the pacer (including headers)\n"); M("pacer_help_bytes", m->pacer_help_bytes, "Bytes transmitted via homa_qdisc_pacer_check"); M("homa_cycles", diff --git a/homa_metrics.h b/homa_metrics.h index f2d722e5..aeeb8986 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -395,23 +395,38 @@ struct homa_metrics { u64 pacer_xmit_cycles; /** - * @pacer_packets: total number of Homa packets that were transmitted - * by homa_qdisc_pacer (they were deferred because of NIC queue - * overload). + * @pacer_homa_packets: total number of Homa packets that were + * transmitted by homa_qdisc_pacer (they were deferred because of + * NIC queue overload). + */ + u64 pacer_homa_packets; + + /** + * @pacer_homa_bytes: total number of bytes in Homa packets that were + * transmitted by homa_qdisc_pacer (they were deferred because of + * NIC queue overload). + */ + u64 pacer_homa_bytes; + + /** + * @pacer_tcp_packets: total number of TCP packets that were + * transmitted by homa_qdisc_pacer (they were deferred because of + * NIC queue overload). */ - u64 pacer_packets; + u64 pacer_tcp_packets; /** - * @pacer_bytes: total number of bytes in packets that were + * @pacer_tcp_bytes: total number of bytes in TCP packets that were * transmitted by homa_qdisc_pacer (they were deferred because of * NIC queue overload). */ - u64 pacer_bytes; + u64 pacer_tcp_bytes; /** - * @pacer_help_bytes: bytes in @pacer_bytes that were transmitted via - * calls to homa_qdisc_pacer_check (presumably because the pacer thread - * wasn't keeping up). Includes header bytes. + * @pacer_help_bytes: bytes that the pacer transmitted via calls to + * homa_qdisc_pacer_check (presumably because the pacer thread + * wasn't keeping up). Includes both TCP and Homa packets as well as + * header bytes. */ u64 pacer_help_bytes; diff --git a/homa_pacer.c b/homa_pacer.c index f7fe7efc..1b716af9 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -90,8 +90,10 @@ int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, if ((clock + pacer->homa->qshared->max_nic_queue_cycles) < idle && !force && !(pacer->homa->flags & HOMA_FLAG_DONT_THROTTLE)) return 0; - if (!list_empty(&pacer->throttled_rpcs)) - INC_METRIC(pacer_bytes, bytes); + if (!list_empty(&pacer->throttled_rpcs)) { + INC_METRIC(pacer_homa_packets, 1); + INC_METRIC(pacer_homa_bytes, bytes); + } if (idle < clock) new_idle = clock + cycles_for_packet; else diff --git a/homa_qdisc.c b/homa_qdisc.c index d3f300d3..f429fd62 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -47,6 +47,20 @@ static struct ctl_table homa_qdisc_ctl_table[] = { .mode = 0644, .proc_handler = homa_qdisc_dointvec }, + { + .procname = "homa_share", + .data = OFFSET(homa_share), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, + { + .procname = "tcp_credit_increment", + .data = OFFSET(tcp_credit_increment), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, }; static struct Qdisc_ops homa_qdisc_ops __read_mostly = { @@ -136,6 +150,8 @@ struct homa_qdisc_shared *homa_qdisc_shared_alloc(void) qshared->fifo_fraction = 50; qshared->max_nic_queue_ns = 5000; qshared->defer_min_bytes = 1000; + qshared->homa_share = 50; + qshared->tcp_credit_increment = 20000; qshared->sysctl_header = register_net_sysctl(&init_net, "net/homa", homa_qdisc_ctl_table); if (!qshared->sysctl_header) { @@ -240,7 +256,7 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) homa_qdev_update_sysctl(qdev); INIT_LIST_HEAD(&qdev->links); qdev->deferred_rpcs = RB_ROOT_CACHED; - skb_queue_head_init(&qdev->tcp_deferred); + INIT_LIST_HEAD(&qdev->tcp_qdiscs); spin_lock_init(&qdev->defer_lock); init_waitqueue_head(&qdev->pacer_sleep); spin_lock_init(&qdev->pacer_mutex); @@ -299,7 +315,8 @@ void homa_qdisc_dev_callback(struct rcu_head *head) qdev = container_of(head, struct homa_qdisc_dev, rcu_head); homa_qdisc_free_homa(qdev); - skb_queue_purge(&qdev->tcp_deferred); + WARN_ON(!list_empty(&qdev->tcp_qdiscs)); + WARN_ON(qdev->cur_tcp_qdisc); kfree(qdev); } @@ -321,6 +338,7 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, if (IS_ERR(qdev)) return PTR_ERR(qdev); + q->sch = sch; q->qdev = qdev; q->ix = -1; for (i = 0; i < qdev->dev->num_tx_queues; i++) { @@ -329,6 +347,8 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, break; } } + skb_queue_head_init(&q->tcp_deferred); + INIT_LIST_HEAD(&q->defer_links); sch->limit = 10 * 1024; return 0; @@ -342,8 +362,13 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, void homa_qdisc_destroy(struct Qdisc *qdisc) { struct homa_qdisc *q = qdisc_priv(qdisc); + unsigned long flags; qdisc_reset_queue(qdisc); + spin_lock_irqsave(&q->qdev->defer_lock, flags); + __skb_queue_purge(&q->tcp_deferred); + list_del_init(&q->defer_links); + spin_unlock_irqrestore(&q->qdev->defer_lock, flags); homa_qdisc_qdev_put(q->qdev); } @@ -388,9 +413,11 @@ void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev) } /** - * homa_qdisc_enqueue() - Add a packet to the queue for this qdisc. - * @skb: Packet to enqueue. - * @sch: Qdisc on which to enqueue @skb. + * homa_qdisc_enqueue() - Invoked when a new packet becomes available for + * transmission; this function determines whether to send it immediately + * or defer it until the NIC queue subsides. + * @skb: Packet to eventually transmit. + * @sch: Qdisc via which to transmit @skb. * @to_free: Used when dropping packets. */ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -398,28 +425,46 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct homa_qdisc *q = qdisc_priv(sch); struct homa_qdisc_dev *qdev = q->qdev; - struct homa_qdisc_shared *qshared = qdev->hnet->homa->qshared; + struct homa_qdisc_shared *qshared; struct homa_data_hdr *h; int pkt_len; int result; int offset; - pkt_len = qdisc_skb_cb(skb)->pkt_len; - if (!is_homa_pkt(skb)) { - homa_qdisc_update_link_idle(qdev, pkt_len, -1); - goto enqueue; - } - - /* For Homa packets, transmit control packets and short messages - * immediately, bypassing the pacer mechanism completely. We do - * this because (a) we don't want to delay control packets, (b) the - * pacer's single thread doesn't have enough throughput to handle - * all the short packets (whereas processing here happens concurrently + /* This function tries to transmit short packets immediately for both + * Homa and TCP, even when the NIC queue is long. This is because + * (a) we don't want to delay Homa control packets, (b) the pacer's + * single thread doesn't have enough throughput to handle all the short + * packets at high load (whereas processing here happens concurrently * on multiple cores), and (c) there is no way to generate enough * short packets to cause NIC queue buildup, so bypassing the pacer * won't impact the SRPT mechanism significantly. - * - * Note: it's very important to use message length, not packet + */ + qshared = qdev->hnet->homa->qshared; + pkt_len = qdisc_pkt_len(skb); + if (!is_homa_pkt(skb)) { + /* This is a TCP packet (or something else other than Homa). + * In order to maintain the order of packets within a stream + * we must defer short packets if there are other packets + * already deferred for this qdisc. + */ + if (!list_empty(&q->defer_links)) { + homa_qdisc_defer_tcp(q, skb); + return NET_XMIT_SUCCESS; + } + if (pkt_len < qshared->defer_min_bytes) { + homa_qdisc_update_link_idle(qdev, pkt_len, -1); + goto enqueue; + } + if (!homa_qdisc_any_deferred(qdev) && + homa_qdisc_update_link_idle(qdev, pkt_len, + qshared->max_nic_queue_cycles)) + goto enqueue; + homa_qdisc_defer_tcp(q, skb); + return NET_XMIT_SUCCESS; + } + + /* For Homa packets it's important to use message length, not packet * length when deciding whether to bypass the pacer. If packet * length were used, then the short packet at the end of a long * message might be transmitted when all the earlier packets in the @@ -471,7 +516,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); result = qdisc_enqueue_tail(skb, sch); } else { - /* homa_enqueue_special is going to lock a different qdisc, + /* homa_qdisc_redirect_skb is going to lock a different qdisc, * so in order to avoid deadlocks we have to release the * lock for this qdisc. */ @@ -482,6 +527,32 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return result; } +/** + * homa_qdisc_defer_tcp() - Add a non-Homa packet to the deferred list for + * a qdisc. + * @q: Qdisc where the packet was submitted. + * @skb: Packet to defer (must not be a Homa packet). + */ +void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb) +{ + struct homa_qdisc_dev *qdev = q->qdev; + u64 now = homa_clock(); + unsigned long flags; + + spin_lock_irqsave(&qdev->defer_lock, flags); + __skb_queue_tail(&q->tcp_deferred, skb); + if (list_empty(&q->defer_links)) { + q->credit = 0; + list_add_tail(&q->defer_links, &qdev->tcp_qdiscs); + } + if (qdev->last_defer) + INC_METRIC(nic_backlog_cycles, now - qdev->last_defer); + else + wake_up(&qdev->pacer_sleep); + qdev->last_defer = now; + spin_unlock_irqrestore(&qdev->defer_lock, flags); +} + /** * homa_qdisc_defer_homa() - Add a Homa packet to the deferred list for * a qdev. @@ -544,13 +615,83 @@ void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) } /** - * homa_qdisc_dequeue_homa() - Return the highest-priority deferred Homa packet - * and dequeue it from the structures that manage deferred packets. + * homa_qdisc_xmit_deferred_tcp() - Transmit the "next" non-Homa packet + * that has been deferred for a particular homa_qdisc_dev and remove it + * from the structures that manage deferred packets. + * @qdev: Device on which to transmit packet. + * Return: The number of bytes in the transmitted packet, or 0 if there + * were no deferred TCP packets. + */ +int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) +{ + struct homa_qdisc_shared *qshared; + struct homa_qdisc *q; + unsigned long flags; + struct sk_buff *skb; + struct Qdisc *sch; + int pkt_len; + + qshared = qdev->hnet->homa->qshared; + spin_lock_irqsave(&qdev->defer_lock, flags); + if (list_empty(&qdev->tcp_qdiscs)) { + spin_unlock_irqrestore(&qdev->defer_lock, flags); + return 0; + } + + /* Find the next qdisc with positive credit.*/ + q = qdev->cur_tcp_qdisc; + if (!q) { + q = list_first_entry(&qdev->tcp_qdiscs, typeof(*q), + defer_links); + q->credit += qshared->tcp_credit_increment; + qdev->cur_tcp_qdisc = q; + } + while (q->credit <= 0) { + q = list_next_entry_circular(q, &qdev->tcp_qdiscs, + defer_links); + qdev->cur_tcp_qdisc = q; + q->credit += qshared->tcp_credit_increment; + continue; + } + + skb = __skb_dequeue(&q->tcp_deferred); + pkt_len = qdisc_pkt_len(skb); + q->credit -= qdisc_pkt_len(skb); + if (skb_queue_len(&q->tcp_deferred) == 0) { + qdev->cur_tcp_qdisc = + list_next_entry_circular(q, &qdev->tcp_qdiscs, + defer_links); + list_del_init(&q->defer_links); + if (list_empty(&qdev->tcp_qdiscs)) { + qdev->cur_tcp_qdisc = NULL; + if (!homa_qdisc_any_deferred(qdev)) { + INC_METRIC(nic_backlog_cycles, + homa_clock() - qdev->last_defer); + qdev->last_defer = 0; + } + } + } + spin_unlock_irqrestore(&qdev->defer_lock, flags); + + homa_qdisc_update_link_idle(qdev, pkt_len, -1); + tt_record2("homa_qdisc_pacer queuing tcp packet with length %d on qid %d", + pkt_len, q->ix); + sch = q->sch; + spin_lock_bh(qdisc_lock(sch)); + qdisc_enqueue_tail(skb, sch); + spin_unlock_bh(qdisc_lock(sch)); + __netif_schedule(sch); + return pkt_len; +} + +/** + * homa_qdisc_get_deferred_homa() - Return the highest-priority deferred Homa + * packet and dequeue it from the structures that manage deferred packets. * @qdev: Info about deferred packets is stored here. * Return: The next packet to transmit, or NULL if there are no deferred * Homa packets. */ -struct sk_buff *homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev) +struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev) { struct homa_rpc_qdisc *qrpc; struct homa_skb_info *info; @@ -591,6 +732,33 @@ struct sk_buff *homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev) return skb; } +/** + * homa_qdisc_xmit_deferred_homa() - Transmit the highest-priority deferred + * Homa packet and dequeue it from the structures that manage deferred packets. + * @qdev: Info about deferred packets is stored here. + * Return: The number of bytes in the transmitted packet (including headers) + * or 0 if there were no deferred Homa packets. + */ +int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev) +{ + struct homa_data_hdr *h; + struct sk_buff *skb; + int pkt_len; + + skb = homa_qdisc_get_deferred_homa(qdev); + if (!skb) + return 0; + + pkt_len = qdisc_pkt_len(skb); + homa_qdisc_update_link_idle(qdev, pkt_len, -1); + h = (struct homa_data_hdr *)skb_transport_header(skb); + tt_record3("homa_qdisc_pacer queuing homa data packet for id %d, offset %d on qid %d", + be64_to_cpu(h->common.sender_id), + ntohl(h->seg.offset), qdev->pacer_qix); + homa_qdisc_redirect_skb(skb, qdev, true); + return pkt_len; +} + /** * homa_qdisc_free_homa() - Free all of the Homa packets that have been * deferred for @qdev. @@ -601,7 +769,7 @@ void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev) struct sk_buff *skb; while (1) { - skb = homa_qdisc_dequeue_homa(qdev); + skb = homa_qdisc_get_deferred_homa(qdev); if (!skb) break; kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); @@ -719,7 +887,7 @@ int homa_qdisc_pacer_main(void *device) */ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) { - int i; + int i, xmit_bytes; /* Make sure only one instance of this function executes at a * time. @@ -733,8 +901,6 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) * homa_qdisc_pacer_main about interfering with softirq handlers). */ for (i = 0; i < 5; i++) { - struct homa_data_hdr *h; - struct sk_buff *skb; u64 idle_time, now; /* If the NIC queue is too long, wait until it gets shorter. */ @@ -754,26 +920,43 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) /* Note: when we get here, it's possible that the NIC queue is * still too long because other threads have queued packets, - * but we transmit anyway (don't want this thread to get - * starved by others). + * but we transmit anyway. If we don't, we could end up in a + * situation where the pacer thread is effectively starved by + * other "helper" threads. */ UNIT_HOOK("pacer_xmit"); - skb = homa_qdisc_dequeue_homa(qdev); - if (!skb) - break; - INC_METRIC(pacer_packets, 1); - INC_METRIC(pacer_bytes, qdisc_skb_cb(skb)->pkt_len); + /* Decide whether to transmit a Homa or TCP packet. If + * only one protocol has packets, reset homa_credit to + * prevent negative credit buildup for the protocol + * with packets. + */ + if (list_empty(&qdev->tcp_qdiscs)) { + if (!rb_first_cached(&qdev->deferred_rpcs)) + break; + qdev->homa_credit = 1; + } else if (!rb_first_cached(&qdev->deferred_rpcs)) { + qdev->homa_credit = 0; + } + if (qdev->homa_credit > 0) { + xmit_bytes = homa_qdisc_xmit_deferred_homa(qdev); + if (xmit_bytes > 0) { + INC_METRIC(pacer_homa_packets, 1); + INC_METRIC(pacer_homa_bytes, xmit_bytes); + qdev->homa_credit -= xmit_bytes * (100 - + qdev->hnet->homa->qshared->homa_share); + } + } else { + xmit_bytes = homa_qdisc_xmit_deferred_tcp(qdev); + if (xmit_bytes > 0) { + INC_METRIC(pacer_tcp_packets, 1); + INC_METRIC(pacer_tcp_bytes, xmit_bytes); + qdev->homa_credit += xmit_bytes * + qdev->hnet->homa->qshared->homa_share; + } + } if (help) - INC_METRIC(pacer_help_bytes, - qdisc_skb_cb(skb)->pkt_len); - homa_qdisc_update_link_idle(qdev, qdisc_skb_cb(skb)->pkt_len, - -1); - h = (struct homa_data_hdr *)skb_transport_header(skb); - tt_record3("homa_qdisc_pacer queuing homa data packet for id %d, offset %d on qid %d", - be64_to_cpu(h->common.sender_id), - ntohl(h->seg.offset), qdev->pacer_qix); - homa_qdisc_redirect_skb(skb, qdev, true); + INC_METRIC(pacer_help_bytes, xmit_bytes); INC_METRIC(pacer_xmit_cycles, homa_clock() - now); } done: @@ -957,6 +1140,11 @@ void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared) qshared->max_nic_queue_cycles = homa_ns_to_cycles(qshared->max_nic_queue_ns); + if (qshared->homa_share < 0) + qshared->homa_share = 0; + if (qshared->homa_share > 100) + qshared->homa_share = 100; + rcu_read_lock(); list_for_each_entry_rcu(qdev, &qshared->qdevs, links) homa_qdev_update_sysctl(qdev); diff --git a/homa_qdisc.h b/homa_qdisc.h index fee414e0..8d207615 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -25,6 +25,9 @@ * the homa queuing discipline */ struct homa_qdisc { + /** @sch: The Qdisc that this structure is associated with. */ + struct Qdisc *sch; + /** @qdev: Info shared among all qdiscs for a net_device. */ struct homa_qdisc_dev *qdev; @@ -33,6 +36,27 @@ struct homa_qdisc { * its net_device. */ int ix; + + /** + * @credit: Used to share bandwidth equally among qdiscs with + * deferred TCP packets. Packets won't be transmitted from + * tcp_deferred until this becomes positive. + */ + int credit; + + /** + * @tcp_deferred: TCP packets whose transmission was deferred + * because the NIC queue was too long. The queue is in order of + * packet arrival at the qdisc. + */ + struct sk_buff_head tcp_deferred; + + /** + * @defer_links: Used to link this qdisc into the tcp_qdiscs list + * in homa_qdisc_dev. This will be an empty list whenever this + * object is not queued on tcp_qdiscs. + */ + struct list_head defer_links; }; /** @@ -118,19 +142,36 @@ struct homa_qdisc_dev { struct rb_root_cached deferred_rpcs; /** - * @tcp_deferred: TCP packets whose transmission was deferred - * because the NIC queue was too long. The queue is in order of - * packet arrival at the qdisc. + * @tcp_qdiscs: List of all homa_qdiscs that have deferred TCP + * packets. */ - struct sk_buff_head tcp_deferred; + struct list_head tcp_qdiscs; + + /** + * @cur_tcp_qdisc: Points to an element of tcp_qdiscs or NULL; this is + * the qdisc currently being serviced by the pacer. This pointer + * rotates circularly through tcp_qdiscs. + */ + struct homa_qdisc *cur_tcp_qdisc; /** * @last_defer: The most recent homa_clock() time when a packet was - * added to homa_deferred or tcp_deferred, or 0 if there are currently - * no deferred packets. + * deferred, or 0 if there are currently no deferred packets. */ u64 last_defer; + /** + * @homa_credit: When there are both Homa and TCP deferred packets, + * this is used to balance output between them according to the + * homa_share sysctl value. Positive means that Homa packets should + * be transmitted next, zero or negative means TCP. When a TCP + * packet is transmitted, this is incremented by the packet length + * times homa_share; when a Homa packet is transmitted, it is + * decremented by packet length times (100 - homa_share). Used only + * by the pacer, so no need for synchronization. + */ + int homa_credit; + /** * @defer_lock: Synchronizes access to information about deferred * packets, including deferred_rpcs, tcp_deferred, and last_defer. @@ -213,6 +254,21 @@ struct homa_qdisc_shared { */ int defer_min_bytes; + /** + * @homa_share: When the uplink is overloaded, this determines how + * to share bandwidth between TCP and Homa. It gives the percentage + * of bandwidth that Homa will receive; TCP (and all other protocols, + * such as UDP) get the remainder. Must be between 0 and 100, + * inclusive. + */ + int homa_share; + + /** + * @tcp_credit_increment: Amount by which the credit field of + * homa_qdisc is incremented. + */ + int tcp_credit_increment; + #ifndef __STRIP__ /* See strip.py */ /** * @sysctl_header: Used to remove sysctl values when this structure @@ -237,8 +293,7 @@ struct homa_rcu_kfreer { void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev); void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb); -struct sk_buff * - homa_qdisc_dequeue_homa(struct homa_qdisc_dev *qdev); +void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb); void homa_qdisc_destroy(struct Qdisc *sch); void homa_qdisc_dev_callback(struct rcu_head *head); int homa_qdisc_dointvec(const struct ctl_table *table, int write, @@ -246,6 +301,7 @@ int homa_qdisc_dointvec(const struct ctl_table *table, int write, int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev); +struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev); int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack); void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, @@ -268,6 +324,8 @@ void homa_qdisc_unregister(void); int homa_qdisc_update_link_idle(struct homa_qdisc_dev *qdev, int bytes, int max_queue_ns); void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared); +int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev); +int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev); void homa_rcu_kfree(void *object); void homa_rcu_kfree_callback(struct rcu_head *head); @@ -302,13 +360,13 @@ static inline void homa_qdisc_rpc_init(struct homa_rpc_qdisc *qrpc) static inline bool homa_qdisc_any_deferred(struct homa_qdisc_dev *qdev) { return rb_first_cached(&qdev->deferred_rpcs) || - !skb_queue_empty(&qdev->tcp_deferred); + !list_empty(&qdev->tcp_qdiscs); } /** - * homa_qdisc_precedes() - Return true if @rpc1 is considered "less" - * than @rpc2 for the purposes of qdev->deferred_rpcs, or false if @rpc1 - * is consdered "greater" (ties not allowed). + * homa_qdisc_precedes() - Return true if @rpc1 is considered "less" than + * @rpc2 (i.e. higher priority) for the purposes of qdev->deferred_rpcs, or + * false if @rpc1 is consdered "greater" (ties not allowed). * @rpc1: RPC to compare * @rpc2: RPC to compare; must be different from rpc1. */ diff --git a/man/homa.7 b/man/homa.7 index 6c46d7a4..64940a96 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -634,6 +634,13 @@ requires Homa to intercept all incoming TCP packets to see if they are actually Homa packets. Some might object to this interference with the rest of the Linux kernel. .TP +.IR homa_share +When there exist both Homa and TCP packets whose transmission has been +deferred because the NIC queue is overloaded, this determines how the +uplink bandwidth is allocated between Homa and TCP. This parameter is +a value between 0 and 100 indicating what percent of the uplink bandwidth +will be allocated to Homa; the remainder will be allocated to TCP. +.TP .IR link_mbps An integer value specifying the bandwidth of this machine's uplink to the top-of-rack switch, in units of 1e06 bits per second. @@ -852,6 +859,11 @@ not release pages from a pool if the amount of unused space in the pool has been less than this (specified in Kbytes) at any point in the recent past. .TP +.IR tcp_credit_increment +Determines the granularity of bandwidth sharing among qdiscs that have +deferred TCP output packets. Read the code in homa_qdisc.c to learn more +about this. You probably shouldn't ever modify this option. +.TP .IR throttle_min_bytes An integer value specifying the smallest packet size subject to output queue throttling. diff --git a/test/mock.c b/test/mock.c index ed6ee3bc..911ccc65 100644 --- a/test/mock.c +++ b/test/mock.c @@ -1667,6 +1667,27 @@ struct page *mock_alloc_pages(gfp_t gfp, unsigned int order) return page; } +#ifndef __STRIP__ /* See strip.py */ +/** + * mock_alloc_qdisc() - Allocate and initialize a new Qdisc suitable for + * use in unit tests as a homa qdisc. + * Return: The new Qdisc. The memory is dynamically allocated and must + * be kfree-d by the caller. homa_qdisc_init has not been invoked on + * this Qdisc yet. + */ +struct Qdisc *mock_alloc_qdisc(struct netdev_queue *dev_queue) +{ + struct Qdisc *qdisc; + + qdisc = kzalloc(sizeof(struct Qdisc) + sizeof(struct homa_qdisc), + GFP_ATOMIC); + qdisc->dev_queue = dev_queue; + qdisc->ops = qdisc_ops; + spin_lock_init(&qdisc->q.lock); + return qdisc; +} +#endif /* See strip.py */ + /** * mock_check_error() - Determines whether a method should simulate an error * return. @@ -1942,26 +1963,70 @@ void mock_put_page(struct page *page) } } -#ifndef __STRIP__ /* See strip.py */ /** - * mock_alloc_qdisc() - Allocate and initialize a new Qdisc suitable for - * use in unit tests as a homa qdisc. - * Return: The new Qdisc. The memory is dynamically allocated and must - * be kfree-d by the caller. homa_qdisc_init has not been invoked on - * this Qdisc yet. + * mock_raw_skb() - Performs most of the work of mock_skb_alloc and + * mock_tcp_skb. Allocates and initializes an skb. + * @saddr: IPv6 address to use as the sender of the packet, in + * network byte order. + * @protocol: Protocol to use in the IP header, such as IPPROTO_HOMA. + * @length: How many bytes of space to allocated after the IP header. + * Return: The new packet buffer, initialized as if the packet just + * arrived from the network and is about to be processed at + * transport level (e.g. there will be an IP header before + * skb->tail). The skb has room for @length additional bytes, + * but they have not yet been allocated with skb_put(). The + * caller must eventually free the skb. */ -struct Qdisc *mock_alloc_qdisc(struct netdev_queue *dev_queue) +struct sk_buff *mock_raw_skb(struct in6_addr *saddr, int protocol, int length) { - struct Qdisc *qdisc; + int ip_size, data_size, shinfo_size; + struct sk_buff *skb; - qdisc = kzalloc(sizeof(struct Qdisc) + sizeof(struct homa_qdisc), - GFP_ATOMIC); - qdisc->dev_queue = dev_queue; - qdisc->ops = qdisc_ops; - spin_lock_init(&qdisc->q.lock); - return qdisc; + /* Don't let the IP header start at the beginning of the packet + * buffer: that will confuse is_homa_pkt. + */ +#define IP_HDR_OFFSET 4 + + skb = malloc(sizeof(struct sk_buff)); + memset(skb, 0, sizeof(*skb)); + if (!skbs_in_use) + skbs_in_use = unit_hash_new(); + unit_hash_set(skbs_in_use, skb, "used"); + + ip_size = mock_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); + data_size = SKB_DATA_ALIGN(IP_HDR_OFFSET + ip_size + length); + shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + skb->head = malloc(data_size + shinfo_size); + memset(skb->head, 0, data_size + shinfo_size); + skb->data = skb->head; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + data_size; + + /* Don't want IP header starting at the beginning of the packet + * buffer (will confuse is_homa_pkt). + */ + skb_reserve(skb, IP_HDR_OFFSET + ip_size); + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + skb_set_network_header(skb, -ip_size); + if (mock_ipv6) { + ipv6_hdr(skb)->version = 6; + ipv6_hdr(skb)->saddr = *saddr; + ipv6_hdr(skb)->nexthdr = protocol; + } else { + ip_hdr(skb)->version = 4; + ip_hdr(skb)->saddr = saddr->in6_u.u6_addr32[3]; + ip_hdr(skb)->protocol = protocol; + ip_hdr(skb)->check = 0; + } + skb->users.refs.counter = 1; + skb->_skb_refdst = 0; + skb->hash = 3; + skb->next = NULL; + skb->dev = &mock_devices[0]; + qdisc_skb_cb(skb)->pkt_len = length + 100; + return skb; } -#endif /* See strip.py */ /** * mock_rcu_read_lock() - Called instead of rcu_read_lock when Homa is compiled @@ -2108,7 +2173,7 @@ void mock_set_ipv6(struct homa_sock *hsk) } /** - * mock_skb_alloc() - Allocate and return a packet buffer. The buffer is + * mock_skb_alloc() - Allocate and return a Homa packet buffer. The buffer is * initialized as if it just arrived from the network. * @saddr: IPv6 address to use as the sender of the packet, in * network byte order. @@ -2124,12 +2189,13 @@ void mock_set_ipv6(struct homa_sock *hsk) * Return: A packet buffer containing the information described above. * The caller owns this buffer and is responsible for freeing it. */ -struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h, - int extra_bytes, int first_value) +struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, + struct homa_common_hdr *h, int extra_bytes, + int first_value) { - int header_size, ip_size, data_size, shinfo_size; struct sk_buff *skb; unsigned char *p; + int header_size; /* Don't let the IP header start at the beginning of the packet * buffer: that will confuse is_homa_pkt. @@ -2176,58 +2242,45 @@ struct sk_buff *mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h } else { header_size = 0; } - skb = malloc(sizeof(struct sk_buff)); - memset(skb, 0, sizeof(*skb)); - if (!skbs_in_use) - skbs_in_use = unit_hash_new(); - unit_hash_set(skbs_in_use, skb, "used"); - - ip_size = mock_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); - data_size = SKB_DATA_ALIGN(IP_HDR_OFFSET + ip_size + header_size + - extra_bytes); - shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - if (h) { - skb->head = malloc(data_size + shinfo_size); - memset(skb->head, 0, data_size + shinfo_size); - } else { - skb->head = malloc(extra_bytes); - memset(skb->head, 0, extra_bytes); - - } - skb->data = skb->head; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + data_size; - - /* Don't want IP header starting at the beginning of the packet - * buffer (will confuse is_homa_pkt). - */ - skb_reserve(skb, IP_HDR_OFFSET + ip_size); - skb_reset_transport_header(skb); + skb = mock_raw_skb(saddr, IPPROTO_HOMA, header_size + extra_bytes); + p = skb_transport_header(skb); if (header_size != 0) { p = skb_put(skb, header_size); - memcpy(skb->data, h, header_size); + memcpy(p, h, header_size); } if (h && extra_bytes != 0) { p = skb_put(skb, extra_bytes); unit_fill_data(p, extra_bytes, first_value); } - skb->users.refs.counter = 1; - skb_reset_network_header(skb); - skb_set_network_header(skb, -ip_size); - if (mock_ipv6) { - ipv6_hdr(skb)->version = 6; - ipv6_hdr(skb)->saddr = *saddr; - ipv6_hdr(skb)->nexthdr = IPPROTO_HOMA; - } else { - ip_hdr(skb)->version = 4; - ip_hdr(skb)->saddr = saddr->in6_u.u6_addr32[3]; - ip_hdr(skb)->protocol = IPPROTO_HOMA; - ip_hdr(skb)->check = 0; - } - skb->_skb_refdst = 0; - skb->hash = 3; - skb->next = NULL; - skb->dev = &mock_devices[0]; + qdisc_skb_cb(skb)->pkt_len = extra_bytes + 100; + return skb; +} + +/** + * mock_tcp_skb() - Allocate and return a TCP packet buffer. The buffer is + * initialized as if it just arrived from the network. + * @saddr: IPv6 address to use as the sender of the packet, in + * network byte order. + * @sequence: Sequence number to store in the TCP header. + * @extra_bytes: How much additional data to add to the buffer after + * the TCP header. + * + * Return: A packet buffer containing the information described above. + * The caller owns this buffer and is responsible for freeing it. + */ +struct sk_buff *mock_tcp_skb(struct in6_addr *saddr, int sequence, + int extra_bytes) +{ + struct sk_buff *skb; + struct tcphdr *tcp; + + skb = mock_raw_skb(saddr, IPPROTO_TCP, + sizeof(struct tcphdr) + extra_bytes); + tcp = (struct tcphdr *)skb_put(skb, sizeof(struct tcphdr)); + tcp->seq = htonl(sequence); + tcp->doff = sizeof(struct tcphdr) / 4; + skb_put(skb, extra_bytes); + qdisc_skb_cb(skb)->pkt_len = extra_bytes + 100; return skb; } diff --git a/test/mock.h b/test/mock.h index aa1b7aaa..8e7c4319 100644 --- a/test/mock.h +++ b/test/mock.h @@ -208,6 +208,8 @@ void mock_preempt_disable(void); void mock_preempt_enable(void); int mock_processor_id(void); void mock_put_page(struct page *page); +struct sk_buff * + mock_raw_skb(struct in6_addr *saddr, int protocol, int length); void mock_rcu_read_lock(void); void mock_rcu_read_unlock(void); void mock_record_locked(void *lock); @@ -231,7 +233,7 @@ void mock_spin_lock(spinlock_t *lock); void mock_spin_unlock(spinlock_t *lock); struct sk_buff * mock_skb_alloc(struct in6_addr *saddr, struct homa_common_hdr *h, - int extra_bytes, int first_value); + int extra_bytes, int first_value); int mock_skb_count(void); void mock_sock_destroy(struct homa_sock *hsk, struct homa_socktab *socktab); @@ -239,6 +241,8 @@ void mock_sock_hold(struct sock *sk); int mock_sock_init(struct homa_sock *hsk, struct homa_net *hnet, int port); void mock_sock_put(struct sock *sk); +struct sk_buff * + mock_tcp_skb(struct in6_addr *saddr, int sequence, int extra_bytes); void mock_teardown(void); void *mock_vmalloc(size_t size); diff --git a/test/unit_homa_plumbing.c b/test/unit_homa_plumbing.c index 3cc9127e..8c1a292a 100644 --- a/test/unit_homa_plumbing.c +++ b/test/unit_homa_plumbing.c @@ -1471,7 +1471,7 @@ TEST_F(homa_plumbing, homa_err_handler_v4__port_unreachable) failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); ip_hdr(failed)->daddr = ipv6_to_ipv4(self->server_ip[0]); - icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); + icmp = mock_raw_skb(self->server_ip, IPPROTO_ICMP, 1000); icmph = skb_put(icmp, sizeof *icmph); icmph->type = ICMP_DEST_UNREACH; icmph->code = ICMP_PORT_UNREACH; @@ -1499,7 +1499,7 @@ TEST_F(homa_plumbing, homa_err_handler_v4__host_unreachable) failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); ip_hdr(failed)->daddr = ipv6_to_ipv4(self->server_ip[0]); - icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); + icmp = mock_raw_skb(self->server_ip, IPPROTO_ICMP, 1000); icmph = skb_put(icmp, sizeof *icmph); icmph->type = ICMP_DEST_UNREACH; icmph->code = ICMP_HOST_UNKNOWN; @@ -1526,7 +1526,7 @@ TEST_F(homa_plumbing, homa_err_handler_v6__port_unreachable) failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); ipv6_hdr(failed)->daddr = self->server_ip[0]; - icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); + icmp = mock_raw_skb(self->server_ip, IPPROTO_ICMP, 1000); memcpy(skb_put(icmp, failed->len), skb_network_header(failed), failed->len); @@ -1550,7 +1550,7 @@ TEST_F(homa_plumbing, homa_err_handler_v6__protocol_not_supported) failed = mock_skb_alloc(self->server_ip, &self->data.common, 100, 0); ipv6_hdr(failed)->daddr = self->server_ip[0]; - icmp = mock_skb_alloc(self->server_ip, NULL, 1000, 0); + icmp = mock_raw_skb(self->server_ip, IPPROTO_ICMP, 1000); memcpy(skb_put(icmp, failed->len), skb_network_header(failed), failed->len); diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index dc528471..2fc7882d 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -178,8 +178,12 @@ FIXTURE_TEARDOWN(homa_qdisc) { int i; - for (i = 0; i < NUM_TXQS; i++) + for (i = 0; i < NUM_TXQS; i++) { + struct homa_qdisc *q = qdisc_priv(self->qdiscs[i]); + if (q->qdev) + homa_qdisc_destroy(self->qdiscs[i]); kfree(self->qdiscs[i]); + } homa_destroy(&self->homa); homa_qdisc_unregister(); unit_teardown(); @@ -434,9 +438,11 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) { struct Qdisc *qdisc, *qdisc2; struct homa_qdisc_dev *qdev; + struct homa_qdisc *q; qdisc = mock_alloc_qdisc(&mock_net_queue); EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); qdisc2 = mock_alloc_qdisc(&mock_net_queue); EXPECT_EQ(0, homa_qdisc_init(qdisc2, NULL, NULL)); qdev = list_first_or_null_rcu(&self->homa.qshared->qdevs, @@ -444,6 +450,9 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) EXPECT_NE(NULL, qdev); EXPECT_EQ(2, refcount_read(&qdev->refs)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 6000, 1100)); + homa_qdisc_destroy(qdisc2); EXPECT_EQ(1, refcount_read(&qdev->refs)); @@ -491,7 +500,7 @@ TEST_F(homa_qdisc, _homa_qdisc_homa_qdisc_set_qixs_object) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; @@ -507,11 +516,45 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); q->ix = 3; - skb = new_test_skb(srpc, &self->addr, 0, 1500); - if (skb_is_ipv6(skb)) - ipv6_hdr(skb)->nexthdr = IPPROTO_TCP; - else - ip_hdr(skb)->protocol = IPPROTO_TCP; + + /* First packet is long and gets deferred because of link_idle_time. */ + skb = mock_tcp_skb(&self->addr, 5000, 1500); + to_free = NULL; + homa_qdisc_enqueue(skb, qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, q->tcp_deferred.qlen); + EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); + + /* Second packet is short, but must be deferred to maintain order + * within qdisc. + */ + skb = mock_tcp_skb(&self->addr, 6000, 500); + to_free = NULL; + homa_qdisc_enqueue(skb, qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(2, q->tcp_deferred.qlen); + EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__xmit_short_tcp_packet) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + atomic64_set(&q->qdev->link_idle_time, 1000000); + q->ix = 3; + skb = mock_tcp_skb(&self->addr, 5000, 500); to_free = NULL; unit_log_clear(); @@ -525,6 +568,45 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__packet_not_homa) homa_qdisc_destroy(qdisc); kfree(qdisc); } +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_homa_deferred) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + atomic64_set(&q->qdev->link_idle_time, 1000000); + q->ix = 3; + + /* First packet is Homa, gets deferred because of link_idle_time. */ + skb = new_test_skb(srpc, &self->addr, 0, 1500); + to_free = NULL; + homa_qdisc_enqueue(skb, qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); + EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); + + /* Second packet is TCP, gets deferred because of deferred Homa + * packet. + */ + mock_clock = 1000000; + skb = mock_tcp_skb(&self->addr, 6000, 1500); + to_free = NULL; + homa_qdisc_enqueue(skb, qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, q->tcp_deferred.qlen); + EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} TEST_F(homa_qdisc, homa_qdisc_enqueue__short_message) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); @@ -687,9 +769,85 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__use_special_queue) EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); EXPECT_EQ(0, self->qdiscs[1]->q.qlen); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); +} + +TEST_F(homa_qdisc, homa_qdisc_defer_tcp__basics) +{ + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 10000); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + + /* First packet: must add qdisc to qdev->tcp_qdiscs. */ + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1500)); + EXPECT_EQ(1, skb_queue_len(&q->tcp_deferred)); + EXPECT_EQ(1, unit_list_length(&q->qdev->tcp_qdiscs)); - homa_qdisc_destroy(self->qdiscs[1]); - homa_qdisc_destroy(self->qdiscs[3]); + /* Second packet: qdisc already in qdev->tcp_qdiscs. */ + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1500)); + EXPECT_EQ(2, skb_queue_len(&q->tcp_deferred)); + EXPECT_EQ(1, unit_list_length(&q->qdev->tcp_qdiscs)); +} +TEST_F(homa_qdisc, homa_qdisc_defer_tcp__multiple_qdiscs_on_list) +{ + struct homa_rpc *srpc; + struct homa_qdisc *q1, *q2, *q3; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 10000); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + q1 = qdisc_priv(self->qdiscs[1]); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q2 = qdisc_priv(self->qdiscs[2]); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + q3 = qdisc_priv(self->qdiscs[3]); + + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); + homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 5000, 2000)); + homa_qdisc_defer_tcp(q3, mock_tcp_skb(&self->addr, 5000, 3000)); + EXPECT_EQ(3, unit_list_length(&q1->qdev->tcp_qdiscs)); + EXPECT_EQ(&q1->defer_links, q1->qdev->tcp_qdiscs.next); + EXPECT_EQ(&q2->defer_links, q1->defer_links.next); + EXPECT_EQ(&q3->defer_links, q2->defer_links.next); +} +TEST_F(homa_qdisc, homa_qdisc_defer_tcp__update_metrics_and_wakeup) +{ + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 10000); + ASSERT_NE(NULL, srpc); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + mock_log_wakeups = 1; + + /* First packet: qdev->last_defer is 0. */ + EXPECT_EQ(0, q->qdev->last_defer); + mock_clock = 5000; + unit_log_clear(); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1500)); + EXPECT_EQ(5000, q->qdev->last_defer); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_STREQ("wake_up", unit_log_get()); + + /* Second packet: qdev->last_defer != 0. */ + mock_clock = 15000; + unit_log_clear(); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1500)); + EXPECT_EQ(15000, q->qdev->last_defer); + EXPECT_EQ(10000, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_STREQ("", unit_log_get()); } TEST_F(homa_qdisc, homa_qdisc_defer_homa__basics) @@ -928,17 +1086,120 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_right_chain) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__no_deferred_rpcs) +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__basics) +{ + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); + atomic64_set(&q->qdev->link_idle_time, 20000); + + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q->qdev)); + EXPECT_EQ(1, self->qdiscs[2]->q.qlen); + EXPECT_EQ(0, skb_queue_len(&q->tcp_deferred)); + EXPECT_LT(20000, atomic64_read(&q->qdev->link_idle_time)); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__no_deferred_packets) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + unit_log_clear(); + EXPECT_EQ(0, homa_qdisc_xmit_deferred_tcp(qdev)); + EXPECT_EQ(0, self->qdiscs[2]->q.qlen); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__scan_for_qdisc_with_credit) +{ + struct homa_qdisc *q1, *q2, *q3; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + q1 = qdisc_priv(self->qdiscs[1]); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q2 = qdisc_priv(self->qdiscs[2]); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + q3 = qdisc_priv(self->qdiscs[3]); + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); + homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 6000, 1100)); + homa_qdisc_defer_tcp(q3, mock_tcp_skb(&self->addr, 7000, 1200)); + EXPECT_EQ(3, unit_list_length(&q2->qdev->tcp_qdiscs)); + self->homa.qshared->tcp_credit_increment = 10000; + q1->credit = -30000; + q2->credit = -10000; + q3->credit = -15000; + + EXPECT_EQ(1200, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(1, self->qdiscs[2]->q.qlen); + EXPECT_EQ(1, skb_queue_len(&q1->tcp_deferred)); + EXPECT_EQ(0, skb_queue_len(&q2->tcp_deferred)); + EXPECT_EQ(1, skb_queue_len(&q3->tcp_deferred)); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__unlink_qdiscs) +{ + struct homa_qdisc *q1, *q2; + + mock_clock = 10000; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + q1 = qdisc_priv(self->qdiscs[1]); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q2 = qdisc_priv(self->qdiscs[2]); + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 6000, 1100)); + homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 7000, 1200)); + EXPECT_EQ(2, unit_list_length(&q2->qdev->tcp_qdiscs)); + self->homa.qshared->tcp_credit_increment = 1000; + + /* First call xmits packet from q1. */ + mock_clock = 11000; + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(2, unit_list_length(&q2->qdev->tcp_qdiscs)); + + /* Second call xmits packet from q2 and unlinks it. */ + mock_clock = 13000; + EXPECT_EQ(1300, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(1, unit_list_length(&q2->qdev->tcp_qdiscs)); + EXPECT_FALSE(list_empty(&q1->defer_links)); + EXPECT_TRUE(list_empty(&q2->defer_links)); + + /* Third call xmits last packet from q1 and unlinks it. */ + mock_clock = 16000; + EXPECT_EQ(1200, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(0, unit_list_length(&q2->qdev->tcp_qdiscs)); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__backlog_cycles_metric) +{ + struct homa_qdisc *q1; + + mock_clock = 10000; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + q1 = qdisc_priv(self->qdiscs[1]); + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 6000, 1100)); + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 6000, 1200)); + + mock_clock = 11000; + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q1->qdev)); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + mock_clock = 12000; + EXPECT_EQ(1200, homa_qdisc_xmit_deferred_tcp(q1->qdev)); + EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); + mock_clock = 13000; + EXPECT_EQ(1300, homa_qdisc_xmit_deferred_tcp(q1->qdev)); + EXPECT_EQ(3000, homa_metrics_per_cpu()->nic_backlog_cycles); +} + +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__no_deferred_rpcs) { struct homa_qdisc_dev *qdev; qdev = homa_qdisc_qdev_get(self->dev); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(NULL, homa_qdisc_dequeue_homa(qdev)); + EXPECT_EQ(NULL, homa_qdisc_get_deferred_homa(qdev)); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__multiple_packets_for_rpc) +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__multiple_packets_for_rpc) { struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; @@ -958,14 +1219,14 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__multiple_packets_for_rpc) log_deferred(qdev); EXPECT_STREQ("[id 1235, offsets 2000 3000 4000]", unit_log_get()); - EXPECT_EQ(skb, homa_qdisc_dequeue_homa(qdev)); + EXPECT_EQ(skb, homa_qdisc_get_deferred_homa(qdev)); unit_log_clear(); log_deferred(qdev); EXPECT_STREQ("[id 1235, offsets 3000 4000]", unit_log_get()); kfree_skb(skb); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__last_packet_for_rpc) +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__last_packet_for_rpc) { struct homa_rpc *srpc1, *srpc2; struct homa_qdisc_dev *qdev; @@ -992,14 +1253,14 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__last_packet_for_rpc) EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 2000 3000]", unit_log_get()); - EXPECT_EQ(skb, homa_qdisc_dequeue_homa(qdev)); + EXPECT_EQ(skb, homa_qdisc_get_deferred_homa(qdev)); unit_log_clear(); log_deferred(qdev); EXPECT_STREQ("[id 1237, offsets 2000 3000]", unit_log_get()); kfree_skb(skb); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__update_tx_left) +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__update_tx_left) { struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; @@ -1015,16 +1276,16 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__update_tx_left) srpc->qrpc.tx_left = 6000; /* First packet doesn't update tx_left. */ - kfree_skb(homa_qdisc_dequeue_homa(qdev)); + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); EXPECT_EQ(6000, srpc->qrpc.tx_left); /* Second packet does update tx_left. */ - kfree_skb(homa_qdisc_dequeue_homa(qdev)); + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); EXPECT_EQ(5500, srpc->qrpc.tx_left); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__nic_backlog_cycles_metric) +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__nic_backlog_cycles_metric) { struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; @@ -1042,19 +1303,56 @@ TEST_F(homa_qdisc, homa_qdisc_dequeue_homa__nic_backlog_cycles_metric) EXPECT_EQ(5000, qdev->last_defer); mock_clock = 12000; - kfree_skb(homa_qdisc_dequeue_homa(qdev)); + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); EXPECT_EQ(0, homa_metrics_per_cpu()->nic_backlog_cycles); EXPECT_EQ(5000, qdev->last_defer); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); mock_clock = 14000; - kfree_skb(homa_qdisc_dequeue_homa(qdev)); + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); EXPECT_EQ(9000, homa_metrics_per_cpu()->nic_backlog_cycles); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(0, qdev->last_defer); homa_qdisc_qdev_put(qdev); } +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__no_packets_available) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_xmit_deferred_homa(qdev)); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__packet_available) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + u64 link_idle; + + mock_clock = 10000; + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + link_idle = atomic64_read(&qdev->link_idle_time); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + + mock_clock = 11000; + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_homa(qdev)); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + + homa_qdisc_qdev_put(qdev); +} + TEST_F(homa_qdisc, homa_qdisc_free_homa) { struct homa_qdisc_dev *qdev; @@ -1168,10 +1466,9 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_main) homa_qdisc_pacer_main(qdev); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); - EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_packets); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_homa_packets); EXPECT_EQ(0, exit_hook_count); - homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } @@ -1204,7 +1501,48 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) EXPECT_EQ(1, self->qdiscs[3]->q.qlen); EXPECT_EQ(7000, xmit_clock); - homa_qdisc_destroy(self->qdiscs[3]); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + qdev->pacer_qix = 3; + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + + skb = new_test_skb(srpc1, &self->addr, 5000, 1500); + homa_qdisc_defer_homa(qdev, skb); + skb = new_test_skb(srpc2, &self->addr, 4000, 1500); + homa_qdisc_defer_homa(qdev, skb); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 4000]", + unit_log_get()); + + mock_clock = atomic64_read(&qdev->link_idle_time); + self->homa.qshared->max_nic_queue_cycles = 100; + unit_log_clear(); + + homa_qdisc_pacer(qdev, false); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 4000]", unit_log_get()); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_LT(mock_clock + 100, atomic64_read(&qdev->link_idle_time)); + homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) @@ -1233,27 +1571,25 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) EXPECT_EQ(0, self->qdiscs[3]->q.qlen); EXPECT_EQ(link_idle, atomic64_read(&qdev->link_idle_time)); - homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_pacer__queue_empty) +TEST_F(homa_qdisc, homa_qdisc_pacer__no_deferred_packets) { struct homa_qdisc_dev *qdev; qdev = homa_qdisc_qdev_get(self->dev); - unit_log_clear(); + qdev->homa_credit = -1000; homa_qdisc_pacer(qdev, false); - EXPECT_STREQ("", unit_log_get()); EXPECT_EQ(0, atomic64_read(&qdev->link_idle_time)); + EXPECT_EQ(-1000, qdev->homa_credit); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) +TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_homa_packet_no_tcp) { struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; - u64 link_idle; qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, @@ -1261,68 +1597,113 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__enqueue_packet) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - link_idle = atomic64_read(&qdev->link_idle_time); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); - mock_clock_tick = 1000; - unit_log_clear(); + qdev->homa_credit = -100; + qdev->hnet->homa->qshared->homa_share = 40; homa_qdisc_pacer(qdev, false); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); - EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); - EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_packets); - EXPECT_EQ(1100, homa_metrics_per_cpu()->pacer_bytes); + EXPECT_EQ(-65999, qdev->homa_credit); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_homa_packets); + EXPECT_EQ(1100, homa_metrics_per_cpu()->pacer_homa_bytes); EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_help_bytes); - EXPECT_NE(0, homa_metrics_per_cpu()->pacer_xmit_cycles); - homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) +TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_tcp_no_homa) { - struct homa_rpc *srpc1, *srpc2; struct homa_qdisc_dev *qdev; - struct sk_buff *skb; + struct homa_qdisc *q; qdev = homa_qdisc_qdev_get(self->dev); - srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1100)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1200)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + qdev->homa_credit = 1000; + qdev->hnet->homa->qshared->homa_share = 40; + + homa_qdisc_pacer(qdev, false); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(2, self->qdiscs[2]->q.qlen); + EXPECT_EQ(52000, qdev->homa_credit); + EXPECT_EQ(2, homa_metrics_per_cpu()->pacer_tcp_packets); + EXPECT_EQ(2500, homa_metrics_per_cpu()->pacer_tcp_bytes); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_help_bytes); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__both_protocols_have_packets_choose_tcp) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); - ASSERT_NE(NULL, srpc1); - srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id + 2, 10000, 10000); - ASSERT_NE(NULL, srpc2); + ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1100)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + qdev->homa_credit = -100; + qdev->hnet->homa->qshared->homa_share = 40; - skb = new_test_skb(srpc1, &self->addr, 5000, 1500); - homa_qdisc_defer_homa(qdev, skb); - skb = new_test_skb(srpc2, &self->addr, 4000, 1500); - homa_qdisc_defer_homa(qdev, skb); - unit_log_clear(); - log_deferred(qdev); - EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 4000]", - unit_log_get()); + /* Arrange for the the NIC queue to exceed its limit once the next + * packet is transmitted. + */ + atomic64_set(&qdev->link_idle_time, 1000000); + qdev->hnet->homa->qshared->max_nic_queue_cycles = 10000; + mock_clock = 1000000 - 10000 + 100; - mock_clock = atomic64_read(&qdev->link_idle_time); - self->homa.qshared->max_nic_queue_cycles = 100; - unit_log_clear(); + homa_qdisc_pacer(qdev, false); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, self->qdiscs[2]->q.qlen); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_EQ(40*1200 - 100, qdev->homa_credit); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_tcp_packets); + EXPECT_EQ(1200, homa_metrics_per_cpu()->pacer_tcp_bytes); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_help_bytes); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_multiple_packets) +{ + struct homa_qdisc_dev *qdev; + struct homa_qdisc *q; + + qdev = homa_qdisc_qdev_get(self->dev); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1100)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1200)); + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1300)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + qdev->hnet->homa->qshared->homa_share = 40; + qdev->hnet->homa->qshared->max_nic_queue_cycles = 100000; homa_qdisc_pacer(qdev, false); - unit_log_clear(); - log_deferred(qdev); - EXPECT_STREQ("[id 1237, offsets 4000]", unit_log_get()); - EXPECT_EQ(1, self->qdiscs[3]->q.qlen); - EXPECT_LT(mock_clock + 100, atomic64_read(&qdev->link_idle_time)); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(3, self->qdiscs[2]->q.qlen); + EXPECT_EQ(3, homa_metrics_per_cpu()->pacer_tcp_packets); + EXPECT_EQ(3900, homa_metrics_per_cpu()->pacer_tcp_bytes); + EXPECT_EQ(0, homa_metrics_per_cpu()->pacer_help_bytes); - homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_help_bytes_metric) @@ -1345,11 +1726,10 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_help_bytes_metric) homa_qdisc_pacer(qdev, true); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_packets); - EXPECT_EQ(900, homa_metrics_per_cpu()->pacer_bytes); + EXPECT_EQ(1, homa_metrics_per_cpu()->pacer_homa_packets); + EXPECT_EQ(900, homa_metrics_per_cpu()->pacer_homa_bytes); EXPECT_EQ(900, homa_metrics_per_cpu()->pacer_help_bytes); - homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } @@ -1372,9 +1752,6 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_pacer_qix) EXPECT_EQ(1, self->qdiscs[1]->q.qlen); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); EXPECT_EQ(1, mock_netif_schedule_calls); - - homa_qdisc_destroy(self->qdiscs[1]); - homa_qdisc_destroy(self->qdiscs[3]); } TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_redirect_qix) { @@ -1394,9 +1771,6 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_redirect_qix) EXPECT_EQ(NET_XMIT_SUCCESS, status); EXPECT_EQ(0, self->qdiscs[1]->q.qlen); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); - - homa_qdisc_destroy(self->qdiscs[1]); - homa_qdisc_destroy(self->qdiscs[3]); } TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_invalid) { @@ -1418,9 +1792,6 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_invalid) EXPECT_EQ(1, self->qdiscs[1]->q.qlen); EXPECT_EQ(0, qdev->pacer_qix); EXPECT_EQ(1, qdev->redirect_qix); - - for (i = 0; i < 4; i++) - homa_qdisc_destroy(self->qdiscs[i]); } TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_not_a_homa_qdisc) { @@ -1443,9 +1814,6 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_not_a_homa_qdisc) EXPECT_EQ(1, self->qdiscs[2]->q.qlen); EXPECT_EQ(1, qdev->pacer_qix); EXPECT_EQ(2, qdev->redirect_qix); - - for (i = 0; i < 4; i++) - homa_qdisc_destroy(self->qdiscs[i]); } TEST_F(homa_qdisc, homa_qdisc_redirect_skb__no_suitable_qdisc) { @@ -1469,9 +1837,6 @@ TEST_F(homa_qdisc, homa_qdisc_redirect_skb__no_suitable_qdisc) EXPECT_EQ(-1, qdev->pacer_qix); EXPECT_EQ(-1, qdev->redirect_qix); EXPECT_EQ(0, mock_netif_schedule_calls); - - for (i = 0; i < 4; i++) - homa_qdisc_destroy(self->qdiscs[i]); } TEST_F(homa_qdisc, homa_qdisc_pacer_check__enqueue_packet) @@ -1502,7 +1867,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__enqueue_packet) EXPECT_EQ(1, self->qdiscs[3]->q.qlen); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); - homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); homa_qdisc_qdev_put(qdev2); } @@ -1527,7 +1891,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__no_deferred_rpcs) EXPECT_EQ(0, self->qdiscs[3]->q.qlen); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); - homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); homa_qdisc_qdev_put(qdev2); } @@ -1555,7 +1918,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__lag_not_long_enough) EXPECT_EQ(0, self->qdiscs[3]->q.qlen); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); - homa_qdisc_destroy(self->qdiscs[3]); homa_qdisc_qdev_put(qdev); } @@ -1598,6 +1960,24 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__max_nic_queue_cycles) homa_qdisc_update_sysctl_deps(self->homa.qshared); EXPECT_EQ(6000, self->homa.qshared->max_nic_queue_cycles); } +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__limit_homa_share) +{ + self->homa.qshared->homa_share = -1; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(0, self->homa.qshared->homa_share); + + self->homa.qshared->homa_share = 0; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(0, self->homa.qshared->homa_share); + + self->homa.qshared->homa_share = 100; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(100, self->homa.qshared->homa_share); + + self->homa.qshared->homa_share = 101; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(100, self->homa.qshared->homa_share); +} TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__update_all_qdevs) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); diff --git a/util/metrics.py b/util/metrics.py index 678df556..ae9eac6b 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -402,26 +402,35 @@ def scale_number(number): print("GRO bypass for data packets: %5.1f%%" % (data_bypass_percent)) print("GRO bypass for grant packets: %5.1f%%" % (grant_bypass_percent)) - if deltas["pacer_packets"] != 0: + pacer_bytes = deltas["pacer_homa_bytes"] + deltas["pacer_tcp_bytes"] + if pacer_bytes != 0: print("\nPacer:") print("--------") - print("Packets sent: %5.3f M/sec (%.1f %% of all packets)" % - (1e-6*deltas["pacer_packets"]/elapsed_secs, - 100*deltas["pacer_packets"]/packets_sent)) - print("Throughput (including headers): %5.2f Gbps" % - (8e-9*deltas["pacer_bytes"]/elapsed_secs)) - print("Helper throughput: %5.2f Gbps (%.1f%% of all pacer bytes)" % + if packets_sent > 0: + print("Homa packets sent: %5.3f M/sec (%.1f %% of all Homa packets)" % + (1e-6*deltas["pacer_homa_packets"]/elapsed_secs, + 100*deltas["pacer_homa_packets"]/packets_sent)) + else: + print("Homa packets sent: 0.000 M/sec") + print("Homa throughput (inc. headers): %5.2f Gbps" % + (8e-9*deltas["pacer_homa_bytes"]/elapsed_secs)) + print("TCP packets sent: %5.3f M/sec" % + (1e-6*deltas["pacer_tcp_packets"]/elapsed_secs)) + print("TCP throughput (inc. headers): %5.2f Gbps" % + (8e-9*deltas["pacer_tcp_bytes"]/elapsed_secs)) + print("Helper throughput (Homa + TCP): %5.2f Gbps (%.1f%% of all pacer bytes)" % (8e-9*deltas["pacer_help_bytes"]/elapsed_secs, - 100*deltas["pacer_help_bytes"]/deltas["pacer_bytes"])) + 100*deltas["pacer_help_bytes"]/pacer_bytes)) backlog_secs = float(deltas["nic_backlog_cycles"])/(cpu_khz * 1000.0) print("Active throughput: %5.2f Gbps (NIC backlogged %.1f%% of time)" % ( - deltas["pacer_bytes"]*8e-09/backlog_secs, - 100*backlog_secs/elapsed_secs)) + pacer_bytes*8e-09/backlog_secs, 100*backlog_secs/elapsed_secs)) xmit_secs = float(deltas["pacer_xmit_cycles"])/(cpu_khz * 1000.0) print("Pacer thread duty cycle: %5.1f %%" % (100*deltas["pacer_cycles"]/time_delta)) print("Time xmitting packets: %5.1f %% (%.2f usecs/packet)" % - (100*xmit_secs/elapsed_secs, 1e6*xmit_secs/deltas["pacer_packets"])) + (100*xmit_secs/elapsed_secs, + 1e6*xmit_secs/(deltas["pacer_homa_packets"] + + deltas["pacer_tcp_packets"]))) print("\nMiscellaneous:") print("--------------") From aa41a15fa527cc15a2df30bc3ab836ba78aad4c3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 20 Oct 2025 16:46:07 -0700 Subject: [PATCH 532/625] Move legend in RTT graphs from cp_vs_tcp --- util/cp_vs_tcp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index e9372e59..fd3a72a6 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -143,7 +143,7 @@ for workload, bw, seconds in load_info: plot_histogram(ax, dctcp_exp, "p50", "DCTCP P50", color=dctcp_color2) plot_histogram(ax, homa_exp, "p99", "Homa P99", color=homa_color) plot_histogram(ax, homa_exp, "p50", "Homa P50", color=homa_color2) - ax.legend(loc="upper right", prop={'size': 9}) + ax.legend(loc="upper left", prop={'size': 9}) plt.tight_layout() plt.savefig("%s/reports/rtt_%s.pdf" % (options.log_dir, workload)) From 22d4ec01da1f98561eb83f76f307863c34c362fa Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 20 Oct 2025 17:28:43 -0700 Subject: [PATCH 533/625] Add qdisc_tcp_packets metric --- homa_metrics.c | 4 +++- homa_metrics.h | 7 +++++++ homa_qdisc.c | 1 + test/unit_homa_qdisc.c | 1 + util/metrics.py | 15 ++++++++++----- 5 files changed, 22 insertions(+), 6 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index b041e5a8..ca2d91de 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -304,7 +304,9 @@ char *homa_metrics_print(void) M("pacer_tcp_bytes", m->pacer_tcp_bytes, "TCP bytes transmitted by the pacer (including headers)\n"); M("pacer_help_bytes", m->pacer_help_bytes, - "Bytes transmitted via homa_qdisc_pacer_check"); + "Bytes transmitted via homa_qdisc_pacer_check\n"); + M("qdisc_tcp_packets", m->qdisc_tcp_packets, + "TCP packets processed by homa_qdisc\n"); M("homa_cycles", m->softirq_cycles + m->napi_cycles + m->send_cycles + m->recv_cycles + diff --git a/homa_metrics.h b/homa_metrics.h index aeeb8986..c1a09c61 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -430,6 +430,13 @@ struct homa_metrics { */ u64 pacer_help_bytes; + /** + * @qdisc_tcp_packets: total number of TCP packets that passed through + * homa_qdisc; includes packets that were transmitted immediately as + * well as those that were deferred. + */ + u64 qdisc_tcp_packets; + /** * @resent_packets: total number of data packets issued in response to * RESEND packets. diff --git a/homa_qdisc.c b/homa_qdisc.c index f429fd62..88dd6c9f 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -448,6 +448,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * we must defer short packets if there are other packets * already deferred for this qdisc. */ + INC_METRIC(qdisc_tcp_packets, 1); if (!list_empty(&q->defer_links)) { homa_qdisc_defer_tcp(q, skb); return NET_XMIT_SUCCESS; diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 2fc7882d..d862f5ad 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -564,6 +564,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__xmit_short_tcp_packet) EXPECT_EQ(1, qdisc->q.qlen); EXPECT_STREQ("", unit_log_get()); EXPECT_LT(1000000, atomic64_read(&q->qdev->link_idle_time)); + EXPECT_EQ(1, homa_metrics_per_cpu()->qdisc_tcp_packets); homa_qdisc_destroy(qdisc); kfree(qdisc); diff --git a/util/metrics.py b/util/metrics.py index ae9eac6b..5662e859 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -414,20 +414,25 @@ def scale_number(number): print("Homa packets sent: 0.000 M/sec") print("Homa throughput (inc. headers): %5.2f Gbps" % (8e-9*deltas["pacer_homa_bytes"]/elapsed_secs)) - print("TCP packets sent: %5.3f M/sec" % - (1e-6*deltas["pacer_tcp_packets"]/elapsed_secs)) + qdisc_tcp_packets = deltas["qdisc_tcp_packets"] + if qdisc_tcp_packets != 0: + print("TCP packets sent: %5.3f M/sec (%.1f %% of all TCP packets)" % + (1e-6*deltas["pacer_tcp_packets"]/elapsed_secs, + 100*deltas["pacer_tcp_packets"]/qdisc_tcp_packets)) + else: + print("TCP packets sent: 0.000 M/sec") print("TCP throughput (inc. headers): %5.2f Gbps" % (8e-9*deltas["pacer_tcp_bytes"]/elapsed_secs)) - print("Helper throughput (Homa + TCP): %5.2f Gbps (%.1f%% of all pacer bytes)" % + print("Helper throughput (Homa + TCP): %5.2f Gbps (%.1f%% of all pacer bytes)" % (8e-9*deltas["pacer_help_bytes"]/elapsed_secs, 100*deltas["pacer_help_bytes"]/pacer_bytes)) backlog_secs = float(deltas["nic_backlog_cycles"])/(cpu_khz * 1000.0) - print("Active throughput: %5.2f Gbps (NIC backlogged %.1f%% of time)" % ( + print("Active throughput: %5.2f Gbps (NIC backlogged %.1f%% of time)" % ( pacer_bytes*8e-09/backlog_secs, 100*backlog_secs/elapsed_secs)) xmit_secs = float(deltas["pacer_xmit_cycles"])/(cpu_khz * 1000.0) print("Pacer thread duty cycle: %5.1f %%" % (100*deltas["pacer_cycles"]/time_delta)) - print("Time xmitting packets: %5.1f %% (%.2f usecs/packet)" % + print("Time xmitting packets: %5.1f %% (%.2f usec/packet)" % (100*xmit_secs/elapsed_secs, 1e6*xmit_secs/(deltas["pacer_homa_packets"] + deltas["pacer_tcp_packets"]))) From 070b59075b1e778f771d4754d7069aeb09e377cc Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 20 Oct 2025 17:35:37 -0700 Subject: [PATCH 534/625] Add RTT latency plot to cp_both --- util/cp_both | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/util/cp_both b/util/cp_both index d2c71477..d01a278a 100755 --- a/util/cp_both +++ b/util/cp_both @@ -70,6 +70,18 @@ ax.legend(loc="upper right", prop={'size': 9}) plt.tight_layout() plt.savefig("%s/reports/both_%s.pdf" % (options.log_dir, options.workload)) +# Generate latency plot. +log("Generating RTT latency plot for %s" % (options.workload)) +ax = start_plot_vs_msg_length(title, [30, 30000], homa_exp, + y_label=r'RTT (µsec)') +plot_histogram(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) +plot_histogram(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) +plot_histogram(ax, homa_exp, "p99", "Homa P99", color=homa_color) +plot_histogram(ax, homa_exp, "p50", "Homa P50", color=homa_color2) +ax.legend(loc="upper left", prop={'size': 9}) +plt.tight_layout() +plt.savefig("%s/reports/rtt_%s.pdf" % (options.log_dir, options.workload)) + # Generate CDF of small-message RTTs. log("Generating short message CDF for %s" % (options.workload)) homa_x, homa_y = get_short_cdf(homa_exp) From 04ef62f7e0464bb98779571aa78a8b99ff87b76f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 21 Oct 2025 11:41:07 -0700 Subject: [PATCH 535/625] Fix bugs in unfreezing timetraces in cperf.py --- util/cperf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index 5d9ee820..03922b45 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -747,8 +747,9 @@ def run_experiment(name, clients, options): do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) if not "no_rtt_files" in options: do_cmd("dump_times /dev/null %s" % (name), clients) - log("Unfreezing timetraces on %s" % (nodes)) - set_sysctl_parameter(".net.homa.action", "10", nodes) + if options.protocol == "homa" and options.tt_freeze: + log("Unfreezing timetraces on %s" % (nodes)) + set_sysctl_parameter(".net.homa.action", "10", nodes) do_cmd("log Starting measurements for %s experiment" % (name), server_nodes, clients) log("Starting measurements") @@ -925,6 +926,9 @@ def run_experiments(*args): vlog("Initializing metrics") do_ssh(["metrics.py > /dev/null"], homa_nodes) do_cmd("dump_times /dev/null", all_nodes) + if homa_nodes and exp.tt_freeze: + log("Unfreezing timetraces on %s" % (all_nodes)) + set_sysctl_parameter(".net.homa.action", "10", all_nodes) do_cmd("log Starting measurements", all_nodes) log("Starting measurements") From 51a3eca2ca84493198dfe7e8c8434ffe39718924 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 21 Oct 2025 15:54:31 -0700 Subject: [PATCH 536/625] Add buffers analyzer to tthoma.py --- util/tthoma.py | 143 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 1 deletion(-) diff --git a/util/tthoma.py b/util/tthoma.py index 11c8d685..06d2da46 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -219,6 +219,7 @@ def __missing__(self, key): # softirq: Time when homa_softirq processed the packet # softirq_core: Core on which SoftIRQ processed the packet # tx_node: Node that sent grant (if known) +# rx_node: Node that received grant (if known) # id: Id of the RPC on the node that sent the grant # offset: Offset specified in the grant # increment: How much previously ungranted data is covered by this grant; @@ -497,6 +498,18 @@ def get_interval(node, usecs): return None return data[i] +def get_last_start(): + """ + Return the latest time at which any of the traces begins (i.e. the first + time that is present in all of the trace files). + """ + latest = -1e20 + for trace in traces.values(): + first = trace['first_time'] + if first > latest: + latest = first + return latest + def get_last_time(): """ Return the latest event time across all trace files. @@ -2049,6 +2062,132 @@ def output(self): print('%-10s %5d %6d' % (node, self.node_rpcs[node], self.node_bpages[node])) +#------------------------------------------------ +# Analyzer: buffers +#------------------------------------------------ +class AnalyzeBuffers: + """ + Estimates buffer occupancy in the egress ports between TOR and nodes + by computing one-way packet delays and assuming that additional time + beyond the minimum observed for a source-destination pair must be due + to queuing in the switch. Also displays minimum node-to-node latencies. + Uses the --gbps option. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeIntervals') + + def output(self): + global grants, options, packets + nodes = get_sorted_nodes() + + # Node1 -> dictionary of: + # Node2 -> minimum observed latency (from nic to gro) of + # packets sent from Node1 to Node2. + latency = {} + for src in nodes: + latency[src] = {} + for dst in nodes: + latency[src][dst] = 1e20 + + # Compute (and output) minimum node-to-node latencies. + for pkt in packets.values(): + if not 'nic' in pkt or not 'gro' in pkt: + continue + delta = pkt['gro'] - pkt['nic'] + if delta < latency[pkt['tx_node']][pkt['rx_node']]: + latency[pkt['tx_node']][pkt['rx_node']] = delta + for pkt in grants.values(): + if not 'nic' in pkt or not 'gro' in pkt: + continue + delta = pkt['gro'] - pkt['nic'] + if delta < latency[pkt['tx_node']][pkt['rx_node']]: + latency[pkt['tx_node']][pkt['rx_node']] = delta + + print('\n-----------------') + print('Analyzer: buffers') + print('-----------------') + + print('\nMinimum one-way latency (microseconds) from when a packet ' + 'was passed') + print('to the NIC on a source node (rows) until it was received by ' + 'GRO on') + print('the destination node:') + print(' '*10, end='') + for dst in nodes: + print('%10s' % (dst), end='') + print('') + for src in nodes: + print('%-10s' % (src), end='') + for dst in nodes: + t = latency[src][dst] + if t >= 1e20: + print(' '*10, end='') + else: + print('%10.1f' % (latency[src][dst]), end='') + print('') + + # Augment the interval information with a new element max_delay + # which contains the highest incremental latency (beyond the + # node-to-node minimum) for any packet received in that interval. + for pkt in packets.values(): + if not 'nic' in pkt or not 'gro' in pkt: + continue + gro = pkt['gro'] + delay = (gro - pkt['nic']) - latency[pkt['tx_node']][pkt['rx_node']] + interval = get_interval(pkt['rx_node'], gro) + if not 'max_delay' in interval or interval['max_delay'] < delay: + interval['max_delay'] = delay + + # Print the interval information, converting delay into buffer + # space (assumes all incremental delays are caused by buffering + # at TOR downlinks). + print('\nEstimated buffer space occupied at TOR downlink ports for ' + 'each node, as') + print('a function of time. Buffer occupancy is estimated from the ' + 'largest packet ') + print('delay (beyond minimum times) observed during an interval. ' + 'Assumes a network') + print('speed of %s Gbps' % (options.gbps)) + print('\nTime Total', end='') + for dst in nodes: + print('%10s' % (dst), end='') + print('') + t = get_last_start() + end = get_first_end() + totals = [] + per_node = [] + while t < end: + buffer_info = '' + total_kbytes = 0 + for node in nodes: + interval = get_interval(node, t) + if 'max_delay' in interval: + delay = interval['max_delay'] + kbytes_buffered = delay * options.gbps / 8 + total_kbytes += kbytes_buffered + buffer_info += '%10d' % (kbytes_buffered) + per_node.append(kbytes_buffered) + else: + buffer_info += ' N/A' + print('%8.1f %10d%s' % (interval['time'], total_kbytes, + buffer_info)) + totals.append(total_kbytes) + t += options.interval + + per_node.sort() + print("\nPer-node buffer utilization:") + print(" P50: %5d KB" % (per_node[len(per_node)//2])) + print(" P99: %5d KB" % (per_node[99*len(per_node)//100])) + print(" Maximum: %5d KB" % (per_node[-1])) + + totals.sort() + print("\nTotal buffer utilization:") + print(" P50: %5d KB" % (totals[len(totals)//2])) + print(" P99: %5d KB" % (totals[99*len(totals)//100])) + print(" Maximum: %5d KB" % (totals[-1])) + #------------------------------------------------ # Analyzer: copy #------------------------------------------------ @@ -6355,6 +6494,7 @@ def tt_gro_grant(self, trace, t, core, peer, id, offset, priority): rpcs[id]['gro_grant_pkts'].append(g) g['gro'] = t g['gro_core'] = core + g['rx_node'] = trace['node'] def tt_softirq_grant(self, trace, t, core, id, offset, priority, increment): global grants @@ -6364,6 +6504,7 @@ def tt_softirq_grant(self, trace, t, core, id, offset, priority, increment): g['softirq'] = t g['softirq_core'] = core g['increment'] = increment + g['rx_node'] = trace['node'] def analyze(self): """ @@ -8369,7 +8510,7 @@ def output(self): print('Summary statistics on delays related to outgoing packets:') print('Node: Name of node') print('Qid: Identifier of transmit queue') - print('TxQueue: Address of netdev_queue struct for Qid') + print('TxQueue: Address of netdev_queue struct for Qid') print('Tsos: Total number of TSO frames transmitted by node ' 'or queue') print('Segs: Total number of segments (packets received by GRO) ' From e54dd0ffd18eb93e46fb688748da9d53dd2ef9b6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 22 Oct 2025 11:16:49 -0700 Subject: [PATCH 537/625] Add latency plot to cp_config --- util/cp_config | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/util/cp_config b/util/cp_config index 7641a677..4e3652fc 100755 --- a/util/cp_config +++ b/util/cp_config @@ -212,7 +212,7 @@ elif options.config == 'ports': }) elif options.config == 'prios': # Vary the number of available priority levels - for priority in [1, 2, 3, 4, 8]: + for priority in [1, 2, 3, 8]: specs.append({'exp_name': 'prios_%d' % (priority), 'label': '%d prios' % (priority), 'sysctl': ['.net.homa.num_priorities', priority]}) @@ -352,6 +352,23 @@ for workload, bw, seconds in load_info: plt.savefig("%s/reports/%s_%s.pdf" % (options.log_dir, options.config, workload)) + # Generate latency plot. + log("Generating latency plot for %s" % (workload)) + title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, bw) + ax = start_plot_vs_msg_length(title, [30, 30000], "%s_%s" % ( + specs[0]['exp_name'], workload), y_label=r'RTT (µsec)') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_histogram(ax, exp_name, "p99", spec['label'] + ' P99') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_histogram(ax, exp_name, "p50", spec['label'] + ' P50') + ax.legend(loc="upper left", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/reports/%s_%s_rtt.pdf" % + (options.log_dir, options.config, workload)) + # Generate CDF of small-message RTTs. log("Generating short message CDFs for %s" % (workload)) title = "%s %d nodes" % (workload.capitalize(), options.num_nodes) From 245a2b0ebe6287430c00ab576ab3580baa9b5246 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 23 Oct 2025 13:18:47 -0700 Subject: [PATCH 538/625] Rename buffers analyzer to qdelay in tthoma.py Remove info on buffer usage (it isn't accurate in the presence of priorities), add scatter plots. --- util/tthoma.py | 344 +++++++++++++++++++++++++++++++------------------ 1 file changed, 216 insertions(+), 128 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 06d2da46..f33e1fe1 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -12,6 +12,8 @@ from functools import cmp_to_key from glob import glob from itertools import count +import matplotlib +import matplotlib.pyplot as plt from optparse import OptionParser import math from operator import itemgetter @@ -2062,132 +2064,6 @@ def output(self): print('%-10s %5d %6d' % (node, self.node_rpcs[node], self.node_bpages[node])) -#------------------------------------------------ -# Analyzer: buffers -#------------------------------------------------ -class AnalyzeBuffers: - """ - Estimates buffer occupancy in the egress ports between TOR and nodes - by computing one-way packet delays and assuming that additional time - beyond the minimum observed for a source-destination pair must be due - to queuing in the switch. Also displays minimum node-to-node latencies. - Uses the --gbps option. - """ - - def __init__(self, dispatcher): - dispatcher.interest('AnalyzePackets') - dispatcher.interest('AnalyzeIntervals') - - def output(self): - global grants, options, packets - nodes = get_sorted_nodes() - - # Node1 -> dictionary of: - # Node2 -> minimum observed latency (from nic to gro) of - # packets sent from Node1 to Node2. - latency = {} - for src in nodes: - latency[src] = {} - for dst in nodes: - latency[src][dst] = 1e20 - - # Compute (and output) minimum node-to-node latencies. - for pkt in packets.values(): - if not 'nic' in pkt or not 'gro' in pkt: - continue - delta = pkt['gro'] - pkt['nic'] - if delta < latency[pkt['tx_node']][pkt['rx_node']]: - latency[pkt['tx_node']][pkt['rx_node']] = delta - for pkt in grants.values(): - if not 'nic' in pkt or not 'gro' in pkt: - continue - delta = pkt['gro'] - pkt['nic'] - if delta < latency[pkt['tx_node']][pkt['rx_node']]: - latency[pkt['tx_node']][pkt['rx_node']] = delta - - print('\n-----------------') - print('Analyzer: buffers') - print('-----------------') - - print('\nMinimum one-way latency (microseconds) from when a packet ' - 'was passed') - print('to the NIC on a source node (rows) until it was received by ' - 'GRO on') - print('the destination node:') - print(' '*10, end='') - for dst in nodes: - print('%10s' % (dst), end='') - print('') - for src in nodes: - print('%-10s' % (src), end='') - for dst in nodes: - t = latency[src][dst] - if t >= 1e20: - print(' '*10, end='') - else: - print('%10.1f' % (latency[src][dst]), end='') - print('') - - # Augment the interval information with a new element max_delay - # which contains the highest incremental latency (beyond the - # node-to-node minimum) for any packet received in that interval. - for pkt in packets.values(): - if not 'nic' in pkt or not 'gro' in pkt: - continue - gro = pkt['gro'] - delay = (gro - pkt['nic']) - latency[pkt['tx_node']][pkt['rx_node']] - interval = get_interval(pkt['rx_node'], gro) - if not 'max_delay' in interval or interval['max_delay'] < delay: - interval['max_delay'] = delay - - # Print the interval information, converting delay into buffer - # space (assumes all incremental delays are caused by buffering - # at TOR downlinks). - print('\nEstimated buffer space occupied at TOR downlink ports for ' - 'each node, as') - print('a function of time. Buffer occupancy is estimated from the ' - 'largest packet ') - print('delay (beyond minimum times) observed during an interval. ' - 'Assumes a network') - print('speed of %s Gbps' % (options.gbps)) - print('\nTime Total', end='') - for dst in nodes: - print('%10s' % (dst), end='') - print('') - t = get_last_start() - end = get_first_end() - totals = [] - per_node = [] - while t < end: - buffer_info = '' - total_kbytes = 0 - for node in nodes: - interval = get_interval(node, t) - if 'max_delay' in interval: - delay = interval['max_delay'] - kbytes_buffered = delay * options.gbps / 8 - total_kbytes += kbytes_buffered - buffer_info += '%10d' % (kbytes_buffered) - per_node.append(kbytes_buffered) - else: - buffer_info += ' N/A' - print('%8.1f %10d%s' % (interval['time'], total_kbytes, - buffer_info)) - totals.append(total_kbytes) - t += options.interval - - per_node.sort() - print("\nPer-node buffer utilization:") - print(" P50: %5d KB" % (per_node[len(per_node)//2])) - print(" P99: %5d KB" % (per_node[99*len(per_node)//100])) - print(" Maximum: %5d KB" % (per_node[-1])) - - totals.sort() - print("\nTotal buffer utilization:") - print(" P50: %5d KB" % (totals[len(totals)//2])) - print(" P99: %5d KB" % (totals[99*len(totals)//100])) - print(" Maximum: %5d KB" % (totals[-1])) - #------------------------------------------------ # Analyzer: copy #------------------------------------------------ @@ -2439,7 +2315,6 @@ class AnalyzeCore: """ def __init__(self, dispatcher): - global options require_options('core', 'data', 'node') # List of all intervals over the life of the trace, each list entry @@ -6682,6 +6557,213 @@ def output(self): info['backlog'], len(delays))) first = False +#------------------------------------------------ +# Analyzer: qdelay +#------------------------------------------------ +class AnalyzeQdelay: + """ + Generates scatter plots that show the queuing delay for each packet + sent from (or received by) a given node. Queuing delay is the + actual latency for a packet (from when packet is queued for the NIC + until it is processed by GRO) minus the smallest latency observed for + the same source-destination pair. Queuing delay can be caused by either + queuing in the network or delays in invoking the GRO handler on the + destination. Requires the --plot option. + """ + + def __init__(self, dispatcher): + require_options('qdelay', 'plot') + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzeIntervals') + + def init_qdelay_axis(self, ax, title, x_min, x_max, max_qdelay, size=10): + """ + Initializes a pyplot axis that will be used for a scatter plot of + queuing delay for each packet over time. + + ax: Axis to initialize + title: Title for the plot; may be empty + x_min: Lowest value for x-axis (usecs) + x_max: Highest value for x-axis (usecs) + max_qdelay: Largest value that will be displayed as y (queuing + delay in usecs). + size: Size to use for fonts + figsize: Dimensions of plot + """ + global options + + if title != "": + ax.set_title(title, size=size) + ax.set_xlim(x_min, x_max) + ax.set_ylim(2, max_qdelay) + ax.set_yscale("log") + ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: '%d' % (x))) + ax.tick_params(right=True, which='both', direction='in', length=5) + ax.set_xlabel('Time of Packet Tx (μsec)', size=size) + ax.set_ylabel('Queuing Delay (μsec)', size=size) + + return ax + + def output(self): + global grants, options, packets, rpcs + nodes = get_sorted_nodes() + + # Node1 -> dictionary of: + # Node2 -> minimum observed latency (from nic to gro) of + # packets sent from Node1 to Node2. + latency = {} + for src in nodes: + latency[src] = {} + for dst in nodes: + latency[src][dst] = 1e20 + + # Node -> for all of the data packets + # received by the node. Time is a list of packet GRO times, qdelays + # is a list of corresponding queuing delays, and colors is a list + # of colors to use for each point. + rx_delays = defaultdict(lambda: [[], [], []]) + + # Same as rx_delays except that packets are those transmitted by + # the node and times are NIC times. + tx_delays = defaultdict(lambda: [[], [], []]) + + # Compute (and output) minimum node-to-node latencies. + for pkt in packets.values(): + if not 'nic' in pkt or not 'gro' in pkt: + continue + delta = pkt['gro'] - pkt['nic'] + if delta < latency[pkt['tx_node']][pkt['rx_node']]: + latency[pkt['tx_node']][pkt['rx_node']] = delta + for pkt in grants.values(): + if not 'nic' in pkt or not 'gro' in pkt: + continue + delta = pkt['gro'] - pkt['nic'] + if delta < latency[pkt['tx_node']][pkt['rx_node']]: + latency[pkt['tx_node']][pkt['rx_node']] = delta + + print('\n-----------------') + print('Analyzer: qdelay') + print('-----------------') + + print('See graphs qdelay_tx.pdf and qdelay_rx.pdf in %s' + % (options.plot)) + + print('\nMinimum one-way latency (microseconds) from when a packet ' + 'was passed') + print('to the NIC on a source node (rows) until it was received by ' + 'GRO on') + print('the destination node:') + print(' '*10, end='') + for dst in nodes: + print('%10s' % (dst), end='') + print('') + for src in nodes: + line = '%-10s' % (src) + for dst in nodes: + t = latency[src][dst] + if t >= 1e20: + line += ' '*10 + else: + line += '%10.1f' % (latency[src][dst]) + print(line.rstrip()) + + # Augment the interval information with a new element max_delay + # which contains the highest incremental latency (beyond the + # node-to-node minimum) for any packet received in that interval. + # Also collect data for the scatter plots + overall_max_delay = 0 + for pkt_type, pkts in [['data', packets.values()], + ['grant', grants.values()]]: + for pkt in pkts: + if not 'nic' in pkt or not 'gro' in pkt: + continue + gro = pkt['gro'] + nic = pkt['nic'] + tx_node = pkt['tx_node'] + rx_node = pkt['rx_node'] + delay = (gro - nic) - latency[tx_node][rx_node] + interval = get_interval(rx_node, gro) + if not 'max_delay' in interval or interval['max_delay'] < delay: + interval['max_delay'] = delay + if pkt_type == 'grant': + color = '#844F1A' + else: + rpc = rpcs[pkt['id']] + if rpc['out_length'] < 1000: + color = '#c00000' + else: + color = '#1f77b4' + tx_delays[tx_node][0].append(nic) + tx_delays[tx_node][1].append(delay) + tx_delays[tx_node][2].append(color) + rx_delays[rx_node][0].append(nic) + rx_delays[rx_node][1].append(delay) + rx_delays[rx_node][2].append(color) + if delay > overall_max_delay: + overall_max_delay = delay + + # Print maximum queuing delay for each node and interval. + print('\nLargest queuing delay (μsecs) for incoming packets on each ' + 'node, over') + print('%d μsec intervals:' % (options.interval) ) + print('Time ', end='') + for dst in nodes: + print('%10s' % (dst), end='') + print('') + t = options.interval * math.floor(get_last_start()/options.interval) + end = get_first_end() + while t < end: + buffer_info = '' + for node in nodes: + interval = get_interval(node, t) + if interval and 'max_delay' in interval: + buffer_info += '%10.1f' % (interval['max_delay']) + else: + buffer_info += ' '*10 + print(('%8.1f %s' % (t, buffer_info)).rstrip()) + t += options.interval + + # Generate plots + legend_handles = [ + matplotlib.lines.Line2D([], [], color=c, marker='o', + linestyle='None', markersize=8, label=label) + for c, label in [['#c00000', 'Data (messages < 1000B)'], + ['#1f77b4', 'Data (other messages)'], + ['#844F1A', 'Grants']] + ] + x_min = get_last_start() + x_max = get_first_end() + fig, axes = plt.subplots(nrows=len(nodes), ncols=1, sharex=False, + figsize=[8, len(nodes)*2]) + for i in range(len(nodes)): + node = nodes[i] + ax = axes[i] + self.init_qdelay_axis(ax, + 'Incoming Packets on %s' % node, + x_min, x_max, overall_max_delay) + ax.scatter(rx_delays[node][0], rx_delays[node][1], + marker='o', s=1, c=rx_delays[node][2]) + fig.legend(handles=legend_handles, loc='lower center', ncol=3, + bbox_to_anchor=(0.5, -0.03), frameon=False) + plt.tight_layout() + plt.savefig("%s/qdelay_rx.pdf" % (options.plot), bbox_inches='tight') + + fig, axes = plt.subplots(nrows=len(nodes), ncols=1, sharex=False, + figsize=[8, len(nodes)*2]) + for i in range(len(nodes)): + node = nodes[i] + ax = axes[i] + self.init_qdelay_axis(ax, + 'Outgoing Packets from %s' % node, + x_min, x_max, overall_max_delay) + ax.scatter(tx_delays[node][0], tx_delays[node][1], + marker='o', s=1, c=tx_delays[node][2]) + fig.legend(handles=legend_handles, loc='lower left', ncol=3, + bbox_to_anchor=(0.5, -0.03), frameon=False) + plt.tight_layout() + plt.savefig("%s/qdelay_tx.pdf" % (options.plot), bbox_inches='tight') + #------------------------------------------------ # Analyzer: rpcs #------------------------------------------------ @@ -7413,7 +7495,7 @@ def output(self): print('\n-----------------') - print('Analyzer: nicbufs') + print('Analyzer: rxbufs') print('-----------------') print('Maximum active NIC buffer space used for each GRO core over the') print('life of the traces (assuming Mellanox mlx5 buffer cache):') @@ -8957,6 +9039,10 @@ def output(self): 'odd means response) and OFF is an offset in the message; if this ' 'option is specified, some analyzers will output information specific ' 'to that packet.') +parser.add_option('--plot', '-p', dest='plot', default=None, + metavar='DIR', help='Some analyzers can generate data plots, but ' + 'they will do so only if this option is specified; DIR gives the ' + 'directory in which to place plots.') parser.add_option('--rx-core', dest='rx_core', type=int, default=None, metavar='C', help='If specified, some analyzers will ignore packets ' 'transmitted from cores other than C') @@ -9008,6 +9094,8 @@ def output(self): exit(1) if options.data: os.makedirs(options.data, exist_ok=True) +if options.plot: + os.makedirs(options.plot, exist_ok=True) if options.pkt: match = re.match('([0-9]+):([0-9]+)$', options.pkt) if not match: From 4afbd94c496b0c8d8d8fdab89e206487017ddff2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 23 Oct 2025 14:09:09 -0700 Subject: [PATCH 539/625] Add minlatency analyzer to tthoma.py Pull code out of qdelay analyzer. --- util/tthoma.py | 155 ++++++++++++++++++++++++------------------------- 1 file changed, 75 insertions(+), 80 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index f33e1fe1..db368e22 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -11,7 +11,7 @@ from collections import defaultdict from functools import cmp_to_key from glob import glob -from itertools import count +import itertools import matplotlib import matplotlib.pyplot as plt from optparse import OptionParser @@ -319,6 +319,14 @@ def __missing__(self, key): # as of the end of the interval intervals = None +# Node (src) -> dictionary of +# Node(dst) -> minimum observed latency (from packet "nic" to "gro") +# among all data and grant packets sent from src to dst. +# A value of math.inf means there were no packets between +# the hosts. +# This structure is created only if the "minlatency" analyzer is active. +min_latency = {} + # Dispatcher used to parse the traces. dispatcher = None @@ -4728,7 +4736,7 @@ def analyze(self): # for the result. next = [1] * len(nodes) self.intervals = [] - for t in count(start, interval): + for t in itertools.count(start, interval): if t > end: break indices = [] @@ -5188,6 +5196,62 @@ def output(self): print('%9s %-10s %-10s %4s %12s %6d' % (xmit_info, tx_node, rx_node, core_info, id_info, pkt['offset'])) +#------------------------------------------------ +# Analyzer: minlatency +#------------------------------------------------ +class AnalyzeMinlatency: + """ + Analyzes packet information to compute the minimum one-way latency + between each pair of nodes. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + def analyze(self): + global grants, min_latency, packets + + nodes = get_sorted_nodes() + for src in nodes: + min_latency[src] = {} + for dst in nodes: + min_latency[src][dst] = math.inf + + for pkt in itertools.chain(packets.values(), grants.values()): + if not 'nic' in pkt or not 'gro' in pkt: + continue + delta = pkt['gro'] - pkt['nic'] + if delta < min_latency[pkt['tx_node']][pkt['rx_node']]: + min_latency[pkt['tx_node']][pkt['rx_node']] = delta + + def output(self): + global min_latency + + print('\n--------------------') + print('Analyzer: minlatency') + print('--------------------') + + print('\nMinimum one-way latency (microseconds) from when a packet ' + 'was queued') + print('for the NIC on a source node (rows) until it was received by ' + 'GRO on') + print('the destination node:') + + nodes = get_sorted_nodes() + print(' '*10, end='') + for dst in nodes: + print('%10s' % (dst), end='') + print('') + for src in nodes: + line = '%-10s' % (src) + for dst in nodes: + t = min_latency[src][dst] + if t == math.inf: + line += ' '*10 + else: + line += '%10.1f' % (t) + print(line.rstrip()) + #------------------------------------------------ # Analyzer: msgrange #------------------------------------------------ @@ -6575,6 +6639,7 @@ def __init__(self, dispatcher): require_options('qdelay', 'plot') dispatcher.interest('AnalyzePackets') dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzeMinlatency') dispatcher.interest('AnalyzeIntervals') def init_qdelay_axis(self, ax, title, x_min, x_max, max_qdelay, size=10): @@ -6589,7 +6654,6 @@ def init_qdelay_axis(self, ax, title, x_min, x_max, max_qdelay, size=10): max_qdelay: Largest value that will be displayed as y (queuing delay in usecs). size: Size to use for fonts - figsize: Dimensions of plot """ global options @@ -6609,15 +6673,6 @@ def output(self): global grants, options, packets, rpcs nodes = get_sorted_nodes() - # Node1 -> dictionary of: - # Node2 -> minimum observed latency (from nic to gro) of - # packets sent from Node1 to Node2. - latency = {} - for src in nodes: - latency[src] = {} - for dst in nodes: - latency[src][dst] = 1e20 - # Node -> for all of the data packets # received by the node. Time is a list of packet GRO times, qdelays # is a list of corresponding queuing delays, and colors is a list @@ -6628,20 +6683,6 @@ def output(self): # the node and times are NIC times. tx_delays = defaultdict(lambda: [[], [], []]) - # Compute (and output) minimum node-to-node latencies. - for pkt in packets.values(): - if not 'nic' in pkt or not 'gro' in pkt: - continue - delta = pkt['gro'] - pkt['nic'] - if delta < latency[pkt['tx_node']][pkt['rx_node']]: - latency[pkt['tx_node']][pkt['rx_node']] = delta - for pkt in grants.values(): - if not 'nic' in pkt or not 'gro' in pkt: - continue - delta = pkt['gro'] - pkt['nic'] - if delta < latency[pkt['tx_node']][pkt['rx_node']]: - latency[pkt['tx_node']][pkt['rx_node']] = delta - print('\n-----------------') print('Analyzer: qdelay') print('-----------------') @@ -6649,29 +6690,7 @@ def output(self): print('See graphs qdelay_tx.pdf and qdelay_rx.pdf in %s' % (options.plot)) - print('\nMinimum one-way latency (microseconds) from when a packet ' - 'was passed') - print('to the NIC on a source node (rows) until it was received by ' - 'GRO on') - print('the destination node:') - print(' '*10, end='') - for dst in nodes: - print('%10s' % (dst), end='') - print('') - for src in nodes: - line = '%-10s' % (src) - for dst in nodes: - t = latency[src][dst] - if t >= 1e20: - line += ' '*10 - else: - line += '%10.1f' % (latency[src][dst]) - print(line.rstrip()) - - # Augment the interval information with a new element max_delay - # which contains the highest incremental latency (beyond the - # node-to-node minimum) for any packet received in that interval. - # Also collect data for the scatter plots + # Collect data for the scatter plots. overall_max_delay = 0 for pkt_type, pkts in [['data', packets.values()], ['grant', grants.values()]]: @@ -6682,10 +6701,7 @@ def output(self): nic = pkt['nic'] tx_node = pkt['tx_node'] rx_node = pkt['rx_node'] - delay = (gro - nic) - latency[tx_node][rx_node] - interval = get_interval(rx_node, gro) - if not 'max_delay' in interval or interval['max_delay'] < delay: - interval['max_delay'] = delay + qdelay = (gro - nic) - min_latency[tx_node][rx_node] if pkt_type == 'grant': color = '#844F1A' else: @@ -6695,36 +6711,15 @@ def output(self): else: color = '#1f77b4' tx_delays[tx_node][0].append(nic) - tx_delays[tx_node][1].append(delay) + tx_delays[tx_node][1].append(qdelay) tx_delays[tx_node][2].append(color) rx_delays[rx_node][0].append(nic) - rx_delays[rx_node][1].append(delay) + rx_delays[rx_node][1].append(qdelay) rx_delays[rx_node][2].append(color) - if delay > overall_max_delay: - overall_max_delay = delay - - # Print maximum queuing delay for each node and interval. - print('\nLargest queuing delay (μsecs) for incoming packets on each ' - 'node, over') - print('%d μsec intervals:' % (options.interval) ) - print('Time ', end='') - for dst in nodes: - print('%10s' % (dst), end='') - print('') - t = options.interval * math.floor(get_last_start()/options.interval) - end = get_first_end() - while t < end: - buffer_info = '' - for node in nodes: - interval = get_interval(node, t) - if interval and 'max_delay' in interval: - buffer_info += '%10.1f' % (interval['max_delay']) - else: - buffer_info += ' '*10 - print(('%8.1f %s' % (t, buffer_info)).rstrip()) - t += options.interval + if qdelay > overall_max_delay: + overall_max_delay = qdelay - # Generate plots + # Generate scatter plots legend_handles = [ matplotlib.lines.Line2D([], [], color=c, marker='o', linestyle='None', markersize=8, label=label) @@ -6759,7 +6754,7 @@ def output(self): x_min, x_max, overall_max_delay) ax.scatter(tx_delays[node][0], tx_delays[node][1], marker='o', s=1, c=tx_delays[node][2]) - fig.legend(handles=legend_handles, loc='lower left', ncol=3, + fig.legend(handles=legend_handles, loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.03), frameon=False) plt.tight_layout() plt.savefig("%s/qdelay_tx.pdf" % (options.plot), bbox_inches='tight') From e90b2e82a5936700f7c47f9f425a0c4d64613b02 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Oct 2025 09:34:08 -0700 Subject: [PATCH 540/625] Update notes.txt --- notes.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/notes.txt b/notes.txt index badfc361..0ce2e6e1 100755 --- a/notes.txt +++ b/notes.txt @@ -1,6 +1,11 @@ Notes for Homa implementation in Linux: --------------------------------------- +* Performance problems to track down: + * On c6620 cluster, Homa is getting RPC timeouts on w4 at 'b 80' and '-b 100' + * On xl170s, both TCP and Homa run slower with qdisc than pacer (P99 for + TCP small packets increases by 50%) + * Move interest cleanup code from homa_sock to a new function in homa_interest. Also move wakeup code from homa_rpc_handoff. @@ -36,6 +41,21 @@ Notes for Homa implementation in Linux: traffic? * Also consider the amount of data that is "stuck" in the NIC? +* Consider eliminating SoftIRQ: process packets completely at NAPI level? + * This eliminates the latency and cache overheads of switching cores + for SoftIRQ + * Should also help with tail latency: eliminates one opportunity for + hot-spots + * Load balancing should still be fine (especially if port number is used + for packet spraying) + * Or, always do SoftIRQ processing on same node as NAPI? + +* Eliminate use of link_mbps in homa_grant.c; perhaps replace with + configuration parameter fifo_mbps? Maybe the grant mechanism needs + to be net_device-specific? + +* Eliminate HOMA_FLAG_DONT_THROTTLE + * Optimizations for skb freeing: * In GRO, merge page frags out of skbs and return skbs to napi with napi_reuse_skb (return GRO_MERGED_FREE?). See also napi_get_frags (used From 86bc938788e7b293bdfd57383d942ab01625a3da Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Oct 2025 09:34:40 -0700 Subject: [PATCH 541/625] Print size of grant headers when Homa initializes --- homa_plumbing.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 92cf022b..2905631e 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -522,9 +522,10 @@ int __init homa_load(void) #ifndef __UPSTREAM__ /* See strip.py */ pr_err("Homa module loading\n"); - pr_notice("Homa structure sizes: homa_data_hdr %lu, homa_seg_hdr %lu, ack %lu, peer %lu, ip_hdr %lu flowi %lu ipv6_hdr %lu, flowi6 %lu tcp_sock %lu homa_rpc %lu sk_buff %lu skb_shared_info %lu rcvmsg_control %lu union sockaddr_in_union %lu HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", + pr_notice("Homa structure sizes: homa_data_hdr %lu, homa_seg_hdr %lu, homa_grant_hdr %lu, ack %lu, peer %lu, ip_hdr %lu flowi %lu ipv6_hdr %lu, flowi6 %lu tcp_sock %lu homa_rpc %lu sk_buff %lu skb_shared_info %lu rcvmsg_control %lu union sockaddr_in_union %lu HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n", sizeof(struct homa_data_hdr), sizeof(struct homa_seg_hdr), + sizeof(struct homa_grant_hdr), sizeof(struct homa_ack), sizeof(struct homa_peer), sizeof(struct iphdr), From 87c6464920ccc219d484e3bbce0031705c738e91 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 27 Oct 2025 10:22:39 -0700 Subject: [PATCH 542/625] Minor documentation improvement in switch.py --- cloudlab/config_switch | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloudlab/config_switch b/cloudlab/config_switch index f966c3c6..e801f8cb 100755 --- a/cloudlab/config_switch +++ b/cloudlab/config_switch @@ -13,8 +13,7 @@ import sys # Ports to configure -nodes = [81, 82, 83, 84, 85, 87, 88, 89, 91, 93, 94, 95, 96, 98, 100, 102, - 104, 105, 106, 107, 108, 110, 111, 112, 114, 116, 117, 118, 120] +nodes = [5, 20, 32, 34, 35, 38] ports = [] prev_switch = -1 for node in nodes: From b1873a81daff032a09515d56f60f8d6286f1dd40 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 28 Oct 2025 09:28:12 -0700 Subject: [PATCH 543/625] Update cloudlab/bin/config for c6620 cluster at CloudLab * Set hijack_tcp for certain NIC types. * Ignore case differences in cpu type --- cloudlab/bin/config | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/cloudlab/bin/config b/cloudlab/bin/config index 423c7d23..388a46d7 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -32,6 +32,9 @@ cpu_type = None # Cached result of get_link_speed (integer Mbits/sec.) link_mbps = None +# Cached result of get_nic_type. +nic_type = None + # Number of this node (e.g. 1 for "node1"). Set by get_node_num. node_num = None @@ -202,6 +205,25 @@ def get_nic_irqs(): f.close() return result +def get_nic_type(): + """ + Returns the type of driver for the primary NIC, such as "ice" for + the Intel driver. + """ + + global nic_type + if nic_type != None: + return nic_type + info = exec_cmd(["sudo", "ethtool", "-i", get_interfaces()[0]], + stdout=subprocess.PIPE, encoding="utf-8", check=True).stdout + match = re.search(r'.*driver:\s*(\S+)', info, re.MULTILINE) + if not match: + raise Exception("Couldn't identify NIC type (no 'driver' info in " + "ethtool output") + nic_type = match.group(1) + print("Driver type for primary network interface is %s" % (nic_type)) + return nic_type + def get_node_num(): """ Returns the (integer) number of this node (e.g., 1 for "node1"). @@ -469,6 +491,10 @@ def config_homa(mod): raise Exception("Can't configure Homa: no config info available " "for link speed %d Mbps" % (link_mbps)) + if get_nic_type() == "ice": + print("Enabling TCP hijacking") + set_sysctl("hijack_tcp", 1) + def config_ecn_threshold(kb): """ Modify the configuration of this experiment's egress ports at the @@ -587,7 +613,7 @@ def config_nic(): interface = get_interfaces()[0] # Use a separate ethtool command for each paramemeter. Otherwise, - # if one parametere isn't supported the command will be aborted, + # if one parameter isn't supported the command will be aborted, # so no parameters will get set. exec_cmd(["sudo", "ethtool", "-C", interface, "adaptive-rx", "off"], check=False) @@ -598,7 +624,7 @@ def config_power(): """ Configure the machine's power management for best Homa performance. """ - if "Intel" in get_cpu_type(): + if "intel" in get_cpu_type().lower(): # For Intel processors, it's best to leave C-states enabled. This # can cause CPUs to sleep in power-saving mode, but if C-states # are disabled, then so is Turbo mode, and that will hurt peak peformance. From 932d67c6d0e099c1eb6ab3982c35747abbc1955d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 28 Oct 2025 11:27:52 -0700 Subject: [PATCH 544/625] Improve comment in switch.py --- cloudlab/bin/switch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudlab/bin/switch.py b/cloudlab/bin/switch.py index 0f9e3228..777d7c2b 100755 --- a/cloudlab/bin/switch.py +++ b/cloudlab/bin/switch.py @@ -12,7 +12,7 @@ import sys import time -# A Switch object represent the top-of-rack switch for a CloudLab +# A Switch object represents a Mellanox top-of-rack switch for a CloudLab # experiment, and it provides various operations on the switch such # as configuring ports and querying statistics such as maximum buffer # usage. From a1d713cac883a195dc7ab429aa0127ac08a17789 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 28 Oct 2025 11:30:32 -0700 Subject: [PATCH 545/625] Fix bug that caused loss of control packets under TCP hijacking * common->doff wasn't getting set in places where __homa_xmit_control was invoked. * Add homa_set_hijack function to encapsulate setting of fields needed for TCP hijacking. --- homa_devel.c | 3 +-- homa_incoming.c | 5 +---- homa_outgoing.c | 26 +++++--------------------- homa_wire.h | 14 ++++++++++++++ 4 files changed, 21 insertions(+), 27 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index ba748987..197f1b3a 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -427,8 +427,7 @@ void homa_freeze_peers(void) freeze.common.type = FREEZE; freeze.common.sport = htons(hsk->port); freeze.common.dport = 0; - IF_NO_STRIP(freeze.common.flags = HOMA_TCP_FLAGS); - IF_NO_STRIP(freeze.common.urgent = htons(HOMA_TCP_URGENT)); + IF_NO_STRIP(homa_set_hijack(&freeze.common)); freeze.common.sender_id = 0; rhashtable_walk_enter(&hnet->homa->peertab->ht, &iter); diff --git a/homa_incoming.c b/homa_incoming.c index 3040fc4a..06181802 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -994,10 +994,7 @@ void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk, ack.common.type = ACK; ack.common.sport = h->dport; ack.common.dport = h->sport; -#ifndef __STRIP__ /* See strip.py */ - ack.common.flags = HOMA_TCP_FLAGS; - ack.common.urgent = htons(HOMA_TCP_URGENT); -#endif /* See strip.py */ + IF_NO_STRIP(homa_set_hijack(&ack.common)); ack.common.sender_id = cpu_to_be64(id); ack.num_acks = htons(homa_peer_get_acks(peer, HOMA_MAX_ACKS_PER_PKT, diff --git a/homa_outgoing.c b/homa_outgoing.c index d3bfef4d..a83ef7b2 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -159,24 +159,15 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, h->common.dport = htons(rpc->dport); h->common.sequence = htonl(offset); h->common.type = DATA; + IF_NO_STRIP(homa_set_hijack(&h->common)); homa_set_doff(h, sizeof(struct homa_data_hdr)); -#ifndef __STRIP__ /* See strip.py */ - h->common.flags = HOMA_TCP_FLAGS; -#endif /* See strip.py */ h->common.checksum = 0; -#ifndef __STRIP__ /* See strip.py */ - h->common.urgent = htons(HOMA_TCP_URGENT); -#endif /* See strip.py */ h->common.sender_id = cpu_to_be64(rpc->id); h->message_length = htonl(rpc->msgout.length); -#ifndef __STRIP__ /* See strip.py */ - h->incoming = htonl(rpc->msgout.unscheduled); -#endif /* See strip.py */ + IF_NO_STRIP(h->incoming = htonl(rpc->msgout.unscheduled)); h->ack.client_id = 0; homa_peer_get_acks(rpc->peer, 1, &h->ack); -#ifndef __STRIP__ /* See strip.py */ - h->cutoff_version = rpc->peer->cutoff_version; -#endif /* See strip.py */ + IF_NO_STRIP(h->cutoff_version = rpc->peer->cutoff_version); h->retransmit = 0; #ifndef __STRIP__ /* See strip.py */ h->seg.offset = htonl(-1); @@ -435,11 +426,7 @@ int homa_xmit_control(enum homa_packet_type type, void *contents, h->type = type; h->sport = htons(rpc->hsk->port); h->dport = htons(rpc->dport); -#ifndef __STRIP__ /* See strip.py */ - h->flags = HOMA_TCP_FLAGS; - h->urgent = htons(HOMA_TCP_URGENT); - h->doff = 0x50; -#endif /* See strip.py */ + IF_NO_STRIP(homa_set_hijack(h)); h->sender_id = cpu_to_be64(rpc->id); return __homa_xmit_control(contents, length, rpc->peer, rpc->hsk); } @@ -544,10 +531,7 @@ void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk) unknown.common.sport = h->dport; unknown.common.dport = h->sport; unknown.common.type = RPC_UNKNOWN; -#ifndef __STRIP__ /* See strip.py */ - unknown.common.flags = HOMA_TCP_FLAGS; - unknown.common.urgent = htons(HOMA_TCP_URGENT); -#endif /* See strip.py */ + IF_NO_STRIP(homa_set_hijack(&unknown.common)); unknown.common.sender_id = cpu_to_be64(homa_local_id(h->sender_id)); peer = homa_peer_get(hsk, &saddr); if (!IS_ERR(peer)) diff --git a/homa_wire.h b/homa_wire.h index 680ee7eb..364c33ac 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -547,4 +547,18 @@ static inline u64 homa_local_id(__be64 sender_id) return be64_to_cpu(sender_id) ^ 1; } +#ifndef __STRIP__ /* See strip.py */ +/** + * homa_set_hijack() - Set fields in a Homa header that are needed for + * TCP hijacking to work properly. + * @common: Header in which to set fields. + */ +static inline void homa_set_hijack(struct homa_common_hdr *common) +{ + common->flags = HOMA_TCP_FLAGS; + common->urgent = htons(HOMA_TCP_URGENT); + common->doff = 0x50; +} +#endif /* See strip.py */ + #endif /* _HOMA_WIRE_H */ From 16e2e3cdcf8500ac75f1a89bf84fe6020244f9b9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 28 Oct 2025 11:32:36 -0700 Subject: [PATCH 546/625] Update ttsync.py to work with Intel NICs --- util/ttsync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/ttsync.py b/util/ttsync.py index e4733fa1..675fc9c7 100755 --- a/util/ttsync.py +++ b/util/ttsync.py @@ -143,7 +143,7 @@ def parse_tt(tt, node_num): continue match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' - 'mlx sent homa packet to ([^ ]+) id ([0-9]+), ' + '[^ ]+ sent homa packet to ([^ ]+) id ([0-9]+), ' 'type (0x[0-9a-f]+)', line) if match: time = float(match.group(1)) From 211164b6f1d82c61a9ecfc38ac77154e9217b53c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 28 Oct 2025 13:38:19 -0700 Subject: [PATCH 547/625] Print more statistics about trace files in ttsync.py --- util/ttsync.py | 106 +++++++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 51 deletions(-) diff --git a/util/ttsync.py b/util/ttsync.py index 675fc9c7..9ce3ca76 100755 --- a/util/ttsync.py +++ b/util/ttsync.py @@ -110,6 +110,7 @@ def parse_tt(tt, node_num): """ Reads a timetrace file and adds entries to send_pkts and recv_pkts. + Also updates num_records and last_time tt: Name of the timetrace file node_num: Integer identifier for this file/node (should reflect the @@ -120,36 +121,45 @@ def parse_tt(tt, node_num): global retransmits sent = 0 recvd = 0 + num_records = 0 + first_time = None + last_time = None for line in open(tt): - match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\]' - '.* id ([-0-9.]+),.* offset ([-0-9.]+)', line) + num_records += 1 + match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] (.*)', line) if not match: - match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' - 'retransmitting offset ([0-9.]+), .*id ([0-9.]+)', line) + continue + time = float(match.group(1)) + core = int(match.group(2)) + msg = match.group(3) + if first_time == None: + first_time = time + last_time = time + + match = re.match('.* id ([-0-9.]+),.* offset ([-0-9.]+)', msg) + if not match: + match = re.match('retransmitting offset ([0-9.]+), .*id ([0-9.]+)', + msg) if match: - offset = int(match.group(3)) - id = int(match.group(4)) + offset = int(match.group(1)) + id = int(match.group(2)) pktid = '%d:%d' % (id, offset) retransmits[pktid] = 1 continue - match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' - 'sending BUSY from resend, id ([0-9]+),', line) + match = re.match('sending BUSY from resend, id ([0-9]+),', msg) if match: - time = float(match.group(1)) - id = match.group(3) + id = match.group(1) send_ctl[node_num][id].append(time) continue - match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' - '[^ ]+ sent homa packet to ([^ ]+) id ([0-9]+), ' - 'type (0x[0-9a-f]+)', line) + match = re.match('[^ ]+ sent homa packet to ([^ ]+) id ([0-9]+), ' + 'type (0x[0-9a-f]+)', msg) if match: - time = float(match.group(1)) - addr = match.group(3) - id = match.group(4) - type = match.group(5) + addr = match.group(1) + id = match.group(2) + type = match.group(3) id_addr[peer_id(id)] = addr id_node_num[id] = node_num if type != '0x12' and type != '0x14': @@ -157,14 +167,12 @@ def parse_tt(tt, node_num): send_ctl[node_num][id].append(time) continue - match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' - 'homa_gro_receive got packet from ([^ ]+) id ([0-9]+), ' - 'type (0x[0-9a-f]+)', line) + match = re.match('homa_gro_receive got packet from ([^ ]+) id ' + '([0-9]+), type (0x[0-9a-f]+)', msg) if match: - time = float(match.group(1)) - addr = match.group(3) - id = match.group(4) - type = match.group(5) + addr = match.group(1) + id = match.group(2) + type = match.group(3) id_addr[peer_id(id)] = addr id_node_num[id] = node_num if type == '0x16': @@ -175,30 +183,17 @@ def parse_tt(tt, node_num): recv_ctl[id].append(time) continue - match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' - 'Sending freeze to (0x[0-9a-f]+)', line) + match = re.match('Sending freeze to (0x[0-9a-f]+)', msg) if match: - time = float(match.group(1)) - addr = match.group(3) + addr = match.group(1) send_freeze.append([time, node_num, addr]) continue - - # match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] ' - # 'Freezing because of request on port [0-9]+ ' - # 'from (0x[0-9a-f]+):', line) - # if match: - # time = float(match.group(1)) - # addr = match.group(3) - # recv_freeze[node_num] = [time, addr] - # continue continue - time = float(match.group(1)) - core = int(match.group(2)) - id = int(match.group(3)) - offset = int(match.group(4)) + id = int(match.group(1)) + offset = int(match.group(2)) - if re.match('.*calling .*_xmit: wire_bytes', line): + if re.match('.*calling .*_xmit: wire_bytes', msg): if (id in max_send_offsets) and (max_send_offsets[id] >= offset): continue pktid = '%d:%d' % (id, offset) @@ -209,7 +204,7 @@ def parse_tt(tt, node_num): sent += 1 match2 = re.match('.*Finished queueing packet: rpc id .*, offset .*, ' - 'len ([0-9.]+)', line) + 'len ([0-9.]+)', msg) if match2: pktid = '%d:%d' % (id, offset) if pktid in retransmits: @@ -219,7 +214,7 @@ def parse_tt(tt, node_num): max_send_offsets[id] = last_offset continue - if "homa_gro_receive got packet" in line: + if "homa_gro_receive got packet" in msg: if (id in max_recv_offsets) and (max_recv_offsets[id] >= offset): continue pktid = '%d:%d' % (id^1, offset) @@ -228,30 +223,31 @@ def parse_tt(tt, node_num): recvd += 1 continue - if "sending grant for" in line: + if "sending grant for" in msg: pktid = '%d:%dg' % (id, offset) if not pktid in send_pkts: send_pkts[pktid] = [time, node_num] sent += 1 continue - if "homa_gro_receive got grant from" in line: + if "homa_gro_receive got grant from" in msg: pktid = '%d:%dg' % (id^1, offset) recv_pkts[pktid] = [time, node_num] recvd += 1 continue - match = re.match(r' *([-0-9.]+) us .* us\) \[C([0-9]+)\] Sent RESEND ' - r'for client RPC id ([0-9]+), server ([^:]+):', line) + match = re.match(r'Sent RESEND for client RPC id ([0-9]+), ' + r'server ([^:]+):', msg) if False and match: - id = match.group(3) - addr = match.group(4) + id = match.group(1) + addr = match.group(2) id_addr[peer_id(id)] = addr id_node_num[id] = node_num send_ctl[node_num][id].append(time) continue - print("%s has %d packet sends, %d receives" % (tt, sent, recvd)) + print('%-12s %8d %8d %8d %8.1f' % (tt, num_records, sent, recvd, + (last_time - first_time)/1000)) def find_min_delays(num_nodes): """ @@ -376,6 +372,14 @@ def peer_id(id): tt_files.sort(key = lambda name : get_node_num(name)) node_names = [Path(tt_file).stem for tt_file in tt_files] num_nodes = len(tt_files) +print('Trace file statistics:') +print('File: Name of trace file') +print('Records: Total number of timetrace records') +print('Sends: Total number of packets sent') +print('Receives: Total number of packets redeived (will be more than Sends') +print(' because of TSO)') +print('Timespan: Elapsed time between first and last timetrace records (ms)') +print('\nFile Records Sends Receives Timespan') for i in range(num_nodes): parse_tt(tt_files[i],i) for id, addr in id_addr.items(): From 79dc167d1c5dee88a67149603e105f553d584275 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 29 Oct 2025 13:32:11 -0700 Subject: [PATCH 548/625] Add pass analyzer to tthoma.py --- util/tthoma.py | 155 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 150 insertions(+), 5 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index db368e22..8013102d 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -8,7 +8,7 @@ Invoke with the --help option for documentation. """ -from collections import defaultdict +from collections import defaultdict, deque from functools import cmp_to_key from glob import glob import itertools @@ -5940,9 +5940,10 @@ def output(self): #------------------------------------------------ class AnalyzeOoo: """ - Prints statistics about out-of-order packet arrivals. Also prints - details about out-of-order packets in the RPCs that experienced the - highest out-of-order delays (--verbose will print info for all OOO RPCs) + Prints statistics about out-of-order packet arrivals within a message. + Also prints details about out-of-order packets in the RPCs that + experienced the highest out-of-order delays (--verbose will print info + for all OOO RPCs). """ def __init__(self, dispatcher): @@ -6489,7 +6490,7 @@ def analyze(self): pkt['tso_length'] = tso_length if not 'rx_node' in pkt: - if 'peer' in tx_rpc: + if 'peer' in tx_rpc and tx_rpc['peer'] in peer_nodes: pkt['rx_node'] = peer_nodes[tx_rpc['peer']] if 'qdisc_xmit' in pkt: @@ -6621,6 +6622,144 @@ def output(self): info['backlog'], len(delays))) first = False +#------------------------------------------------ +# Analyzer: pass +#------------------------------------------------ +class AnalyzePass: + """ + Compute statistics on "passing", where a packet A passes a packet B + if both are sent to the same destination and B was transmitted before + A, but A arrived before B. This information will indicate whether or + not priority queues are being used properly. If the --same-gro-core + option is specified, than packets must be processed by the same GRO + core at the destination in order to be considered for passing. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + def output(self): + global packets + + print('\n--------------') + print('Analyzer: pass') + print('--------------') + doc = 'Statistics on passing. A packet A has passed a packet B if A ' + doc += 'and B are sent to the same destination node and A was ' + doc += 'transmitted after B but arrived at GRO before B. ' + if options.same_gro_core: + doc += 'Since the --same-gro-core option was specified, A and B ' + doc += 'must also have been handled by the same GRO core at the ' + doc += 'destination. ' + doc += 'The term "gain" refers to the largest difference in ' + doc += 'transmission times between a packet and any of the packets ' + doc += 'it passed.' + print(textwrap.fill(doc, width=70)) + print('Node: Destination node for packets') + print('Packets: Total data packets sent to Node') + print('PassFrac: Fraction of packets that passed a lower-priority packet') + print('GainP50: 50th percentile gain of packets that passed a ' + 'lower-priority') + print(' packet (usecs)') + print('GainP90: 90th percentile gain of packets that passed a ' + 'lower-priority') + print(' packet (usecs)') + print('GainMax: Maximum gain of any packet that passed a ' + 'lower-priority') + print(' packet (usecs)') + print('RFrac: Fraction of packets that passed a higher-priority ' + 'packet ("reverse")') + print('RP50: 50th percentile gain of packets that passed a ' + 'higher-priority') + print(' packet (usecs)') + print('RP90: 90th percentile gain of packets that passed a ' + 'higher-priority') + print(' packet (usecs)') + print('RMax: Maximum gain of any packet that passed a ' + 'higher-priority packet') + print('\nNode Packets PassFrac GainP50 GainP90 GainMax RFrac RP50 RP90 RMax') + + # Node -> list of all data packets sent to that node. The list will + # eventually be sorted by packet transmission time (nic). + node_pkts = defaultdict(list) + + for pkt in packets.values(): + if not 'nic' in pkt or not 'gro' in pkt or not 'rx_node' in pkt: + continue + if not 'priority' in pkt: + continue + node_pkts[pkt['rx_node']].append(pkt) + for pkts in node_pkts.values(): + pkts.sort(key=lambda d: d['nic']) + for node in get_sorted_nodes(): + pkts = node_pkts[node] + + # Active packets (those that have been sent but have not yet been + # discovered to have been received), sorted in order of 'nic'. + active = deque() + + # For each packet that passed a lower-priority packet, this list + # contains one element (the "gain"), which is the largest + # difference in transmission times between the passing + # packet and any of the packets it passed. + gains = [] + + # Same as gains, except when a lower-priority packet passes + # a higher-priority one. + lgains = [] + + # Scan packets sent to the current node in order of 'nic' time, + # gathering data about inversions + for pkt in pkts: + nic = pkt['nic'] + gro = pkt['gro'] + priority = pkt['priority'] + gro_core = pkt['gro_core'] + + # Drop "active" packets that have completed. + while len(active) > 0 and active[0]['gro'] <= nic: + active.popleft() + + have_gain = False + have_lgain = False + for i, p2 in enumerate(active): + if gro >= p2['gro']: + continue + if options.same_gro_core and gro_core != p2['gro_core']: + continue + gain = nic - p2['nic'] + if p2['priority'] < priority: + if not have_gain: + # if node == 'node2': + # print('%9.3f -> %9.3f prio %d passed %9.3f-> ' + # '%9.3f prio %d, gain %.3f' % + # (nic, gro, priority, p2['nic'], p2['gro'], + # p2['priority'], gain)) + gains.append(gain) + have_gain = True + elif p2['priority'] > priority and not have_lgain: + # if gain > 89.0: + # print('%9.3f -> %9.3f prio %d passed %9.3f-> ' + # '%9.3f prio %d, gain %.3f' % + # (nic, gro, priority, p2['nic'], + # p2['gro'], p2['priority'], gain)) + lgains.append(gain) + have_lgain = True + active.append(pkt) + + # Print statistics + gains.sort() + num_passes = len(gains) + lgains.sort() + num_lpasses = len(lgains) + num_pkts = len(pkts) + print('%-10s %8d %6.3f %7.1f %7.1f %7.1f' % (node, num_pkts, + num_passes/num_pkts, gains[50*num_passes//100], + gains[90*num_passes//100], gains[-1]), end='') + print(' %6.3f%6.1f%6.1f %6.1f' % ( + num_lpasses/num_pkts, lgains[50*num_lpasses//100], + lgains[90*num_lpasses//100], lgains[-1])) + #------------------------------------------------ # Analyzer: qdelay #------------------------------------------------ @@ -8465,6 +8604,8 @@ def output(self): f.write('# link speed)\n') f.write('# InNic: KB of data that have been queued for the ' 'NIC but whose packets\n') + f.write('# have not yet been returned after ' + 'transmission') f.write('# NicRx: KB of data that are still in the NIC\'s ' 'possession (their packets\n') f.write('# haven\'t been returned after transmission) ' @@ -9050,6 +9191,10 @@ def output(self): parser.add_option('--rx-start', dest='rx_start', type=float, default=None, metavar='T', help='If specified, some analyzers will ignore packets ' 'received before time T') +parser.add_option('--same-gro-core', dest='same_gro_core', action="store_true", + default=False, help='If specified, the pass analyzer will only ' + 'consider passing for packets that are processed by GRO on the ' + 'same core') parser.add_option('--sort', dest='sort', default=None, metavar='S', help='Used by some analyzers to select a field to use ' 'for sorting data. The supported values depend on the analyzer; ' From 79569e6c3c6ca88712d71715f2597c0003186ee5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 29 Oct 2025 17:04:26 -0700 Subject: [PATCH 549/625] Add cloudlab/dell_switch.txt --- cloudlab/dell_switch.txt | 186 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 cloudlab/dell_switch.txt diff --git a/cloudlab/dell_switch.txt b/cloudlab/dell_switch.txt new file mode 100644 index 00000000..0e60fe0d --- /dev/null +++ b/cloudlab/dell_switch.txt @@ -0,0 +1,186 @@ +# Commands for configuring a Dell switch (OS 10.6.0) to enable priority +# queues with strict priority: + +enable +configure terminal + +# Create maps from DSCP classes to egress queues, and from queues +# to service policies. + +class-map type queuing pq_cm0 +match queue 0 +exit +class-map type queuing pq_cm1 +match queue 1 +exit +class-map type queuing pq_cm2 +match queue 2 +exit +class-map type queuing pq_cm3 +match queue 3 +exit +class-map type queuing pq_cm4 +match queue 4 +exit +class-map type queuing pq_cm5 +match queue 5 +exit +class-map type queuing pq_cm6 +match queue 6 +exit +class-map type queuing pq_cm7 +match queue 7 +exit + +policy-map type queuing pq_pmap +class pq_cm0 +priority +class pq_cm1 +priority +class pq_cm2 +priority +class pq_cm3 +priority +class pq_cm4 +priority +class pq_cm5 +priority +class pq_cm6 +priority +class pq_cm7 +priority +exit + +# Configure interfaces to use the trust-map (on input) and policy- +# (for output) + +interface ethernet 1/1/36:3 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/28:1 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/57:3 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/39:3 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/31:2 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/62:1 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/36:2 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/59:3 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/58:2 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/31:4 +trust-map dscp default +service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/7:4 +trust-map dscp default +service-policy output type queuing pq_pmap +exit + +# Reset ports + +interface ethernet 1/1/36:3 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/28:1 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/57:3 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/39:3 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/31:2 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/62:1 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/36:2 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/59:3 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/58:2 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/31:4 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit +interface ethernet 1/1/7:4 +no trust-map dscp +no service-policy output type queuing pq_pmap +exit + +# Create policy map that "flattens" the queues: they each get the +# same bandwidth. + +policy-map type queuing pmap_flat +class pq_cm0 +bandwidth percent 12 +class pq_cm1 +bandwidth percent 12 +class pq_cm2 +bandwidth percent 12 +class pq_cm3 +bandwidth percent 12 +class pq_cm4 +bandwidth percent 12 +class pq_cm5 +bandwidth percent 12 +class pq_cm6 +bandwidth percent 12 +class pq_cm7 +bandwidth percent 12 +exit + + +interface ethernet 1/1/36:2 +service-policy output type queuing pmap_flat +exit +interface ethernet 1/1/59:3 +service-policy output type queuing pmap_flat +exit +interface ethernet 1/1/58:2 +service-policy output type queuing pmap_flat +exit +interface ethernet 1/1/31:4 +service-policy output type queuing pmap_flat +exit +interface ethernet 1/1/7:4 +service-policy output type queuing pmap_flat +exit \ No newline at end of file From 026e4545d55fc76f039b423e0ec7a5a821959841 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Sat, 1 Nov 2025 22:27:50 -0700 Subject: [PATCH 550/625] Fix syncrhonization bug in timetrace.c Reading/updatingnext_index was not atomic, so an interrupt could occur between the read and the update. This could cause events recorded in the interrupt handler to be lost. --- test/unit_timetrace.c | 4 +- timetrace.c | 43 ++-- timetrace.h | 25 +- util/tthoma.py | 571 +++++++++++++++++++++++++++++++++++------- 4 files changed, 534 insertions(+), 109 deletions(-) diff --git a/test/unit_timetrace.c b/test/unit_timetrace.c index 33db5d90..bbc250bb 100644 --- a/test/unit_timetrace.c +++ b/test/unit_timetrace.c @@ -294,11 +294,11 @@ TEST_F(timetrace, tt_proc_release__unfreeze) EXPECT_EQ(2, tt_freeze_count.counter); EXPECT_TRUE(atomic_read(&tt_frozen)); EXPECT_NE(NULL, tt_buffers[1]->events[3].format); - EXPECT_EQ(2, tt_buffers[1]->next_index); + EXPECT_EQ(6, atomic_read(&tt_buffers[1]->next_index)); tt_proc_release(NULL, &file2); EXPECT_EQ(0, tt_freeze_count.counter); EXPECT_FALSE(atomic_read(&tt_frozen)); EXPECT_EQ(NULL, tt_buffers[1]->events[3].format); - EXPECT_EQ(0, tt_buffers[1]->next_index); + EXPECT_EQ(0, atomic_read(&tt_buffers[1]->next_index)); } diff --git a/timetrace.c b/timetrace.c index cafdc585..a06f3d2f 100644 --- a/timetrace.c +++ b/timetrace.c @@ -238,8 +238,8 @@ void tt_freeze(void) */ void tt_unfreeze(void) { + pr_err("%s invoked\n", __func__); if (atomic_xchg(&tt_frozen, 0) == 1) { - pr_err("%s invoked\n", __func__); atomic_dec(&tt_freeze_count); } } @@ -268,6 +268,7 @@ void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, u32 arg3) { struct tt_event *event; + int index; if (unlikely(atomic_read(&tt_freeze_count) > 0)) { // In order to ensure that reads produce consistent @@ -276,13 +277,14 @@ void tt_record_buf(struct tt_buffer *buffer, u64 timestamp, return; } - event = &buffer->events[buffer->next_index]; - buffer->next_index = (buffer->next_index + 1) -#ifdef __UNIT_TEST__ - & (tt_buffer_size - 1); -#else /* __UNIT_TEST__ */ - & (TT_BUF_SIZE - 1); -#endif /* __UNIT_TEST__ */ + /* Even though there is a separate tt buffer for each core, we + * still have to use an atomic operation to update next_index, + * because an interrupt could occur while executing this function. + * Before the atomic increment was added, tt entries were occasionally + * lost. + */ + index = atomic_fetch_inc_relaxed(&buffer->next_index) & TT_BUF_MASK; + event = &buffer->events[index]; event->timestamp = timestamp; event->format = format; @@ -317,8 +319,8 @@ u64 tt_find_oldest(int *pos) if (!buffer->events[tt_buffer_size - 1].format) { pos[i] = 0; } else { - int index = (buffer->next_index + 1) - & (tt_buffer_size - 1); + int index = (atomic_read(&buffer->next_index) + 1) + & TT_BUF_MASK; struct tt_event *event = &buffer->events[index]; pos[i] = index; @@ -331,10 +333,13 @@ u64 tt_find_oldest(int *pos) * sure that there's no missing data in what we print. */ for (i = 0; i < nr_cpu_ids; i++) { + int next; + buffer = tt_buffers[i]; + next = tt_get_buf_index(buffer); while (buffer->events[pos[i]].timestamp < start_time && - pos[i] != buffer->next_index) { - pos[i] = (pos[i] + 1) & (tt_buffer_size - 1); + pos[i] != next) { + pos[i] = (pos[i] + 1) & TT_BUF_MASK; } } return start_time; @@ -430,7 +435,7 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, struct tt_buffer *buffer = tt_buffers[i]; event = &buffer->events[pf->pos[i]]; - if (pf->pos[i] != buffer->next_index && + if (pf->pos[i] != tt_get_buf_index(buffer) && event->timestamp < earliest_time) { current_core = i; earliest_time = event->timestamp; @@ -553,7 +558,7 @@ int tt_proc_release(struct inode *inode, struct file *file) struct tt_buffer *buffer = tt_buffers[i]; buffer->events[tt_buffer_size - 1].format = NULL; - buffer->next_index = 0; + atomic_set(&buffer->next_index, 0); } } atomic_dec(&tt_freeze_count); @@ -627,7 +632,7 @@ void tt_print_file(char *path) struct tt_buffer *buffer = tt_buffers[i]; event = &buffer->events[pos[i]]; - if (pos[i] != buffer->next_index && + if (pos[i] != tt_get_buf_index(buffer) && event->timestamp < earliest_time) { current_core = i; earliest_time = event->timestamp; @@ -723,11 +728,11 @@ void tt_printk(void) start_time = tt_find_oldest(oldest); events = 0; for (i = 0; i < nr_cpu_ids; i++) { - if (oldest[i] == tt_buffers[i]->next_index) + if (oldest[i] == tt_get_buf_index(tt_buffers[i])) pos[i] = -1; else - pos[i] = (tt_buffers[i]->next_index - 1) & - (tt_buffer_size - 1); + pos[i] = (atomic_read(&tt_buffers[i]->next_index) - 1) & + TT_BUF_MASK; } #if 0 @@ -819,7 +824,7 @@ void tt_get_messages(char *buffer, size_t length) struct tt_buffer *buffer = tt_buffers[i]; event = &buffer->events[pos[i]]; - if (pos[i] != buffer->next_index && + if (pos[i] != tt_get_buf_index(buffer) && event->timestamp < earliest_time) { current_core = i; earliest_time = event->timestamp; diff --git a/timetrace.h b/timetrace.h index 60e6270c..f7fc8058 100644 --- a/timetrace.h +++ b/timetrace.h @@ -50,6 +50,12 @@ struct tt_event { /* The number of events in a tt_buffer, as a power of 2. */ #define TT_BUF_SIZE_EXP 14 #define TT_BUF_SIZE BIT(TT_BUF_SIZE_EXP) +#ifndef __UNIT_TEST__ +#define TT_BUF_MASK (TT_BUF_SIZE - 1) +#else +#define TT_BUF_MASK (tt_buffer_size - 1) +#endif +extern int tt_buffer_size; /* Represents a sequence of events, typically consisting of all those * generated by one thread. Has a fixed capacity, so slots are reused @@ -57,18 +63,30 @@ struct tt_event { */ struct tt_buffer { /** - * Index within events of the slot to use for the next tt_record call. + * The low-order bits of this value hold the index within events + * of the slot to use for the next tt_record call. High-order bits + * may be garbage (the value is incremented without checking for + * wraparound). */ - int next_index; + atomic_t next_index; /** - * Holds information from the most recent calls to tt_record. + * Holds information from the most recent calls to tt_record. * Updated circularly, so each new event replaces the oldest * existing event. */ struct tt_event events[TT_BUF_SIZE]; }; +/** + * tt_get_buf_index() - Returns the current position in a tt_buffer + * (masks off potentially garbage high-order bits) + */ +static inline int tt_get_buf_index(struct tt_buffer *buffer) +{ + return atomic_read(&buffer->next_index) & TT_BUF_MASK; +} + /* Holds information about an attempt to read timetrace information * using a /proc file. Several of these can exist simultaneously. */ @@ -119,7 +137,6 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, int tt_proc_release(struct inode *inode, struct file *file); loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence); extern struct tt_buffer *tt_buffers[]; -extern int tt_buffer_size; extern atomic_t tt_freeze_count; extern atomic_t tt_frozen; extern int tt_pf_storage; diff --git a/util/tthoma.py b/util/tthoma.py index 8013102d..5b229282 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -141,7 +141,7 @@ def __missing__(self, id): # elapsed_time: Total time interval covered by the trace traces = {} -# Peer address -> node name. Computed by AnalyzeRpcs. +# Peer address -> node name. Computed by AnalyzeRpcs and AnalyzePackets. peer_nodes = {} # This variable holds information about every data packet in the traces. @@ -210,12 +210,12 @@ def __missing__(self, key): recv_offsets = {} # This variable holds information about every grant packet in the traces. -# it is created by AnalyzePackets. Keys have the form id:offset where id is +# It is created by AnalyzePackets. Keys have the form id:offset where id is # the RPC id on the sending side and offset is the offset in message of # the first byte of the packet. Each value is a dictionary containing # the following fields: # xmit: Time when ip*xmit was invoked -# nic: Time when the NIC transmitted the packet +# nic: Time when the the packet was handed off to the NIC # gro: Time when GRO received (the first bytes of) the packet # gro_core: Core on which homa_gro_receive was invoked # softirq: Time when homa_softirq processed the packet @@ -234,11 +234,29 @@ def __missing__(self, key): return self[key] grants = GrantDict() +# This variable holds information about every TCP packet in the traces. +# It is created by AnalyzePackets. See get_tcp_packet for details on the keys +# used to look up packets. Each value is a dictionary containing the following +# fields: +# saddr: Source address of the packet (hex string) +# sport: Source port number +# daddr: Destination address of the packet (hex string) +# dport: Destination port number +# sequence: The sequence number in the packet +# data_bytes: The number of data bytes in the packet +# total_length: Total length of the packet, including IP and TCP headers +# ack: The ack sequence number in the packet +# nic: Time when the the packet was handed off to the NIC +# gro: Time when GRO received the packet +# tx_node: Node that sent the packet (corresponds to saddr) +# rx_node: Node that received the packet (corresponds to daddr) +tcp_packets = {} + # Node -> list of intervals for that node. Created by the intervals analyzer. # Each interval contains information about a particular time range, including # things that happened during that time range and the state of the node at # the end of the period. The list entry for each interval is a dictionary with -# the following fields:repo_ +# the following fields: # time: Ending time of the interval (integer usecs); this time is # included in the interval # rpcs_live: Number of live RPCs for which this node is the client @@ -310,6 +328,14 @@ def __missing__(self, key): # rx_new_grants: Number of bytes of additional grants passed to ip*xmit # during the interval # +# The following fields are present only if the buffers analyzer is used. +# They count bytes of incoming packet data (including headers) for this node +# that are queued somewhere in the network between 'nic' and 'gro'. +# q_homa_unsched: Bytes from unscheduled Homa packets +# q_homa_sched: Bytes from scheduled Homa packets +# q_homa_grant: Bytes from Homa grant packets +# q_tcp: Bytes from TCP packets +# # The following fields are present only if the grants analyzer is used: # rx_grants: Number of incoming RPCs with outstanding grants # rx_grant_bytes: Total bytes of data in outstanding grants for incoming RPCs @@ -330,6 +356,22 @@ def __missing__(self, key): # Dispatcher used to parse the traces. dispatcher = None +# Total bytes in an IPv4 header +ipv4_hdr_length = 20 + +# Total header bytes in a Homa data packet, including Homa header and +# IP header. +data_hdr_length = 60 + ipv4_hdr_length + +# Total bytes in a Homa grant packet, including IP header (assume IPv4). +grant_pkt_length = 33 + ipv4_hdr_length + +# Various color values for plotting: +color_red = '#c00000' +color_blue = '#1f77b4' +color_brown = '#844f1a' +color_green = '#00b050' + def add_to_intervals(node, start, end, key, delta): """ Find all of the intervals for node whose end times overlap the range @@ -565,6 +607,30 @@ def get_packet(id, offset): global packets return packets['%d:%d' % (id, offset)] +def get_tcp_packet(saddr, sport, daddr, dport, sequence, data_bytes, ack): + """ + Returns the entry in tcp_packets corresponding to the arguments. Creates + a new packet if it doesn't already exist. + + saddr: IP address of source (hex string) + sport: Source port + daddr: IP address of destination (hex string) + dport: Destination port + sequence: Sequence number in packet + data_bytes: Amount of payload data in the packet + ack: Acknowledgment sequence number in the packet + """ + global tcp_packets + + key = '%s:%d %s:%d %d %d %d' % (saddr, sport, daddr, dport, sequence, + data_bytes, ack) + if key in tcp_packets: + return tcp_packets[key] + pkt = {'saddr': saddr, 'sport': sport, 'daddr': daddr, 'dport': dport, + 'sequence': sequence, 'data_bytes': data_bytes, 'ack': ack} + tcp_packets[key] = pkt + return pkt + def get_recv_length(offset, msg_length=None): """ Compute the length of a received packet. Uses information collected in the @@ -827,7 +893,7 @@ def __init__(self): # Values are the corresponding objects. self.analyzers = {} - # Pattern name -> list of objects interested in that pattern. + # Pattern name -> list of analyzer classes interested in that pattern. self.interests = {} # List of objects with tt_all methods, which will be invoked for @@ -860,6 +926,12 @@ def __init__(self): # files (whether they matched or not) self.regex_tries = 0 + # Core -> dictionary of saved values for that core. Used in situations + # where it takes multiple time trace entries to provide all the data + # needed for an event: info accumulates here until the last time + # trace entry is seen. + self.core_saved = {} + for pattern in self.patterns: pattern['matches'] = 0 self.pattern_dict[pattern['name']] = pattern @@ -924,14 +996,21 @@ def interest(self, analyzer): if name == 'all': self.all_interests.append(obj) continue + name_len = len(name) for pattern in self.patterns: - if name != pattern['name']: + # Include all patterns whose names either match the given + # name or consist of the name followed by a number (used for + # situations where it takes multiple timetrace entries to + # supply relevant data). + if not pattern['name'].startswith(name): + continue + if (len(pattern['name']) != name_len and + not pattern['name'][name_len:].isdigit()): continue found_pattern = True - if not name in self.interests: - self.interests[name] = [] - self.interests[name].append(obj) - break + if not pattern['name'] in self.interests: + self.interests[pattern['name']] = [] + self.interests[pattern['name']].append(obj) if not name in self.interests: raise Exception('Couldn\'t find pattern %s for analyzer %s' % (name, analyzer)) @@ -1010,11 +1089,7 @@ def print_no_matches(self): if no_matches: print('No lines matched the following patterns:', file=sys.stderr) for pattern in no_matches: - print_string = pattern['regexp'] - match = re.search('[()[\].+*?\\^${}]', print_string) - if match: - print_string = print_string[:match.start()] - print(' %s...' % (print_string), file=sys.stderr) + print(' %s' % (pattern['regexp']), file=sys.stderr) def print_stats(self): """ @@ -1119,6 +1194,40 @@ def __gro_grant(self, trace, time, core, match, interests): 'offset ([0-9]+), priority ([0-9]+)' }) + def __gro_tcp(self, trace, time, core, match, interests): + saddr = match.group(1) + sport = int(match.group(2)) + daddr = match.group(3) + dport = int(match.group(4)) + self.core_saved[core] = {'saddr': saddr, 'sport': sport, + 'daddr': daddr, 'dport': dport} + + patterns.append({ + 'name': 'gro_tcp', + 'regexp': 'tcp_gro_receive got packet from ([^:]+):([0-9]+) to ' + '([^:]+):([0-9]+)' + }) + + def __gro_tcp2(self, trace, time, core, match, interests): + if not core in self.core_saved: + return + saved = self.core_saved[core] + sequence = int(match.group(1)) + data_bytes = int(match.group(2)) + total = int(match.group(3)) + ack = int(match.group(4)) + for interest in interests: + interest.tt_gro_tcp(trace, time, core, saved['saddr'], + saved['sport'], saved['daddr'], saved['dport'], sequence, + data_bytes, total, ack) + del self.core_saved[core] + + patterns.append({ + 'name': 'gro_tcp2', + 'regexp': r'tcp_gro_receive .2. sequence ([-0-9]+), data bytes ' + '([0-9]+), total length ([0-9]+), ack ([-0-9]+)' + }) + def __softirq_data(self, trace, time, core, match, interests): id = int(match.group(1)) offset = int(match.group(2)) @@ -1229,6 +1338,40 @@ def __nic_grant(self, trace, time, core, match, interests): 'offset ([0-9]+), queue (0x[0-9a-f]+)' }) + def __nic_tcp(self, trace, time, core, match, interests): + saddr = match.group(2) + sport = int(match.group(3)) + daddr = match.group(4) + dport = int(match.group(5)) + self.core_saved[core] = {'saddr': saddr, 'sport': sport, + 'daddr': daddr, 'dport': dport} + + patterns.append({ + 'name': 'nic_tcp', + 'regexp': '(mlx|ice) sent TCP packet from ([^:]+):([0-9]+) to ' + '([^:]+):([0-9]+)' + }) + + def __nic_tcp2(self, trace, time, core, match, interests): + if not core in self.core_saved: + return + saved = self.core_saved[core] + sequence = int(match.group(2)) + data_bytes = int(match.group(3)) + ack = int(match.group(4)) + gso_size = int(match.group(5)) + for interest in interests: + interest.tt_nic_tcp(trace, time, core, saved['saddr'], + saved['sport'], saved['daddr'], saved['dport'], + sequence, data_bytes, ack, gso_size) + del self.core_saved[core] + + patterns.append({ + 'name': 'nic_tcp2', + 'regexp': r'(mlx|ice) sent TCP packet .2. sequence ([-0-9]+), ' + 'data bytes ([0-9]+), ack ([-0-9]+), gso_size ([0-9]+)' + }) + def __free_tx_skb(self, trace, time, core, match, interests): id = int(match.group(1)) offset = int(match.group(2)) @@ -4176,46 +4319,13 @@ def init_intervals(self): t = get_first_interval_end(node) end = traces[node]['last_time'] + interval_length while t < end: - node_intervals.append({ - 'time': t, - 'rpcs_live': 0, - 'tx_starts': 0, - 'tx_live_req': 0, - 'tx_live_resp': 0, - 'tx_pkts': 0, - 'tx_bytes': 0, - 'tx_nic_pkts': 0, - 'tx_nic_bytes': 0, - 'tx_in_nic': 0, - 'tx_nic_rx': 0, - 'tx_qdisc': 0, - 'tx_q': 0, - 'tx_gro_bytes': 0, - 'tx_free_bytes': 0, - 'tx_max_free': 0, - 'tx_min_free': 0, - 'tx_max_gro_free': None, - 'tx_min_gro_free': None, - 'tx_grant_xmit': 0, - 'tx_grant_gro': 0, - 'tx_grant_avl': 0, - 'tx_new_grants': 0, - 'rx_starts': 0, - 'rx_live': 0, - 'rx_pkts': 0, - 'rx_bytes': 0, - 'rx_grantable': 0, - 'rx_granted': 0, - 'rx_data_qdisc': 0, - 'rx_data_net': 0, - 'rx_overdue': 0, - 'rx_data_gro': 0, - 'rx_new_grants': 0, - 'rx_grants': 0, - 'rx_grant_bytes': 0, - 'rx_grant_info': None, - 'tx_grant_info': None - }) + interval = defaultdict(lambda: 0) + interval['time'] = t + interval['tx_max_gro_free'] = None + interval['tx_min_gro_free'] = None + interval['rx_grant_info'] = None + interval['tx_grant_info'] = None + node_intervals.append(interval) t += interval_length intervals[node] = node_intervals @@ -6446,6 +6556,53 @@ def tt_softirq_grant(self, trace, t, core, id, offset, priority, increment): g['increment'] = increment g['rx_node'] = trace['node'] + def tt_nic_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, + data_bytes, ack, gso_size): + # Break GSO packets up into multiple packets, matching what will + # be received on the other end. + bytes_left = data_bytes + node = trace['node'] + if not saddr in peer_nodes and saddr != '0x00000000': + peer_nodes[saddr] = node + while True: + pkt_bytes = bytes_left + if pkt_bytes > gso_size and gso_size != 0: + pkt_bytes = gso_size + tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, + pkt_bytes, ack) + # if (saddr == '0x0a000105' and sport == 36222 and daddr == '0x0a000103' and dport == 5000): + # print('%9.3f: tcp_pkt: %s' % (t, tcp_pkt)) + if 'nic' in tcp_pkt and data_bytes > 0: + print('TCP packet retransmission (nic %.3f and %.3f), node %s' % + (tcp_pkt['nic'], t, node), file=sys.stderr) + return + tcp_pkt['nic'] = t + tcp_pkt['tx_node'] = node + if bytes_left == data_bytes: + tcp_pkt['gso_pkt_size'] = data_bytes + bytes_left -= pkt_bytes + sequence += pkt_bytes + if sequence > 0x80000000: + # 32-bit sequence number has wrapped around + sequence -= 0x100000000 + if bytes_left <= 0: + break + + def tt_gro_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, + data_bytes, total, ack): + tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, + data_bytes, ack) + node = trace['node'] + if 'gro' in tcp_pkt and data_bytes > 0: + print('TCP packet retransmission (gro %.3f and %.3f), node %s' % + (tcp_pkt['gro'], t, node), file=sys.stderr) + return + tcp_pkt['gro'] = t + tcp_pkt['total_length'] = total + tcp_pkt['rx_node'] = node + if not daddr in peer_nodes and daddr != '0x00000000': + peer_nodes[daddr] = node + def analyze(self): """ Try to deduce missing packet fields, such as message length. @@ -6760,6 +6917,255 @@ def output(self): num_lpasses/num_pkts, lgains[50*num_lpasses//100], lgains[90*num_lpasses//100], lgains[-1])) +#------------------------------------------------ +# Analyzer: qbytes +#------------------------------------------------ +class AnalyzeQbytes: + """ + Computes the amount of packet data of various kinds (Homa data, TCP + data, etc.) queued in the network at each point in time. Requires the + --plot option. + """ + def __init__(self, dispatcher): + require_options('qbytes', 'plot') + dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzeMinlatency') + dispatcher.interest('AnalyzeIntervals') + + def analyze(self): + """ + Computes interval fields related to queued data. + """ + global packets, rpcs, minlatency, intervals, grant_pkt_length + global tcp_packets + + for pkt_type, pkts in [['data', packets.values()], + ['grant', grants.values()], + ['tcp', tcp_packets.values()]]: + for pkt in pkts: + if not 'nic' in pkt or not 'gro' in pkt: + if pkt_type == 'tcp': + trace = None + if 'gro' in pkt: + t = pkt['gro'] + if pkt['saddr'] in peer_nodes: + trace = traces[peer_nodes[pkt['saddr']]] + else: + t = pkt['nic'] + if pkt['daddr'] in peer_nodes: + trace = traces[peer_nodes[pkt['daddr']]] + if (trace != None and trace['first_time'] < (t-1000) and + trace['last_time'] > (t+1000)): + print('%9.3f: incomplete TCP packet for %s:%d to %s:%d (peer %s, ' + 'start %.3f, end %.3f): %s' % + (t, pkt['saddr'], pkt['sport'], + pkt['daddr'], pkt['dport'], trace['node'], + trace['first_time'], trace['last_time'], + pkt)) + continue + gro = pkt['gro'] + nic = pkt['nic'] + tx_node = pkt['tx_node'] + rx_node = pkt['rx_node'] + + # The packet is assumed to be queued if its latency + # exceeds min_latency for the nodes; it is assumed to be + # queued for the last part of this time (that's not quite + # accurate since the queuing is probably in the switch and + # there is probably additional delay after the packet has + # been received but before GRO gets it). + q_start = nic + min_latency[tx_node][rx_node] + if q_start < gro: + if pkt_type == 'grant': + add_to_intervals(rx_node, q_start, gro, 'q_homa_grant', + grant_pkt_length) + elif pkt_type == 'data': + rpc = rpcs[pkt['id']^1] + length = pkt['length'] + data_hdr_length + if 'unsched' in rpc and pkt['offset'] < rpc['unsched']: + add_to_intervals(rx_node, q_start, gro, + 'q_homa_unsched', length) + else: + add_to_intervals(rx_node, q_start, gro, + 'q_homa_sched', length) + else: + add_to_intervals(rx_node, q_start, gro, 'q_tcp', + pkt['total_length']) + + def init_axis(self, ax, x_min, x_max, y_max, size=10): + """ + Initialize an axis for plotting queued bytes. + """ + ax.set_xlim(x_min, x_max) + ax.set_ylim(0, y_max) + ax.tick_params(right=True, which='both', direction='in', length=5) + ax.set_xlabel('Time (μsec)', size=size) + ax.set_ylabel('Queued Data (KB)', size=size) + + def output(self): + global grants, options, packets, rpcs + nodes = get_sorted_nodes() + + print('\n----------------') + print('Analyzer: qbytes') + print('----------------') + print('See qbytes.pdf in %s' % (options.plot)) + + # Node -> dictionary with ready-to-plot data series for the node: + # grant, unsched, sched, and tcp. The data are cumulative: sched + # includes sched, unsched, and grant. Values correspond to time_data + node_data = defaultdict(lambda: {'grant': [], 'unsched': [], + 'sched': [], 'tcp': []}) + + # End-of-interval time values correspond to data points in node_data. + time_data = [] + + # node-> dictionary with maximum observed queuing across various + # categories + node_max = defaultdict(lambda:{'grant': 0, 'unsched': 0, 'sched': 0, + 'tcp': 0, 'total': 0}) + + # Largest 'total' value in dictionary above'. + overall_node_max = 0 + + # Ready-to-plot data series that hold totals across all nodes, + # corresponding to time_data. + total_grant_data = [] + total_unsched_data = [] + total_sched_data = [] + total_tcp_data = [] + + # Maximum values of sums across all nodes + max_grant = 0 + max_unsched = 0 + max_sched = 0 + max_tcp = 0 + max_total = 0 + + # Generate data to plot. Each iteration of this outer loop processes + # the interval data for all nodes at a given time. + t = options.interval * math.floor(get_last_start()/options.interval) + end_time = get_first_end() + while t < end_time: + total_grant = 0 + total_unsched = 0 + total_sched = 0 + total_tcp = 0 + for node in nodes: + data = node_data[node] + interval = get_interval(node, t) + max = node_max[node] + + val = interval['q_homa_grant'] + sum = val + data['grant'].append(sum/1000) + if val > max['grant']: + max['grant'] = val + total_grant += val + + val = interval['q_homa_unsched'] + sum += val + data['unsched'].append(sum/1000) + if val > max['unsched']: + max['unsched'] = val + total_unsched += val + + val = interval['q_homa_sched'] + sum += val + data['sched'].append(sum/1000) + if val > max['sched']: + max['sched'] = val + total_sched += val + + val = interval['q_tcp'] + sum += val + data['tcp'].append(sum/1000) + if val > max['tcp']: + max['tcp'] = val + total_tcp += val + + if sum > max['total']: + max['total'] = sum + if sum > overall_node_max: + overall_node_max = sum + + total_grant_data.append(total_grant/1000) + if total_grant > max_grant: + max_grant = total_grant + + sum = total_grant + total_unsched + total_unsched_data.append(sum/1000) + if total_unsched > max_unsched: + max_unsched= total_unsched + + sum += total_sched + total_sched_data.append(sum/1000) + if total_sched > max_sched: + max_sched= total_sched + + sum += total_tcp + total_tcp_data.append(sum/1000) + if total_tcp > max_tcp: + max_tcp= total_tcp + + if sum > max_total: + max_total = sum + time_data.append(t) + t += options.interval + + # Print summary statistics + print('\nLargest observed queued incoming data (KB):') + print('Node: Name of node') + print('Total: Maximum total queued bytes for the node') + print('Grants: Maximum queued bytes from grant packets') + print('Unsched: Maximum queued bytes in unscheduled data packets') + print('Sched: Maximum queued bytes in scheduled data packets') + print('Tcp: Maximum queued bytes in TCP packets\n') + print('The Total line shows the maximum instantaneous sum across all ' + 'nodes.') + print('Node Total Grants Unsched Sched Tcp') + for node in nodes: + max = node_max[node] + print('%-10s %8d %8d %8d %8d %8d' % (node, max['total']/1000, + max['grant']/1000, max['unsched']/1000, max['sched']/1000, + max['tcp']/1000)) + print('Total %8d %8d %8d %8d %8d' % (max_total/1000, max_grant/1000, + max_unsched/1000, max_sched/1000, max_tcp/1000)) + + # Generate a stacked graph. The top plot contains cluster-wide totals; + # subsequent plots show data for each individual node. + fig, axes = plt.subplots(nrows=len(nodes) + 1, ncols=1, sharex=False, + figsize=[8, (1 + len(nodes))*2]) + ax = axes[0] + ax.set_title("Total Across All Nodes", size=10) + x_min = get_last_start() + x_max = get_first_end() + self.init_axis(ax, x_min, x_max, max_total/1000) + ax.step(time_data, total_grant_data, where='pre', + label='Homa grants', color=color_red) + ax.step(time_data, total_unsched_data, where='pre', + label='Homa unscheduled data', color=color_blue) + ax.step(time_data, total_sched_data, where='pre', + label='Homa scheduled data', color=color_brown) + ax.step(time_data, total_tcp_data, where='pre', + label='TCP', color=color_green) + for i in range(len(nodes)): + node = nodes[i] + data = node_data[node] + ax = axes[i+1] + self.init_axis(ax, x_min, x_max, overall_node_max/1000) + ax.set_title(node, size=10) + ax.step(time_data, data['grant'], where='pre', color=color_red) + ax.step(time_data, data['unsched'], where='pre', color=color_blue) + ax.step(time_data, data['sched'], where='pre', color=color_brown) + ax.step(time_data, data['tcp'], where='pre', color=color_green) + fig.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.02), + frameon=False, prop={'size': 9}) + # plt.legend(loc="upper left", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/qbytes.pdf" % (options.plot), bbox_inches='tight') + #------------------------------------------------ # Analyzer: qdelay #------------------------------------------------ @@ -6842,13 +7248,13 @@ def output(self): rx_node = pkt['rx_node'] qdelay = (gro - nic) - min_latency[tx_node][rx_node] if pkt_type == 'grant': - color = '#844F1A' + color = color_brown else: rpc = rpcs[pkt['id']] if rpc['out_length'] < 1000: - color = '#c00000' + color = color_red else: - color = '#1f77b4' + color = color_blue tx_delays[tx_node][0].append(nic) tx_delays[tx_node][1].append(qdelay) tx_delays[tx_node][2].append(color) @@ -6862,9 +7268,9 @@ def output(self): legend_handles = [ matplotlib.lines.Line2D([], [], color=c, marker='o', linestyle='None', markersize=8, label=label) - for c, label in [['#c00000', 'Data (messages < 1000B)'], - ['#1f77b4', 'Data (other messages)'], - ['#844F1A', 'Grants']] + for c, label in [[color_red, 'Data (messages < 1000B)'], + [color_blue, 'Data (other messages)'], + [color_brown, 'Grants']] ] x_min = get_last_start() x_max = get_first_end() @@ -8269,33 +8675,30 @@ class AnalyzeTemp: debugging. Consult the code to see what it does right now. """ def __init__(self, dispatcher): - if True: + if False: dispatcher.interest('AnalyzeRpcs') dispatcher.interest('AnalyzePackets') + self.stream_pkts = defaultdict(list) - def output(self): - global traces, options, packets, rpcs - print('\n-------------------') - print('Analyzer: temp') - print('-------------------\n') + def tt_gro_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, + data_bytes, total, ack): + if data_bytes == 0: + return + stream = '%s:%d to %s:%d' % (saddr, sport, daddr, dport) + self.stream_pkts[stream].append([t, sequence, data_bytes]) - bytes = 0 - for pkt in packets.values(): - if pkt['retransmits']: - print('Packet with %d retransmissions: %s\n' % ( - len(pkt['retransmits']), pkt)) - for r in pkt['retransmits']: - if 'tso_length' in r: - bytes += r['tso_length'] - elif 'length' in pkt: - bytes += pkt['length'] - else: - print('Can\'t find length for preceding packet') - elapsed = 0 - for trace in traces.values(): - elapsed += trace['elapsed_time'] - print('Total elapsed time %.1f ms, retransmitted bytes %d (%.3f MB/sec)' - % (elapsed * 1e-3, bytes, bytes / elapsed)) + def output(self): + for stream in self.stream_pkts.keys(): + pkts = sorted(self.stream_pkts[stream], key=lambda t: t[1]) + for i in range(1, len(pkts)): + prev = pkts[i-1] + pkt = pkts[i] + print('%9.3f: stream %s, sequence %d, data_bytes %d' % + (pkt[0], stream, pkt[1], pkt[2])) + gap = pkt[1] - (prev[1] + prev[2]) + if gap == 0: + continue + print(' Gap of %d bytes in %s' % (gap, stream)) def output_snapshot(self): global packets, rpcs From 08068d9c3b3835c7f8fa7616830c6aa3d77530d2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 3 Nov 2025 13:23:21 -0800 Subject: [PATCH 551/625] Fix bug in RCU usage in homa_qdisc RCU was being used when the thread could potentially block; switch to a mutext for this case. --- homa_qdisc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 88dd6c9f..ff4a5e85 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -1146,8 +1146,13 @@ void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared) if (qshared->homa_share > 100) qshared->homa_share = 100; - rcu_read_lock(); + /* Use a mutex rather than RCU to prevent qdev deletion while we + * traverse the list. This is more expensive, but RCU isn't safe + * because homa_qdev_update_sysctl may block (and efficiency isn't + * paramount here). + */ + mutex_lock(&qshared->mutex); list_for_each_entry_rcu(qdev, &qshared->qdevs, links) homa_qdev_update_sysctl(qdev); - rcu_read_unlock(); + mutex_unlock(&qshared->mutex); } From 74051ce4377a24c7aa9cc7522f40de2833701295 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 3 Nov 2025 13:25:46 -0800 Subject: [PATCH 552/625] Introduce homa_get_offset function --- homa_devel.c | 8 ++------ homa_qdisc.c | 4 +--- homa_wire.h | 13 +++++++++++++ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 197f1b3a..31e05869 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -171,9 +171,7 @@ char *homa_print_packet(struct sk_buff *skb, char *buffer, int buf_len) seg_length = homa_info->data_bytes; data_left = homa_info->data_bytes - seg_length; } - offset = ntohl(h->seg.offset); - if (offset == -1) - offset = ntohl(h->common.sequence); + offset = homa_get_offset(h); #ifndef __STRIP__ /* See strip.py */ used = homa_snprintf(buffer, buf_len, used, ", message_length %d, offset %d, data_length %d, incoming %d", @@ -322,9 +320,7 @@ char *homa_print_packet_short(struct sk_buff *skb, char *buffer, int buf_len) seg_length = homa_info->seg_length; data_left = homa_info->data_bytes - seg_length; } - offset = ntohl(h->seg.offset); - if (offset == -1) - offset = ntohl(h->common.sequence); + offset = homa_get_offset(h); pos = skb_transport_offset(skb) + sizeof(*h) + seg_length; used = homa_snprintf(buffer, buf_len, 0, "DATA%s %d@%d", diff --git a/homa_qdisc.c b/homa_qdisc.c index ff4a5e85..0854fc11 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -478,9 +478,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, */ h = (struct homa_data_hdr *)skb_transport_header(skb); - offset = ntohl(h->seg.offset); - if (offset == -1) - offset = ntohl(h->common.sequence); + offset = homa_get_offset(h); if (h->common.type != DATA || ntohl(h->message_length) < qshared->defer_min_bytes) { homa_qdisc_update_link_idle(qdev, pkt_len, -1); diff --git a/homa_wire.h b/homa_wire.h index 364c33ac..948be506 100644 --- a/homa_wire.h +++ b/homa_wire.h @@ -559,6 +559,19 @@ static inline void homa_set_hijack(struct homa_common_hdr *common) common->urgent = htons(HOMA_TCP_URGENT); common->doff = 0x50; } + +/** + * homa_get_offset() - Returns the offset within message of the first byte + * of data in a Homa DATA packet (the offset is stored in different places + * in different situations). + * @h: Header for DATA packet + * Return: See above + */ +static inline int homa_get_offset(struct homa_data_hdr *h) +{ + return (h->seg.offset != -1) ? ntohl(h->seg.offset) : + ntohl(h->common.sequence); +} #endif /* See strip.py */ #endif /* _HOMA_WIRE_H */ From 31c018b9f9e839855276f10fb39f71f95cf157e3 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 4 Nov 2025 14:59:30 -0800 Subject: [PATCH 553/625] Force IPv4 in update_linux to avoid delays on CloudLab nodes --- cloudlab/bin/update_linux | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cloudlab/bin/update_linux b/cloudlab/bin/update_linux index 97fe00c0..fe22b543 100755 --- a/cloudlab/bin/update_linux +++ b/cloudlab/bin/update_linux @@ -32,8 +32,12 @@ for ((i = $first ; i <= $last; i++)); do node=node$i echo echo $node - ssh $node 'rm -rf tmp; mkdir -p tmp tmp/boot' - rsync -rtv /boot/initrd.img-$v /boot/config-$v /boot/System.map-$v \ + + # Forcing IPv4 below helps on CloudLab nodes (otherwise there will be + # a 10-20 second delay for each node if Homa hasn't been installed + # (tries IPv6 first?) + ssh -4 $node 'rm -rf tmp; mkdir -p tmp tmp/boot' + rsync --ipv4 -rtv /boot/initrd.img-$v /boot/config-$v /boot/System.map-$v \ /boot/vmlinuz-$v $node:tmp/boot/ - ssh $node "sudo cp -f tmp/boot/* /boot; sudo reboot" + ssh -4 $node "sudo cp -f tmp/boot/* /boot; sudo reboot" done From 98d3eacdb965dbe3a5f78be0c30cc81b01e134fc Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 4 Nov 2025 15:01:18 -0800 Subject: [PATCH 554/625] Various improvements to tthoma.py --- util/tthoma.py | 170 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 110 insertions(+), 60 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 5b229282..5e193fa3 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -274,6 +274,12 @@ def __missing__(self, key): # tx_in_nic: Number of bytes of data that have been passed to the NIC # but not yet returned via the tx completion queue, as of the # end of the interval +# tx_in_nic2: Same as tx_in_nic except excludes bytes that have been +# received at target; provides a tighter (over)estimate of +# "bytes that are queued in the NIC but have not yet been +# transmitted" +# tx_in_nic_qx: Same as tx_in_nic2 except only counts bytes in a particular +# tx queue (given by the --tx-qid option, default 0). # tx_nic_rx: Number of bytes of data that have been received by the # destination but their packet buffers haven't been returned # from the NIC via the completion queue, as of the end of @@ -431,6 +437,12 @@ def dict_avg(data, key): return 0 return total / count +def div_safe(num, denom): + if denom != 0: + return num/denom + else: + return 0 + def list_avg(data, index): """ Given a list of lists, return the average of the index'th elements @@ -1807,7 +1819,7 @@ def __qdisc_xmit(self, trace, time, core, match, interests): patterns.append({ 'name': 'qdisc_xmit', 'regexp': 'homa_qdisc_pacer queuing homa data packet for id ([0-9]+), ' - 'offset ([0-9]+)' + 'offset ([-0-9]+)' }) def __tcp_xmit(self, trace, time, core, match, interests): @@ -2103,10 +2115,8 @@ def output(self): node, msgs, rate, liveFrac, avgLive, avg_gbps, avg_gbps/liveFrac), end='') print(' %5.2f (C%02d) %6.3f (C%02d) %6.3f (C%02d)' % ( - max_gbps, max_core, - max_rpcs/total_rpcs if total_rpcs != 0 else 0, - max_rpcs_core, - max_pending/total_pending if total_pending != 0 else 0, + max_gbps, max_core, div_safe(max_rpcs, total_rpcs), + max_rpcs_core, div_safe(max_pending, total_pending), max_pending_core)) print('\nOutgoing messages:') print('Node Msgs MsgRate LiveFrac AvgLive Gbps LiveGbps') @@ -4433,6 +4443,8 @@ def analyze(self): self.init_intervals() late_usecs = options.late + qid = options.tx_qid if options.tx_qid != None else 0 + # See if packets include NIC xmit times nic_data = False for pkt in packets.values(): @@ -4454,17 +4466,43 @@ def analyze(self): txmit = pkt['xmit2'] if 'xmit2' in pkt else None if 'nic' in pkt: tnic = pkt['nic'] + nic_start = tnic nic_interval = get_interval(tx_node, tnic) else: tnic = None - tfree = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None - tgro = pkt['gro'] if 'gro' in pkt else None + if tx_node != None: + nic_start = traces[tx_node]['first_time'] + if 'free_tx_skb' in pkt: + tfree = pkt['free_tx_skb'] + nic_end = tfree + nic_end2 = tfree + else: + tfree = None + nic_end = 1e20 + nic_end2 = 1e20 + if 'gro' in pkt: + tgro = pkt['gro'] + if tgro < nic_end2: + nic_end2 = tgro + else: + tgro = None # For most tx statistics, process only the overall TSO frame, # not the individual segments if ('tso_length' in pkt): tso_length = pkt['tso_length'] + if tx_node != None: + if nic_end < 1e20: + add_to_intervals(tx_node, nic_start, nic_end, + 'tx_in_nic', tso_length) + if nic_end2 < 1e20: + add_to_intervals(tx_node, nic_start, nic_end2, + 'tx_in_nic2', tso_length) + if 'tx_qid' in pkt and pkt['tx_qid'] == qid: + add_to_intervals(tx_node, nic_start, nic_end2, + 'tx_in_nic_qx', tso_length) + if txmit != None: interval = get_interval(tx_node, txmit) interval['tx_pkts'] += 1 @@ -4473,8 +4511,7 @@ def analyze(self): add_to_intervals(tx_node, txmit, pkt['nic'], 'tx_qdisc', tso_length) - if 'nic' in pkt: - tnic = pkt['nic'] + if tnic != None: nic_interval = get_interval(tx_node, tnic) node_xmits[tx_node].append([pkt['nic'], tso_length + data_overhead_bytes]) @@ -4487,19 +4524,13 @@ def analyze(self): if tfree != None: interval = get_interval(tx_node, tfree) interval['tx_free_bytes'] += tso_length - if 'nic' in pkt: - add_to_intervals(tx_node, tnic, tfree, 'tx_in_nic', - tso_length) + if tnic != None: delay = tfree - tnic if delay > nic_interval['tx_max_free']: nic_interval['tx_max_free'] = delay if (nic_interval['tx_min_free'] == 0) or (delay < nic_interval['tx_min_free']): nic_interval['tx_min_free'] = delay - else: - start = traces[tx_node]['first_time'] - add_to_intervals(tx_node, start, tfree, 'tx_in_nic', - tso_length) if tgro != None: add_to_intervals(tx_node, tgro, tfree, 'tx_nic_rx', tso_length) @@ -6018,10 +6049,18 @@ def output(self): f.write('# Gro: Rate at which data bytes reached GRO on receivers\n') f.write('# Free: Rate at which packet buffers were freed ' 'after transmission complete\n') - f.write('# InNIC KB of data that has been queued in the NIC ' + f.write('# InNic KB of data that has been queued in the NIC ' 'but not yet freed\n') + f.write('# InNic2 KB of data that has been queued in the NIC ' + 'and has neither been\n') + f.write(' freed nor received at the destination\n') + f.write('# InNicQ Same as InNic2 except only counts bytes in ' + 'tx queue %d (use\n' % ( + options.tx_qid if options.tx_qid != None else 0)) + f.write(' the --tx-qid option to select a different ' + 'queue)\n') - f.write('\n# Time Tx ToNic Gro Free InNIC\n') + f.write('\n# Time Tx ToNic Gro Free InNic InNic2 InNicQ\n') node_intervals = intervals[node] bytes_to_gbps = 8 / (options.interval * 5 * 1000) @@ -6036,13 +6075,15 @@ def output(self): gro_bytes += interval['tx_gro_bytes'] free_bytes += interval['tx_free_bytes'] interval = node_intervals[i] - f.write('%8.1f %6.1f %6.1f %6.1f %6.1f %5d\n' % + f.write('%8.1f %6.1f %6.1f %6.1f %6.1f %5d %6d %6d\n' % (interval['time'], tx_bytes * bytes_to_gbps, to_nic_bytes * bytes_to_gbps, gro_bytes * bytes_to_gbps, free_bytes * bytes_to_gbps, - interval['tx_in_nic'] * 1e-3)) + interval['tx_in_nic'] * 1e-3, + interval['tx_in_nic2'] * 1e-3, + interval['tx_in_nic_qx'] * 1e-3)) f.close() #------------------------------------------------ @@ -6570,12 +6611,10 @@ def tt_nic_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, pkt_bytes = gso_size tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, pkt_bytes, ack) - # if (saddr == '0x0a000105' and sport == 36222 and daddr == '0x0a000103' and dport == 5000): - # print('%9.3f: tcp_pkt: %s' % (t, tcp_pkt)) if 'nic' in tcp_pkt and data_bytes > 0: - print('TCP packet retransmission (nic %.3f and %.3f), node %s' % - (tcp_pkt['nic'], t, node), file=sys.stderr) - return + # Retransmitted packet: retain only the last transmission. + if 'gro' in tcp_pkt and tcp_pkt['gro'] < t: + del tcp_pkt['gro'] tcp_pkt['nic'] = t tcp_pkt['tx_node'] = node if bytes_left == data_bytes: @@ -6593,9 +6632,10 @@ def tt_gro_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, data_bytes, ack) node = trace['node'] - if 'gro' in tcp_pkt and data_bytes > 0: - print('TCP packet retransmission (gro %.3f and %.3f), node %s' % - (tcp_pkt['gro'], t, node), file=sys.stderr) + if 'nic' in tcp_pkt and t < tcp_pkt['nic']: + # This packet was apparently retransmitted; we want to retain + # only the last transmission, but this event appears to be from + # an earlier transmision; ignore it. return tcp_pkt['gro'] = t tcp_pkt['total_length'] = total @@ -8675,30 +8715,38 @@ class AnalyzeTemp: debugging. Consult the code to see what it does right now. """ def __init__(self, dispatcher): - if False: - dispatcher.interest('AnalyzeRpcs') - dispatcher.interest('AnalyzePackets') - self.stream_pkts = defaultdict(list) - - def tt_gro_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, - data_bytes, total, ack): - if data_bytes == 0: - return - stream = '%s:%d to %s:%d' % (saddr, sport, daddr, dport) - self.stream_pkts[stream].append([t, sequence, data_bytes]) + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') def output(self): - for stream in self.stream_pkts.keys(): - pkts = sorted(self.stream_pkts[stream], key=lambda t: t[1]) - for i in range(1, len(pkts)): - prev = pkts[i-1] - pkt = pkts[i] - print('%9.3f: stream %s, sequence %d, data_bytes %d' % - (pkt[0], stream, pkt[1], pkt[2])) - gap = pkt[1] - (prev[1] + prev[2]) - if gap == 0: - continue - print(' Gap of %d bytes in %s' % (gap, stream)) + global rpcs + + for rpc in rpcs.values(): + # print('RPC id %d: %s\n' % (rpc['id'], rpc)) + if not 'sendmsg' in rpc or not 'recvmsg_done' in rpc: + continue + if rpc['out_length'] < 1000 or rpc['out_length'] > 1400: + continue + if rpc['id'] & 1: + continue + rtt = rpc['recvmsg_done'] - rpc['sendmsg'] + if rtt < 150: + continue + peer = rpc['peer'] + print('RPC id %d (%s -> %s) took %.1f usecs, length %d, %.3f -> %.3f' % + (rpc['id'], rpc['node'], + peer_nodes[peer] if peer in peer_nodes else peer, + rtt, rpc['out_length'], rpc['sendmsg'], rpc['recvmsg_done'])) + if rpc['send_data_pkts']: + pkt = rpc['send_data_pkts'][0] + if 'xmit' in pkt and 'nic' in pkt: + print(' Request packet xmit time %.1f usecs (xmit %.3f, nic %.3f)' % + (pkt['nic'] - pkt['xmit'], pkt['xmit'], pkt['nic'])) + if rpc['softirq_data_pkts']: + pkt = rpc['softirq_data_pkts'][0] + if 'xmit' in pkt and 'nic' in pkt: + print(' Response packet xmit time %.1f usecs (xmit %.3f, nic %.3f)' % + (pkt['nic'] - pkt['xmit'], pkt['xmit'], pkt['nic'])) def output_snapshot(self): global packets, rpcs @@ -9239,6 +9287,7 @@ def output(self): f.write('# Generated at %s.\n' % (time.strftime('%I:%M %p on %m/%d/%Y'))) f.write('# Data packets transmitted from %s:\n' % (node)) + f.write('# Dest: Node to which packet was sent\n') f.write('# Xmit: Time when packet was passed to ip*xmit\n') f.write('# Qdisc: Time when homa_qdisc requeued packet after ' 'deferral, if any\n') @@ -9256,8 +9305,8 @@ def output(self): f.write('# Rx: Number of times segments in the packet were ' 'retransmitted\n\n') - f.write('# Xmit Qdisc RpcId Offset Length Qid') - f.write(' Nic NDelay MaxGro GDelay') + f.write('# Dest Xmit Qdisc RpcId Offset Length') + f.write(' Qid Nic NDelay MaxGro GDelay') f.write(' Free FDelay Rx\n') for pkt in pkts: xmit = pkt['xmit'] @@ -9328,16 +9377,16 @@ def output(self): qid_slow_bytes[qid] += length qid_total_bytes[qid] += length - - line = '%10.3f %10s %10d %6d %6d %3s' % (xmit, qdisc_string, - pkt['id'], pkt['offset'], pkt['tso_length'], - qid_string) + line = '%-10s %10.3f %10s %10d %6d %6d' % ( + pkt['rx_node'] if 'rx_node' in pkt else "", + xmit, qdisc_string, pkt['id'], pkt['offset'], + pkt['tso_length']) nic_delay_string = '' if (nic_delay != None): nic_delay_string = '%.1f' % (nic_delay) - line += ' %10s %7s %10s %7s' % (print_if(nic, '%.3f'), - nic_delay_string, print_if(max_gro, '%.3f'), - gro_string) + line += ' %3s %10s %7s %10s %7s' % (qid_string, + print_if(nic, '%.3f'), nic_delay_string, + print_if(max_gro, '%.3f'), gro_string) free_delay_string = '' if (nic != None) and (free != None): free_delay_string = '%.1f' % (free - nic) @@ -9384,7 +9433,8 @@ def print_type(delays): node_info += '%-10s %5d %6.1f %5.2f %s %s %s\n' % ( node, total_pkts, 1e-3*sum(qid_backlog.values())/total_time, - sum(qid_slow_bytes.values())/sum(qid_total_bytes.values()), + div_safe(sum(qid_slow_bytes.values()), + sum(qid_total_bytes.values())), print_type(totals['nic']), print_type(totals['gro']), print_type(totals['free'])) From 6ecacd58fcea61913e3e7b4c7eaee2cde3d01a88 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 5 Nov 2025 09:44:29 -0800 Subject: [PATCH 555/625] Add --verbose option to install_homa and config --- cloudlab/bin/config | 48 +++++++++++++++++++++++---------------- cloudlab/bin/install_homa | 32 +++++++++++++++++--------- 2 files changed, 49 insertions(+), 31 deletions(-) diff --git a/cloudlab/bin/config b/cloudlab/bin/config index 388a46d7..ba8999f0 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -48,6 +48,9 @@ num_nodes = 0 # Cached result of get_node_names. node_names = [] +# Set from command line option to request detailed output. +verbose = False + # Contains information from /proc/cpuinfo with one entry for each # "processor" in /proc/cpuinfo. The entry is a dictionary with the # following entries (same names as in /proc/cpuinfo): @@ -138,8 +141,7 @@ def get_interfaces(): if interface: return [interface, vlan] available = "" - for line in exec_cmd(["ifconfig"], stdout=subprocess.PIPE, - encoding="utf-8", check=True).stdout.splitlines(): + for line in exec_cmd(["ifconfig"], check=True).stdout.splitlines(): match = re.match('^([a-z0-9]*):', line) if match: current = match.group(1) @@ -173,9 +175,7 @@ def get_link_speed(): nic = get_interfaces()[0] num_channels = -1 - for line in exec_cmd(["ethtool", nic], stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, encoding="utf-8", - check=True).stdout.splitlines(): + for line in exec_cmd(["ethtool", nic], check=True).stdout.splitlines(): match = re.match('.*Speed: ([0-9]+)Mb/s', line) if match: link_mbps = int(match.group(1)) @@ -215,7 +215,7 @@ def get_nic_type(): if nic_type != None: return nic_type info = exec_cmd(["sudo", "ethtool", "-i", get_interfaces()[0]], - stdout=subprocess.PIPE, encoding="utf-8", check=True).stdout + check=True).stdout match = re.search(r'.*driver:\s*(\S+)', info, re.MULTILINE) if not match: raise Exception("Couldn't identify NIC type (no 'driver' info in " @@ -232,8 +232,7 @@ def get_node_num(): global node_num if node_num != None: return node_num - hostname = exec_cmd(["hostname"], stdout=subprocess.PIPE, - encoding="utf-8", check=True).stdout + hostname = exec_cmd(["hostname"], check=True).stdout match = re.match(r'node([0-9]+)\.', hostname) if not match: raise Exception("Couldn't figure out node number for this node") @@ -314,7 +313,6 @@ def get_qdisc_config(): result['children'] = [] nic = get_interfaces()[0] for line in exec_cmd(['tc', 'qdisc', 'show', 'dev', nic], - stdout=subprocess.PIPE, encoding='utf-8', check=True).stdout.splitlines(): match = re.match('qdisc mq ([0-9a-f]+:[0-9a-f]*) root', line) if match: @@ -338,8 +336,8 @@ def print_rss(): nic = get_interfaces()[0] irqs = get_nic_irqs() num_channels = -1 - for line in exec_cmd(["ethtool", "-l", nic], stdout=subprocess.PIPE, - encoding="utf-8", check=True).stdout.splitlines(): + for line in exec_cmd(["ethtool", "-l", nic], + check=True).stdout.splitlines(): match = re.match('Combined:[^0-9]+([0-9]+)', line) if match: num_channels = int(match.group(1)) @@ -428,12 +426,21 @@ def add_ipv6_to_etc_hosts(num_hosts): def exec_cmd(*args, **kwargs): """ - This method is a wrapper around subprocess.run, which logs the command - before executing it. The arguments are the same as those to - subprocess.run. + This method is a wrapper around subprocess.run, which arranges for the + output to be captured and also performs logging. The arguments are the + same as those for subprocess.run. """ - print("%% %s" % (" ".join(args[0]))) - return subprocess.run(*args, **kwargs) + global verbose + + if verbose: + print("%% %s" % (" ".join(args[0]))) + try: + return subprocess.run(*args, text=True, capture_output=True, **kwargs) + except subprocess.CalledProcessError as e: + if not verbose: + print("%% %s" % (" ".join(args[0]))) + print(e.stderr) + raise def set_sysctl(name, value): """ @@ -461,8 +468,7 @@ def config_homa(mod): # See if Homa supports sysctls (if it has been stripped down for Linux # upstreaming, it might not). - result = exec_cmd(["sysctl", ".net.homa.num_priorities"], - capture_output=True) + result = exec_cmd(["sysctl", ".net.homa.num_priorities"], check=False) if result.returncode != 0: global sysctl_avl print("Homa doesn't appear to support sysctls") @@ -515,8 +521,7 @@ def config_ipv6(num_hosts, vlan): """ vlan = get_interfaces()[1] # Configure ifconfig and route if not already done. - if "inet6 fd00::" in exec_cmd(["ifconfig", vlan], - stdout=subprocess.PIPE, encoding="utf-8", check=True).stdout: + if "inet6 fd00::" in exec_cmd(["ifconfig", vlan], check=True).stdout: print("IPv6 already configured") else: print("Configuring IPv6:") @@ -762,6 +767,7 @@ def print_help(): print("Usage: config feature feature ...") print("\nEach feature may be one of the following:") print(" --help Print this help text and exit") + print(" --verbose Print details of commands executed and results") print(" default Normal configuration for Homa: equivalent to") print(" 'reset_qdisc homa ~/bin/homa.ko ipv6 nic power") print(" rps'") @@ -813,6 +819,8 @@ while i < len(sys.argv): if arg == "--help": print_help() exit(0) + elif arg == "--verbose": + verbose = True elif arg == "default": config_reset_qdisc() config_homa("~/bin/homa.ko") diff --git a/cloudlab/bin/install_homa b/cloudlab/bin/install_homa index c457a332..6749cda2 100755 --- a/cloudlab/bin/install_homa +++ b/cloudlab/bin/install_homa @@ -7,7 +7,7 @@ # or more target machines; it also loads the Homa kernel module. # # Usage: -# install_homa [--net-next] num_nodes [first] +# install_homa [--net-next] [--verbose] num_nodes [first] # # The "num_nodes" arguments indicates how many servers should be updated. # The "first" argument is optional; it is an integer identifying the @@ -23,10 +23,20 @@ root=~/homaModule set -e homa_ko=$root/homa.ko -if [ $1 = "--net-next" ]; then - homa_ko=/netnext/net-next/net/homa/homa.ko - shift -fi +verbose="" +rsync_switches="-rt" +while true; do + if [ $1 = "--net-next" ]; then + homa_ko=/netnext/net-next/net/homa/homa.ko + shift + elif [ $1 = "--verbose" ]; then + verbose=" --verbose" + rsync_switches="-rtv" + shift + else + break + fi +done if [ $# -eq 2 ]; then first=$2 elif [ $# -eq 1 ]; then @@ -41,11 +51,11 @@ for ((i = $first ; i <= $last; i++)); do node=node$i echo echo '*** Installing Homa on' $node '***' - rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv ~/.bashrc ~/.bash_profile ~/.gdbinit $node: - rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv --exclude __pycache__ ~/bin/ $node:bin/ - rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" -rtv $homa_ko $root/util/cp_node $root/util/homa_prio $root/util/server $root/util/homa_test $root/util/*.py $node:bin/ - ssh -4 $node 'sudo sysctl .kernel.printk="5 4 1 7"' - ssh -4 $node 'echo $PATH' - ssh -4 $node 'config default' + rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" $rsync_switches ~/.bashrc ~/.bash_profile ~/.gdbinit $node: + rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" $rsync_switches --exclude __pycache__ ~/bin/ $node:bin/ + rsync --ipv4 -e "ssh -4 -o StrictHostKeyChecking=no" $rsync_switches $homa_ko $root/util/cp_node $root/util/homa_prio $root/util/server $root/util/homa_test $root/util/*.py $node:bin/ + ssh -4 $node 'sudo sysctl .kernel.printk="5 4 1 7" > /dev/null' + # ssh -4 $node 'echo $PATH' + ssh -4 $node "config$verbose default" ssh -4 $node 'if ! grep -q mitigations=off /proc/cmdline; then echo WARNING: Meltdown/Spectre mitigations have not been disabled!; fi' done \ No newline at end of file From 58508f6a58bf6d25888e74b7710771caf9e8f0b5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 6 Nov 2025 10:38:48 -0800 Subject: [PATCH 556/625] Separate print_pkts from the txpkts analyzer in tthoma.py Also add plot_ccdf function. --- util/tthoma.py | 346 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 240 insertions(+), 106 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 5e193fa3..38ef0519 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -12,6 +12,7 @@ from functools import cmp_to_key from glob import glob import itertools +from io import StringIO import matplotlib import matplotlib.pyplot as plt from optparse import OptionParser @@ -278,6 +279,7 @@ def __missing__(self, key): # received at target; provides a tighter (over)estimate of # "bytes that are queued in the NIC but have not yet been # transmitted" +# pkts_in_nic2: The number of packets associated with tx_in_nic2 # tx_in_nic_qx: Same as tx_in_nic2 except only counts bytes in a particular # tx queue (given by the --tx-qid option, default 0). # tx_nic_rx: Number of bytes of data that have been received by the @@ -812,6 +814,37 @@ def percentile(data, pct, format, na): def pkt_id(id, offset): return '%d:%d' % (id, offset) +def plot_ccdf(data, file, fig_size=(8,6), title=None, size=10, + y_label="Cumulative Fraction", x_label="Delay (usecs)"): + """ + Generate a complementary CDF with log-scale y-axis. + + data: X-values for the graph, in ascending order + file: File in which to write the plot. + """ + + plt.figure(figsize=fig_size) + if title != None: + plt.title(title, size=size) + if x_label: + plt.xlabel(x_label, size=size) + plt.yscale("log") + plt.ylim(1e-3, 1.0) + if y_label: + plt.ylabel(y_label, size=size) + plt.tick_params(top=True, which="both", direction="in", labelsize=size, + length=5) + plt.grid(which="major", axis="x") + plt.grid(which="major", axis="y") + l = len(data) + y = [(l - i)/l for i in range(0, l)] + plt.plot(data, y, color=color_blue) + left, right = plt.xlim() + print('left: %s, right: %s' % (left, right)) + plt.xlim(left=0, right=right) + plt.tight_layout() + plt.savefig(file) + def print_analyzer_help(): """ Prints out documentation for all of the analyzers. @@ -863,6 +896,85 @@ def print_if(value, fmt, modifier=None): return fmt % (value) return '' +def print_pkts(pkts, header=True): + """ + Returns a string containing one line for each packet in pkts, which + contains various useful information about the packet. If header is True + then the string also includes initial text describing the fields that + are printed on each line. + """ + + buf = StringIO() + buf.write('# Dest: Node to which packet was sent\n') + buf.write('# Xmit: Time when packet was passed to ip*xmit\n') + buf.write('# Qdisc: Time when homa_qdisc requeued packet after ' + 'deferral, if any\n') + buf.write('# RpcId: Identifier of packet\'s RPC\n') + buf.write('# Offset: Offset of packet within message\n') + buf.write('# Length: Size of packet (before segmentation)\n') + buf.write('# Qid: Transmit queue on which packet was sent\n') + buf.write('# Nic: Time when packet was queued for NIC\n') + buf.write('# NDelay: Nic - (later of Xmit and Qdisc)\n') + buf.write('# MaxGro: Time when last fragment of packet was ' + 'received by GRO\n') + buf.write('# GDelay: MaxGro - Nic\n') + buf.write('# Free: Time when sk_buff was released on sender\n') + buf.write('# FDelay: Free - Nic\n') + buf.write('# Rx: Number of times segments in the packet were ' + 'retransmitted\n\n') + buf.write('# Dest Xmit Qdisc RpcId Offset Length') + buf.write(' Qid Nic NDelay MaxGro GDelay') + buf.write(' Free FDelay Rx\n') + for pkt in pkts: + xmit = pkt['xmit'] + if 'qdisc_xmit' in pkt: + qdisc = pkt['qdisc_xmit'] + qdisc_string = '%10.3f' % (qdisc) + else: + qdisc = None + qdisc_string = '' + nic_delay = None + if 'nic' in pkt: + nic = pkt['nic'] + if qdisc != None: + nic_delay = nic - qdisc + elif xmit != None: + nic_delay = nic - xmit + else: + nic = None + gro = pkt['gro'] if 'gro' in pkt else None + free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None + qid = pkt['tx_qid'] if 'tx_qid' in pkt else None + length = pkt['tso_length'] + + rx = len(pkt['retransmits']) + if 'segments' in pkt: + for seg in pkt['segments']: + rx += len(seg['retransmits']) + rx_msg = str(rx) if rx > 0 else "" + + line = '%-10s %10.3f %10s %10d %6d %6d' % ( + pkt['rx_node'] if 'rx_node' in pkt else "", + xmit, qdisc_string, pkt['id'], pkt['offset'], + pkt['tso_length']) + nic_delay_string = '' + if nic_delay != None: + nic_delay_string = '%.1f' % (nic_delay) + gro_delay_string = '' + if gro != None and nic != None: + gro_delay_string = '%.1f' % (gro - nic) + line += ' %3s %10s %7s %10s %7s' % (print_if(qid, '%d'), + print_if(nic, '%.3f'), nic_delay_string, + print_if(gro, '%.3f'), gro_delay_string) + free_delay_string = '' + if (nic != None) and (free != None): + free_delay_string = '%.1f' % (free - nic) + line += ' %10s %7s %2s' % (print_if(free, '%.3f'), + free_delay_string, rx_msg) + buf.write(line.rstrip()) + buf.write('\n') + return buf.getvalue() + def require_options(analyzer, *args): """ For each argument, ensures that the associated option has been specified; @@ -4499,6 +4611,8 @@ def analyze(self): if nic_end2 < 1e20: add_to_intervals(tx_node, nic_start, nic_end2, 'tx_in_nic2', tso_length) + add_to_intervals(tx_node, nic_start, nic_end2, + 'pkts_in_nic2', 1) if 'tx_qid' in pkt and pkt['tx_qid'] == qid: add_to_intervals(tx_node, nic_start, nic_end2, 'tx_in_nic_qx', tso_length) @@ -6059,8 +6173,9 @@ def output(self): options.tx_qid if options.tx_qid != None else 0)) f.write(' the --tx-qid option to select a different ' 'queue)\n') + f.write('# NicPkts Number of packets associated with InNic2') - f.write('\n# Time Tx ToNic Gro Free InNic InNic2 InNicQ\n') + f.write('\n# Time Tx ToNic Gro Free InNic InNic2 InNicQ NicPkts\n') node_intervals = intervals[node] bytes_to_gbps = 8 / (options.interval * 5 * 1000) @@ -6075,7 +6190,7 @@ def output(self): gro_bytes += interval['tx_gro_bytes'] free_bytes += interval['tx_free_bytes'] interval = node_intervals[i] - f.write('%8.1f %6.1f %6.1f %6.1f %6.1f %5d %6d %6d\n' % + f.write('%8.1f %6.1f %6.1f %6.1f %6.1f %5d %6d %6d %6d\n' % (interval['time'], tx_bytes * bytes_to_gbps, to_nic_bytes * bytes_to_gbps, @@ -6083,7 +6198,8 @@ def output(self): free_bytes * bytes_to_gbps, interval['tx_in_nic'] * 1e-3, interval['tx_in_nic2'] * 1e-3, - interval['tx_in_nic_qx'] * 1e-3)) + interval['tx_in_nic_qx'] * 1e-3, + interval['pkts_in_nic2'])) f.close() #------------------------------------------------ @@ -6533,6 +6649,8 @@ def tt_send_data(self, trace, t, core, id, offset, length): p['tso_length'] = length else: p['retransmits'][-1]['tso_length'] = length + if id == 202753545: + print("tt_send_data got packet: %s" % (p)) def tt_pacer_xmit(self, trace, t, core, id, offset, port, bytes_left): global packets @@ -8718,9 +8836,41 @@ def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') dispatcher.interest('AnalyzePackets') + def output_delays(self): + global packets, options, rpcs + + delays = [] + + for pkt in packets.values(): + if not 'nic' in pkt or not 'gro' in pkt: + continue + if options.node != None and pkt['tx_node'] != options.node: + continue + if not pkt['id'] in rpcs: + continue + rpc = rpcs[pkt['id']] + if not 'out_length' in rpc: + continue + length = rpc['out_length'] + if length <= 1000 or length > 1400: + continue + delays.append(pkt['gro'] - pkt['nic']) + if not delays: + print('No packets matched!') + return + delays.sort() + plot_ccdf(delays, 'temp_delays.pdf') + print('%d data points, P50 %.1f P90 %.1f P99 %.1f max %.1f' % ( + len(delays), delays[50*len(delays)//100], + delays[90*len(delays)//100], + delays[99*len(delays)//100], delays[-1])) + def output(self): - global rpcs + global packets, rpcs + matches = [] + max_rpc = None + max_rtt = 0 for rpc in rpcs.values(): # print('RPC id %d: %s\n' % (rpc['id'], rpc)) if not 'sendmsg' in rpc or not 'recvmsg_done' in rpc: @@ -8730,23 +8880,59 @@ def output(self): if rpc['id'] & 1: continue rtt = rpc['recvmsg_done'] - rpc['sendmsg'] - if rtt < 150: - continue + if rtt > max_rtt: + max_rpc = rpc + max_rtt = rtt + if rtt >= 150: + matches.append(rpc) + if not matches and max_rpc != None: + matches = [max_rpc] + for rpc in matches: peer = rpc['peer'] + rtt = rpc['recvmsg_done'] - rpc['sendmsg'] print('RPC id %d (%s -> %s) took %.1f usecs, length %d, %.3f -> %.3f' % (rpc['id'], rpc['node'], peer_nodes[peer] if peer in peer_nodes else peer, rtt, rpc['out_length'], rpc['sendmsg'], rpc['recvmsg_done'])) if rpc['send_data_pkts']: pkt = rpc['send_data_pkts'][0] - if 'xmit' in pkt and 'nic' in pkt: - print(' Request packet xmit time %.1f usecs (xmit %.3f, nic %.3f)' % - (pkt['nic'] - pkt['xmit'], pkt['xmit'], pkt['nic'])) + if 'nic' in pkt and 'gro' in pkt: + print(' Request packet network time %.1f usecs (nic %.3f, gro %.3f)' % + (pkt['gro'] - pkt['nic'], pkt['nic'], pkt['gro'])) if rpc['softirq_data_pkts']: pkt = rpc['softirq_data_pkts'][0] - if 'xmit' in pkt and 'nic' in pkt: - print(' Response packet xmit time %.1f usecs (xmit %.3f, nic %.3f)' % - (pkt['nic'] - pkt['xmit'], pkt['xmit'], pkt['nic'])) + if 'nic' in pkt and 'gro' in pkt: + print(' Response packet network time %.1f usecs (nic %.3f, gro %.3f)' % + (pkt['gro'] - pkt['nic'], pkt['nic'], pkt['gro'])) + + max_free_delay = 0 + max_pkt = None + max_gro_free = 0 + max_gro_free_pkt = None + for pkt in packets.values(): + if not 'nic' in pkt or not 'free_tx_skb' in pkt: + continue + if not 'tso_length' in pkt or not 'gro' in pkt: + continue + if 'tx_qid' in pkt and pkt['tx_qid'] <= 1: + delay = min(pkt['free_tx_skb'], pkt['gro']) - pkt['nic'] + if delay > max_free_delay: + max_free_delay = delay + max_pkt = pkt + delay = pkt['free_tx_skb'] - pkt['gro'] + if delay > max_gro_free: + max_gro_free = delay + max_gro_free_pkt = pkt + # print("New max_gro_free_pkt: %s" % (pkt)) + print('\nMax NIC delay: %.1f usecs, id %d, offset %d, node %s, nic %.3f, free %.3f, gro %.3f' % + (max_free_delay, max_pkt['id'], max_pkt['offset'], + max_pkt['tx_node'], max_pkt['nic'], max_pkt['free_tx_skb'], + max_pkt['gro'])) + print('\nMax GRO->free delay: %.1f usecs, id %d, offset %d, node %s, nic %.3f, free %.3f, gro %.3f' % + (max_gro_free, max_gro_free_pkt['id'], + max_gro_free_pkt['offset'], max_gro_free_pkt['tx_node'], + max_gro_free_pkt['nic'], max_gro_free_pkt['free_tx_skb'], + max_gro_free_pkt['gro'])) def output_snapshot(self): global packets, rpcs @@ -9138,12 +9324,12 @@ def output(self): class AnalyzeTxpkts: """ Generates one data file for each node showing information about every - data packet transmitted from that node, in time order. Also generates - aggregate statistics for each tx queue on each node. If either --node or + data packet transmitted from that node, in time order. If either --node or --tx-qid is specified, only packets matching those options will be considered. Packets will normally be sorted by the 'Xmit' column, but the --sort option can be used to specify a different column to use for sorting - ('Xmit', 'Nic', 'MaxGro', or 'Free'). + ('Xmit', 'Nic', 'MaxGro', or 'Free'). Also generates aggregate statistics + for each tx queue on each node. """ def __init__(self, dispatcher): @@ -9224,17 +9410,6 @@ def output(self): if (options.node != None) and (node != options.node): continue - # Create a data file for this node with packets in time order - # (or whatever order was requested on the command line). - pkts = sorted(node_pkts[node], key = lambda pkt : pkt['xmit']) - if sort_key == 'gro': - pkts = sorted(pkts, key = lambda pkt : get_max_gro(pkt)) - elif sort_key != 'xmit': - pkts = sorted(pkts, key = lambda pkt : - pkt[sort_key] if sort_key in pkt else 1e20) - if len(pkts) == 0: - continue - # Tx queue number -> dictionary mapping from delay type to a list # of delays of the given type on the given transmit queue. # Delay types currently used: @@ -9282,55 +9457,25 @@ def output(self): total_pkts = 0 - f = open('%s/txpkts_%s.dat' % (options.data, node), 'w') - f.write('# Node: %s\n' % (node)) - f.write('# Generated at %s.\n' % - (time.strftime('%I:%M %p on %m/%d/%Y'))) - f.write('# Data packets transmitted from %s:\n' % (node)) - f.write('# Dest: Node to which packet was sent\n') - f.write('# Xmit: Time when packet was passed to ip*xmit\n') - f.write('# Qdisc: Time when homa_qdisc requeued packet after ' - 'deferral, if any\n') - f.write('# RpcId: Identifier of packet\'s RPC\n') - f.write('# Offset: Offset of packet within message\n') - f.write('# Length: Size of packet (before segmentation)\n') - f.write('# Qid: Transmit queue on which packet was sent\n') - f.write('# Nic: Time when packet was queued for NIC\n') - f.write('# NDelay: Nic - (later of Xmit and Qdisc)\n') - f.write('# MaxGro: Time when last fragment of packet was ' - 'received by GRO\n') - f.write('# GDelay: MaxGro - Nic\n') - f.write('# Free: Time when sk_buff was released on sender\n') - f.write('# FDelay: Free - Nic\n') - f.write('# Rx: Number of times segments in the packet were ' - 'retransmitted\n\n') - - f.write('# Dest Xmit Qdisc RpcId Offset Length') - f.write(' Qid Nic NDelay MaxGro GDelay') - f.write(' Free FDelay Rx\n') - for pkt in pkts: + # Select packets to print for this node, plus gather statistics. + pkts = [] + for pkt in node_pkts[node]: xmit = pkt['xmit'] - if 'qdisc_xmit' in pkt: - qdisc = pkt['qdisc_xmit'] - qdisc_string = '%10.3f' % (qdisc) - else: - qdisc = None - qdisc_string = '' + gro = pkt['gro'] if 'gro' in pkt else None + free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None + nic = pkt['nic'] if 'nic' in pkt else None + qdisc = pkt['qdisc_xmit'] if 'qdisc_xmit' in pkt else None + qid = pkt['tx_qid'] if 'tx_qid' in pkt else None + length = pkt['tso_length'] + nic_delay = None - if 'nic' in pkt: - nic = pkt['nic'] + if nic != None: if qdisc != None: nic_delay = nic - qdisc elif xmit != None: nic_delay = nic - xmit - else: - nic = None - max_gro = get_max_gro(pkt) - free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None - length = pkt['tso_length'] - if 'tx_qid' in pkt: - qid = pkt['tx_qid'] + if qid != None: qid_tsos[qid] += 1 segs = 1 if 'segments' in pkt: @@ -9340,32 +9485,22 @@ def output(self): qid_bytes[qid] += length if 'pacer' in pkt: qid_pacer_tsos[qid] += 1 - if 'qdisc_xmit' in pkt: + if qdisc != None: qid_qdisc_tsos[qid] += 1 if 'tx_queue' in pkt: qid_tx_queue[qid] = pkt['tx_queue'] - qid_string = str(qid) - if (options.tx_qid != None) and (qid != options.tx_qid): - continue + if (options.tx_qid == None) or (qid == options.tx_qid): + pkts.append(pkt) else: - if options.tx_qid != None: - continue - qid = None - qid_string = '' + if options.tx_qid == None: + pkts.append(pkt) total_pkts += 1 - rx = len(pkt['retransmits']) - if 'segments' in pkt: - for seg in pkt['segments']: - rx += len(seg['retransmits']) - rx_msg = str(rx) if rx > 0 else "" - - gro_string = "" - if rx == 0 and qid != None and nic_delay != None: + if (len(pkt['retransmits']) == 0 and qid != None and + nic_delay != None): delays[qid]['nic'].append(nic_delay) - if max_gro != None: - delays[qid]['gro'].append(max_gro - nic) - gro_string = '%.1f' % (max_gro - nic) + if gro != None: + delays[qid]['gro'].append(gro - nic) if free != None: delays[qid]['free'].append(free - nic) @@ -9377,22 +9512,21 @@ def output(self): qid_slow_bytes[qid] += length qid_total_bytes[qid] += length - line = '%-10s %10.3f %10s %10d %6d %6d' % ( - pkt['rx_node'] if 'rx_node' in pkt else "", - xmit, qdisc_string, pkt['id'], pkt['offset'], - pkt['tso_length']) - nic_delay_string = '' - if (nic_delay != None): - nic_delay_string = '%.1f' % (nic_delay) - line += ' %3s %10s %7s %10s %7s' % (qid_string, - print_if(nic, '%.3f'), nic_delay_string, - print_if(max_gro, '%.3f'), gro_string) - free_delay_string = '' - if (nic != None) and (free != None): - free_delay_string = '%.1f' % (free - nic) - line += ' %10s %7s %2s' % (print_if(free, '%.3f'), - free_delay_string, rx_msg) - f.write(line.rstrip() + '\n') + # Create a data file for this node with packets in time order + # (or whatever order was requested on the command line). + pkts = sorted(pkts, key = lambda pkt : pkt['xmit']) + if sort_key == 'gro': + pkts = sorted(pkts, key = lambda pkt : get_max_gro(pkt)) + elif sort_key != 'xmit': + pkts = sorted(pkts, key = lambda pkt : + pkt[sort_key] if sort_key in pkt else 1e20) + + f = open('%s/txpkts_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Data packets transmitted from %s:\n' % (node)) + f.write(print_pkts(pkts)) f.close() def print_type(delays): @@ -9610,6 +9744,9 @@ def output(self): parser.add_option('--max', dest='max', type=float, default=None, metavar='T', help='Upper bound to consider for some parameter; ' 'specific meaning depends on analyzer') +parser.add_option('--max-rtt', dest='max_rtt', type=float, default=None, + metavar='T', help='Only consider RPCs with RTTs <= T usecs. Used by ' + 'rpc analyzer to select which specific RTTs to print out.') parser.add_option('--min', dest='min', type=float, default=None, metavar='T', help='Lower bound to consider for some parameter; ' 'specific meaning depends on analyzer') @@ -9619,9 +9756,6 @@ def output(self): parser.add_option('--node', dest='node', default=None, metavar='N', help='Specifies a particular node (the name of its ' 'trace file without the extension); required by some analyzers') -parser.add_option('--max-rtt', dest='max_rtt', type=float, default=None, - metavar='T', help='Only consider RPCs with RTTs <= T usecs. Used by ' - 'rpc analyzer to select which specific RTTs to print out.') parser.add_option('--pkt', dest='pkt', default=None, metavar='ID:OFF', help='Identifies a specific packet with ID:OFF, ' 'where ID is the RPC id on the sender (even means request message, ' From dcffc598ede8d59cc378ded352301983d003ac20 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 13 Nov 2025 15:10:52 -0800 Subject: [PATCH 557/625] Update config for Intel NICs (increase tx ring size) --- cloudlab/bin/config | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/cloudlab/bin/config b/cloudlab/bin/config index ba8999f0..682a396c 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -150,7 +150,7 @@ def get_interfaces(): available += current if (('s1f1' in current) or ('s1f0' in current) or ('s0f0' in current) or ('s0f1' in current) - or (current == 'eno1')): + or (current == 'eno1')) and not interface: interface = current continue if re.match(r'^[ ]+ inet 10\.0\.1\.', line): @@ -453,6 +453,17 @@ def set_sysctl(name, value): exec_cmd(["sudo", "sysctl", ".net.homa.%s=%s" % (name, value)], check=True) +def config_ecn_threshold(kb): + """ + Modify the configuration of this experiment's egress ports at the + top-of-rack switch to enable optimal Homa performance. + """ + s = Switch(True) + for port in get_exp_ports(): + print("Configuring ECN threshold for port %d" % (port)) + s.set_ecn_threshold(port, kb) + s.close() + def config_homa(mod): """ Install the Homa kernel driver and configure it appropriately for @@ -501,17 +512,6 @@ def config_homa(mod): print("Enabling TCP hijacking") set_sysctl("hijack_tcp", 1) -def config_ecn_threshold(kb): - """ - Modify the configuration of this experiment's egress ports at the - top-of-rack switch to enable optimal Homa performance. - """ - s = Switch(True) - for port in get_exp_ports(): - print("Configuring ECN threshold for port %d" % (port)) - s.set_ecn_threshold(port, kb) - s.close() - def config_ipv6(num_hosts, vlan): """ Configure this node to enable IPv6. @@ -620,11 +620,20 @@ def config_nic(): # Use a separate ethtool command for each paramemeter. Otherwise, # if one parameter isn't supported the command will be aborted, # so no parameters will get set. + print("Configuring NIC to reduce interrupt latency") exec_cmd(["sudo", "ethtool", "-C", interface, "adaptive-rx", "off"], check=False) exec_cmd(["sudo", "ethtool", "-C", interface, "rx-usecs", "0"], check=False) exec_cmd(["sudo", "ethtool", "-C", interface, "rx-frames", "1"], check=False) + if get_nic_type() == "ice": + print("Increasing tx ring size for Intel NIC") + exec_cmd(["sudo", "ethtool", "-G", interface, "tx", "1024"], + check=True) + print("Disabling adaptive-tx for Intel NIC to recover tx buffers faster") + exec_cmd(["sudo", "ethtool", "-C", interface, "adaptive-tx", "off", + "tx-usecs", "10"], check=True) + def config_power(): """ Configure the machine's power management for best Homa performance. @@ -651,6 +660,8 @@ def config_qdisc(): nic = get_interfaces()[0] config = get_qdisc_config() root = config['root_handle'] + + print('Installing Homa qdisc') if root == '0:': # Must reset the root qdisc (it isn't possible to modify the # default one) @@ -678,6 +689,7 @@ def config_reset_qdisc(): config = get_qdisc_config() root = config['root_handle'] + print('Removing Homa qdisc, restoring fq_codel') for child in config['children']: if child['type'] != 'homa': continue From 7b98ba140ad7832cf6cc5f8ef5ef4e85860098e9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 13 Nov 2025 15:11:43 -0800 Subject: [PATCH 558/625] Force IPv4 in on_nodes scripts (Otherwise can run very slowly on CloudLab nodes) --- cloudlab/bin/on_nodes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudlab/bin/on_nodes b/cloudlab/bin/on_nodes index bc3ae67c..a13763d9 100755 --- a/cloudlab/bin/on_nodes +++ b/cloudlab/bin/on_nodes @@ -25,5 +25,5 @@ for ((i = $first ; i <= $last; i++)); do node=node$i echo "" echo $node: - ssh $node $@ + ssh -4 $node $@ done \ No newline at end of file From e31f0b5834a83debf63e57c61c88d8931a83502d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 13 Nov 2025 15:16:19 -0800 Subject: [PATCH 559/625] Initialize timetracing earlier in homa_plumbing.c --- homa_plumbing.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/homa_plumbing.c b/homa_plumbing.c index 2905631e..e8a4a020 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -544,6 +544,10 @@ int __init homa_load(void) MAX_NUMNODES); #endif /* See strip.py */ +#ifndef __UPSTREAM__ /* See strip.py */ + tt_init("timetrace"); +#endif /* See strip.py */ + status = homa_init(homa); if (status) goto error; @@ -641,7 +645,6 @@ int __init homa_load(void) homa_gro_hook_tcp(); #endif /* See strip.py */ #ifndef __UPSTREAM__ /* See strip.py */ - tt_init("timetrace"); tt_set_temp(homa->temp); #endif /* See strip.py */ From 8342219ec471556d153107bbf7cc5888f0756c56 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 13 Nov 2025 15:22:08 -0800 Subject: [PATCH 560/625] Cleanup a few comments in homa_skb.c --- homa_skb.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/homa_skb.c b/homa_skb.c index 7df69482..58f10807 100644 --- a/homa_skb.c +++ b/homa_skb.c @@ -112,7 +112,7 @@ void homa_skb_cleanup(struct homa *homa) * function will allocate additional space for IP and * Ethernet headers, as well as for the homa_skb_info. * Return: New sk_buff, or NULL if there was insufficient memory. - * The sk_buff will be configured with so that the next + * The sk_buff will be configured so that the next * skb_put will be for the transport (Homa) header. The * homa_skb_info is not initialized. */ @@ -121,9 +121,6 @@ struct sk_buff *homa_skb_alloc_tx(int length) u64 start = homa_clock(); struct sk_buff *skb; - /* Note: allocate space for an IPv6 header, which is larger than - * an IPv4 header. - */ skb = alloc_skb(HOMA_SKB_EXTRA + sizeof(struct homa_skb_info) + length, GFP_ATOMIC); if (likely(skb)) { From 5bcf4a6c001dc29eb31673a6fe8dfbb276441b83 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 14 Nov 2025 16:20:47 -0800 Subject: [PATCH 561/625] Upgrade to Linux 6.17.8 Only unit tests were affected. --- README.md | 3 ++- test/mock.c | 18 ++++++++++------ test/mock.h | 5 ++++- test/unit_homa_offload.c | 46 ++++++++++++++++++++-------------------- test/unit_homa_pool.c | 4 ++-- 5 files changed, 43 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 5c03c4cc..24186b9b 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k - Please contact me if you have any problems using this repo; I'm happy to provide advice and support. -- The head is known to work under Linux 6.13.9. In the past, Homa has +- The head is known to work under Linux 6.17.8. In the past, Homa has run under several earlier versions of Linux. There is a separate branch for each of these older versions, with names such as linux_4.15.18. Older branches are @@ -123,6 +123,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k sysctl mechanism. For details, see the man page `homa.7`. ## Significant changes +- November 2025: upgraded to Linux 6.17.8. - October 2025: added the HOMAIOCINFO ioctl for retrieving status information about a Homa socket. See man/homa.7 for details. - May 2025: `homa_api.c` has been removed, so the functions `homa_abort`, diff --git a/test/mock.c b/test/mock.c index 911ccc65..187744f9 100644 --- a/test/mock.c +++ b/test/mock.c @@ -289,7 +289,7 @@ unsigned long vmemmap_base; kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES]; #endif int __preempt_count; -struct pcpu_hot pcpu_hot = {.cpu_number = 1, .current_task = &mock_task}; +int cpu_number = 1; char sock_flow_table[RPS_SOCK_FLOW_TABLE_SIZE(1024)]; struct net_hotdata net_hotdata = { .rps_cpu_mask = 0x1f, @@ -301,6 +301,9 @@ struct static_call_key __SCK__might_resched; struct static_call_key __SCK__preempt_schedule; struct paravirt_patch_template pv_ops; struct workqueue_struct *system_wq; +struct static_key_true validate_usercopy_range; +unsigned long __per_cpu_offset[NR_CPUS]; +struct tracepoint __tracepoint_sched_set_state_tp; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map rcu_lock_map; @@ -1060,7 +1063,7 @@ void lock_sock_nested(struct sock *sk, int subclass) sk->sk_lock.owned = 1; } -ssize_t __modver_version_show(struct module_attribute *a, +ssize_t __modver_version_show(const struct module_attribute *a, struct module_kobject *b, char *c) { return 0; @@ -1579,6 +1582,9 @@ void tasklet_init(struct tasklet_struct *t, void tasklet_kill(struct tasklet_struct *t) {} +void __trace_set_current_state(int state_value) +{} + void unregister_net_sysctl_table(struct ctl_table_header *header) { UNIT_LOG("; ", "unregister_net_sysctl_table"); @@ -1943,7 +1949,7 @@ void mock_preempt_enable() int mock_processor_id() { - return pcpu_hot.cpu_number; + return cpu_number; } void mock_put_page(struct page *page) @@ -2155,7 +2161,7 @@ void mock_set_clock_vals(u64 t, ...) */ void mock_set_core(int num) { - pcpu_hot.cpu_number = num; + cpu_number = num; } /** @@ -2372,8 +2378,8 @@ void mock_teardown(void) { int count, i; - pcpu_hot.cpu_number = 1; - pcpu_hot.current_task = &mock_task; + cpu_number = 1; + current_task = &mock_task; mock_alloc_page_errors = 0; mock_alloc_skb_errors = 0; mock_cmpxchg_errors = 0; diff --git a/test/mock.h b/test/mock.h index 8e7c4319..670d8d1f 100644 --- a/test/mock.h +++ b/test/mock.h @@ -101,7 +101,10 @@ #define spin_unlock mock_spin_unlock #undef this_cpu_ptr -#define this_cpu_ptr(name) (&name[pcpu_hot.cpu_number]) +#define this_cpu_ptr(name) (&name[cpu_number]) + +#undef __this_cpu_read +#define __this_cpu_read(name) (name) #undef vmalloc #define vmalloc mock_vmalloc diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index ffe8ddd6..c64dd7a0 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -59,10 +59,10 @@ FIXTURE_SETUP(homa_offload) self->header.incoming = htonl(10000); self->header.seg.offset = htonl(2000); for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&self->napi.gro_hash[i].list); - self->napi.gro_hash[i].count = 0; + INIT_LIST_HEAD(&self->napi.gro.hash[i].list); + self->napi.gro.hash[i].count = 0; } - self->napi.gro_bitmask = 0; + self->napi.gro.bitmask = 0; self->skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 2000); NAPI_GRO_CB(self->skb)->same_flow = 0; @@ -75,10 +75,10 @@ FIXTURE_SETUP(homa_offload) NAPI_GRO_CB(self->skb2)->same_flow = 0; NAPI_GRO_CB(self->skb2)->last = self->skb2; NAPI_GRO_CB(self->skb2)->count = 1; - self->napi.gro_bitmask = 6; - self->napi.gro_hash[2].count = 2; - list_add_tail(&self->skb->list, &self->napi.gro_hash[2].list); - list_add_tail(&self->skb2->list, &self->napi.gro_hash[2].list); + self->napi.gro.bitmask = 6; + self->napi.gro.hash[2].count = 2; + list_add_tail(&self->skb->list, &self->napi.gro.hash[2].list); + list_add_tail(&self->skb2->list, &self->napi.gro.hash[2].list); INIT_LIST_HEAD(&self->empty_list); self->tcp_offloads.callbacks.gro_receive = test_tcp_gro_receive; inet_offloads[IPPROTO_TCP] = &self->tcp_offloads; @@ -98,7 +98,7 @@ FIXTURE_TEARDOWN(homa_offload) struct sk_buff *skb, *tmp; homa_offload_end(); - list_for_each_entry_safe(skb, tmp, &self->napi.gro_hash[2].list, list) + list_for_each_entry_safe(skb, tmp, &self->napi.gro.hash[2].list, list) kfree_skb(skb); homa_destroy(&self->homa); unit_teardown(); @@ -375,7 +375,7 @@ TEST_F(homa_offload, homa_gro_receive__no_held_skb) NAPI_GRO_CB(skb)->same_flow = 0; cur_offload_core->held_skb = NULL; cur_offload_core->held_bucket = 2; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[2].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[2].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); EXPECT_EQ(skb, cur_offload_core->held_skb); @@ -393,7 +393,7 @@ TEST_F(homa_offload, homa_gro_receive__empty_merge_list) NAPI_GRO_CB(skb)->same_flow = 0; cur_offload_core->held_skb = self->skb; cur_offload_core->held_bucket = 3; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[2].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[2].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); EXPECT_EQ(skb, cur_offload_core->held_skb); @@ -411,7 +411,7 @@ TEST_F(homa_offload, homa_gro_receive__held_skb_not_in_merge_list) NAPI_GRO_CB(skb)->same_flow = 0; cur_offload_core->held_skb = skb; cur_offload_core->held_bucket = 2; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); EXPECT_EQ(skb, cur_offload_core->held_skb); @@ -433,7 +433,7 @@ TEST_F(homa_offload, homa_gro_receive__held_skb__in_merge_list_but_wrong_proto) else ip_hdr(self->skb)->protocol = IPPROTO_TCP; cur_offload_core->held_bucket = 2; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(0, same_flow); EXPECT_EQ(skb, cur_offload_core->held_skb); @@ -452,7 +452,7 @@ TEST_F(homa_offload, homa_gro_receive__merge) self->header.common.sender_id = cpu_to_be64(1002); skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb)->same_flow = 0; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb)); + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(1, same_flow); EXPECT_EQ(2, NAPI_GRO_CB(self->skb2)->count); @@ -461,7 +461,7 @@ TEST_F(homa_offload, homa_gro_receive__merge) self->header.common.sender_id = cpu_to_be64(1004); skb2 = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); NAPI_GRO_CB(skb2)->same_flow = 0; - EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro_hash[3].list, skb2)); + EXPECT_EQ(NULL, homa_gro_receive(&self->napi.gro.hash[3].list, skb2)); same_flow = NAPI_GRO_CB(skb)->same_flow; EXPECT_EQ(1, same_flow); EXPECT_EQ(3, NAPI_GRO_CB(self->skb2)->count); @@ -481,23 +481,23 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) cur_offload_core->held_bucket = 2; self->header.seg.offset = htonl(6000); skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); - homa_gro_receive(&self->napi.gro_hash[3].list, skb); + homa_gro_receive(&self->napi.gro.hash[3].list, skb); EXPECT_EQ(2, NAPI_GRO_CB(self->skb2)->count); - EXPECT_EQ(2, self->napi.gro_hash[2].count); + EXPECT_EQ(2, self->napi.gro.hash[2].count); // Second packet hits the limit. self->header.common.sport = htons(40001); skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( - &self->napi.gro_hash[3].list, skb))); + &self->napi.gro.hash[3].list, skb))); EXPECT_EQ(3, NAPI_GRO_CB(self->skb2)->count); - EXPECT_EQ(1, self->napi.gro_hash[2].count); + EXPECT_EQ(1, self->napi.gro.hash[2].count); EXPECT_STREQ("netif_receive_skb, id 1002, offset 4000", unit_log_get()); kfree_skb(self->skb2); - EXPECT_EQ(1, self->napi.gro_hash[2].count); - EXPECT_EQ(6, self->napi.gro_bitmask); + EXPECT_EQ(1, self->napi.gro.hash[2].count); + EXPECT_EQ(6, self->napi.gro.bitmask); // Third packet also hits the limit for skb, causing the bucket // to become empty. @@ -506,10 +506,10 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) skb = mock_skb_alloc(&self->ip, &self->header.common, 1400, 0); unit_log_clear(); EXPECT_EQ(EINPROGRESS, -PTR_ERR(homa_gro_receive( - &self->napi.gro_hash[3].list, skb))); + &self->napi.gro.hash[3].list, skb))); EXPECT_EQ(2, NAPI_GRO_CB(self->skb)->count); - EXPECT_EQ(0, self->napi.gro_hash[2].count); - EXPECT_EQ(2, self->napi.gro_bitmask); + EXPECT_EQ(0, self->napi.gro.hash[2].count); + EXPECT_EQ(2, self->napi.gro.bitmask); EXPECT_STREQ("netif_receive_skb, id 1000, offset 2000", unit_log_get()); kfree_skb(self->skb); diff --git a/test/unit_homa_pool.c b/test/unit_homa_pool.c index cf59cac4..4bcfb249 100644 --- a/test/unit_homa_pool.c +++ b/test/unit_homa_pool.c @@ -770,13 +770,13 @@ TEST_F(homa_pool, homa_pool_avail_bytes__include_free_space_in_core_private_page { struct homa_pool *pool = self->hsk.buffer_pool; - pcpu_hot.cpu_number = 3; + mock_set_core(3); EXPECT_EQ(100 * HOMA_BPAGE_SIZE, homa_pool_avail_bytes(pool)); unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 2000); EXPECT_EQ(100 * HOMA_BPAGE_SIZE - 2000, homa_pool_avail_bytes(pool)); - pcpu_hot.cpu_number = 5; + mock_set_core(5); unit_client_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &self->client_ip, &self->server_ip, 4000, 98, 1000, 50000); EXPECT_EQ(100 * HOMA_BPAGE_SIZE - 52000, homa_pool_avail_bytes(pool)); From a8b4026b2636b6103bfc046a25d606e391c9a737 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 14 Nov 2025 16:21:38 -0800 Subject: [PATCH 562/625] Fix bug in setting skb_shinfo(skb)->gso_type in tx skbs Previously it was always set to SKB_GSO_TCPV6, but recent changes to Linux broke this (GSO packets got dropped under IPv4): it must be set to SKB_GSO_TCPV4 or SKB_GSO_TCPV6 depending on whether the socket uses IPv4 or IPv6. --- homa_outgoing.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index a83ef7b2..72dd58f3 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -130,6 +130,7 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, int length, int max_seg_data) __must_hold(rpc->bucket->lock) { + struct homa_sock *hsk = rpc->hsk; struct homa_skb_info *homa_info; struct homa_data_hdr *h; struct sk_buff *skb; @@ -147,7 +148,7 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, (segs - 1) * sizeof(struct homa_seg_hdr)); #endif /* See strip.py */ if (!skb) { - rpc->hsk->error_msg = "couldn't allocate sk_buff for outgoing message"; + hsk->error_msg = "couldn't allocate sk_buff for outgoing message"; return ERR_PTR(-ENOMEM); } @@ -155,7 +156,7 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, * network packet by GSO). */ h = (struct homa_data_hdr *)skb_put(skb, sizeof(struct homa_data_hdr)); - h->common.sport = htons(rpc->hsk->port); + h->common.sport = htons(hsk->port); h->common.dport = htons(rpc->dport); h->common.sequence = htonl(offset); h->common.type = DATA; @@ -178,14 +179,14 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, homa_info = homa_get_skb_info(skb); homa_info->next_skb = NULL; homa_info->wire_bytes = length + segs * (sizeof(struct homa_data_hdr) - + rpc->hsk->ip_header_length + HOMA_ETH_OVERHEAD); + + hsk->ip_header_length + HOMA_ETH_OVERHEAD); homa_info->data_bytes = length; homa_info->seg_length = max_seg_data; homa_info->offset = offset; homa_info->rpc = rpc; #ifndef __STRIP__ /* See strip.py */ - if (segs > 1 && rpc->hsk->sock.sk_protocol != IPPROTO_TCP) { + if (segs > 1 && hsk->sock.sk_protocol != IPPROTO_TCP) { #else /* See strip.py */ if (segs > 1) { #endif /* See strip.py */ @@ -198,11 +199,10 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, err = homa_fill_data_interleaved(rpc, skb, iter); } else { gso_size = max_seg_data; - err = homa_skb_append_from_iter(rpc->hsk->homa, skb, iter, - length); + err = homa_skb_append_from_iter(hsk->homa, skb, iter, length); } if (err) { - rpc->hsk->error_msg = "couldn't copy message body into packet buffers"; + hsk->error_msg = "couldn't copy message body into packet buffers"; goto error; } @@ -214,12 +214,14 @@ struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc, * GSO; the value below seems to work... */ skb_shinfo(skb)->gso_type = - rpc->hsk->homa->gso_force_software ? 0xd : SKB_GSO_TCPV6; + hsk->homa->gso_force_software ? 0xd : + (hsk->inet.sk.sk_family == AF_INET6) ? SKB_GSO_TCPV6 : + SKB_GSO_TCPV4; } return skb; error: - homa_skb_free_tx(rpc->hsk->homa, skb); + homa_skb_free_tx(hsk->homa, skb); return ERR_PTR(err); } From 006eb4d28eb82bb7abdd8a05813cc5f2c6ec7bda Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 14 Nov 2025 16:24:30 -0800 Subject: [PATCH 563/625] Minor changes to tthoma.py --- util/tthoma.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 38ef0519..eab20de8 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -915,15 +915,14 @@ def print_pkts(pkts, header=True): buf.write('# Qid: Transmit queue on which packet was sent\n') buf.write('# Nic: Time when packet was queued for NIC\n') buf.write('# NDelay: Nic - (later of Xmit and Qdisc)\n') - buf.write('# MaxGro: Time when last fragment of packet was ' - 'received by GRO\n') - buf.write('# GDelay: MaxGro - Nic\n') + buf.write('# Gro: Time when packet was received by GRO\n') + buf.write('# GDelay: Gro - Nic\n') buf.write('# Free: Time when sk_buff was released on sender\n') buf.write('# FDelay: Free - Nic\n') buf.write('# Rx: Number of times segments in the packet were ' 'retransmitted\n\n') buf.write('# Dest Xmit Qdisc RpcId Offset Length') - buf.write(' Qid Nic NDelay MaxGro GDelay') + buf.write(' Qid Nic NDelay Gro GDelay') buf.write(' Free FDelay Rx\n') for pkt in pkts: xmit = pkt['xmit'] @@ -945,7 +944,7 @@ def print_pkts(pkts, header=True): gro = pkt['gro'] if 'gro' in pkt else None free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None qid = pkt['tx_qid'] if 'tx_qid' in pkt else None - length = pkt['tso_length'] + length = pkt['tso_length'] if 'tso_length' in pkt else pkt['length'] rx = len(pkt['retransmits']) if 'segments' in pkt: @@ -955,8 +954,7 @@ def print_pkts(pkts, header=True): line = '%-10s %10.3f %10s %10d %6d %6d' % ( pkt['rx_node'] if 'rx_node' in pkt else "", - xmit, qdisc_string, pkt['id'], pkt['offset'], - pkt['tso_length']) + xmit, qdisc_string, pkt['id'], pkt['offset'], length) nic_delay_string = '' if nic_delay != None: nic_delay_string = '%.1f' % (nic_delay) @@ -8836,6 +8834,22 @@ def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') dispatcher.interest('AnalyzePackets') + def output(self): + self.output_slow_pkts() + + def output_slow_pkts(self): + pkts = [] + for pkt in packets.values(): + if not 'nic' in pkt or not 'gro' in pkt: + continue + if pkt['length'] > 1000 or pkt['offset'] !=0: + continue + delay = pkt['gro'] - pkt['nic'] + if delay >= 150 and delay <= 300: + pkts.append(pkt) + print("# Packets with nic->gro delays between 150 and 300 usecs:") + print(print_pkts(pkts), end='') + def output_delays(self): global packets, options, rpcs @@ -8865,7 +8879,7 @@ def output_delays(self): delays[90*len(delays)//100], delays[99*len(delays)//100], delays[-1])) - def output(self): + def output_slow_rpcs(self): global packets, rpcs matches = [] From 9185174c175e812a31cca711c602c6d8d747bd82 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Nov 2025 09:55:07 -0800 Subject: [PATCH 564/625] Various cleanups for cperf.py and cp_vs_tcp.py * Remove --old-slowdown option * Remove code related to unloaded experiments * Set baseline RTT for slowdown from node type (was hardwired at 15 usecs previously) --- util/cp_vs_tcp | 23 -------- util/cperf.py | 152 ++++++++++++++++++------------------------------- 2 files changed, 56 insertions(+), 119 deletions(-) diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index fd3a72a6..c785b78f 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -25,9 +25,6 @@ parser.add_argument('--servers', dest='num_servers', type=int, metavar='count', "and this value indicates the number of server nodes; all other " "nodes will be clients. If 0, each node runs both a client and a " "server (default: 0)") -parser.add_argument('--skip-unloaded', dest='skip_unloaded', type=boolean, - default=True, help="Boolean value:: true means don't measure" - "Homa latency under low load (default: true)") options = parser.parse_args() init(options) @@ -57,23 +54,12 @@ if not options.plot_only: options.workload = workload options.gbps = bw * bw_multiplier options.seconds = seconds - unloaded_exp = "unloaded_" + workload homa_exp = "homa_" + workload tcp_exp = "tcp_" + workload dctcp_exp = "dctcp_" + workload try: options.protocol = "homa" - if not options.skip_unloaded: - start_servers(unloaded_exp, options.nodes[1:2], options) - o = copy.deepcopy(options) - o.gbps = 0.0 - o.client_ports = 1 - o.client_max = 1 - o.server_ports = 1 - o.unloaded = 500 - run_experiment(unloaded_exp, options.nodes[0:1], o) - start_servers(homa_exp, options.servers, options) run_experiment(homa_exp, options.clients, options) @@ -103,15 +89,11 @@ if not options.plot_only: # Generate plots and reports for workload, bw, seconds in load_info: - unloaded_exp = "unloaded_" + workload homa_exp = "homa_" + workload tcp_exp = "tcp_" + workload dctcp_exp = "dctcp_" + workload scan_metrics(homa_exp) - if not options.skip_unloaded: - set_unloaded(unloaded_exp) - # Generate slowdown plot. log("Generating slowdown plot for %s" % (workload)) title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), @@ -149,8 +131,6 @@ for workload, bw, seconds in load_info: # Generate CDF of small-message RTTs. log("Generating short message CDF for %s" % (workload)) - if not options.skip_unloaded: - unloaded_x, unloaded_y = get_short_cdf(unloaded_exp) homa_x, homa_y = get_short_cdf(homa_exp) if options.tcp: tcp_x, tcp_y = get_short_cdf(tcp_exp) @@ -163,8 +143,5 @@ for workload, bw, seconds in load_info: if options.dctcp: plt.plot(dctcp_x, dctcp_y, label="DCTCP", color=dctcp_color) plt.plot(homa_x, homa_y, label="Homa", color=homa_color) - if not options.skip_unloaded: - plt.plot(unloaded_x, unloaded_y, label="Homa best case", - color=unloaded_color) plt.legend(loc="upper right", prop={'size': 9}) plt.savefig("%s/reports/short_cdf_%s.pdf" % (options.log_dir, workload)) diff --git a/util/cperf.py b/util/cperf.py index 03922b45..7c249890 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -51,11 +51,6 @@ # Open file (in the log directory) where log messages should be written. log_file = 0 -# True means use new slowdown calculation, where denominator is calculated -# using best-case Homa unloaded RTT plus link bandwidth; False means use -# original calculation where the denominator is Homa P50 unloaded latency. -old_slowdown = False - # Indicates whether we should generate additional log messages for debugging verbose = False @@ -71,6 +66,11 @@ # Speed of host uplinks. link_mbps = None +# "Best possible RTT for short messages", depending on CloudLab node type. +# Used to compute slowdowns. +baseline_rtts = {"xl170": 15, "c6620": 25, "c6525-25g": 25, "c6525-100g": 25, + "default": 25} + # Defaults for command-line options; assumes that servers and clients # share nodes. default_defaults = { @@ -91,7 +91,6 @@ 'tcp_port_receivers': 1, 'tcp_server_ports': 8, 'tcp_port_threads': 1, - 'unloaded': 0, 'unsched': 0, 'unsched_boost': 0.0, 'workload': '' @@ -116,13 +115,6 @@ # avg_slowdown: Average slowdown across all messages of all sizes digests = {} -# A dictionary where keys are message lengths, and each value is the median -# unloaded RTT (usecs) for messages of that length. -unloaded_p50 = {} - -# Minimum RTT for any measurement in the unloaded dataset -min_rtt = 1e20; - # Keys are filenames, and each value is a dictionary containing data read # from that file. Within that dictionary, each key is the name of a column # within the file, and the value is a list of numbers read from the given @@ -247,11 +239,6 @@ def get_parser(description, usage, defaults = {}): parser.add_argument('--no-homa-prio', dest='no_homa_prio', action='store_true', default=False, help='Don\'t run homa_prio on nodes to adjust unscheduled cutoffs') - parser.add_argument('--old-slowdown', dest='old_slowdown', - action='store_true', default=False, - help='Compute slowdowns using the approach of the Homa ATC ' - 'paper (default: use 15 usec RTT and 100%% link throughput as ' - 'reference)') parser.add_argument('--plot-only', dest='plot_only', action='store_true', help='Don\'t run experiments; generate plot(s) with existing data') parser.add_argument('--port-receivers', type=int, dest='port_receivers', @@ -336,10 +323,9 @@ def init(options): """ Initialize various global state, such as the log file. """ - global old_slowdown, log_dir, log_file, verbose, delete_rtts, link_mbps + global log_dir, log_file, verbose, delete_rtts, link_mbps global stripped log_dir = options.log_dir - old_slowdown = options.old_slowdown if not options.plot_only: if os.path.exists(log_dir): shutil.rmtree(log_dir) @@ -599,6 +585,18 @@ def set_sysctl_parameter(name, value, nodes): do_subprocess(["ssh", "node%d" % id, "sudo", "sysctl", "%s=%s" % (name, value)]) +def get_baseline_rtt(): + """ + Return the "best possible" RTT for short messages, for use in computing + slowdowns. + """ + global baseline_rtts + + node_type = get_node_type() + if node_type in baseline_rtts: + return baseline_rtts[node_type] + return baseline_rtts["default"] + def get_node_type(): """ Returns the node type for this machine. @@ -702,8 +700,6 @@ def run_experiment(name, clients, options): id, name, options.ipv6) - if "unloaded" in options: - command += " --unloaded %d" % (options.unloaded) else: if "no_trunc" in options: trunc = '--no-trunc' @@ -735,41 +731,40 @@ def run_experiment(name, clients, options): nodes.append(id) vlog("Command for node%d: %s" % (id, command)) wait_output("% ", nodes, command, 40.0) - if not "unloaded" in options: - if options.protocol == "homa": - # Wait a bit so that homa_prio can set priorities appropriately - time.sleep(2) - if stripped: - vlog("Skipping initial read of metrics (Homa is stripped)") - else: - vlog("Recording initial metrics") - for id in exp_nodes: - do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) - if not "no_rtt_files" in options: - do_cmd("dump_times /dev/null %s" % (name), clients) - if options.protocol == "homa" and options.tt_freeze: - log("Unfreezing timetraces on %s" % (nodes)) - set_sysctl_parameter(".net.homa.action", "10", nodes) - do_cmd("log Starting measurements for %s experiment" % (name), - server_nodes, clients) - log("Starting measurements") - debug_delay = 0 - if debug_delay > 0: - time.sleep(debug_delay) - if False and "dctcp" in name: - log("Setting debug info") - do_cmd("debug 2000 3000", clients) - log("Finished setting debug info") - time.sleep(options.seconds - debug_delay) - if options.protocol == "homa" and options.tt_freeze: - log("Freezing timetraces via node%d" % nodes[0]) - set_sysctl_parameter(".net.homa.action", "7", nodes[0:1]) - do_cmd("log Ending measurements for %s experiment" % (name), - server_nodes, clients) + if options.protocol == "homa": + # Wait a bit so that homa_prio can set priorities appropriately + time.sleep(2) + if stripped: + vlog("Skipping initial read of metrics (Homa is stripped)") + else: + vlog("Recording initial metrics") + for id in exp_nodes: + do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) + if not "no_rtt_files" in options: + do_cmd("dump_times /dev/null %s" % (name), clients) + if options.protocol == "homa" and options.tt_freeze: + log("Unfreezing timetraces on %s" % (nodes)) + set_sysctl_parameter(".net.homa.action", "10", nodes) + do_cmd("log Starting measurements for %s experiment" % (name), + server_nodes, clients) + log("Starting measurements") + debug_delay = 0 + if debug_delay > 0: + time.sleep(debug_delay) + if False and "dctcp" in name: + log("Setting debug info") + do_cmd("debug 2000 3000", clients) + log("Finished setting debug info") + time.sleep(options.seconds - debug_delay) + if options.protocol == "homa" and options.tt_freeze: + log("Freezing timetraces via node%d" % nodes[0]) + set_sysctl_parameter(".net.homa.action", "7", nodes[0:1]) + do_cmd("log Ending measurements for %s experiment" % (name), + server_nodes, clients) log("Retrieving data for %s experiment" % (name)) if not "no_rtt_files" in options: do_cmd("dump_times rtts %s" % (name), clients) - if (options.protocol == "homa") and not "unloaded" in options: + if (options.protocol == "homa"): if stripped: vlog("Skipping final read of metrics (Homa is stripped)") else: @@ -1097,11 +1092,7 @@ def scan_logs(): vlog("\n%ss for %s experiment:" % (type.capitalize(), name)) for node in sorted(exp.keys()): if not gbps_key in exp[node]: - if name.startswith("unloaded"): - exp[node][gbps_key] = [0.0] - exp[node][kops_key] = [0.0] - else: - continue + continue gbps = exp[node][gbps_key] avg = sum(gbps)/len(gbps) vlog("%s: %.2f Gbps (%s)" % (node, avg, @@ -1287,29 +1278,6 @@ def get_buckets(rtts, total): buckets.append([length, cumulative/total]) return buckets -def set_unloaded(experiment): - """ - Collect measurements from an unloaded system to use in computing slowdowns. - - experiment: Name of experiment that measured RTTs under low load - """ - global unloaded_p50, min_rtt - - # Find (or generate) unloaded data for comparison. - files = sorted(glob.glob("%s/%s-*.rtts" % (log_dir, experiment))) - if len(files) == 0: - raise Exception("Couldn't find %s RTT data" % (experiment)) - rtts = {} - for file in files: - read_rtts(file, rtts) - unloaded_p50.clear() - min_rtt = 1e20 - for length in rtts.keys(): - sorted_rtts = sorted(rtts[length]) - unloaded_p50[length] = sorted_rtts[len(rtts[length])//2] - min_rtt = min(min_rtt, sorted_rtts[0]) - vlog("Computed unloaded_p50: %d entries" % len(unloaded_p50)) - def get_digest(experiment): """ Returns an element of digest that contains data for a particular @@ -1320,7 +1288,7 @@ def get_digest(experiment): experiment: Name of the desired experiment """ - global old_slowdown, digests, log_dir, min_rtt, unloaded_p50, delete_rtts + global digests, log_dir, delete_rtts global link_mbps if experiment in digests: @@ -1339,6 +1307,7 @@ def get_digest(experiment): digest["slow_999"] = [] avg_slowdowns = [] + baseline_rtt = get_baseline_rtt() # Read in the RTT files for this experiment. files = sorted(glob.glob(log_dir + ("/%s-*.rtts" % (experiment)))) @@ -1348,13 +1317,13 @@ def get_digest(experiment): sys.stdout.write("Reading RTT data for %s experiment: " % (experiment)) sys.stdout.flush() for file in files: - count, slowdown = read_rtts(file, digest["rtts"], min_rtt, link_mbps) + count, slowdown = read_rtts(file, digest["rtts"], baseline_rtt, link_mbps) digest["total_messages"] += count avg_slowdowns.append([file, slowdown]) sys.stdout.write("#") sys.stdout.flush() - if delete_rtts and not ("unloaded" in file): + if delete_rtts: os.remove(file) log("") @@ -1368,9 +1337,6 @@ def get_digest(experiment): log("Outlier alt-slowdown in %s: %.1f vs. %.1f overall average" % (info[0], info[1], overall_avg)) - if old_slowdown and (len(unloaded_p50) == 0): - raise Exception("No unloaded data: must invoke set_unloaded") - rtts = digest["rtts"] buckets = get_buckets(rtts, digest["total_messages"]) bucket_length, bucket_cum_frac = buckets[0] @@ -1381,10 +1347,7 @@ def get_digest(experiment): slowdown_sum = 0.0 lengths = sorted(rtts.keys()) lengths.append(999999999) # Force one extra loop iteration - if old_slowdown: - optimal = unloaded_p50[min(unloaded_p50.keys())] - else: - optimal = 15 + lengths[0]*8/link_mbps + optimal = baseline_rtt + lengths[0]*8/link_mbps for length in lengths: if length > bucket_length: digest["lengths"].append(bucket_length) @@ -1408,10 +1371,7 @@ def get_digest(experiment): bucket_count = 0 bucket_length, bucket_cum_frac = buckets[next_bucket] next_bucket += 1 - if old_slowdown: - optimal = unloaded_p50[length] - else: - optimal = 15 + length*8/link_mbps + optimal = baseline_rtt + length*8/link_mbps bucket_count += len(rtts[length]) for rtt in rtts[length]: bucket_rtts.append(rtt) From 34937f1fa0403f3ea23dd326173b68bd2e42bfec Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Nov 2025 10:40:16 -0800 Subject: [PATCH 565/625] Document the defer_min_bytes configuration parameter --- man/homa.7 | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/man/homa.7 b/man/homa.7 index 64940a96..2ee5b3df 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -556,6 +556,15 @@ aggressively (which could impact application performance) until the number of dead packet buffers drops below .I dead_buffs_limit . .TP +.IR defer_min_bytes +Messages shorter than this value will always be transmitted immediately, +without worrying about NIC queue length. Messages of this length or greater +will be queued if the NIC queue becomes too long, in order to implement +SRPT for outgoing messages. Short messages are transmitted immediately +because (a) it's unlikely that they can be generated rapidly enough +to produce significant queuing in the NIC and (b) deferring them can overload +the pacer to the point where it cannot keep the uplink fully saturated. +.TP .IR fifo_grant_increment An integer value. When Homa decides to issue a grant to the oldest message (because of From f064cf5185505d882e7bb9f4826d0f198a2b94bb Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Nov 2025 15:38:18 -0800 Subject: [PATCH 566/625] Add info to perf.txt Discussion of separating outbound pacer traffic from non-paced. --- perf.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/perf.txt b/perf.txt index 5ffcb608..f76a535f 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,20 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +64. (November 2025) Separating pacer traffic from non-paced traffic in +homa_qdisc (use tx queue 0 for paced traffic; non-paced traffic is spread +across other queues, using default queues except that traffic for queue 0 +goes to queue 1 instead). In comparison to the old pacer (measurements +with w4 and w5 on c6620 cluster at 80 Gbps load; see log book for graphs)): +* P99 for messages shorter than defer_min_bytes is 20-30% faster with separation +* P99 for messages between defer_min_bytes and unsched_limit is about 2x + slower with separation +* P99 for messages longer than unsched_limit starts off 40-50% slower with + separate, but gradually converges. +* Increasing defer_min_bytes provides upside with no apparent downside. +* Average slowdowns are better with the old pacer: 3.45 vs. 3.77 for W4, + 9.40 vs. 7.72 for W5 (W5 has no messages shorter than defer_min_bytes). + 63. (September 2025) Compared CPU utilization against TCP. Measured with top, running cp_vs_tcp -w w4 -b20 on a 6-node xl170 cluster (20 cores): From dda2c125587821b6e86eaaf92958d4eaf3db155a Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Nov 2025 15:39:58 -0800 Subject: [PATCH 567/625] Update notes.txt --- notes.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/notes.txt b/notes.txt index 0ce2e6e1..92642c10 100755 --- a/notes.txt +++ b/notes.txt @@ -2,7 +2,6 @@ Notes for Homa implementation in Linux: --------------------------------------- * Performance problems to track down: - * On c6620 cluster, Homa is getting RPC timeouts on w4 at 'b 80' and '-b 100' * On xl170s, both TCP and Homa run slower with qdisc than pacer (P99 for TCP small packets increases by 50%) From a8fd87fdd4ca4515588e657d3192acd8e87138f8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Nov 2025 15:40:48 -0800 Subject: [PATCH 568/625] Add "-c defer_min_bytes" option to cp_config Also a few formatting changes in graphs --- util/cp_config | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/util/cp_config b/util/cp_config index 4e3652fc..482d6604 100755 --- a/util/cp_config +++ b/util/cp_config @@ -17,8 +17,8 @@ parser = get_parser(description= usage='%(prog)s [options]') parser.add_argument('-c', '--config', dest='config', choices=['balance', 'buffers', 'busy_usecs', 'client_threads', - 'dctcp_buffers', 'fifo', 'gbps', 'gen2', 'gen3', - 'grant_policy', 'gro_busy_usecs', 'load', + 'dctcp_buffers', 'defer_min_bytes', 'fifo', 'gbps', + 'gen2', 'gen3', 'grant_policy', 'gro_busy_usecs', 'load', 'max_gro', 'max_gso', 'mtu', 'nic_queue', 'poll', 'ports', 'prios', 'receivers', 'repeat', 'tcp_buffers', 'throttle', 'time', 'unsched_bytes'], @@ -87,6 +87,12 @@ elif options.config == 'dctcp_buffers': 'options': ['protocol', 'dctcp'], 'sysctl': ['.net.ipv4.tcp_congestion_control', 'dctcp'], 'switch_buffer': mb}) +elif options.config == 'defer_min_bytes': + # Vary the fraction of bandwidth reserved for the oldest message + for value in [1000, 3000, 10000]: + specs.append({'exp_name': 'defer_%d' % (value), + 'label': 'defer_min_bytes %d' % (value), + 'sysctl': ['.net.homa.defer_min_bytes', value]}) elif options.config == 'fifo': # Vary the fraction of bandwidth reserved for the oldest message for fifo in [0, 5, 10, 20]: @@ -334,8 +340,8 @@ if switch: for workload, bw, seconds in load_info: # Generate slowdown plot. log("Generating slowdown plot for %s" % (workload)) - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) ax = start_plot_vs_msg_length(title, plot_max_y, "%s_%s" % ( specs[0]['exp_name'], workload), y_label=" Slowdown") for spec in specs: @@ -354,9 +360,9 @@ for workload, bw, seconds in load_info: # Generate latency plot. log("Generating latency plot for %s" % (workload)) - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) - ax = start_plot_vs_msg_length(title, [30, 30000], "%s_%s" % ( + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, [10, 10000], "%s_%s" % ( specs[0]['exp_name'], workload), y_label=r'RTT (µsec)') for spec in specs: exp_name = "%s_%s" % (spec['exp_name'], workload) @@ -371,7 +377,8 @@ for workload, bw, seconds in load_info: # Generate CDF of small-message RTTs. log("Generating short message CDFs for %s" % (workload)) - title = "%s %d nodes" % (workload.capitalize(), options.num_nodes) + title = "%s, %d %s nodes" % (workload.capitalize(), options.num_nodes, + get_node_type()) start_cdf_plot(title, 10, 0.99e05, 1e-05, "RTT (usecs)", "Cumulative Fraction of Short Messages") for spec in specs: From 6778a379de3c029df55a5ba0ef8e082a0b8d33f0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Nov 2025 15:42:42 -0800 Subject: [PATCH 569/625] Add read_digest function to cperf.py --- util/cperf.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index 7c249890..34927be8 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -1280,9 +1280,9 @@ def get_buckets(rtts, total): def get_digest(experiment): """ - Returns an element of digest that contains data for a particular + Returns an element of digests that contains data for a particular experiment; if this is the first request for a given experiment, the - method reads the data for experiment and generates the digest. For + method reads the raw RTT data for experiment and generates the digest. For each new digest generated, a .data file is generated in the "reports" subdirectory of the log directory. @@ -1411,6 +1411,52 @@ def get_digest(experiment): digests[experiment] = digest return digest +def read_digest(file): + """ + Read digest data from a file return the parsed digest. All digest fields + are populated except rtts. + + file: Name of the file to read + """ + + digest = { + "total_messages": 0, + "lengths": [], + "cum_frac": [], + "counts": [], + "p50": [], + "p99": [], + "p999": [], + "s50": [], + "s99": [], + "s999": [] + } + line_num = 0 + f = open(file) + for line in f: + line_num += 1 + if line.startswith('#'): + continue + values = line.strip().split() + if len(values) != 9: + print("Line %d in %s had %d field(s), expected 9: %s" % + (line_num, file, len(values), line.rstrip()), + file=sys.stderr) + length, cum_frac, count, p50, p99, p999, s50, s99, s999 = values + count = int(count) + digest["total_messages"] += count + digest["lengths"].append(int(length)) + digest["cum_frac"].append(float(cum_frac)) + digest["counts"].append(count) + digest["p50"].append(float(p50)) + digest["p99"].append(float(p99)) + digest["p999"].append(float(p999)) + digest["s50"].append(float(s50)) + digest["s99"].append(float(s99)) + digest["s999"].append(float(s999)) + f.close() + return digest + def start_plot_vs_msg_length(title, y_range, x_experiment, size=10, show_top_label=True, show_bot_label=True, figsize=[6,4], y_label="Slowdown", show_upper_x_axis=True): @@ -1484,10 +1530,9 @@ def start_plot_vs_msg_length(title, y_range, x_experiment, size=10, target_count = 0 tick = 0 digest = get_digest(x_experiment) - rtts = digest["rtts"] total = digest["total_messages"] - for length in sorted(rtts.keys()): - cumulative_count += len(rtts[length]) + for length, count in zip(digest["lengths"], digest["counts"]): + cumulative_count += count while cumulative_count >= target_count: ticks.append(target_count/total) if length < 1000: From 73b9ee43984c128f12dbb8d2bd57283ffdd78ef6 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 17 Nov 2025 15:43:18 -0800 Subject: [PATCH 570/625] A few format changes in graph output --- util/cp_vs_tcp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/util/cp_vs_tcp b/util/cp_vs_tcp index c785b78f..e6d29bd0 100755 --- a/util/cp_vs_tcp +++ b/util/cp_vs_tcp @@ -96,8 +96,8 @@ for workload, bw, seconds in load_info: # Generate slowdown plot. log("Generating slowdown plot for %s" % (workload)) - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) ax = start_plot_vs_msg_length(title, 1000, homa_exp) if options.tcp: plot_slowdown(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) @@ -113,9 +113,9 @@ for workload, bw, seconds in load_info: # Generate latency plot. log("Generating RTT latency plot for %s" % (workload)) - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) - ax = start_plot_vs_msg_length(title, [30, 30000], homa_exp, + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, [10, 10000], homa_exp, y_label=r'RTT (µsec)') if options.tcp: plot_histogram(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) From 548056faa1b7030ba35c099e57bd67de6b20225f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 18 Nov 2025 13:40:31 -0800 Subject: [PATCH 571/625] Fix bug in location of priority bits for IPv6 Bits were positioned one bit too low in the Traffic Class field. --- homa_outgoing.c | 4 ++-- test/mock.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/homa_outgoing.c b/homa_outgoing.c index 72dd58f3..12cce77a 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -476,7 +476,7 @@ int __homa_xmit_control(void *contents, size_t length, struct homa_peer *peer, #ifndef __STRIP__ /* See strip.py */ if (hsk->inet.sk.sk_family == AF_INET6) { result = ip6_xmit(&hsk->inet.sk, skb, &peer->flow.u.ip6, 0, - NULL, hsk->homa->priority_map[priority] << 4, + NULL, hsk->homa->priority_map[priority] << 5, 0); } else { /* This will find its way to the DSCP field in the IPv4 hdr. */ @@ -688,7 +688,7 @@ void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc) #ifndef __STRIP__ /* See strip.py */ err = ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, 0, NULL, - rpc->hsk->homa->priority_map[priority] << 4, 0); + rpc->hsk->homa->priority_map[priority] << 5, 0); #else /* See strip.py */ ip6_xmit(&rpc->hsk->inet.sk, skb, &rpc->peer->flow.u.ip6, 0, NULL, 0, 0); diff --git a/test/mock.c b/test/mock.c index 187744f9..611ca239 100644 --- a/test/mock.c +++ b/test/mock.c @@ -767,7 +767,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, mock_xmit_prios_offset += snprintf( mock_xmit_prios + mock_xmit_prios_offset, sizeof(mock_xmit_prios) - mock_xmit_prios_offset, - "%s%d", prefix, tclass >> 4); + "%s%d", prefix, tclass >> 5); if (mock_xmit_log_verbose) homa_print_packet(skb, buffer, sizeof(buffer)); else From 23b11989a85bedc20a4d8af7dba49bf411365b7e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 18 Nov 2025 16:00:30 -0800 Subject: [PATCH 572/625] Updates to dell_switch.txt Added commands to use global defaults, with overrides. --- cloudlab/dell_switch.txt | 163 +++++++++++++++++++++++++-------------- 1 file changed, 103 insertions(+), 60 deletions(-) diff --git a/cloudlab/dell_switch.txt b/cloudlab/dell_switch.txt index 0e60fe0d..5320dbc3 100644 --- a/cloudlab/dell_switch.txt +++ b/cloudlab/dell_switch.txt @@ -1,12 +1,10 @@ -# Commands for configuring a Dell switch (OS 10.6.0) to enable priority -# queues with strict priority: - +! Commands for configuring a Dell switch (OS 10.6.0) to enable priority +! queues with strict priority. First, enter configure mode: enable configure terminal -# Create maps from DSCP classes to egress queues, and from queues -# to service policies. - +! Create maps from DSCP classes to egress queues, and from queues +! to service policies. class-map type queuing pq_cm0 match queue 0 exit @@ -51,9 +49,94 @@ class pq_cm7 priority exit -# Configure interfaces to use the trust-map (on input) and policy- -# (for output) +! Configure all ports to use the trust-map (on input) and the +! policy-map (for output) by default. +system qos +trust-map dscp default +service-policy output type queuing pq_pmap +exit + +! Create a qos-map that maps all traffic classes to queue 0 of egress ports +qos-map traffic-class qos_map_null +queue 0 qos-group 0-7 type ucast +queue 0 qos-group 0-7 type mcast +exit + +! Apply the above qos-map to a few specific ports: this overrides the global +! default so that there are no priorities for these egress ports (queue 0 is +! used for all traffic). +interface ethernet 1/1/15 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/16 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/17 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/18 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/47 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/48 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/49 +qos-map traffic-class qos_map_null +exit +interface ethernet 1/1/50 +qos-map traffic-class qos_map_null +exit + +! Do not apply commands below here: I've provided these to show how to +! undo the effects of the commands above, plus a few other things in +! case you need them in the future. + +! Remove the override for specific ports: +interface ethernet 1/1/36:3 +no qos-map traffic-class +exit +interface ethernet 1/1/28:1 +no qos-map traffic-class +exit +interface ethernet 1/1/57:3 +no qos-map traffic-class +exit +interface ethernet 1/1/39:3 +no qos-map traffic-class +exit +interface ethernet 1/1/31:2 +no qos-map traffic-class +exit +interface ethernet 1/1/62:1 +no qos-map traffic-class +exit +interface ethernet 1/1/36:2 +no qos-map traffic-class +exit +interface ethernet 1/1/59:3 +no qos-map traffic-class +exit +interface ethernet 1/1/58:2 +no qos-map traffic-class +exit +interface ethernet 1/1/31:4 +no qos-map traffic-class +exit +interface ethernet 1/1/7:4 +no qos-map traffic-class +exit + +! Remove the global default +system qos +no trust-map dscp +no service-policy output type queuing +exit +! Configure specific interfaces to use the trust-map (on input) and policy-map +! (for output). This is an alternative to the global default. interface ethernet 1/1/36:3 trust-map dscp default service-policy output type queuing pq_pmap @@ -99,88 +182,48 @@ trust-map dscp default service-policy output type queuing pq_pmap exit -# Reset ports - +! Remove the port-specific configuration. interface ethernet 1/1/36:3 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/28:1 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/57:3 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/39:3 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/31:2 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/62:1 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/36:2 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/59:3 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/58:2 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/31:4 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit interface ethernet 1/1/7:4 no trust-map dscp -no service-policy output type queuing pq_pmap +no service-policy output type queuing exit - -# Create policy map that "flattens" the queues: they each get the -# same bandwidth. - -policy-map type queuing pmap_flat -class pq_cm0 -bandwidth percent 12 -class pq_cm1 -bandwidth percent 12 -class pq_cm2 -bandwidth percent 12 -class pq_cm3 -bandwidth percent 12 -class pq_cm4 -bandwidth percent 12 -class pq_cm5 -bandwidth percent 12 -class pq_cm6 -bandwidth percent 12 -class pq_cm7 -bandwidth percent 12 -exit - - -interface ethernet 1/1/36:2 -service-policy output type queuing pmap_flat -exit -interface ethernet 1/1/59:3 -service-policy output type queuing pmap_flat -exit -interface ethernet 1/1/58:2 -service-policy output type queuing pmap_flat -exit -interface ethernet 1/1/31:4 -service-policy output type queuing pmap_flat -exit -interface ethernet 1/1/7:4 -service-policy output type queuing pmap_flat -exit \ No newline at end of file From c420beb4108eadd2550bdcc8d70091e08df6ede0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 18 Nov 2025 16:01:05 -0800 Subject: [PATCH 573/625] Minor improvements in graph formatting for cp_both --- util/cp_both | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/util/cp_both b/util/cp_both index d01a278a..94916de6 100755 --- a/util/cp_both +++ b/util/cp_both @@ -58,9 +58,9 @@ scan_metrics(tcp_exp) # Generate slowdown plot. log("Generating slowdown plot for %s" % (options.workload)) -title = "TCP (%.1f Gbps) and Homa (%.1f Gbps) together, %s %d nodes" % ( +title = "TCP (%.1f Gbps) and Homa (%.1f Gbps) together, %s, %d %s nodes" % ( options.gbps - options.homa_gbps, options.homa_gbps, - options.workload.capitalize(), options.num_nodes) + options.workload.capitalize(), options.num_nodes, get_node_type()) ax = start_plot_vs_msg_length(title, 1000, homa_exp) plot_slowdown(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) plot_slowdown(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) @@ -72,7 +72,7 @@ plt.savefig("%s/reports/both_%s.pdf" % (options.log_dir, options.workload)) # Generate latency plot. log("Generating RTT latency plot for %s" % (options.workload)) -ax = start_plot_vs_msg_length(title, [30, 30000], homa_exp, +ax = start_plot_vs_msg_length(title, [10, 10000], homa_exp, y_label=r'RTT (µsec)') plot_histogram(ax, tcp_exp, "p99", "TCP P99", color=tcp_color) plot_histogram(ax, tcp_exp, "p50", "TCP P50", color=tcp_color2) From 636447c4ec350549b85a7030a2c26ad3701d7196 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 19 Nov 2025 09:48:11 -0800 Subject: [PATCH 574/625] Use homa_get_offset to retrieve pkt seg offsets correctly --- homa_devel.c | 2 +- homa_qdisc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/homa_devel.c b/homa_devel.c index 31e05869..272aede9 100644 --- a/homa_devel.c +++ b/homa_devel.c @@ -750,7 +750,7 @@ void homa_rpc_log_tt(struct homa_rpc *rpc) h = (struct homa_data_hdr *) skb->data; tt_record3("RPC id %d has %d bpages allocated, first uncopied offset %d", rpc->id, rpc->msgin.num_bpages, - ntohl(h->seg.offset)); + homa_get_offset(h)); } } } else if (rpc->state == RPC_OUTGOING) { diff --git a/homa_qdisc.c b/homa_qdisc.c index 0854fc11..bc8c38e8 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -753,7 +753,7 @@ int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev) h = (struct homa_data_hdr *)skb_transport_header(skb); tt_record3("homa_qdisc_pacer queuing homa data packet for id %d, offset %d on qid %d", be64_to_cpu(h->common.sender_id), - ntohl(h->seg.offset), qdev->pacer_qix); + homa_get_offset(h), qdev->pacer_qix); homa_qdisc_redirect_skb(skb, qdev, true); return pkt_len; } From 47100eee0b367c2791ce7c21b814b8266d6f3d62 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 19 Nov 2025 16:48:43 -0800 Subject: [PATCH 575/625] Add more TCP timetraces to homa_qdisc --- homa_qdisc.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index bc8c38e8..26c77315 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -673,8 +673,21 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) spin_unlock_irqrestore(&qdev->defer_lock, flags); homa_qdisc_update_link_idle(qdev, pkt_len, -1); - tt_record2("homa_qdisc_pacer queuing tcp packet with length %d on qid %d", - pkt_len, q->ix); + if (ip_hdr(skb)->protocol == IPPROTO_TCP) { + struct tcphdr *th; + + th = (struct tcphdr*) skb_transport_header(skb); + ltt_record4("homa_qdisc_pacer requeued TCP packet " + "from 0x%08x:%d to 0x%08x:%d", + ntohl(ip_hdr(skb)->saddr), ntohs(th->source), + ntohl(ip_hdr(skb)->daddr), ntohs(th->dest)); + ltt_record4("homa_qdisc_pacer requeued TCP packet (2) " + "sequence %u, data bytes %d, ack %u, gso_size %d", + ntohl(th->seq), + skb->len - skb_transport_offset(skb) - + tcp_hdrlen(skb), ntohl(th->ack_seq), + skb_shinfo(skb)->gso_size); + } sch = q->sch; spin_lock_bh(qdisc_lock(sch)); qdisc_enqueue_tail(skb, sch); From f85102b3884e8780bdbfc258e312a64d53da2b2d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 19 Nov 2025 16:49:22 -0800 Subject: [PATCH 576/625] Include TCP packets in tthoma tkpkts analyzer * Add more fields to TCP packets. * Revise TCP timetraces in various ways. * Also added free delays to delay analyzer. --- util/tthoma.py | 444 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 330 insertions(+), 114 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index eab20de8..9a3f0934 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -239,18 +239,28 @@ def __missing__(self, key): # It is created by AnalyzePackets. See get_tcp_packet for details on the keys # used to look up packets. Each value is a dictionary containing the following # fields: +# id: Always zero; this can be used to distinguish TCP packets from +# Homa packets, where there is always a nonzero id. # saddr: Source address of the packet (hex string) # sport: Source port number # daddr: Destination address of the packet (hex string) # dport: Destination port number # sequence: The sequence number in the packet -# data_bytes: The number of data bytes in the packet +# length: # bytes of message data in the received packet +# tso_length: The number of data bytes in the packet (before TSO) # total_length: Total length of the packet, including IP and TCP headers # ack: The ack sequence number in the packet +# xmit: Time when ip*xmit was invoked for the packet +# qdisc_xmit: Time when homa_qdisc requeued a packet that was deferred +# because of NIC queue length (only present for deferred +# packets) # nic: Time when the the packet was handed off to the NIC +# free_tx_skb: Time when NAPI released the skb on the sender, which can't +# happen until the packet has been fully transmitted. # gro: Time when GRO received the packet # tx_node: Node that sent the packet (corresponds to saddr) # rx_node: Node that received the packet (corresponds to daddr) +# retransmits: Always empty (for compatibility with Homa packets) tcp_packets = {} # Node -> list of intervals for that node. Created by the intervals analyzer. @@ -636,12 +646,21 @@ def get_tcp_packet(saddr, sport, daddr, dport, sequence, data_bytes, ack): """ global tcp_packets - key = '%s:%d %s:%d %d %d %d' % (saddr, sport, daddr, dport, sequence, - data_bytes, ack) + # This is tricky because a 'data' packet can arrive with no data, + # just an ack. This code will create one packet if there is data, + # ignoring amount of data and ack. If there is no data, then one + # packet is created for each distinct sequence/ack combination. + + if data_bytes > 0: + key = '%s:%d %s:%d %d' % (saddr, sport, daddr, dport, sequence) + else: + key = '%s:%d %s:%d %d ack %d' % (saddr, sport, daddr, dport, sequence, + ack) if key in tcp_packets: return tcp_packets[key] - pkt = {'saddr': saddr, 'sport': sport, 'daddr': daddr, 'dport': dport, - 'sequence': sequence, 'data_bytes': data_bytes, 'ack': ack} + pkt = {'id': 0, 'saddr': saddr, 'sport': sport, 'daddr': daddr, + 'dport': dport, 'sequence': sequence, 'length': data_bytes, + 'ack': ack, 'retransmits': []} tcp_packets[key] = pkt return pkt @@ -899,18 +918,22 @@ def print_if(value, fmt, modifier=None): def print_pkts(pkts, header=True): """ Returns a string containing one line for each packet in pkts, which - contains various useful information about the packet. If header is True + contains various useful information about the packet. The entries in + pkts can be either Homa packets or TCP packets. If header is True then the string also includes initial text describing the fields that are printed on each line. """ buf = StringIO() + buf.write('# Source: Node that sent packet\n') buf.write('# Dest: Node to which packet was sent\n') buf.write('# Xmit: Time when packet was passed to ip*xmit\n') buf.write('# Qdisc: Time when homa_qdisc requeued packet after ' 'deferral, if any\n') - buf.write('# RpcId: Identifier of packet\'s RPC\n') - buf.write('# Offset: Offset of packet within message\n') + buf.write('# Id/Seq: RPC identifier for Homa packets, sequence ' + 'number for TCP\n') + buf.write('# Offset: Offset of packet within message or "TCP" if ' + 'packet is TCP\n') buf.write('# Length: Size of packet (before segmentation)\n') buf.write('# Qid: Transmit queue on which packet was sent\n') buf.write('# Nic: Time when packet was queued for NIC\n') @@ -921,8 +944,8 @@ def print_pkts(pkts, header=True): buf.write('# FDelay: Free - Nic\n') buf.write('# Rx: Number of times segments in the packet were ' 'retransmitted\n\n') - buf.write('# Dest Xmit Qdisc RpcId Offset Length') - buf.write(' Qid Nic NDelay Gro GDelay') + buf.write('Source Dest Xmit Qdisc Id/Seq Offset') + buf.write(' Length Qid Nic NDelay Gro GDelay') buf.write(' Free FDelay Rx\n') for pkt in pkts: xmit = pkt['xmit'] @@ -952,17 +975,22 @@ def print_pkts(pkts, header=True): rx += len(seg['retransmits']) rx_msg = str(rx) if rx > 0 else "" - line = '%-10s %10.3f %10s %10d %6d %6d' % ( + line = '%-8s %-8s %10.3f %10s' % (pkt['tx_node'], pkt['rx_node'] if 'rx_node' in pkt else "", - xmit, qdisc_string, pkt['id'], pkt['offset'], length) + xmit, qdisc_string) + if pkt['id'] != 0: + line += ' %10d %6d' % (pkt['id'], pkt['offset']) + else: + # This is a TCP packet + line += ' %10d TCP' % (pkt['sequence']) nic_delay_string = '' if nic_delay != None: nic_delay_string = '%.1f' % (nic_delay) gro_delay_string = '' if gro != None and nic != None: gro_delay_string = '%.1f' % (gro - nic) - line += ' %3s %10s %7s %10s %7s' % (print_if(qid, '%d'), - print_if(nic, '%.3f'), nic_delay_string, + line += ' %6d %3s %10s %7s %10s %7s' % (length, + print_if(qid, '%d'), print_if(nic, '%.3f'), nic_delay_string, print_if(gro, '%.3f'), gro_delay_string) free_delay_string = '' if (nic != None) and (free != None): @@ -1316,40 +1344,6 @@ def __gro_grant(self, trace, time, core, match, interests): 'offset ([0-9]+), priority ([0-9]+)' }) - def __gro_tcp(self, trace, time, core, match, interests): - saddr = match.group(1) - sport = int(match.group(2)) - daddr = match.group(3) - dport = int(match.group(4)) - self.core_saved[core] = {'saddr': saddr, 'sport': sport, - 'daddr': daddr, 'dport': dport} - - patterns.append({ - 'name': 'gro_tcp', - 'regexp': 'tcp_gro_receive got packet from ([^:]+):([0-9]+) to ' - '([^:]+):([0-9]+)' - }) - - def __gro_tcp2(self, trace, time, core, match, interests): - if not core in self.core_saved: - return - saved = self.core_saved[core] - sequence = int(match.group(1)) - data_bytes = int(match.group(2)) - total = int(match.group(3)) - ack = int(match.group(4)) - for interest in interests: - interest.tt_gro_tcp(trace, time, core, saved['saddr'], - saved['sport'], saved['daddr'], saved['dport'], sequence, - data_bytes, total, ack) - del self.core_saved[core] - - patterns.append({ - 'name': 'gro_tcp2', - 'regexp': r'tcp_gro_receive .2. sequence ([-0-9]+), data bytes ' - '([0-9]+), total length ([0-9]+), ack ([-0-9]+)' - }) - def __softirq_data(self, trace, time, core, match, interests): id = int(match.group(1)) offset = int(match.group(2)) @@ -1460,40 +1454,6 @@ def __nic_grant(self, trace, time, core, match, interests): 'offset ([0-9]+), queue (0x[0-9a-f]+)' }) - def __nic_tcp(self, trace, time, core, match, interests): - saddr = match.group(2) - sport = int(match.group(3)) - daddr = match.group(4) - dport = int(match.group(5)) - self.core_saved[core] = {'saddr': saddr, 'sport': sport, - 'daddr': daddr, 'dport': dport} - - patterns.append({ - 'name': 'nic_tcp', - 'regexp': '(mlx|ice) sent TCP packet from ([^:]+):([0-9]+) to ' - '([^:]+):([0-9]+)' - }) - - def __nic_tcp2(self, trace, time, core, match, interests): - if not core in self.core_saved: - return - saved = self.core_saved[core] - sequence = int(match.group(2)) - data_bytes = int(match.group(3)) - ack = int(match.group(4)) - gso_size = int(match.group(5)) - for interest in interests: - interest.tt_nic_tcp(trace, time, core, saved['saddr'], - saved['sport'], saved['daddr'], saved['dport'], - sequence, data_bytes, ack, gso_size) - del self.core_saved[core] - - patterns.append({ - 'name': 'nic_tcp2', - 'regexp': r'(mlx|ice) sent TCP packet .2. sequence ([-0-9]+), ' - 'data bytes ([0-9]+), ack ([-0-9]+), gso_size ([0-9]+)' - }) - def __free_tx_skb(self, trace, time, core, match, interests): id = int(match.group(1)) offset = int(match.group(2)) @@ -2013,6 +1973,175 @@ def __snapshot_server_response(self, trace, time, core, match, interests): 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' }) + def __xmit_tcp(self, trace, time, core, match, interests): + saddr = match.group(1) + sport = int(match.group(2)) + daddr = match.group(3) + dport = int(match.group(4)) + self.core_saved[core] = {'saddr': saddr, 'sport': sport, + 'daddr': daddr, 'dport': dport} + + patterns.append({ + 'name': 'xmit_tcp', + 'regexp': 'Transmitting TCP packet from ([^:]+):([0-9]+) to ' + '([^:]+):([0-9]+)' + }) + + def __xmit_tcp2(self, trace, time, core, match, interests): + if not core in self.core_saved: + return + saved = self.core_saved[core] + sequence = int(match.group(1)) + data_bytes = int(match.group(2)) + total_length = int(match.group(3)) + ack = int(match.group(4)) + for interest in interests: + interest.tt_xmit_tcp(trace, time, core, saved['saddr'], + saved['sport'], saved['daddr'], saved['dport'], + sequence, data_bytes, total_length, ack) + del self.core_saved[core] + + patterns.append({ + 'name': 'xmit_tcp2', + 'regexp': r'Transmitting TCP packet .2. sequence ([-0-9]+), ' + 'data bytes ([0-9]+), total length ([-0-9]+), ack ([-0-9]+)' + }) + + def __qdisc_tcp(self, trace, time, core, match, interests): + saddr = match.group(1) + sport = int(match.group(2)) + daddr = match.group(3) + dport = int(match.group(4)) + self.core_saved[core] = {'saddr': saddr, 'sport': sport, + 'daddr': daddr, 'dport': dport} + + patterns.append({ + 'name': 'qdisc_tcp', + 'regexp': 'homa_qdisc_pacer requeued TCP packet from ([^:]+):([0-9]+) ' + 'to ([^:]+):([0-9]+)' + }) + + def __qdisc_tcp2(self, trace, time, core, match, interests): + if not core in self.core_saved: + return + saved = self.core_saved[core] + sequence = int(match.group(1)) + data_bytes = int(match.group(2)) + ack = int(match.group(3)) + for interest in interests: + interest.tt_qdisc_tcp(trace, time, core, saved['saddr'], + saved['sport'], saved['daddr'], saved['dport'], + sequence, data_bytes, ack) + del self.core_saved[core] + + patterns.append({ + 'name': 'qdisc_tcp2', + 'regexp': r'homa_qdisc_pacer requeued TCP packet .2. sequence ([-0-9]+), ' + 'data bytes ([0-9]+), ack ([-0-9]+)' + }) + + def __nic_tcp(self, trace, time, core, match, interests): + saddr = match.group(2) + sport = int(match.group(3)) + daddr = match.group(4) + dport = int(match.group(5)) + self.core_saved[core] = {'saddr': saddr, 'sport': sport, + 'daddr': daddr, 'dport': dport} + + patterns.append({ + 'name': 'nic_tcp', + 'regexp': '(mlx|ice) sent TCP packet from ([^:]+):([0-9]+) to ' + '([^:]+):([0-9]+)' + }) + + def __nic_tcp2(self, trace, time, core, match, interests): + if not core in self.core_saved: + return + saved = self.core_saved[core] + sequence = int(match.group(2)) + data_bytes = int(match.group(3)) + ack = int(match.group(4)) + gso_size = int(match.group(5)) + for interest in interests: + interest.tt_nic_tcp(trace, time, core, saved['saddr'], + saved['sport'], saved['daddr'], saved['dport'], + sequence, data_bytes, ack, gso_size) + del self.core_saved[core] + + patterns.append({ + 'name': 'nic_tcp2', + 'regexp': r'(mlx|ice) sent TCP packet .2. sequence ([-0-9]+), ' + 'data bytes ([0-9]+), ack ([-0-9]+), gso_size ([0-9]+)' + }) + + def __free_tcp(self, trace, time, core, match, interests): + saddr = match.group(1) + sport = int(match.group(2)) + daddr = match.group(3) + dport = int(match.group(4)) + self.core_saved[core] = {'saddr': saddr, 'sport': sport, + 'daddr': daddr, 'dport': dport} + + patterns.append({ + 'name': 'free_tcp', + 'regexp': 'napi freeing TCP skb from ([^:]+):([0-9]+) to ' + '([^:]+):([0-9]+)' + }) + + def __free_tcp2(self, trace, time, core, match, interests): + if not core in self.core_saved: + return + saved = self.core_saved[core] + sequence = int(match.group(1)) + data_bytes = int(match.group(2)) + ack = int(match.group(3)) + qid = int(match.group(4)) + for interest in interests: + interest.tt_free_tcp(trace, time, core, saved['saddr'], + saved['sport'], saved['daddr'], saved['dport'], + sequence, data_bytes, ack, qid) + del self.core_saved[core] + + patterns.append({ + 'name': 'free_tcp2', + 'regexp': r'napi freeing TCP skb .2. sequence ([-0-9]+), ' + 'data bytes ([0-9]+), ack ([-0-9]+), qid ([-0-9]+)' + }) + + def __gro_tcp(self, trace, time, core, match, interests): + saddr = match.group(1) + sport = int(match.group(2)) + daddr = match.group(3) + dport = int(match.group(4)) + self.core_saved[core] = {'saddr': saddr, 'sport': sport, + 'daddr': daddr, 'dport': dport} + + patterns.append({ + 'name': 'gro_tcp', + 'regexp': 'tcp_gro_receive got packet from ([^:]+):([0-9]+) to ' + '([^:]+):([0-9]+)' + }) + + def __gro_tcp2(self, trace, time, core, match, interests): + if not core in self.core_saved: + return + saved = self.core_saved[core] + sequence = int(match.group(1)) + data_bytes = int(match.group(2)) + total = int(match.group(3)) + ack = int(match.group(4)) + for interest in interests: + interest.tt_gro_tcp(trace, time, core, saved['saddr'], + saved['sport'], saved['daddr'], saved['dport'], sequence, + data_bytes, total, ack) + del self.core_saved[core] + + patterns.append({ + 'name': 'gro_tcp2', + 'regexp': r'tcp_gro_receive .2. sequence ([-0-9]+), data bytes ' + '([0-9]+), total length ([0-9]+), ack ([-0-9]+)' + }) + #------------------------------------------------ # Analyzer: activity #------------------------------------------------ @@ -2900,16 +3029,19 @@ def print_pkt_delays(self): short_to_nic = [] short_to_gro = [] short_to_softirq = [] + short_free = [] short_total = [] long_to_nic = [] long_to_gro = [] long_to_softirq = [] + long_free = [] long_total = [] grant_to_nic = [] grant_to_gro = [] grant_to_softirq = [] + grant_free = [] grant_total = [] # Collect statistics about delays within individual packets. @@ -2934,6 +3066,10 @@ def print_pkt_delays(self): delay = pkt['softirq'] - pkt['xmit'] if delay > 0: short_total.append([delay, p, pkt['softirq']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + delay = pkt['free_tx_skb'] - pkt['nic'] + if delay > 0: + short_free.append([delay, p, pkt['free_tx_skb']]) else: if 'tso_length' in pkt: if 'nic' in pkt: @@ -2944,6 +3080,10 @@ def print_pkt_delays(self): delay = pkt['nic'] - pkt['xmit'] if delay > 0: long_to_nic.append([delay, p, pkt['nic']]) + if 'free_tx_skb' in pkt: + delay = pkt['free_tx_skb'] - pkt['nic'] + if delay > 0: + long_free.append([delay, p, pkt['free_tx_skb']]) if ('nic' in pkt) and ('gro' in pkt): delay = pkt['gro'] - pkt['nic'] if delay > 0: @@ -2974,6 +3114,10 @@ def print_pkt_delays(self): delay = pkt['softirq'] - pkt['xmit'] if delay > 0: grant_total.append([delay, p, pkt['softirq']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + delay = pkt['free_tx_skb'] - pkt['nic'] + if delay > 0: + grant_free.append([delay, p, pkt['free_tx_skb']]) print('\n----------------') print('Analyzer: delay') @@ -2985,6 +3129,8 @@ def print_pkt_delays(self): print(' homa_xmit_control)') print('Net: Time from when NIC received packet until GRO started processing') print('SoftIRQ: Time from GRO until SoftIRQ started processing') + print('Free: Time from when NIC received packet until packet was returned') + print(' to Linux and freed') print('Total: Total time from ip*xmit call until SoftIRQ processing') def print_pcts(data, label): @@ -3003,18 +3149,21 @@ def print_pcts(data, label): print_pcts(short_to_nic, 'Xmit') print_pcts(short_to_gro, 'Net') print_pcts(short_to_softirq, 'SoftIRQ') + print_pcts(short_free, 'Free') print_pcts(short_total, 'Total') print('\nData packets from multi-packet messages:') print_pcts(long_to_nic, 'Xmit') print_pcts(long_to_gro, 'Net') print_pcts(long_to_softirq, 'SoftIRQ') + print_pcts(long_free, 'Free') print_pcts(long_total, 'Total') print('\nGrants:') print_pcts(grant_to_nic, 'Xmit') print_pcts(grant_to_gro, 'Net') print_pcts(grant_to_softirq, 'SoftIRQ') + print_pcts(grant_free, 'Free') print_pcts(grant_total, 'Total') # Handle --verbose for packet-related delays. @@ -3099,14 +3248,19 @@ def print_worst(data, label): short_to_nic = [] short_to_gro = [] short_to_softirq = [] + short_free = [] long_to_nic = [] long_to_gro = [] long_to_softirq = [] + long_free = [] grant_to_nic = [] grant_to_gro = [] grant_to_softirq = [] + grant_free = [] + + print('Number of packets is now %d' % (len(packets))) for p, pkt in packets.items(): if (not 'softirq' in pkt) or (not 'xmit' in pkt): @@ -3114,7 +3268,7 @@ def print_worst(data, label): total = pkt['softirq'] - pkt['xmit'] if (pkt['msg_length'] != None) and (pkt['msg_length'] <= mtu): if (total < min_short) or (total > max_short): - continue; + continue if ('xmit' in pkt) and ('nic' in pkt): short_to_nic.append( [pkt['nic'] - pkt['xmit'], p, pkt['nic']]) @@ -3124,6 +3278,9 @@ def print_worst(data, label): if ('gro' in pkt) and ('softirq' in pkt): short_to_softirq.append( [pkt['softirq'] - pkt['gro'], p, pkt['softirq']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + short_free.append( + [pkt['free_tx_skb'] - pkt['nic'], p, pkt['free_tx_skb']]) else: if (total < min_long) or (total > max_long): continue @@ -3134,6 +3291,10 @@ def print_worst(data, label): elif ('xmit' in pkt) and ('nic' in pkt): long_to_nic.append( [pkt['nic'] - pkt['xmit'], p, pkt['nic']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + long_free.append( + [pkt['free_tx_skb'] - pkt['nic'], p, + pkt['free_tx_skb']]) if ('nic' in pkt) and ('gro' in pkt): long_to_gro.append( [pkt['gro'] - pkt['nic'], p, pkt['gro']]) @@ -3156,6 +3317,9 @@ def print_worst(data, label): if ('gro' in pkt) and ('softirq' in pkt): grant_to_softirq.append( [pkt['softirq'] - pkt['gro'], p, pkt['softirq']]) + if ('nic' in pkt) and ('free_tx_skb' in pkt): + grant_free.append( + [pkt['free_tx_skb'] - pkt['nic'], p, pkt['free_tx_skb']]) def get_slow_summary(data): if not data: @@ -3165,21 +3329,24 @@ def get_slow_summary(data): list_avg(data, 0)) print('\nPhase breakdown for P98-P99 packets:') - print(' Xmit Net SoftIRQ') - print(' Pkts P50 Avg P50 Avg P50 Avg') - print('-------------------------------------------------------------') - print('Single-packet %5d %s %s %s' % (len(short_to_nic), + print(' Xmit Net SoftIRQ Free') + print(' Pkts P50 Avg P50 Avg P50 Avg P50 Avg') + print('---------------------------------------------------------------------------') + print('Single-packet %5d %s %s %s %s' % (len(short_to_nic), get_slow_summary(short_to_nic), get_slow_summary(short_to_gro), - get_slow_summary(short_to_softirq))) - print('Multi-packet %5d %s %s %s' % (len(long_to_nic), + get_slow_summary(short_to_softirq), + get_slow_summary(short_free))) + print('Multi-packet %5d %s %s %s %s' % (len(long_to_nic), get_slow_summary(long_to_nic), get_slow_summary(long_to_gro), - get_slow_summary(long_to_softirq))) - print('Grants %5d %s %s %s' % (len(grant_to_nic), + get_slow_summary(long_to_softirq), + get_slow_summary(long_free))) + print('Grants %5d %s %s %s %s' % (len(grant_to_nic), get_slow_summary(grant_to_nic), get_slow_summary(grant_to_gro), - get_slow_summary(grant_to_softirq))) + get_slow_summary(grant_to_softirq), + get_slow_summary(grant_free))) return verbose def print_wakeup_delays(self): @@ -6713,19 +6880,45 @@ def tt_softirq_grant(self, trace, t, core, id, offset, priority, increment): g['increment'] = increment g['rx_node'] = trace['node'] + def tt_xmit_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, + data_bytes, total, ack): + tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, + data_bytes, ack) + node = trace['node'] + tcp_pkt['xmit'] = t + tcp_pkt['total_length'] = total + tcp_pkt['tx_node'] = node + if not saddr in peer_nodes and saddr != '0x00000000': + peer_nodes[saddr] = node + + def tt_qdisc_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, + data_bytes, ack): + tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, + data_bytes, ack) + node = trace['node'] + tcp_pkt['qdisc_xmit'] = t + tcp_pkt['tx_node'] = node + if not saddr in peer_nodes and saddr != '0x00000000': + peer_nodes[saddr] = node + def tt_nic_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, data_bytes, ack, gso_size): - # Break GSO packets up into multiple packets, matching what will - # be received on the other end. - bytes_left = data_bytes node = trace['node'] + if sequence == 3666610099: + print('%9.3f got sequence %u on %s, data_bytes %d, gso_size %d' % + (t, sequence, node, data_bytes, gso_size), file=sys.stderr) if not saddr in peer_nodes and saddr != '0x00000000': peer_nodes[saddr] = node + + # Break TSO packets up into multiple packets, matching what will + # be received on the other end. + bytes_left = data_bytes + pkt_sequence = sequence while True: pkt_bytes = bytes_left if pkt_bytes > gso_size and gso_size != 0: pkt_bytes = gso_size - tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, + tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, pkt_sequence, pkt_bytes, ack) if 'nic' in tcp_pkt and data_bytes > 0: # Retransmitted packet: retain only the last transmission. @@ -6733,16 +6926,24 @@ def tt_nic_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, del tcp_pkt['gro'] tcp_pkt['nic'] = t tcp_pkt['tx_node'] = node - if bytes_left == data_bytes: - tcp_pkt['gso_pkt_size'] = data_bytes + if pkt_sequence == sequence: + tcp_pkt['tso_length'] = data_bytes bytes_left -= pkt_bytes - sequence += pkt_bytes - if sequence > 0x80000000: - # 32-bit sequence number has wrapped around - sequence -= 0x100000000 + pkt_sequence = (pkt_sequence + pkt_bytes) & 0xffffffff if bytes_left <= 0: break + def tt_free_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, + data_bytes, ack, qid): + tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, + data_bytes, ack) + node = trace['node'] + tcp_pkt['free_tx_skb'] = t + tcp_pkt['tx_qid'] = qid + tcp_pkt['tx_node'] = node + if not saddr in peer_nodes and saddr != '0x00000000': + peer_nodes[saddr] = node + def tt_gro_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, data_bytes, total, ack): tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, @@ -6837,7 +7038,7 @@ def analyze(self): new_pkts.append([pid, pkt2]) for key in ['xmit', 'qdisc_xmit', 'xmit2', 'nic', 'id', 'msg_length', 'priority', 'tx_node', 'tx_core', - 'free_tx_skb']: + 'free_tx_skb', 'tx_qid']: if key in pkt: pkt2[key] = pkt[key] if pkt2['msg_length'] != None and pkt2['offset'] > pkt2['msg_length']: @@ -8839,15 +9040,23 @@ def output(self): def output_slow_pkts(self): pkts = [] + delays = [] for pkt in packets.values(): - if not 'nic' in pkt or not 'gro' in pkt: + if (pkt['msg_length'] == None or pkt['msg_length'] <= 1000 or + pkt['msg_length'] >= 60000): continue - if pkt['length'] > 1000 or pkt['offset'] !=0: + if not 'nic' in pkt or not 'gro' in pkt or not 'tso_length' in pkt: continue delay = pkt['gro'] - pkt['nic'] - if delay >= 150 and delay <= 300: + if delay >= 300: pkts.append(pkt) - print("# Packets with nic->gro delays between 150 and 300 usecs:") + else: + if pkt['id'] == 400376130 or pkt['id'] == 400376131: + print('Packet id %d, offset %d, delay %.1f: %s' % + (pkt['id'], pkt['offset'], delay, pkt)) + delays.append(delay) + print('# Packets from messages with length 1000-60000 and') + print('# nic->gro delays > 300 usecs:') print(print_pkts(pkts), end='') def output_delays(self): @@ -9256,7 +9465,7 @@ def output(self): f.write('# InNic: KB of data that have been queued for the ' 'NIC but whose packets\n') f.write('# have not yet been returned after ' - 'transmission') + 'transmission\n') f.write('# NicRx: KB of data that are still in the NIC\'s ' 'possession (their packets\n') f.write('# haven\'t been returned after transmission) ' @@ -9297,8 +9506,9 @@ def output(self): total = 0 for interval in intervals[node]: if not 'tx_bytes' in interval: - print('Bogus interval: %s' % (interval)) - print('Trace: %s' % (traces[node])) + interval['tx_bytes'] = 0 + # print('Bogus interval: %s' % (interval)) + # print('Trace: %s' % (traces[node])) gbps = interval['tx_bytes'] * 8 / (options.interval * 1000) total += gbps f.write('%8.1f %6.1f %5.0f %5d %5d %5d' % @@ -9342,7 +9552,7 @@ class AnalyzeTxpkts: --tx-qid is specified, only packets matching those options will be considered. Packets will normally be sorted by the 'Xmit' column, but the --sort option can be used to specify a different column to use for sorting - ('Xmit', 'Nic', 'MaxGro', or 'Free'). Also generates aggregate statistics + ('Xmit', 'Nic', 'Gro', or 'Free'). Also generates aggregate statistics for each tx queue on each node. """ @@ -9352,18 +9562,24 @@ def __init__(self, dispatcher): dispatcher.interest('AnalyzePackets') def output(self): - global packets, options, traces + global packets, tcp_packets, options, traces # node -> list of packets transmitted by that node node_pkts = defaultdict(list) # Bucket all of the packets by transmitting node. for pkt in packets.values(): - if (not 'xmit' in pkt) or not ('tso_length' in pkt): + if not 'xmit' in pkt or not 'tso_length' in pkt: + continue + node_pkts[pkt['tx_node']].append(pkt) + for pkt in tcp_packets.values(): + if pkt['sequence'] == 3666610099: + print('Found TCP packet: %s' % (pkt)) + if not 'xmit' in pkt or not ('tso_length' in pkt): continue node_pkts[pkt['tx_node']].append(pkt) - sort_keys = {'Xmit': 'xmit', 'Nic': 'nic', 'MaxGro': 'gro', + sort_keys = {'Xmit': 'xmit', 'Nic': 'nic', 'Gro': 'gro', 'Free': 'free_tx_skb'} sort_key = 'xmit' if options.sort != None: From df6ac3541eb2337c18d17a5b95685ec3f796cf76 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 20 Nov 2025 20:47:36 -0800 Subject: [PATCH 577/625] Add nicbacklog analyzer to tthoma.py Also clean up a few other minor things --- util/tthoma.py | 281 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 251 insertions(+), 30 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 9a3f0934..c7aae9d6 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -11,6 +11,7 @@ from collections import defaultdict, deque from functools import cmp_to_key from glob import glob +import heapq import itertools from io import StringIO import matplotlib @@ -1455,17 +1456,17 @@ def __nic_grant(self, trace, time, core, match, interests): }) def __free_tx_skb(self, trace, time, core, match, interests): - id = int(match.group(1)) - offset = int(match.group(2)) - qid = int(match.group(3)) - msg_length = int(match.group(4)) + id = int(match.group(2)) + offset = int(match.group(3)) + qid = int(match.group(4)) + msg_length = int(match.group(5)) for interest in interests: interest.tt_free_tx_skb(trace, time, core, id, offset, qid, msg_length) patterns.append({ 'name': 'free_tx_skb', - 'regexp': 'napi freeing tx skb for homa data, id ([0-9]+), ' + 'regexp': '(mlx|ice) freeing tx skb for homa data, id ([0-9]+), ' 'offset ([0-9]+), qid ([0-9]+), msg_length ([0-9]+)' }) @@ -2075,16 +2076,16 @@ def __nic_tcp2(self, trace, time, core, match, interests): }) def __free_tcp(self, trace, time, core, match, interests): - saddr = match.group(1) - sport = int(match.group(2)) - daddr = match.group(3) - dport = int(match.group(4)) + saddr = match.group(2) + sport = int(match.group(3)) + daddr = match.group(4) + dport = int(match.group(5)) self.core_saved[core] = {'saddr': saddr, 'sport': sport, 'daddr': daddr, 'dport': dport} patterns.append({ 'name': 'free_tcp', - 'regexp': 'napi freeing TCP skb from ([^:]+):([0-9]+) to ' + 'regexp': '(mlx|ice) freeing TCP skb from ([^:]+):([0-9]+) to ' '([^:]+):([0-9]+)' }) @@ -2092,10 +2093,10 @@ def __free_tcp2(self, trace, time, core, match, interests): if not core in self.core_saved: return saved = self.core_saved[core] - sequence = int(match.group(1)) - data_bytes = int(match.group(2)) - ack = int(match.group(3)) - qid = int(match.group(4)) + sequence = int(match.group(2)) + data_bytes = int(match.group(3)) + ack = int(match.group(4)) + qid = int(match.group(5)) for interest in interests: interest.tt_free_tcp(trace, time, core, saved['saddr'], saved['sport'], saved['daddr'], saved['dport'], @@ -2104,7 +2105,7 @@ def __free_tcp2(self, trace, time, core, match, interests): patterns.append({ 'name': 'free_tcp2', - 'regexp': r'napi freeing TCP skb .2. sequence ([-0-9]+), ' + 'regexp': r'(mlx|ice) freeing TCP skb .2. sequence ([-0-9]+), ' 'data bytes ([0-9]+), ack ([-0-9]+), qid ([-0-9]+)' }) @@ -3125,7 +3126,7 @@ def print_pkt_delays(self): print('Delays in the transmission and processing of data and grant packets') print('(all times in usecs):') print('Xmit: Time from ip*xmit call until driver queued packet for NIC') - print(' (for grants, includes time in homa_send_grants and ') + print(' (for grants, includes time in homa_send_grants and') print(' homa_xmit_control)') print('Net: Time from when NIC received packet until GRO started processing') print('SoftIRQ: Time from GRO until SoftIRQ started processing') @@ -3145,14 +3146,14 @@ def print_pcts(data, label): list_avg(data, 0))) print('\nPhase Count Min P10 P50 P90 P99 Max Avg') print('-------------------------------------------------------------------------') - print('Data packets from single-packet messages:') + print('Data packets from messages <= %d bytes:' % (mtu)) print_pcts(short_to_nic, 'Xmit') print_pcts(short_to_gro, 'Net') print_pcts(short_to_softirq, 'SoftIRQ') print_pcts(short_free, 'Free') print_pcts(short_total, 'Total') - print('\nData packets from multi-packet messages:') + print('\nData packets from messages > %d bytes:' % (mtu)) print_pcts(long_to_nic, 'Xmit') print_pcts(long_to_gro, 'Net') print_pcts(long_to_softirq, 'SoftIRQ') @@ -3206,13 +3207,13 @@ def print_worst(data, label): verbose += ('--------------------------------------------------------' '-------------\n') - verbose += 'Data packets from single-packet messages:\n' + verbose += 'Data packets from messages <= %d bytes:\n' % (mtu) verbose += print_worst(short_to_nic, 'Xmit') verbose += print_worst(short_to_gro, 'Net') verbose += print_worst(short_to_softirq, 'SoftIRQ') verbose += print_worst(short_total, 'Total') - verbose += '\nData packets from multi-packet messages:\n' + verbose += '\nData packets from messages > %d bytes:\n' % (mtu) verbose += print_worst(long_to_nic, 'Xmit') verbose += print_worst(long_to_gro, 'Net') verbose += print_worst(long_to_softirq, 'SoftIRQ') @@ -3332,12 +3333,12 @@ def get_slow_summary(data): print(' Xmit Net SoftIRQ Free') print(' Pkts P50 Avg P50 Avg P50 Avg P50 Avg') print('---------------------------------------------------------------------------') - print('Single-packet %5d %s %s %s %s' % (len(short_to_nic), + print('Short msgs %5d %s %s %s %s' % (len(short_to_nic), get_slow_summary(short_to_nic), get_slow_summary(short_to_gro), get_slow_summary(short_to_softirq), get_slow_summary(short_free))) - print('Multi-packet %5d %s %s %s %s' % (len(long_to_nic), + print('Long msgs %5d %s %s %s %s' % (len(long_to_nic), get_slow_summary(long_to_nic), get_slow_summary(long_to_gro), get_slow_summary(long_to_softirq), @@ -6139,6 +6140,223 @@ def output(self): core_data['max_backlog'] * 1e-3, core_data['max_backlog_time'])) +#------------------------------------------------ +# Analyzer: nicbacklog +#------------------------------------------------ +class AnalyzeNicbacklog: + """ + Prints a time-series analysis of backlog in the NIC (packets that + have been passed to the NIC but not yet returned after transmission) + along with information about the rate of packets flowing into the + NIC and being returned from the NIC. Requries the --data option. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + require_options('nicbacklog', 'data') + + def output(self): + global packets, tcp_packets, options, traces + + # Microseconds in the smalleset interval we'll consider for + # computing rates. + base_interval = 50 + + # node -> list of packets transmitted by that node + node_pkts = defaultdict(list) + + # Bytes and packets owned by the NIC as of current time + nic_pkts = 0 + nic_bytes = 0 + + print('\n--------------------') + print('Analyzer: nicbacklog') + print('--------------------') + print('See data files %s/nicbacklog_*.dat' % (options.data)) + print('\nMaximum values observed for each node:') + print('Node: Name of node') + print('MaxPkts: Maximum packets owned by NIC at one time') + print('MaxKB: Maximum Kbytes of data in packets owned by NIC ' + 'at one time') + print('MaxInP: Maximum packets passed to NIC in a %d usec interval' % + (4 * base_interval)) + print('MaxInD: Maximum data rate from pkts passed to NIC in a %d ' + 'usec interval (Gbps)' % (4 * base_interval)) + print('MaxFrP: Maximum packets freed in a %d usec interval after ' + 'return from NIC' % (4 * base_interval)) + print('MaxFrD: Maximum data rate from pkts freed in a %d usec ' + 'interval (Gbps)' % (4 * base_interval)) + print() + print('Node MaxPkts MaxKB MaxInP MaxInD MaxFrP MaxFrD') + print('------------------------------------------------------') + + # Bucket all of the packets by transmitting node. + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if (not 'nic' in pkt or not 'free_tx_skb' in pkt or + not 'tso_length' in pkt): + continue + node_pkts[pkt['tx_node']].append(pkt) + + # Each iteraction in this loops generates data for one node. + for node in get_sorted_nodes(): + f = open('%s/nicbacklog_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# NIC backlog (packets passed to the NIC but not yet ' + 'returned to the\n') + f.write('# kernel) as a function of time\n') + f.write('# Time: Time of measurement (usecs)\n') + f.write('# NicPkts: Packets currently owned by NIC\n') + f.write('# NicKB: Kbytes of data in packets currently owned by NIC\n') + + f.write('# %-8s Packets passed to NIC in last %d usecs\n' % + ('InP%d:' % (base_interval), base_interval)) + f.write('# %-8s Data rate from packets passed to NIC in last %d ' + 'usecs (Gbps)\n' % + ('InB%d:' % (base_interval), base_interval)) + f.write('# %-8s Packets freed in last %d usecs after return from ' + 'NIC\n' % + ('FrP%d:' % (base_interval), base_interval)) + f.write('# %-8s Data rate from packets freed in last %d usecs ' + '(Gbps)\n' % + ('FrB%d:' % (base_interval), base_interval)) + + f.write('# %-8s Packets passed to NIC in last %d usecs\n' % + ('InP%d:' % (2*base_interval), 2*base_interval)) + f.write('# %-8s Data rate from packets passed to NIC in last %d ' + 'usecs (Gbps)\n' % + ('InB%d:' % (2*base_interval), 2*base_interval)) + f.write('# %-8s Packets freed in last %d usecs after return from ' + 'NIC\n' % + ('FrP%d:' % (2*base_interval), 2*base_interval)) + f.write('# %-8s Data rate from packets freed in last %d usecs ' + '(Gbps)\n' % + ('FrB%d:' % (2*base_interval), 2*base_interval)) + + f.write('# %-8s Packets passed to NIC in last %d usecs (M/sec)\n' % + ('InP%d:' % (4*base_interval), 4*base_interval)) + f.write('# %-8s Data rate from packets passed to NIC in last %d ' + 'usecs (Gbps)\n' % + ('InB%d:' % (4*base_interval), 4*base_interval)) + f.write('# %-8s Packets freed in last %d usecs after return from ' + 'NIC (M packets/sec)\n' % + ('FrP%d:' % (4*base_interval), 4*base_interval)) + f.write('# %-8s Data rate from packets freed in last %d usecs ' + '(Gbps)\n' % + ('FrB%d:' % (4*base_interval), 4*base_interval)) + + f.write('\nTime NicPkts NicKB') + for i in [base_interval, base_interval*2, base_interval*4]: + f.write(' %6s' % ('InP%d' % (i))) + f.write(' %7s' % ('InB%d' % (i))) + f.write(' %6s' % ('FrP%d' % (i))) + f.write(' %7s' % ('FrB%d' % (i))) + f.write('\n') + + # heapq of all active packets (those that are currently in + # the posessions of the NIC) in increasing order of free time. + active = [] + + # list of for each of + # 4 intervals, where intervals[0] is the newest interval. + # in_pkts: packets passed to the NIC in the interval + # in_bytes: bytes of data in packets passed to the NIC + # free_pkts: packets returned to Linux and freed in the interval + # free_bytes: bytes of data in packets freed in the interval + intervals = deque() + for _ in range(4): + intervals.appendleft([0, 0, 0, 0]) + + # End of the current interval (the next one to be added to + # intervals) + interval_end = 0 + + # Maximum values for any of the largest size interval. + max_pkts = 0 + max_bytes = 0 + max_in_pkts = 0 + max_in_bytes = 0 + max_free_pkts = 0 + max_free_bytes = 0 + + pkts = sorted(node_pkts[node], key = lambda pkt : pkt['nic']) + interval_end = (math.ceil(pkts[0]['nic'] / base_interval) * + base_interval) + cur = 0 + # print('\n%s: %d packets:' % (node, len(node_pkts[node]))) + + # Each iteration of this loop handles a new interval. + while cur < len(pkts) or len(active) > 0: + in_pkts = 0 + in_bytes = 0 + free_pkts = 0 + free_bytes = 0 + + while cur < len(pkts) and pkts[cur]['nic'] <= interval_end: + pkt = pkts[cur] + cur += 1 + in_pkts += 1 + in_bytes += pkt['tso_length'] + heapq.heappush(active, [pkt['free_tx_skb'], cur, pkt]) + # print('\n%9.3f: to Nic: %s' % (pkt['nic'], pkt['free_tx_skb'])) + while len(active) > 0 and active[0][0] < interval_end: + pkt = heapq.heappop(active)[2] + free_pkts += 1 + free_bytes += pkt['tso_length'] + # print('\n%9.3f: freed: %s' % (pkt['free_tx_skb'], pkt)) + + nic_pkts += in_pkts - free_pkts + nic_bytes += in_bytes - free_bytes + intervals.pop() + intervals.appendleft([in_pkts, in_bytes, free_pkts, free_bytes]) + + # print('%7.1f: %8d %8d %8d %8d %8d %8d' % (interval_end, + # in_pkts, in_bytes, free_pkts, free_bytes, + # in_pkts - free_pkts, in_bytes - free_bytes)) + + f.write('%7.1f %5d %6d' % (interval_end, nic_pkts, + nic_bytes/1000)) + f.write(' %6d %7.2f %6d %7.2f' % ( + in_pkts, in_bytes*8/(1000*base_interval), + free_pkts, free_bytes*8/(1000*base_interval))) + in_pkts += intervals[1][0] + in_bytes += intervals[1][1] + free_pkts += intervals[1][2] + free_bytes += intervals[1][3] + f.write(' %6d %7.2f %6d %7.2f' % ( + in_pkts, in_bytes*8/(2000*base_interval), + free_pkts, free_bytes*8/(2000*base_interval))) + in_pkts += intervals[2][0] + intervals[3][0] + in_bytes += intervals[2][1] + intervals[3][1] + free_pkts += intervals[2][2] + intervals[3][2] + free_bytes += intervals[2][3] + intervals[3][3] + f.write(' %6d %7.2f %6d %7.2f' % ( + in_pkts, in_bytes*8/(4000*base_interval), + free_pkts, free_bytes*8/(4000*base_interval))) + f.write('\n') + + # Update maximum values + if nic_pkts > max_pkts: + max_pkts = nic_pkts + if nic_bytes > max_bytes: + max_bytes = nic_bytes + if in_pkts > max_in_pkts: + max_in_pkts = in_pkts + if in_bytes > max_in_bytes: + max_in_bytes = in_bytes + if free_pkts > max_free_pkts: + max_free_pkts = free_pkts + if free_bytes > max_free_bytes: + max_free_bytes = free_bytes + + interval_end += base_interval + f.close() + print('%-10s %6d %6d %6d %7.2f %6d %7.2f' % ( + node, max_pkts, max_bytes/1000, + max_in_pkts, max_in_bytes*8/(4000*base_interval), + max_free_pkts, max_free_bytes*8/(4000*base_interval))) + #------------------------------------------------ # Analyzer: nicqueues #------------------------------------------------ @@ -6321,7 +6539,7 @@ def output(self): (time.strftime('%I:%M %p on %m/%d/%Y'))) f.write('# Statistics about NIC transmit throughput from node ') f.write('%s over %d usec intervals\n' % (node, options.interval)) - f.write('All rates are in gbps, averaged over the 5 preceding intervals\n') + f.write('# All rates are in gbps, averaged over the 5 preceding intervals\n') f.write('# Time: End of the time interval\n') f.write('# Tx: Rate at which new data bytes were passed to ip*xmit\n') f.write('# ToNic: Rate at which new data bytes were queued in the NIC\n') @@ -6332,11 +6550,11 @@ def output(self): 'but not yet freed\n') f.write('# InNic2 KB of data that has been queued in the NIC ' 'and has neither been\n') - f.write(' freed nor received at the destination\n') + f.write('# freed nor received at the destination\n') f.write('# InNicQ Same as InNic2 except only counts bytes in ' 'tx queue %d (use\n' % ( options.tx_qid if options.tx_qid != None else 0)) - f.write(' the --tx-qid option to select a different ' + f.write('# the --tx-qid option to select a different ' 'queue)\n') f.write('# NicPkts Number of packets associated with InNic2') @@ -6888,6 +7106,8 @@ def tt_xmit_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, tcp_pkt['xmit'] = t tcp_pkt['total_length'] = total tcp_pkt['tx_node'] = node + if sequence == 1749134782: + print('tt_xmit_tcp setting tx_node to %s' % (node)) if not saddr in peer_nodes and saddr != '0x00000000': peer_nodes[saddr] = node @@ -6898,15 +7118,14 @@ def tt_qdisc_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, node = trace['node'] tcp_pkt['qdisc_xmit'] = t tcp_pkt['tx_node'] = node + if sequence == 1749134782: + print('tt_qdisc_tcp setting tx_node to %s' % (node)) if not saddr in peer_nodes and saddr != '0x00000000': peer_nodes[saddr] = node def tt_nic_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, data_bytes, ack, gso_size): node = trace['node'] - if sequence == 3666610099: - print('%9.3f got sequence %u on %s, data_bytes %d, gso_size %d' % - (t, sequence, node, data_bytes, gso_size), file=sys.stderr) if not saddr in peer_nodes and saddr != '0x00000000': peer_nodes[saddr] = node @@ -6926,6 +7145,8 @@ def tt_nic_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, del tcp_pkt['gro'] tcp_pkt['nic'] = t tcp_pkt['tx_node'] = node + if pkt_sequence == 1749134782: + print('tt_xmit_tcp setting tx_node to %s' % (node)) if pkt_sequence == sequence: tcp_pkt['tso_length'] = data_bytes bytes_left -= pkt_bytes @@ -6941,6 +7162,8 @@ def tt_free_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, tcp_pkt['free_tx_skb'] = t tcp_pkt['tx_qid'] = qid tcp_pkt['tx_node'] = node + if sequence == 1749134782: + print('%9.3f: tt_free_tcp setting tx_node to %s' % (t, node)) if not saddr in peer_nodes and saddr != '0x00000000': peer_nodes[saddr] = node @@ -9573,8 +9796,6 @@ def output(self): continue node_pkts[pkt['tx_node']].append(pkt) for pkt in tcp_packets.values(): - if pkt['sequence'] == 3666610099: - print('Found TCP packet: %s' % (pkt)) if not 'xmit' in pkt or not ('tso_length' in pkt): continue node_pkts[pkt['tx_node']].append(pkt) From af4fbe1bc47eec32f62d420706dae539376424e5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 20 Nov 2025 20:48:48 -0800 Subject: [PATCH 578/625] Remove nictx analyzer from tthoma.py The new nicbacklog analyzer is a more useful superset. --- util/tthoma.py | 78 -------------------------------------------------- 1 file changed, 78 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index c7aae9d6..a7aa1c6e 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -6507,84 +6507,6 @@ def output(self): interval_end += interval file.close() -#------------------------------------------------ -# Analyzer: nictx -#------------------------------------------------ -class AnalyzeNictx: - """ - Generates statistics about NIC transmit throughput, intended for debugging - situations where the NIC does not seem to be transmitting at line rate. - Requires the --data option; also uses the --interval option. - """ - - def __init__(self, dispatcher): - interval_analyzer = dispatcher.interest('AnalyzeIntervals') - require_options('nictx', 'data') - - def output(self): - global intervals, options, traces - - print('\n---------------') - print('Analyzer: nictx') - print('---------------') - if options.data == None: - print('--data option wasn\'t specified, so no output generated.') - return - print('See data files nictx_*.dat in %s\n' % (options.data)) - - for node in get_sorted_nodes(): - f = open('%s/nictx_%s.dat' % (options.data, node), 'w') - f.write('# Node: %s\n' % (node)) - f.write('# Generated at %s.\n' % - (time.strftime('%I:%M %p on %m/%d/%Y'))) - f.write('# Statistics about NIC transmit throughput from node ') - f.write('%s over %d usec intervals\n' % (node, options.interval)) - f.write('# All rates are in gbps, averaged over the 5 preceding intervals\n') - f.write('# Time: End of the time interval\n') - f.write('# Tx: Rate at which new data bytes were passed to ip*xmit\n') - f.write('# ToNic: Rate at which new data bytes were queued in the NIC\n') - f.write('# Gro: Rate at which data bytes reached GRO on receivers\n') - f.write('# Free: Rate at which packet buffers were freed ' - 'after transmission complete\n') - f.write('# InNic KB of data that has been queued in the NIC ' - 'but not yet freed\n') - f.write('# InNic2 KB of data that has been queued in the NIC ' - 'and has neither been\n') - f.write('# freed nor received at the destination\n') - f.write('# InNicQ Same as InNic2 except only counts bytes in ' - 'tx queue %d (use\n' % ( - options.tx_qid if options.tx_qid != None else 0)) - f.write('# the --tx-qid option to select a different ' - 'queue)\n') - f.write('# NicPkts Number of packets associated with InNic2') - - f.write('\n# Time Tx ToNic Gro Free InNic InNic2 InNicQ NicPkts\n') - - node_intervals = intervals[node] - bytes_to_gbps = 8 / (options.interval * 5 * 1000) - for i in range(4, len(node_intervals)): - tx_bytes = 0 - to_nic_bytes = 0 - gro_bytes = 0 - free_bytes = 0 - for interval in node_intervals[i-4:i+1]: - tx_bytes += interval['tx_bytes'] - to_nic_bytes += interval['tx_nic_bytes'] - gro_bytes += interval['tx_gro_bytes'] - free_bytes += interval['tx_free_bytes'] - interval = node_intervals[i] - f.write('%8.1f %6.1f %6.1f %6.1f %6.1f %5d %6d %6d %6d\n' % - (interval['time'], - tx_bytes * bytes_to_gbps, - to_nic_bytes * bytes_to_gbps, - gro_bytes * bytes_to_gbps, - free_bytes * bytes_to_gbps, - interval['tx_in_nic'] * 1e-3, - interval['tx_in_nic2'] * 1e-3, - interval['tx_in_nic_qx'] * 1e-3, - interval['pkts_in_nic2'])) - f.close() - #------------------------------------------------ # Analyzer: ooo #------------------------------------------------ From 3b5d07101e978b14dbb8411077847c640d650015 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 24 Nov 2025 08:35:03 -0800 Subject: [PATCH 579/625] Add another timetrace record to homa_qdisc.c --- homa_qdisc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 26c77315..fbdf665b 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -461,6 +461,14 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, homa_qdisc_update_link_idle(qdev, pkt_len, qshared->max_nic_queue_cycles)) goto enqueue; + tt_record4("homa_qdisc_enqueue deferring TCP packet from 0x%08x " + "to 0x%08x, ports %x, length %d", + ntohl(ip_hdr(skb)->saddr), + ntohl(ip_hdr(skb)->daddr), + (ntohs(tcp_hdr(skb)->source) << 16) + + ntohs(tcp_hdr(skb)->dest), + skb->len - skb_transport_offset(skb) - + tcp_hdrlen(skb)); homa_qdisc_defer_tcp(q, skb); return NET_XMIT_SUCCESS; } @@ -677,11 +685,11 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) struct tcphdr *th; th = (struct tcphdr*) skb_transport_header(skb); - ltt_record4("homa_qdisc_pacer requeued TCP packet " + tt_record4("homa_qdisc_pacer requeued TCP packet " "from 0x%08x:%d to 0x%08x:%d", ntohl(ip_hdr(skb)->saddr), ntohs(th->source), ntohl(ip_hdr(skb)->daddr), ntohs(th->dest)); - ltt_record4("homa_qdisc_pacer requeued TCP packet (2) " + tt_record4("homa_qdisc_pacer requeued TCP packet (2) " "sequence %u, data bytes %d, ack %u, gso_size %d", ntohl(th->seq), skb->len - skb_transport_offset(skb) - From 3ab37ac236c5530850980c8169e4232684e745d1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 24 Nov 2025 10:07:19 -0800 Subject: [PATCH 580/625] Add max_link_usage sysctl parameter --- homa_qdisc.c | 42 +++++++++++++++++++++++++++--------------- homa_qdisc.h | 8 ++++++++ man/homa.7 | 13 +++++++++++++ test/unit_homa_qdisc.c | 23 +++++++++++++++++++++-- 4 files changed, 69 insertions(+), 17 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index fbdf665b..f40a4d57 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -54,6 +54,13 @@ static struct ctl_table homa_qdisc_ctl_table[] = { .mode = 0644, .proc_handler = homa_qdisc_dointvec }, + { + .procname = "max_link_usage", + .data = OFFSET(max_link_usage), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, { .procname = "tcp_credit_increment", .data = OFFSET(tcp_credit_increment), @@ -152,6 +159,7 @@ struct homa_qdisc_shared *homa_qdisc_shared_alloc(void) qshared->defer_min_bytes = 1000; qshared->homa_share = 50; qshared->tcp_credit_increment = 20000; + qshared->max_link_usage = 99; qshared->sysctl_header = register_net_sysctl(&init_net, "net/homa", homa_qdisc_ctl_table); if (!qshared->sysctl_header) { @@ -1116,7 +1124,7 @@ void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev) struct ethtool_link_ksettings ksettings; struct homa *homa = qdev->hnet->homa; const struct ethtool_ops *ops; - u64 tmp; + u64 tmp, tmp2; qdev->link_mbps = homa->link_mbps; ops = qdev->dev->ethtool_ops; @@ -1125,25 +1133,25 @@ void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev) qdev->link_mbps = ksettings.base.speed; } - /* Underestimate link bandwidth (overestimate time) by 1%. + /* Compute cycles_per_mibyte based on the link speed (mibytes/sec) + * and max_link_usage: * - * cycles/sec - * cycles/mibyte = (101/100) * ------------- - * mibytes/sec + * cycles/sec + * cycles/mibyte = (100/max_link_usage) * ------------- + * mibytes/sec * - * 101 * homa_clock_khz() * 1000 - * = --------------------------------------- - * 100 * link_mbps * (1<<20 / 1000000) / 8 + * 100 * homa_clock_khz() * 1000 + * = -------------------------------------------------- + * max_link_usage * link_mbps * (1000000 / 1<<20) / 8 * - * 8 * 1010 * homa_clock_khz() 1<<20 - * = ----------------------------- * --------- - * link_mbps 1000000 + * 8 * homa_clock_khz() 1<<20 + * = ----------------------------- * ------- + * max_link_usage * link_mbps 10 */ - tmp = 8ULL * 1010; - tmp *= homa_clock_khz(); - do_div(tmp, qdev->link_mbps); + tmp = 8ULL * homa_clock_khz(); tmp <<= 20; - do_div(tmp, 1000000); + tmp2 = 10ULL * homa->qshared->max_link_usage * qdev->link_mbps; + do_div(tmp, tmp2); qdev->cycles_per_mibyte = tmp; } @@ -1164,6 +1172,10 @@ void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared) qshared->homa_share = 0; if (qshared->homa_share > 100) qshared->homa_share = 100; + if (qshared->max_link_usage < 5) + qshared->max_link_usage = 5; + if (qshared->max_link_usage > 100) + qshared->max_link_usage = 100; /* Use a mutex rather than RCU to prevent qdev deletion while we * traverse the list. This is more expensive, but RCU isn't safe diff --git a/homa_qdisc.h b/homa_qdisc.h index 8d207615..f54d58b7 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -269,6 +269,14 @@ struct homa_qdisc_shared { */ int tcp_credit_increment; + /** + * @max_link_usage: An integer <= 100 indicating the maximum percentage + * of uplink bandwidth that Homa will attempt to utilize. A smaller + * value reduces the likelihood of queue buildup in the NIC, but + * also prevents full link utilization. + */ + int max_link_usage; + #ifndef __STRIP__ /* See strip.py */ /** * @sysctl_header: Used to remove sysctl values when this structure diff --git a/man/homa.7 b/man/homa.7 index 2ee5b3df..2d01ce0f 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -679,6 +679,19 @@ have appeared) then Homa will not issue grants until enough data has been received to get below the limit. Used to control the total utilization of TOR switch buffers. .TP +.IR max_link_usage +In order to reduce the likelihood of queues forming in the NIC (which would +reduce the effectiveness of Homa's SRPT policy) Homa limits the rate at +which it submits outgoing packets to the NIC to slightly less than the +full uplink bandwidth. This parameter determines the degree of undercommitment. +It is an integer between 5 and 100 (inclusive) that specifies the maximum +percentage of link bandwidth that Homa will attempt to utilize. 100 means +Homa will attempt to utilize the full bandwidth. Smaller values reduce the +likelihood of queues forming, but also limit uplink utilization, which can +affect performance. Note that queues can sometimes form even with values less +than 100, since most NICs cannot transmit at full link speed under all +conditions. +.TP .IR max_nic_queue_ns An integer value specifying a NIC queue length in units of nanoseconds (how long it will take the existing packets in the queue diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index d862f5ad..84c59ded 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -1931,9 +1931,10 @@ TEST_F(homa_qdisc, homa_qdevc_update_sysctl__basics) self->homa.link_mbps = 25000; mock_link_mbps = 8000; + self->homa.qshared->max_link_usage = 90; homa_qdev_update_sysctl(qdev); EXPECT_EQ(8000, qdev->link_mbps); - EXPECT_EQ(1059061, qdev->cycles_per_mibyte); + EXPECT_EQ(1165084, qdev->cycles_per_mibyte); homa_qdisc_qdev_put(qdev); } @@ -1949,7 +1950,7 @@ TEST_F(homa_qdisc, homa_qdev_update_sysctl__cant_get_link_speed_from_dev) mock_ethtool_ksettings_errors = 1; homa_qdev_update_sysctl(qdev); EXPECT_EQ(16000, qdev->link_mbps); - EXPECT_EQ(529530, qdev->cycles_per_mibyte); + EXPECT_EQ(529583, qdev->cycles_per_mibyte); homa_qdisc_qdev_put(qdev); } @@ -1979,6 +1980,24 @@ TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__limit_homa_share) homa_qdisc_update_sysctl_deps(self->homa.qshared); EXPECT_EQ(100, self->homa.qshared->homa_share); } +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__limit_max_link_usage) +{ + self->homa.qshared->max_link_usage = 4; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(5, self->homa.qshared->max_link_usage); + + self->homa.qshared->max_link_usage = 6; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(6, self->homa.qshared->max_link_usage); + + self->homa.qshared->max_link_usage = 100; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(100, self->homa.qshared->max_link_usage); + + self->homa.qshared->max_link_usage = 101; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(100, self->homa.qshared->max_link_usage); +} TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__update_all_qdevs) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); From 09eb25323ac7eeee560daa8fcc3501ee661c6710 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 24 Nov 2025 16:18:58 -0800 Subject: [PATCH 581/625] Refactor handling of TCP packet deferrals in homa_qdisc Use a single global queue to hold all the deferred packets, rather than a per-dev_queue queue. This simplifies the code and eliminates the tcp_credit_increment option. --- homa_qdisc.c | 114 +++++++++++++---------------------- homa_qdisc.h | 47 ++++----------- man/homa.7 | 5 -- test/mock.c | 5 ++ test/mock.h | 1 + test/unit_homa_qdisc.c | 131 +++++++++++------------------------------ 6 files changed, 94 insertions(+), 209 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index f40a4d57..ba87fff4 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -61,13 +61,6 @@ static struct ctl_table homa_qdisc_ctl_table[] = { .mode = 0644, .proc_handler = homa_qdisc_dointvec }, - { - .procname = "tcp_credit_increment", - .data = OFFSET(tcp_credit_increment), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = homa_qdisc_dointvec - }, }; static struct Qdisc_ops homa_qdisc_ops __read_mostly = { @@ -158,7 +151,6 @@ struct homa_qdisc_shared *homa_qdisc_shared_alloc(void) qshared->max_nic_queue_ns = 5000; qshared->defer_min_bytes = 1000; qshared->homa_share = 50; - qshared->tcp_credit_increment = 20000; qshared->max_link_usage = 99; qshared->sysctl_header = register_net_sysctl(&init_net, "net/homa", homa_qdisc_ctl_table); @@ -264,7 +256,7 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) homa_qdev_update_sysctl(qdev); INIT_LIST_HEAD(&qdev->links); qdev->deferred_rpcs = RB_ROOT_CACHED; - INIT_LIST_HEAD(&qdev->tcp_qdiscs); + skb_queue_head_init(&qdev->deferred_tcp); spin_lock_init(&qdev->defer_lock); init_waitqueue_head(&qdev->pacer_sleep); spin_lock_init(&qdev->pacer_mutex); @@ -323,8 +315,7 @@ void homa_qdisc_dev_callback(struct rcu_head *head) qdev = container_of(head, struct homa_qdisc_dev, rcu_head); homa_qdisc_free_homa(qdev); - WARN_ON(!list_empty(&qdev->tcp_qdiscs)); - WARN_ON(qdev->cur_tcp_qdisc); + WARN_ON(!skb_queue_empty(&qdev->deferred_tcp)); kfree(qdev); } @@ -355,8 +346,6 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, break; } } - skb_queue_head_init(&q->tcp_deferred); - INIT_LIST_HEAD(&q->defer_links); sch->limit = 10 * 1024; return 0; @@ -370,12 +359,19 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, void homa_qdisc_destroy(struct Qdisc *qdisc) { struct homa_qdisc *q = qdisc_priv(qdisc); + struct sk_buff *skb, *tmp; unsigned long flags; qdisc_reset_queue(qdisc); + + /* Delete any deferred skb's for this qdisc. */ spin_lock_irqsave(&q->qdev->defer_lock, flags); - __skb_queue_purge(&q->tcp_deferred); - list_del_init(&q->defer_links); + skb_queue_walk_safe(&q->qdev->deferred_tcp, skb, tmp) { + if (skb_get_queue_mapping(skb) == q->ix) { + __skb_unlink(skb, &q->qdev->deferred_tcp); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + } + } spin_unlock_irqrestore(&q->qdev->defer_lock, flags); homa_qdisc_qdev_put(q->qdev); } @@ -457,7 +453,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * already deferred for this qdisc. */ INC_METRIC(qdisc_tcp_packets, 1); - if (!list_empty(&q->defer_links)) { + if (q->num_deferred_tcp > 0) { homa_qdisc_defer_tcp(q, skb); return NET_XMIT_SUCCESS; } @@ -469,14 +465,6 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, homa_qdisc_update_link_idle(qdev, pkt_len, qshared->max_nic_queue_cycles)) goto enqueue; - tt_record4("homa_qdisc_enqueue deferring TCP packet from 0x%08x " - "to 0x%08x, ports %x, length %d", - ntohl(ip_hdr(skb)->saddr), - ntohl(ip_hdr(skb)->daddr), - (ntohs(tcp_hdr(skb)->source) << 16) + - ntohs(tcp_hdr(skb)->dest), - skb->len - skb_transport_offset(skb) - - tcp_hdrlen(skb)); homa_qdisc_defer_tcp(q, skb); return NET_XMIT_SUCCESS; } @@ -554,12 +542,18 @@ void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb) u64 now = homa_clock(); unsigned long flags; + tt_record4("homa_qdisc deferring TCP packet from 0x%08x to 0x%08x, " + "ports %x, length %d", + ntohl(ip_hdr(skb)->saddr), + ntohl(ip_hdr(skb)->daddr), + (ntohs(tcp_hdr(skb)->source) << 16) + + ntohs(tcp_hdr(skb)->dest), + skb->len - skb_transport_offset(skb) - + tcp_hdrlen(skb)); + spin_lock_irqsave(&qdev->defer_lock, flags); - __skb_queue_tail(&q->tcp_deferred, skb); - if (list_empty(&q->defer_links)) { - q->credit = 0; - list_add_tail(&q->defer_links, &qdev->tcp_qdiscs); - } + __skb_queue_tail(&qdev->deferred_tcp, skb); + q->num_deferred_tcp++; if (qdev->last_defer) INC_METRIC(nic_backlog_cycles, now - qdev->last_defer); else @@ -631,60 +625,34 @@ void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) /** * homa_qdisc_xmit_deferred_tcp() - Transmit the "next" non-Homa packet - * that has been deferred for a particular homa_qdisc_dev and remove it - * from the structures that manage deferred packets. + * that has been deferred for a particular homa_qdisc_dev. * @qdev: Device on which to transmit packet. * Return: The number of bytes in the transmitted packet, or 0 if there * were no deferred TCP packets. */ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) { - struct homa_qdisc_shared *qshared; + struct netdev_queue *txq; struct homa_qdisc *q; unsigned long flags; struct sk_buff *skb; - struct Qdisc *sch; + struct Qdisc *qdisc; int pkt_len; - qshared = qdev->hnet->homa->qshared; spin_lock_irqsave(&qdev->defer_lock, flags); - if (list_empty(&qdev->tcp_qdiscs)) { + if (skb_queue_empty(&qdev->deferred_tcp)) { spin_unlock_irqrestore(&qdev->defer_lock, flags); return 0; } - - /* Find the next qdisc with positive credit.*/ - q = qdev->cur_tcp_qdisc; - if (!q) { - q = list_first_entry(&qdev->tcp_qdiscs, typeof(*q), - defer_links); - q->credit += qshared->tcp_credit_increment; - qdev->cur_tcp_qdisc = q; - } - while (q->credit <= 0) { - q = list_next_entry_circular(q, &qdev->tcp_qdiscs, - defer_links); - qdev->cur_tcp_qdisc = q; - q->credit += qshared->tcp_credit_increment; - continue; - } - - skb = __skb_dequeue(&q->tcp_deferred); + skb = __skb_dequeue(&qdev->deferred_tcp); pkt_len = qdisc_pkt_len(skb); - q->credit -= qdisc_pkt_len(skb); - if (skb_queue_len(&q->tcp_deferred) == 0) { - qdev->cur_tcp_qdisc = - list_next_entry_circular(q, &qdev->tcp_qdiscs, - defer_links); - list_del_init(&q->defer_links); - if (list_empty(&qdev->tcp_qdiscs)) { - qdev->cur_tcp_qdisc = NULL; - if (!homa_qdisc_any_deferred(qdev)) { - INC_METRIC(nic_backlog_cycles, - homa_clock() - qdev->last_defer); - qdev->last_defer = 0; - } - } + txq = netdev_get_tx_queue(skb->dev, skb_get_queue_mapping(skb)); + qdisc = rcu_dereference_bh(txq->qdisc); + q = qdisc_priv(qdisc); + q->num_deferred_tcp--; + if (!homa_qdisc_any_deferred(qdev)) { + INC_METRIC(nic_backlog_cycles, homa_clock() - qdev->last_defer); + qdev->last_defer = 0; } spin_unlock_irqrestore(&qdev->defer_lock, flags); @@ -704,11 +672,11 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) tcp_hdrlen(skb), ntohl(th->ack_seq), skb_shinfo(skb)->gso_size); } - sch = q->sch; - spin_lock_bh(qdisc_lock(sch)); - qdisc_enqueue_tail(skb, sch); - spin_unlock_bh(qdisc_lock(sch)); - __netif_schedule(sch); + + spin_lock_bh(qdisc_lock(qdisc)); + qdisc_enqueue_tail(skb, qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); + __netif_schedule(qdisc); return pkt_len; } @@ -959,7 +927,7 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) * prevent negative credit buildup for the protocol * with packets. */ - if (list_empty(&qdev->tcp_qdiscs)) { + if (skb_queue_empty(&qdev->deferred_tcp)) { if (!rb_first_cached(&qdev->deferred_rpcs)) break; qdev->homa_credit = 1; diff --git a/homa_qdisc.h b/homa_qdisc.h index f54d58b7..c7fa3df2 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -38,25 +38,12 @@ struct homa_qdisc { int ix; /** - * @credit: Used to share bandwidth equally among qdiscs with - * deferred TCP packets. Packets won't be transmitted from - * tcp_deferred until this becomes positive. + * @num_deferred_tcp: Count of the number of TCP packets for this + * qdisc that are currently in qdev->deferred_tcp. Must hold both + * the qdisc lock and qdev->defer_lock to increment this; must + * hold qdev->defer_lock to decrement. */ - int credit; - - /** - * @tcp_deferred: TCP packets whose transmission was deferred - * because the NIC queue was too long. The queue is in order of - * packet arrival at the qdisc. - */ - struct sk_buff_head tcp_deferred; - - /** - * @defer_links: Used to link this qdisc into the tcp_qdiscs list - * in homa_qdisc_dev. This will be an empty list whenever this - * object is not queued on tcp_qdiscs. - */ - struct list_head defer_links; + int num_deferred_tcp; }; /** @@ -142,17 +129,11 @@ struct homa_qdisc_dev { struct rb_root_cached deferred_rpcs; /** - * @tcp_qdiscs: List of all homa_qdiscs that have deferred TCP - * packets. + * @deferred_tcp: List of all non-Homa packets that have been deferred + * because of NIC overload, in order of when they were deferred. + * The internal lock isn't used (defer_lock is used instead) */ - struct list_head tcp_qdiscs; - - /** - * @cur_tcp_qdisc: Points to an element of tcp_qdiscs or NULL; this is - * the qdisc currently being serviced by the pacer. This pointer - * rotates circularly through tcp_qdiscs. - */ - struct homa_qdisc *cur_tcp_qdisc; + struct sk_buff_head deferred_tcp; /** * @last_defer: The most recent homa_clock() time when a packet was @@ -174,7 +155,7 @@ struct homa_qdisc_dev { /** * @defer_lock: Synchronizes access to information about deferred - * packets, including deferred_rpcs, tcp_deferred, and last_defer. + * packets, including deferred_rpcs, deferred_tcp, and last_defer. */ spinlock_t defer_lock; @@ -263,12 +244,6 @@ struct homa_qdisc_shared { */ int homa_share; - /** - * @tcp_credit_increment: Amount by which the credit field of - * homa_qdisc is incremented. - */ - int tcp_credit_increment; - /** * @max_link_usage: An integer <= 100 indicating the maximum percentage * of uplink bandwidth that Homa will attempt to utilize. A smaller @@ -368,7 +343,7 @@ static inline void homa_qdisc_rpc_init(struct homa_rpc_qdisc *qrpc) static inline bool homa_qdisc_any_deferred(struct homa_qdisc_dev *qdev) { return rb_first_cached(&qdev->deferred_rpcs) || - !list_empty(&qdev->tcp_qdiscs); + !skb_queue_empty(&qdev->deferred_tcp); } /** diff --git a/man/homa.7 b/man/homa.7 index 2d01ce0f..afe49055 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -881,11 +881,6 @@ not release pages from a pool if the amount of unused space in the pool has been less than this (specified in Kbytes) at any point in the recent past. .TP -.IR tcp_credit_increment -Determines the granularity of bandwidth sharing among qdiscs that have -deferred TCP output packets. Read the code in homa_qdisc.c to learn more -about this. You probably shouldn't ever modify this option. -.TP .IR throttle_min_bytes An integer value specifying the smallest packet size subject to output queue throttling. diff --git a/test/mock.c b/test/mock.c index 611ca239..0566e44c 100644 --- a/test/mock.c +++ b/test/mock.c @@ -268,6 +268,9 @@ struct dst_ops mock_dst_ops = { .check = mock_dst_check}; struct netdev_queue mock_net_queue = {.state = 0}; +/* Use this as the dev queue index in new skbs. */ +int mock_queue_index = 0; + /* Number of invocations of netif_schedule_queue. */ int mock_netif_schedule_calls; @@ -2030,6 +2033,7 @@ struct sk_buff *mock_raw_skb(struct in6_addr *saddr, int protocol, int length) skb->hash = 3; skb->next = NULL; skb->dev = &mock_devices[0]; + skb_set_queue_mapping(skb, mock_queue_index); qdisc_skb_cb(skb)->pkt_len = length + 100; return skb; } @@ -2443,6 +2447,7 @@ void mock_teardown(void) memset(mock_devices, 0, sizeof(mock_devices)); mock_peer_free_no_fail = 0; mock_link_mbps = 10000; + mock_queue_index = 0; mock_netif_schedule_calls = 0; memset(inet_offloads, 0, sizeof(inet_offloads)); inet_offloads[IPPROTO_TCP] = (struct net_offload __rcu *) &tcp_offload; diff --git a/test/mock.h b/test/mock.h index 670d8d1f..96c21fb2 100644 --- a/test/mock.h +++ b/test/mock.h @@ -161,6 +161,7 @@ extern int mock_page_nid_mask; extern int mock_peer_free_no_fail; extern int mock_prepare_to_wait_status; extern char mock_printk_output[]; +extern int mock_queue_index; extern int mock_rht_init_errors; extern int mock_rht_insert_errors; extern void **mock_rht_walk_results; diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 84c59ded..6c5e423f 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -438,25 +438,33 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) { struct Qdisc *qdisc, *qdisc2; struct homa_qdisc_dev *qdev; - struct homa_qdisc *q; + struct homa_qdisc *q, *q2; qdisc = mock_alloc_qdisc(&mock_net_queue); EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); + q->ix = 3; qdisc2 = mock_alloc_qdisc(&mock_net_queue); EXPECT_EQ(0, homa_qdisc_init(qdisc2, NULL, NULL)); + q2 = qdisc_priv(qdisc2); + q2->ix = 4; qdev = list_first_or_null_rcu(&self->homa.qshared->qdevs, struct homa_qdisc_dev, links); EXPECT_NE(NULL, qdev); EXPECT_EQ(2, refcount_read(&qdev->refs)); + mock_queue_index = 3; homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); + mock_queue_index = 4; homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 6000, 1100)); + mock_queue_index = 3; + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1100)); - homa_qdisc_destroy(qdisc2); + homa_qdisc_destroy(qdisc); EXPECT_EQ(1, refcount_read(&qdev->refs)); + EXPECT_EQ(1, skb_queue_len(&qdev->deferred_tcp)); - homa_qdisc_destroy(qdisc); + homa_qdisc_destroy(qdisc2); EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); kfree(qdisc); kfree(qdisc2); @@ -516,13 +524,14 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); q->ix = 3; + mock_queue_index = 3; /* First packet is long and gets deferred because of link_idle_time. */ skb = mock_tcp_skb(&self->addr, 5000, 1500); to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, q->tcp_deferred.qlen); + EXPECT_EQ(1, q->num_deferred_tcp); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); /* Second packet is short, but must be deferred to maintain order @@ -532,7 +541,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(2, q->tcp_deferred.qlen); + EXPECT_EQ(2, q->num_deferred_tcp); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); homa_qdisc_destroy(qdisc); @@ -585,6 +594,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_homa_deferred q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); q->ix = 3; + mock_queue_index = 3; /* First packet is Homa, gets deferred because of link_idle_time. */ skb = new_test_skb(srpc, &self->addr, 0, 1500); @@ -602,7 +612,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_homa_deferred to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, q->tcp_deferred.qlen); + EXPECT_EQ(1, q->num_deferred_tcp); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); homa_qdisc_destroy(qdisc); @@ -784,41 +794,16 @@ TEST_F(homa_qdisc, homa_qdisc_defer_tcp__basics) EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; - /* First packet: must add qdisc to qdev->tcp_qdiscs. */ homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1500)); - EXPECT_EQ(1, skb_queue_len(&q->tcp_deferred)); - EXPECT_EQ(1, unit_list_length(&q->qdev->tcp_qdiscs)); + EXPECT_EQ(1, skb_queue_len(&q->qdev->deferred_tcp)); + EXPECT_EQ(1, q->num_deferred_tcp); - /* Second packet: qdisc already in qdev->tcp_qdiscs. */ homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1500)); - EXPECT_EQ(2, skb_queue_len(&q->tcp_deferred)); - EXPECT_EQ(1, unit_list_length(&q->qdev->tcp_qdiscs)); -} -TEST_F(homa_qdisc, homa_qdisc_defer_tcp__multiple_qdiscs_on_list) -{ - struct homa_rpc *srpc; - struct homa_qdisc *q1, *q2, *q3; - - srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 100, 10000); - ASSERT_NE(NULL, srpc); - - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); - q1 = qdisc_priv(self->qdiscs[1]); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); - q2 = qdisc_priv(self->qdiscs[2]); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - q3 = qdisc_priv(self->qdiscs[3]); - - homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); - homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 5000, 2000)); - homa_qdisc_defer_tcp(q3, mock_tcp_skb(&self->addr, 5000, 3000)); - EXPECT_EQ(3, unit_list_length(&q1->qdev->tcp_qdiscs)); - EXPECT_EQ(&q1->defer_links, q1->qdev->tcp_qdiscs.next); - EXPECT_EQ(&q2->defer_links, q1->defer_links.next); - EXPECT_EQ(&q3->defer_links, q2->defer_links.next); + EXPECT_EQ(2, skb_queue_len(&q->qdev->deferred_tcp)); + EXPECT_EQ(2, q->num_deferred_tcp); } TEST_F(homa_qdisc, homa_qdisc_defer_tcp__update_metrics_and_wakeup) { @@ -831,6 +816,8 @@ TEST_F(homa_qdisc, homa_qdisc_defer_tcp__update_metrics_and_wakeup) ASSERT_NE(NULL, srpc); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); q = qdisc_priv(self->qdiscs[2]); + q->ix = 7; + mock_queue_index = 7; mock_log_wakeups = 1; /* First packet: qdev->last_defer is 0. */ @@ -1093,12 +1080,14 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__basics) EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); atomic64_set(&q->qdev->link_idle_time, 20000); EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q->qdev)); EXPECT_EQ(1, self->qdiscs[2]->q.qlen); - EXPECT_EQ(0, skb_queue_len(&q->tcp_deferred)); + EXPECT_EQ(0, skb_queue_len(&q->qdev->deferred_tcp)); EXPECT_LT(20000, atomic64_read(&q->qdev->link_idle_time)); } TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__no_deferred_packets) @@ -1111,63 +1100,6 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__no_deferred_packets) EXPECT_EQ(0, self->qdiscs[2]->q.qlen); homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__scan_for_qdisc_with_credit) -{ - struct homa_qdisc *q1, *q2, *q3; - - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); - q1 = qdisc_priv(self->qdiscs[1]); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); - q2 = qdisc_priv(self->qdiscs[2]); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - q3 = qdisc_priv(self->qdiscs[3]); - homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); - homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 6000, 1100)); - homa_qdisc_defer_tcp(q3, mock_tcp_skb(&self->addr, 7000, 1200)); - EXPECT_EQ(3, unit_list_length(&q2->qdev->tcp_qdiscs)); - self->homa.qshared->tcp_credit_increment = 10000; - q1->credit = -30000; - q2->credit = -10000; - q3->credit = -15000; - - EXPECT_EQ(1200, homa_qdisc_xmit_deferred_tcp(q2->qdev)); - EXPECT_EQ(1, self->qdiscs[2]->q.qlen); - EXPECT_EQ(1, skb_queue_len(&q1->tcp_deferred)); - EXPECT_EQ(0, skb_queue_len(&q2->tcp_deferred)); - EXPECT_EQ(1, skb_queue_len(&q3->tcp_deferred)); -} -TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__unlink_qdiscs) -{ - struct homa_qdisc *q1, *q2; - - mock_clock = 10000; - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); - q1 = qdisc_priv(self->qdiscs[1]); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); - q2 = qdisc_priv(self->qdiscs[2]); - homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); - homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 6000, 1100)); - homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 7000, 1200)); - EXPECT_EQ(2, unit_list_length(&q2->qdev->tcp_qdiscs)); - self->homa.qshared->tcp_credit_increment = 1000; - - /* First call xmits packet from q1. */ - mock_clock = 11000; - EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q2->qdev)); - EXPECT_EQ(2, unit_list_length(&q2->qdev->tcp_qdiscs)); - - /* Second call xmits packet from q2 and unlinks it. */ - mock_clock = 13000; - EXPECT_EQ(1300, homa_qdisc_xmit_deferred_tcp(q2->qdev)); - EXPECT_EQ(1, unit_list_length(&q2->qdev->tcp_qdiscs)); - EXPECT_FALSE(list_empty(&q1->defer_links)); - EXPECT_TRUE(list_empty(&q2->defer_links)); - - /* Third call xmits last packet from q1 and unlinks it. */ - mock_clock = 16000; - EXPECT_EQ(1200, homa_qdisc_xmit_deferred_tcp(q2->qdev)); - EXPECT_EQ(0, unit_list_length(&q2->qdev->tcp_qdiscs)); -} TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__backlog_cycles_metric) { struct homa_qdisc *q1; @@ -1175,6 +1107,8 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__backlog_cycles_metric) mock_clock = 10000; EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); q1 = qdisc_priv(self->qdiscs[1]); + q1->ix = 1; + mock_queue_index = 1; homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 6000, 1100)); homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 6000, 1200)); @@ -1188,6 +1122,7 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__backlog_cycles_metric) mock_clock = 13000; EXPECT_EQ(1300, homa_qdisc_xmit_deferred_tcp(q1->qdev)); EXPECT_EQ(3000, homa_metrics_per_cpu()->nic_backlog_cycles); + EXPECT_EQ(0, q1->qdev->last_defer); } TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__no_deferred_rpcs) @@ -1624,6 +1559,8 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_tcp_no_homa) qdev = homa_qdisc_qdev_get(self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1100)); homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1200)); @@ -1650,6 +1587,8 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__both_protocols_have_packets_choose_tcp) qdev = homa_qdisc_qdev_get(self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, @@ -1690,6 +1629,8 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_multiple_packets) qdev = homa_qdisc_qdev_get(self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1100)); homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1200)); From b9089d2624622bc206675a5886dddd3e7684023c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 24 Nov 2025 20:24:17 -0800 Subject: [PATCH 582/625] Update perf.txt --- perf.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/perf.txt b/perf.txt index f76a535f..41e2aea0 100644 --- a/perf.txt +++ b/perf.txt @@ -6,7 +6,7 @@ order. homa_qdisc (use tx queue 0 for paced traffic; non-paced traffic is spread across other queues, using default queues except that traffic for queue 0 goes to queue 1 instead). In comparison to the old pacer (measurements -with w4 and w5 on c6620 cluster at 80 Gbps load; see log book for graphs)): +with w4 and w5 on c6620 cluster at 80 Gbps load; see log book for graphs): * P99 for messages shorter than defer_min_bytes is 20-30% faster with separation * P99 for messages between defer_min_bytes and unsched_limit is about 2x slower with separation @@ -15,6 +15,16 @@ with w4 and w5 on c6620 cluster at 80 Gbps load; see log book for graphs)): * Increasing defer_min_bytes provides upside with no apparent downside. * Average slowdowns are better with the old pacer: 3.45 vs. 3.77 for W4, 9.40 vs. 7.72 for W5 (W5 has no messages shorter than defer_min_bytes). +* It appears that Intel NICs cannot always transmit at full link bandwidth, + so some some queuing occurs in the NIC even with Homa's output + pacing. +* When packets build up in the NIC, it appears to use some sort of fair + sharing mechanism between the queues. By placing a disproportionate + share of outgoing bytes in a single queue, those bytes effectively get + lower priority and bytes in other queues get higher priority, which + explains the behaviors observed above. +* Overall, it appears that placing pacer traffic in a dedicated queue is + not a good idea. 63. (September 2025) Compared CPU utilization against TCP. Measured with top, running cp_vs_tcp -w w4 -b20 on a 6-node xl170 cluster (20 cores): From 2fdfa7ece8724d504fdc29123e0e0bd1f985baf5 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 24 Nov 2025 21:52:53 -0800 Subject: [PATCH 583/625] Remove support for segregated tx queue for pacer in homa_qdisc It didn't provide the hoped-for performance improvement. See perf.txt for details. --- homa_qdisc.c | 163 +++++------------------ homa_qdisc.h | 55 +++----- test/mock.h | 4 + test/unit_homa_qdisc.c | 287 +++++++++++------------------------------ 4 files changed, 129 insertions(+), 380 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index ba87fff4..119e8430 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -251,8 +251,6 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) qdev->dev = dev; qdev->hnet = hnet; refcount_set(&qdev->refs, 1); - qdev->pacer_qix = -1; - qdev->redirect_qix = -1; homa_qdev_update_sysctl(qdev); INIT_LIST_HEAD(&qdev->links); qdev->deferred_rpcs = RB_ROOT_CACHED; @@ -376,46 +374,6 @@ void homa_qdisc_destroy(struct Qdisc *qdisc) homa_qdisc_qdev_put(q->qdev); } -/** - * homa_qdisc_set_qixs() - Recompute the @pacer_qix and @redirect_qix - * fields in @qdev. Upon return, both fields will be valid unless there - * are no Homa qdiscs associated with qdev's net_device. - * @qdev: Identifies net_device containing qnetdev_queues to choose - * between. - */ -void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev) -{ - int i, pacer_qix, redirect_qix; - struct netdev_queue *txq; - struct Qdisc *qdisc; - - /* Note: it's safe for multiple instances of this function to - * execute concurrently so no synchronization is needed (other - * than using RCU to protect against deletion of the underlying - * data structures). - */ - - pacer_qix = -1; - redirect_qix = -1; - rcu_read_lock(); - for (i = 0; i < qdev->dev->num_tx_queues; i++) { - txq = netdev_get_tx_queue(qdev->dev, i); - qdisc = rcu_dereference_bh(txq->qdisc); - if (!qdisc || qdisc->ops != &homa_qdisc_ops) - continue; - if (pacer_qix == -1) { - pacer_qix = i; - redirect_qix = i; - } else { - redirect_qix = i; - break; - } - } - qdev->pacer_qix = pacer_qix; - qdev->redirect_qix = redirect_qix; - rcu_read_unlock(); -} - /** * homa_qdisc_enqueue() - Invoked when a new packet becomes available for * transmission; this function determines whether to send it immediately @@ -432,7 +390,6 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct homa_qdisc_shared *qshared; struct homa_data_hdr *h; int pkt_len; - int result; int offset; /* This function tries to transmit short packets immediately for both @@ -453,7 +410,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * already deferred for this qdisc. */ INC_METRIC(qdisc_tcp_packets, 1); - if (q->num_deferred_tcp > 0) { + if (atomic_read(&q->num_deferred_tcp) > 0) { homa_qdisc_defer_tcp(q, skb); return NET_XMIT_SUCCESS; } @@ -480,7 +437,6 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * issue any grants, even though the "incoming" data isn't going to * be transmitted anytime soon. */ - h = (struct homa_data_hdr *)skb_transport_header(skb); offset = homa_get_offset(h); if (h->common.type != DATA || ntohl(h->message_length) < @@ -498,7 +454,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * been drained a bit. */ tt_record3("homa_qdisc_enqueue deferring homa data packet for id %d, offset %d on qid %d", - be64_to_cpu(h->common.sender_id), offset, qdev->pacer_qix); + be64_to_cpu(h->common.sender_id), offset, q->ix); homa_qdisc_defer_homa(qdev, skb); return NET_XMIT_SUCCESS; @@ -511,23 +467,12 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->ix); } } else { - tt_record2("homa_qdisc_enqueue queuing non-homa packet, qix %d, pacer_qix %d", - q->ix, qdev->pacer_qix); + tt_record1("homa_qdisc_enqueue queuing non-homa packet, qid %d", + q->ix); } - if (q->ix != qdev->pacer_qix) { - if (unlikely(sch->q.qlen >= READ_ONCE(sch->limit))) - return qdisc_drop(skb, sch, to_free); - result = qdisc_enqueue_tail(skb, sch); - } else { - /* homa_qdisc_redirect_skb is going to lock a different qdisc, - * so in order to avoid deadlocks we have to release the - * lock for this qdisc. - */ - spin_unlock(qdisc_lock(sch)); - result = homa_qdisc_redirect_skb(skb, qdev, false); - spin_lock(qdisc_lock(sch)); - } - return result; + if (unlikely(sch->q.qlen >= READ_ONCE(sch->limit))) + return qdisc_drop(skb, sch, to_free); + return qdisc_enqueue_tail(skb, sch); } /** @@ -553,7 +498,7 @@ void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb) spin_lock_irqsave(&qdev->defer_lock, flags); __skb_queue_tail(&qdev->deferred_tcp, skb); - q->num_deferred_tcp++; + atomic_inc(&q->num_deferred_tcp); if (qdev->last_defer) INC_METRIC(nic_backlog_cycles, now - qdev->last_defer); else @@ -645,17 +590,13 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) return 0; } skb = __skb_dequeue(&qdev->deferred_tcp); - pkt_len = qdisc_pkt_len(skb); - txq = netdev_get_tx_queue(skb->dev, skb_get_queue_mapping(skb)); - qdisc = rcu_dereference_bh(txq->qdisc); - q = qdisc_priv(qdisc); - q->num_deferred_tcp--; if (!homa_qdisc_any_deferred(qdev)) { INC_METRIC(nic_backlog_cycles, homa_clock() - qdev->last_defer); qdev->last_defer = 0; } spin_unlock_irqrestore(&qdev->defer_lock, flags); + pkt_len = qdisc_pkt_len(skb); homa_qdisc_update_link_idle(qdev, pkt_len, -1); if (ip_hdr(skb)->protocol == IPPROTO_TCP) { struct tcphdr *th; @@ -673,10 +614,17 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) skb_shinfo(skb)->gso_size); } - spin_lock_bh(qdisc_lock(qdisc)); - qdisc_enqueue_tail(skb, qdisc); - spin_unlock_bh(qdisc_lock(qdisc)); - __netif_schedule(qdisc); + rcu_read_lock_bh(); + txq = netdev_get_tx_queue(skb->dev, skb_get_queue_mapping(skb)); + qdisc = rcu_dereference_bh(txq->qdisc); + if (qdisc->ops == &homa_qdisc_ops) { + q = qdisc_priv(qdisc); + atomic_dec(&q->num_deferred_tcp); + homa_qdisc_schedule_skb(skb, qdisc); + } else { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + } + rcu_read_unlock_bh(); return pkt_len; } @@ -737,7 +685,9 @@ struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev) */ int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev) { + struct netdev_queue *txq; struct homa_data_hdr *h; + struct Qdisc *qdisc; struct sk_buff *skb; int pkt_len; @@ -750,8 +700,16 @@ int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev) h = (struct homa_data_hdr *)skb_transport_header(skb); tt_record3("homa_qdisc_pacer queuing homa data packet for id %d, offset %d on qid %d", be64_to_cpu(h->common.sender_id), - homa_get_offset(h), qdev->pacer_qix); - homa_qdisc_redirect_skb(skb, qdev, true); + homa_get_offset(h), skb_get_queue_mapping(skb)); + + rcu_read_lock_bh(); + txq = netdev_get_tx_queue(skb->dev, skb_get_queue_mapping(skb)); + qdisc = rcu_dereference_bh(txq->qdisc); + if (qdisc->ops == &homa_qdisc_ops) + homa_qdisc_schedule_skb(skb, qdisc); + else + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + rcu_read_unlock_bh(); return pkt_len; } @@ -959,63 +917,6 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) spin_unlock_bh(&qdev->pacer_mutex); } -/** - * homa_qdisc_redirect_skb() - Enqueue a packet on a different queue from - * the one it was originally passed to and wakeup that queue for - * transmission. This is used to transmit all pacer packets via a single - * queue and to redirect other packets originally sent to that queue to - * another queue. - * @skb: Packet to resubmit. - * @qdev: Homa data about the network device on which the packet should - * be resubmitted. - * @pacer: True means queue the packet on qdev->pacer_qix, false means - * qdev->redirect_qix. - * Return: Standard enqueue return code (usually NET_XMIT_SUCCESS). - */ -int homa_qdisc_redirect_skb(struct sk_buff *skb, - struct homa_qdisc_dev *qdev, bool pacer) -{ - struct netdev_queue *txq; - struct Qdisc *qdisc; - int result; - int qix; - int i; - - rcu_read_lock(); - - /* Must make sure that the queue index is still valid (refers - * to a Homa qdisc). - */ - for (i = 0; ; i++) { - qix = pacer ? qdev->pacer_qix : qdev->redirect_qix; - if (qix >= 0 && qix < qdev->dev->num_tx_queues) { - txq = netdev_get_tx_queue(qdev->dev, qix); - qdisc = rcu_dereference_bh(txq->qdisc); - if (qdisc->ops == &homa_qdisc_ops) - break; - } - if (i > 0) { - /* Couldn't find a Homa qdisc to use; drop the skb. - * Shouldn't ever happen? - */ - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); - result = NET_XMIT_DROP; - goto done; - } - homa_qdisc_set_qixs(qdev); - } - - skb_set_queue_mapping(skb, qix); - spin_lock_bh(qdisc_lock(qdisc)); - result = qdisc_enqueue_tail(skb, qdisc); - spin_unlock_bh(qdisc_lock(qdisc)); - netif_schedule_queue(txq); - -done: - rcu_read_unlock(); - return result; -} - /** * homa_qdisc_pacer_check() - Check whether any of the homa_qdisc pacer * threads associated with @homa have fallen behind (e.g. because they diff --git a/homa_qdisc.h b/homa_qdisc.h index c7fa3df2..6adeec89 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -39,11 +39,10 @@ struct homa_qdisc { /** * @num_deferred_tcp: Count of the number of TCP packets for this - * qdisc that are currently in qdev->deferred_tcp. Must hold both - * the qdisc lock and qdev->defer_lock to increment this; must - * hold qdev->defer_lock to decrement. + * qdisc that are currently in qdev->deferred_tcp. Incremented and + * decremented without holding a lock. */ - int num_deferred_tcp; + atomic_t num_deferred_tcp; }; /** @@ -67,36 +66,6 @@ struct homa_qdisc_dev { */ refcount_t refs; - /** - * @pacer_qix: Index of a netdev_queue within dev that is reserved - * for the pacer to use for transmitting packets. We segregate paced - * traffic (which is almost entirely large packets) from non-paced - * traffic (mostly small packets). All the paced traffic goes to a - * single transmit queue, and though we try to limit the length of - * this queue, there are situations where the queue can still build - * up (under some scenarios it appears that NICs cannot actually - * transmit at line rate). If the pacer queue is segregated, queue - * buildup there will not affect non-paced packets. In order to - * reserve pacer_qix for pacer traffic, short-packet traffic that - * is assigned to that queue must be redirected to another queue; - * redirect_qix is used for that. -1 means there currently isn't - * a netdev_queue assigned for pacer traffic. Note: this field is - * a hint; the value must be verified under RCU to have a Homa qdisc - * before using. - */ - int pacer_qix; - - /** - * @redirect_qix: Index of a netdev_queue within dev; packets - * originally passed to pacer_qix are redirected here, so that - * pacer_qix is used only for packets sent by the pacer. -1 means - * there isn't currently a netdev_queue assigned for this purpose. - * This field is a hint that must be verified under RCU before using - * to be sure it still refers to a Homa qdisc. May be the same as - * pacer_qix if there is only one Homa qdisc associated with dev. - */ - int redirect_qix; - /** @link_mbps: Speed of the link associated with @dev, in Mbps. */ int link_mbps; @@ -295,11 +264,7 @@ int homa_qdisc_pacer_main(void *device); struct homa_qdisc_dev * homa_qdisc_qdev_get(struct net_device *dev); void homa_qdisc_qdev_put(struct homa_qdisc_dev *qdev); -int homa_qdisc_redirect_skb(struct sk_buff *skb, - struct homa_qdisc_dev *qdev, - bool pacer); int homa_qdisc_register(void); -void homa_qdisc_set_qixs(struct homa_qdisc_dev *qdev); struct homa_qdisc_shared * homa_qdisc_shared_alloc(void); void homa_qdisc_shared_free(struct homa_qdisc_shared *qshared); @@ -346,6 +311,20 @@ static inline bool homa_qdisc_any_deferred(struct homa_qdisc_dev *qdev) !skb_queue_empty(&qdev->deferred_tcp); } +/** + * homa_qdisc_schedule_skb() - Enqueue an skb on a qdisc and schedule the + * qdisc for execution. + * @skb: Packet buffer to queue for output + * @qdisc: homa_qdisc on which to schedule it. + */ +static inline void homa_qdisc_schedule_skb(struct sk_buff *skb, + struct Qdisc *qdisc) { + spin_lock_bh(qdisc_lock(qdisc)); + qdisc_enqueue_tail(skb, qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); + __netif_schedule(qdisc); +} + /** * homa_qdisc_precedes() - Return true if @rpc1 is considered "less" than * @rpc2 (i.e. higher priority) for the purposes of qdev->deferred_rpcs, or diff --git a/test/mock.h b/test/mock.h index 96c21fb2..82aaa10e 100644 --- a/test/mock.h +++ b/test/mock.h @@ -84,8 +84,12 @@ #define rcu_read_lock mock_rcu_read_lock +#define rcu_read_lock_bh mock_rcu_read_lock + #define rcu_read_unlock mock_rcu_read_unlock +#define rcu_read_unlock_bh mock_rcu_read_unlock + #undef register_net_sysctl #define register_net_sysctl mock_register_net_sysctl diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 6c5e423f..46399c70 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -422,17 +422,6 @@ TEST_F(homa_qdisc, homa_qdisc_init__cant_create_new_qdisc_dev) EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); kfree(qdisc); } -TEST_F(homa_qdisc, homa_qdisc_init__set_qix) -{ - struct Qdisc *qdisc = mock_alloc_qdisc(&self->txqs[2]); - struct homa_qdisc *q; - - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); - EXPECT_EQ(2, q->ix); - homa_qdisc_destroy(qdisc); - kfree(qdisc); -} TEST_F(homa_qdisc, homa_qdisc_destroy) { @@ -470,44 +459,6 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) kfree(qdisc2); } -TEST_F(homa_qdisc, _homa_qdisc_homa_qdisc_set_qixs_object) -{ - struct homa_qdisc_dev *qdev; - - qdev = homa_qdisc_qdev_get(self->dev); - - /* Simple working case. */ - homa_qdisc_set_qixs(qdev); - EXPECT_EQ(0, qdev->pacer_qix); - EXPECT_EQ(1, qdev->redirect_qix); - - /* No qdisc in devnet_queue. */ - self->txqs[0].qdisc = NULL; - homa_qdisc_set_qixs(qdev); - EXPECT_EQ(1, qdev->pacer_qix); - EXPECT_EQ(2, qdev->redirect_qix); - - /* Qdisc isn't Homa. */ - self->txqs[2].qdisc->ops = NULL; - homa_qdisc_set_qixs(qdev); - EXPECT_EQ(1, qdev->pacer_qix); - EXPECT_EQ(3, qdev->redirect_qix); - - /* Can't find separate qdisc for short_pkt_qix. */ - self->txqs[3].qdisc->ops = NULL; - homa_qdisc_set_qixs(qdev); - EXPECT_EQ(1, qdev->pacer_qix); - EXPECT_EQ(1, qdev->redirect_qix); - - /* Can't find any Homa qdiscs. */ - self->txqs[1].qdisc->ops = NULL; - homa_qdisc_set_qixs(qdev); - EXPECT_EQ(-1, qdev->pacer_qix); - EXPECT_EQ(-1, qdev->redirect_qix); - - homa_qdisc_qdev_put(qdev); -} - TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); @@ -531,7 +482,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, q->num_deferred_tcp); + EXPECT_EQ(1, atomic_read(&q->num_deferred_tcp)); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); /* Second packet is short, but must be deferred to maintain order @@ -541,7 +492,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(2, q->num_deferred_tcp); + EXPECT_EQ(2, atomic_read(&q->num_deferred_tcp)); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); homa_qdisc_destroy(qdisc); @@ -612,7 +563,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_homa_deferred to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, q->num_deferred_tcp); + EXPECT_EQ(1, atomic_read(&q->num_deferred_tcp)); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); homa_qdisc_destroy(qdisc); @@ -753,34 +704,6 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__drop_packet_queue_over_limit) homa_qdisc_destroy(qdisc); kfree(qdisc); } -TEST_F(homa_qdisc, homa_qdisc_enqueue__use_special_queue) -{ - struct sk_buff *skb, *to_free; - struct homa_rpc *srpc; - struct homa_qdisc *q; - - srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 100, 10000); - ASSERT_NE(NULL, srpc); - - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - q = qdisc_priv(self->qdiscs[1]); - q->qdev->pacer_qix = 1; - q->qdev->redirect_qix = 3; - skb = new_test_skb(srpc, &self->addr, 0, 1500); - unit_log_clear(); - - spin_lock(qdisc_lock(self->qdiscs[1])); - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, self->qdiscs[1], - &to_free)); - spin_unlock(qdisc_lock(self->qdiscs[1])); - ASSERT_NE(NULL, to_free); - EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); - EXPECT_EQ(0, self->qdiscs[1]->q.qlen); - EXPECT_EQ(1, self->qdiscs[3]->q.qlen); -} TEST_F(homa_qdisc, homa_qdisc_defer_tcp__basics) { @@ -799,11 +722,11 @@ TEST_F(homa_qdisc, homa_qdisc_defer_tcp__basics) homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1500)); EXPECT_EQ(1, skb_queue_len(&q->qdev->deferred_tcp)); - EXPECT_EQ(1, q->num_deferred_tcp); + EXPECT_EQ(1, atomic_read(&q->num_deferred_tcp)); homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1500)); EXPECT_EQ(2, skb_queue_len(&q->qdev->deferred_tcp)); - EXPECT_EQ(2, q->num_deferred_tcp); + EXPECT_EQ(2, atomic_read(&q->num_deferred_tcp)); } TEST_F(homa_qdisc, homa_qdisc_defer_tcp__update_metrics_and_wakeup) { @@ -1124,6 +1047,24 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__backlog_cycles_metric) EXPECT_EQ(3000, homa_metrics_per_cpu()->nic_backlog_cycles); EXPECT_EQ(0, q1->qdev->last_defer); } +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__qdisc_not_homa) +{ + const struct Qdisc_ops *saved_ops; + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q = qdisc_priv(self->qdiscs[2]); + q->ix = 2; + mock_queue_index = 2; + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); + saved_ops = self->qdiscs[2]->ops; + self->qdiscs[2]->ops = NULL; + + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q->qdev)); + EXPECT_EQ(0, self->qdiscs[2]->q.qlen); + EXPECT_EQ(0, skb_queue_len(&q->qdev->deferred_tcp)); + self->qdiscs[2]->ops = saved_ops; +} TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__no_deferred_rpcs) { @@ -1266,7 +1207,10 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__packet_available) struct homa_rpc *srpc; u64 link_idle; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); mock_clock = 10000; + mock_queue_index = 3; qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, @@ -1276,15 +1220,44 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__packet_available) link_idle = atomic64_read(&qdev->link_idle_time); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + + mock_clock = 11000; + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_homa(qdev)); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__qdisc_not_homa) +{ + const struct Qdisc_ops *saved_ops; + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + u64 link_idle; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_clock = 10000; + mock_queue_index = 3; + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + link_idle = atomic64_read(&qdev->link_idle_time); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); mock_clock = 11000; + saved_ops = self->qdiscs[3]->ops; + self->qdiscs[3]->ops = NULL; EXPECT_EQ(1100, homa_qdisc_xmit_deferred_homa(qdev)); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + self->qdiscs[3]->ops = saved_ops; homa_qdisc_qdev_put(qdev); } @@ -1385,8 +1358,8 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_main) * (b) proper thread exit */ qdev = homa_qdisc_qdev_get(self->dev); - qdev->pacer_qix = 3; EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + mock_queue_index = 3; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, @@ -1413,15 +1386,16 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; - EXPECT_EQ(0, self->qdiscs[3]->q.qlen); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); mock_clock = 0; @@ -1445,6 +1419,10 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) struct homa_qdisc_dev *qdev; struct sk_buff *skb; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + qdev = homa_qdisc_qdev_get(self->dev); srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, @@ -1455,10 +1433,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) self->server_id + 2, 10000, 10000); ASSERT_NE(NULL, srpc2); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; - EXPECT_EQ(0, self->qdiscs[3]->q.qlen); - skb = new_test_skb(srpc1, &self->addr, 5000, 1500); homa_qdisc_defer_homa(qdev, skb); skb = new_test_skb(srpc2, &self->addr, 4000, 1500); @@ -1497,7 +1471,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); unit_log_clear(); @@ -1527,6 +1500,9 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_homa_packet_no_tcp) struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, @@ -1535,9 +1511,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_homa_packet_no_tcp) homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; - EXPECT_EQ(0, self->qdiscs[3]->q.qlen); qdev->homa_credit = -100; qdev->hnet->homa->qshared->homa_share = 40; @@ -1598,7 +1571,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__both_protocols_have_packets_choose_tcp) homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1100)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); - qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); qdev->homa_credit = -100; qdev->hnet->homa->qshared->homa_share = 40; @@ -1653,6 +1625,9 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_help_bytes_metric) struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; qdev = homa_qdisc_qdev_get(self->dev); srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, @@ -1661,9 +1636,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_help_bytes_metric) homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 800)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; - EXPECT_EQ(0, self->qdiscs[3]->q.qlen); unit_log_clear(); homa_qdisc_pacer(qdev, true); @@ -1675,117 +1647,15 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_help_bytes_metric) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_pacer_qix) -{ - struct homa_qdisc_dev *qdev; - struct sk_buff *skb; - int status; - - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[1]))->qdev; - qdev->pacer_qix = 1; - qdev->redirect_qix = 3; - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - unit_log_clear(); - - status = homa_qdisc_redirect_skb(skb, qdev, true); - EXPECT_EQ(NET_XMIT_SUCCESS, status); - EXPECT_EQ(1, self->qdiscs[1]->q.qlen); - EXPECT_EQ(0, self->qdiscs[3]->q.qlen); - EXPECT_EQ(1, mock_netif_schedule_calls); -} -TEST_F(homa_qdisc, homa_qdisc_redirect_skb__use_redirect_qix) -{ - struct homa_qdisc_dev *qdev; - struct sk_buff *skb; - int status; - - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[1]))->qdev; - qdev->pacer_qix = 1; - qdev->redirect_qix = 3; - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - unit_log_clear(); - - status = homa_qdisc_redirect_skb(skb, qdev, false); - EXPECT_EQ(NET_XMIT_SUCCESS, status); - EXPECT_EQ(0, self->qdiscs[1]->q.qlen); - EXPECT_EQ(1, self->qdiscs[3]->q.qlen); -} -TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_invalid) -{ - struct homa_qdisc_dev *qdev; - struct sk_buff *skb; - int status; - int i; - - for (i = 0; i < 4; i++) - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[i], NULL, NULL)); - qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[0]))->qdev; - qdev->pacer_qix = 3; - qdev->redirect_qix = 5; - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - unit_log_clear(); - - status = homa_qdisc_redirect_skb(skb, qdev, false); - EXPECT_EQ(NET_XMIT_SUCCESS, status); - EXPECT_EQ(1, self->qdiscs[1]->q.qlen); - EXPECT_EQ(0, qdev->pacer_qix); - EXPECT_EQ(1, qdev->redirect_qix); -} -TEST_F(homa_qdisc, homa_qdisc_redirect_skb__redirect_qix_not_a_homa_qdisc) -{ - struct homa_qdisc_dev *qdev; - struct sk_buff *skb; - int status; - int i; - - for (i = 0; i < 4; i++) - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[i], NULL, NULL)); - qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[0]))->qdev; - qdev->pacer_qix = 3; - qdev->redirect_qix = 0; - self->qdiscs[0]->ops = NULL; - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - unit_log_clear(); - - status = homa_qdisc_redirect_skb(skb, qdev, false); - EXPECT_EQ(NET_XMIT_SUCCESS, status); - EXPECT_EQ(1, self->qdiscs[2]->q.qlen); - EXPECT_EQ(1, qdev->pacer_qix); - EXPECT_EQ(2, qdev->redirect_qix); -} -TEST_F(homa_qdisc, homa_qdisc_redirect_skb__no_suitable_qdisc) -{ - struct homa_qdisc_dev *qdev; - struct sk_buff *skb; - int status; - int i; - - for (i = 0; i < 4; i++) { - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[i], NULL, NULL)); - self->qdiscs[i]->ops = NULL; - } - qdev = ((struct homa_qdisc *) qdisc_priv(self->qdiscs[0]))->qdev; - qdev->pacer_qix = 3; - qdev->redirect_qix = 0; - skb = mock_skb_alloc(&self->addr, &self->data.common, 1500, 0); - unit_log_clear(); - - status = homa_qdisc_redirect_skb(skb, qdev, false); - EXPECT_EQ(NET_XMIT_DROP, status); - EXPECT_EQ(-1, qdev->pacer_qix); - EXPECT_EQ(-1, qdev->redirect_qix); - EXPECT_EQ(0, mock_netif_schedule_calls); -} - TEST_F(homa_qdisc, homa_qdisc_pacer_check__enqueue_packet) { struct homa_qdisc_dev *qdev, *qdev2; struct homa_rpc *srpc; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + /* Create 2 qdevs to verify that homa_qdisc_pacer_check loops over * all qdevs. */ @@ -1795,9 +1665,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__enqueue_packet) &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; - EXPECT_EQ(0, self->qdiscs[3]->q.qlen); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); @@ -1822,7 +1689,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__no_deferred_rpcs) qdev2 = homa_qdisc_qdev_get(mock_dev(1, &self->homa)); qdev = homa_qdisc_qdev_get(self->dev); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); atomic64_set(&qdev->link_idle_time, 20000); @@ -1847,7 +1713,6 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__lag_not_long_enough) self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - qdev->pacer_qix = 3; EXPECT_EQ(0, self->qdiscs[3]->q.qlen); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); From 91a4167fe48ac188965589b464df6985dd552982 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 25 Nov 2025 14:11:01 -0800 Subject: [PATCH 584/625] Various improvements to tthoma.py * New analyzer nicpkts * Rewrote filter analyzer * Added sort_pkts and get_range functions * Various changes to command-line options in support of the changes above --- util/tthoma.py | 546 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 425 insertions(+), 121 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index a7aa1c6e..4ece854c 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -632,6 +632,34 @@ def get_packet(id, offset): global packets return packets['%d:%d' % (id, offset)] +def get_range(s, option_name=None, parse_float=False): + """ + Parse a range defined by two endpoints and return the endpoints as a list. + If only one value is specified in list then it is returned as the lower + end of the range, with None as the upper end. + s: The input string to parse; may contain either one or + two values + option_name: If specified, contains the name of the option that was + specified as range; used for error messages + parse_float: True means parse values as floating-point numbers; False + means integers + """ + + values = s.split() + if len(values) == 1: + min = float(values[0]) if parse_float else int(values[0]) + return [min, None] + if len(values) == 2: + min = float(values[0]) if parse_float else int(values[0]) + max = float(values[1]) if parse_float else int(values[1]) + return [min, max] + if option_name != None: + raise Exception('Bad %s value \'%s\'; must be \'value\' or ' + '\'value1 value2\'' % (option_name, s)) + else: + raise Exception('Bad range spec \'%s\'; must be \'value\' or ' + '\'value1 value2\'' % (s)) + def get_tcp_packet(saddr, sport, daddr, dport, sequence, data_bytes, ack): """ Returns the entry in tcp_packets corresponding to the arguments. Creates @@ -1014,6 +1042,23 @@ def require_options(analyzer, *args): raise Exception('The %s analyzer requires the --%s option' % ( analyzer, arg)) +def sort_pkts(pkts, key): + """ + Sort a list of packets using a given key and return the sorted list. + pkts: Packets to sort + key: Determines sort order (typically the value of the --sort + option); must be 'Xmit', 'Nic', 'Gro', 'SoftIRQ', or 'Free' + """ + + sort_keys = {'Xmit': 'xmit', 'Nic': 'nic', 'Gro': 'gro', + 'SoftIRQ': 'softirq', 'Free': 'free_tx_skb'} + if not key in sort_keys: + raise Exception('Invalid sort option %s: must be one of %s' % ( + key, sort_keys.keys())) + sort_key = sort_keys[key] + return sorted(pkts, key = lambda pkt : + pkt[sort_key] if sort_key in pkt else 1e20) + def sum_fields(list, field): """ Given a list of dictionaries, return the sum of a given field in each @@ -3046,11 +3091,12 @@ def print_pkt_delays(self): grant_total = [] # Collect statistics about delays within individual packets. - mtu = get_mtu() - if mtu == 0: - mtu = 1000000 + # short_limit = get_mtu() + # if short_limit == 0: + # short_limit = 1000000 + short_limit = 1000 for p, pkt in packets.items(): - if (pkt['msg_length'] != None) and (pkt['msg_length'] <= mtu): + if (pkt['msg_length'] != None) and (pkt['msg_length'] <= short_limit): if ('xmit' in pkt) and ('nic' in pkt): delay = pkt['nic'] - pkt['xmit'] if delay > 0: @@ -3146,14 +3192,14 @@ def print_pcts(data, label): list_avg(data, 0))) print('\nPhase Count Min P10 P50 P90 P99 Max Avg') print('-------------------------------------------------------------------------') - print('Data packets from messages <= %d bytes:' % (mtu)) + print('Data packets from messages <= %d bytes:' % (short_limit)) print_pcts(short_to_nic, 'Xmit') print_pcts(short_to_gro, 'Net') print_pcts(short_to_softirq, 'SoftIRQ') print_pcts(short_free, 'Free') print_pcts(short_total, 'Total') - print('\nData packets from messages > %d bytes:' % (mtu)) + print('\nData packets from messages > %d bytes:' % (short_limit)) print_pcts(long_to_nic, 'Xmit') print_pcts(long_to_gro, 'Net') print_pcts(long_to_softirq, 'SoftIRQ') @@ -3207,13 +3253,13 @@ def print_worst(data, label): verbose += ('--------------------------------------------------------' '-------------\n') - verbose += 'Data packets from messages <= %d bytes:\n' % (mtu) + verbose += 'Data packets from messages <= %d bytes:\n' % (short_limit) verbose += print_worst(short_to_nic, 'Xmit') verbose += print_worst(short_to_gro, 'Net') verbose += print_worst(short_to_softirq, 'SoftIRQ') verbose += print_worst(short_total, 'Total') - verbose += '\nData packets from messages > %d bytes:\n' % (mtu) + verbose += '\nData packets from messages > %d bytes:\n' % (short_limit) verbose += print_worst(long_to_nic, 'Xmit') verbose += print_worst(long_to_gro, 'Net') verbose += print_worst(long_to_softirq, 'SoftIRQ') @@ -3267,7 +3313,7 @@ def print_worst(data, label): if (not 'softirq' in pkt) or (not 'xmit' in pkt): continue total = pkt['softirq'] - pkt['xmit'] - if (pkt['msg_length'] != None) and (pkt['msg_length'] <= mtu): + if (pkt['msg_length'] != None) and (pkt['msg_length'] <= short_limit): if (total < min_short) or (total > max_short): continue if ('xmit' in pkt) and ('nic' in pkt): @@ -3428,9 +3474,11 @@ def output(self): #------------------------------------------------ class AnalyzeFilter: """ - Prints information about the packets selected by the following command-line - options: --tx-node, --tx-core, --tx-start, --tx-end, --rx-node, --rx-core, - --rx-start, --rx-end. + Select packets based on various criteria, then print summary statistics + for those packets. The following command-line options are used to filter + the packets: --tx-node, --rx-node, --tx-qid, --msglen, --grolat, --segs, + and --filter. If --verbose is specified then the matching packets are printed + in detail; the --sort option determines the order of printing. """ def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') @@ -3441,42 +3489,64 @@ def filter_packets(self, options): """ Returns a list containing all of the packets that match options. In addition, all returned packets will have valid 'xmit' and 'gro' - fields, and the sending and receiving RPCs will exist. + fields. options: A dictionary of option values (see class doc for list of valid options); usually contains the command-line options. """ global packets, rpcs + filter_func = None + if options.filter != None: + name = 'filter_' + options.filter + filter_func = getattr(self, name, None) + if filter_func == None or not callable(filter_func): + raise Exception('Couldn\'t find a filter method %s in the ' + '%s class' % (name, self.__class__.__name__)) + if options.msglen != None: + min_length, max_length = get_range(options.msglen, + option_name='--msglen') + if max_length == None: + max_length = min_length + min_length = 0 + if options.grolat != None: + min_gro, max_gro = get_range(options.grolat, + option_name='--grolat', parse_float=True) + if max_gro == None: + max_gro = 1e20 + result = [] for pkt in packets.values(): - if not 'id' in pkt: - print('No id in pkt: %s' % (pkt)) tx_id = pkt['id'] rx_id = tx_id ^ 1 - if not 'gro' in pkt: - continue - if not 'xmit' in pkt: - continue - if (not rx_id in rpcs) or (not rx_id in rpcs): + if not 'gro' in pkt or not 'xmit' in pkt: continue - if (options.tx_start != None) and (pkt['xmit'] < options.tx_start): + if not rx_id in rpcs or not rx_id in rpcs: continue - if (options.tx_end != None) and (pkt['xmit'] >= options.tx_end): + if options.tx_node != None and options.tx_node != pkt['tx_node']: continue - if (options.rx_start != None) and (pkt['gro'] < options.rx_start): + if options.rx_node != None and options.rx_node != pkt['rx_node']: continue - if (options.rx_end != None) and (pkt['gro'] >= options.rx_end): + if options.tx_core != None and options.tx_core != pkt['tx_core']: continue - if (options.tx_node != None) and (options.tx_node - != rpcs[tx_id]['node']): + if options.rx_core != None and options.rx_core != pkt['gro_core']: continue - if (options.rx_node != None) and (options.rx_node - != rpcs[rx_id]['node']): + if options.tx_qid != None and (not 'tx_qid' in pkt or + options.tx_qid != pkt['tx_qid']): continue - if (options.tx_core != None) and (options.tx_core != pkt['tx_core']): + if options.msglen != None: + if not 'msg_length' in pkt: + continue + length = pkt['msg_length'] + if length < min_length or length > max_length: + continue + if options.grolat != None: + latency = pkt['gro'] - pkt['xmit'] + if latency < min_gro or latency > max_gro: + continue + if not options.segs and not 'tso_length' in pkt: continue - if (options.rx_core != None) and (options.rx_core != pkt['gro_core']): + if filter_func != None and not filter_func(pkt): continue result.append(pkt) return result @@ -3486,72 +3556,85 @@ def output(self): pkts = self.filter_packets(options) - print('\n-------------------') + print('\n----------------') print('Analyzer: filter') - print('-------------------\n') + print('----------------\n') if not pkts: print('No packets matched filters') return - tx_filter = 'xmit:' + print('%d Homa data packets were selected using the following filters:' % + (len(pkts))) if options.tx_node != None: - tx_filter += ' node %s' % (options.tx_node) + print(' --tx-node %s' % (options.tx_node)) if options.tx_core != None: - tx_filter += ' core %d' % (options.tx_core) - if options.tx_start != None: - if options.tx_end != None: - tx_filter += ' time %9.3f-%9.3f' % ( - options.tx_start, options.tx_end) - else: - tx_filter += ' time >= %9.3f' % (options.tx_start) - elif options.tx_end != None: - tx_filter += ' time < %9.3f' % (options.tx_end) - - rx_filter = 'gro:' + print(' --tx-core %d' % (options.tx_core)) + if options.tx_qid != None: + print(' --tx-qid %d' % (options.tx_qid)) if options.rx_node != None: - rx_filter += ' node %s' % (options.rx_node) + print(' --rx-node %s' % (options.rx_node)) if options.rx_core != None: - rx_filter += ' core %d' % (options.rx_core) - if options.rx_start != None: - if options.rx_end != None: - rx_filter += ' time %9.3f-%9.3f' % ( - options.rx_start, options.rx_end) - else: - rx_filter += ' time >= %9.3f' % (options.rx_start) - elif options.rx_end != None: - rx_filter += ' time < %9.3f' % (options.rx_end) - - print('Packets below matched these filters:') - if len(tx_filter) > 5: - print(tx_filter) - if len(rx_filter) > 4: - print(rx_filter) - print('Packet information:') - print('TxTime: Time when ip*xmit was invoked for packet') - print('TxNode: Node that transmitted packet') - print('TxCore: Core on which ip*xmit was invoked for packet') - print('RxTime: Time when homa_gro_receive was invoked for packet') - print('Delay: RxTime - TxTime') - print('RxNode: Node that received packet') - print('RxCore: Core where home_gro_receive was invoked for packet') - print('Prio: Priority of packet') - print('Len: Bytes of message data in packet') - print('Tx_Id: RPC ID on sender') - print('Offset: Offset of first byte of data in packet') - print('') + print(' --rx-core %s' % (options.rx_core)) + if options.segs: + print(' --segs True') + if options.msglen: + print(' --msglen %s' % (options.msglen)) + if options.filter != None: + print(' --filter %s' % (options.filter)) + + nic = [] + gro = [] + softirq = [] + free = [] + total = [] - print('TxTime TxNode TxCore RxTime Delay RxNode RxCore Prio Len Tx_Id Offset') - print('-----------------------------------------------------------------------------------------') - pkts.sort(key=lambda d : d['xmit'] if 'xmit' in d else 0) for pkt in pkts: - tx_id = pkt['id'] - rx_id = tx_id ^ 1 - print('%9.3f %10s %s %9.3f %6.1f %10s %3d %2d %6d %10d %7d' % ( - pkt['xmit'], rpcs[tx_id]['node'], - print_field_if(pkt, 'tx_core', '%4d'), - pkt['gro'], pkt['gro'] - pkt['xmit'], - rpcs[rx_id]['node'], pkt['gro_core'], pkt['priority'], - get_recv_length(pkt['offset'], pkt['msg_length']), - tx_id, pkt['offset'])) + if 'xmit' in pkt and 'nic' in pkt: + nic.append(pkt['nic'] - pkt['xmit']) + if 'nic' in pkt and 'gro' in pkt: + gro.append(pkt['gro'] - pkt['nic']) + if 'gro' in pkt and 'softirq' in pkt: + softirq.append(pkt['softirq'] - pkt['gro']) + if 'softirq' in pkt and 'xmit' in pkt: + total.append(pkt['softirq'] - pkt['xmit']) + if 'nic' in pkt and 'free_tx_skb' in pkt: + free.append(pkt['free_tx_skb'] - pkt['nic']) + + print('\nDelays (in usecs) for each of the following phases of the ' + 'selected packets\' lifetimes:') + print('Xmit: Time from ip*xmit call until driver queued packet for NIC') + print('Net: Time from when NIC received packet until GRO started processing') + print('SoftIRQ: Time from GRO until SoftIRQ started processing') + print('Free: Time from when NIC received packet until packet was returned') + print(' to Linux and freed') + print('Total: Total time from ip*xmit call until SoftIRQ started') + print() + + def print_pcts(data, label): + data.sort() + if not data: + print('%-10s 0' % (label)) + else: + print('%-10s %6d %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f' % (label, + len(data), data[0], data[10*len(data)//100], + data[50*len(data)//100], data[90*len(data)//100], + data[99*len(data)//100], data[len(data)-1], + sum(data)/len(data))) + + print('Phase Count Min P10 P50 P90 P99 Max Avg') + print('------------------------------------------------------------------') + print_pcts(nic, 'Xmit') + print_pcts(gro, 'Net') + print_pcts(softirq, 'SoftIRQ') + print_pcts(free, 'Free') + print_pcts(total, 'Total') + + if not options.verbose: + return + + print() + print('# Details of the selected packets, sorted by \'%s\':' % + (options.sort)) + print(print_pkts(sort_pkts(pkts, options.sort), header=True), end='') #------------------------------------------------ # Analyzer: grantlock @@ -6165,6 +6248,16 @@ def output(self): # node -> list of packets transmitted by that node node_pkts = defaultdict(list) + # node-> list of interval stats for the largest size interval + # (4*base_interval) on that node. Each list element consists of + # : + # time: End time of the interval + # nic_pkts: # packets owned by NIC at time + # nic_bytes: Bytes of data in packets owned by NIC at time + # free_pkts: # packets returned to Linux in the interval + # free_bytes: Bytes of data in free_pkts + node_intervals = defaultdict(list) + # Bytes and packets owned by the NIC as of current time nic_pkts = 0 nic_bytes = 0 @@ -6197,7 +6290,7 @@ def output(self): continue node_pkts[pkt['tx_node']].append(pkt) - # Each iteraction in this loops generates data for one node. + # Each iteration in this loops generates data for one node. for node in get_sorted_nodes(): f = open('%s/nicbacklog_%s.dat' % (options.data, node), 'w') f.write('# Node: %s\n' % (node)) @@ -6255,7 +6348,7 @@ def output(self): f.write('\n') # heapq of all active packets (those that are currently in - # the posessions of the NIC) in increasing order of free time. + # the posession of the NIC) in increasing order of free time. active = [] # list of for each of @@ -6283,6 +6376,7 @@ def output(self): pkts = sorted(node_pkts[node], key = lambda pkt : pkt['nic']) interval_end = (math.ceil(pkts[0]['nic'] / base_interval) * base_interval) + interval_ix = 0 cur = 0 # print('\n%s: %d packets:' % (node, len(node_pkts[node]))) @@ -6320,6 +6414,12 @@ def output(self): f.write(' %6d %7.2f %6d %7.2f' % ( in_pkts, in_bytes*8/(1000*base_interval), free_pkts, free_bytes*8/(1000*base_interval))) + + interval_ix += 1 + if nic_bytes >= 200000: + node_intervals[node].append([interval_end, nic_pkts, + nic_bytes, free_pkts, free_bytes]) + in_pkts += intervals[1][0] in_bytes += intervals[1][1] free_pkts += intervals[1][2] @@ -6357,6 +6457,180 @@ def output(self): max_in_pkts, max_in_bytes*8/(4000*base_interval), max_free_pkts, max_free_bytes*8/(4000*base_interval))) + # Output a table showing stats for the intervals with the highest + # and lowest free_bytes. + print() + print('Average interval statistics for each node, measured over %d ' + 'usec intervals.' % (base_interval)) + print('For each node, intervals with at least 200 Kbytes of NIC data ' + 'are selected;') + print('from this group the 10% slowest intervals (those with fewest ' + 'bytes freed)') + print('and 10% fastest intervals (those with the most NIC bytes freed)' + ' are selected,') + print('and the following statistics are printed from each group:') + print('Node: Name of node') + print('NicPS: Average # packets owned by NIC at the end of ' + 'slow intervals') + print('NicKbS: Average KB of data owned by NIC at the end of ' + 'slow intervals') + print('FreePS: Average # packets freed during slow intervals') + print('FreeKbs: Average KB of data freed during slow intervals') + print('NicPS: Average # packets owned by NIC at the end of ' + 'fast intervals') + print('NicKbS: Average KB of data owned by NIC at the end of ' + 'fast intervals') + print('FreePS: Average # packets freed during fast intervals') + print('FreeKbs: Average KB of data freed during fast intervals') + print() + print('Node NicPS NicKbS FreePS FreeKbS NicPF NicKbF FreePF FreeKbF') + print('-----------------------------------------------------------------------') + for node in get_sorted_nodes(): + intervals = sorted(node_intervals[node], key=lambda t: t[4]) + slow = intervals[0:(len(intervals)//10)] + fast = intervals[len(9*intervals)//10:] + if len(intervals) < 10: + print('%s has only %d intervals' % (node, len(intervals))) + continue + print('%-10s %5.1f %6.1f %5.1f %6.1f %5.1f %6.1f %5.1f %6.1f' % + (node, + sum(t[1] for t in slow)/len(slow), + sum(t[2] for t in slow)/(1000*len(slow)), + sum(t[3] for t in slow)/len(slow), + sum(t[4] for t in slow)/(1000*len(slow)), + sum(t[1] for t in fast)/len(fast), + sum(t[2] for t in fast)/(1000*len(fast)), + sum(t[3] for t in fast)/len(fast), + sum(t[4] for t in fast)/(1000*len(fast)),)) + +#------------------------------------------------ +# Analyzer: nicpkts +#------------------------------------------------ +class AnalyzeNicpkts: + """ + Generate a history for each node of the packets owned by the NIC + (packets passed to the NIC but not yet returned after transmission), + showing the state of the NIC queues at each point in time and the + order in which packets are returned to Linux after transmission. + Requries the --data option. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + require_options('nicpkts', 'data') + + def print_active(self, f, active, free_index): + """ + Print out the list of active NIC packets for a node. + f: File in which to print information + active: List of packets currently owned by NIC + free_index: Index in active of the next packet to be freed; + this packet will be highlighted in the printout + """ + + num_this_line = 0 + index = 0 + for pkt in active: + if num_this_line == 4: + f.write('\n') + num_this_line = 0 + elif num_this_line > 0: + f.write(' ') + if index == free_index: + f.write('%19s' % ('')) + else: + f.write('%9.3f:%3d:%5d' % (pkt['nic'], pkt['tx_qid'], + pkt['tso_length'])) + num_this_line += 1 + index += 1 + if num_this_line > 0: + f.write('\n') + + def output(self): + global packets, tcp_packets, options, traces + + # node -> list of packets transmitted by that node + node_pkts = defaultdict(list) + + print('\n--------------------') + print('Analyzer: nicpkts') + print('--------------------') + print('See data files %s/nicpkts*.dat' % (options.data)) + + # Bucket all of the packets by transmitting node. + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if (not 'nic' in pkt or not 'free_tx_skb' in pkt or + not 'tso_length' in pkt): + continue + node_pkts[pkt['tx_node']].append(pkt) + + # Each iteration in this loops generates data for one node. + for node in get_sorted_nodes(): + f = open('%s/nicpkts_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Each block of lines shows the packets owned by the ' + 'NIC when the next\n') + f.write('# packet was returned to Linux after transmission. The ' + 'first line contains\n') + f.write('# the time when the next packet was freed, plus ' + 'information about the freed\n') + f.write('# packet.\n') + f.write('#\n') + f.write('# The following lines describe each of the packets ' + 'owned by the NIC as\n') + f.write('# a tuple :\n') + f.write('# t: Time when the packet was passed to the NIC\n') + f.write('# qid: Id of the NIC queue containing the packet\n') + f.write('# length: Bytes in the packet\n') + f.write('# The freed packet is not displayed in the list; ' + '\'\' is displayed\n') + f.write('# in its place to highlight its position in the list.\n') + + # Contains all packets currently owned by the NIC, in increasing + # order of nic time. + active = [] + + pkts = sorted(node_pkts[node], key = lambda pkt : pkt['nic']) + cur = 0 + + # Each iteration of this loop handles the next 'free_tx_skb' + # event, which includes adding packets to the active list, + # printing the list, and removing the next packet to be freed. + while cur < len(pkts): + # Find the first free time among active packets + min_free = 1e20 + min_index = -1 + for i in range(len(active)): + pkt = active[i] + if pkt['free_tx_skb'] < min_free: + min_index = i + min_free = pkt['free_tx_skb'] + + # Add more packets to the active list until the free time + # is reached; this could cause the first free time to change. + while cur < len(pkts): + pkt = pkts[cur] + if pkt['nic'] >= min_free: + break + cur += 1 + active.append(pkt) + if pkt['free_tx_skb'] < min_free: + min_index = len(active) - 1 + min_free = pkt['free_tx_skb'] + + # Print information about the active list and the packet + # that was just freed. + pkt = active[min_index] + f.write('\n%9.3f: qid %d, slot %d, %d bytes, queued at %.3f\n' % + (pkt['free_tx_skb'], pkt['tx_qid'], min_index, + pkt['tso_length'], pkt['nic'])) + self.print_active(f, active, min_index) + + del active[min_index] + f.close() + #------------------------------------------------ # Analyzer: nicqueues #------------------------------------------------ @@ -9392,6 +9666,45 @@ def output_snapshot(self): rpc = rpc_counts[id] print('%-10d %5d %8d' % (id, rpc['pkts'], rpc['bytes'])) +#------------------------------------------------ +# Analyzer: temp2 +#------------------------------------------------ +class AnalyzeTemp2: + """ + This analyzer is used to implement temporary checks used during + debugging. Consult the code to see what it does right now. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + + def output(self): + global packets + + tcp_headers = 20 + 20 + 18 + homa_headers = 56 + 20 + 18 + data_bytes = 0 + total_bytes = 0 + pkts = 0 + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if not 'tx_node' in pkt or pkt['tx_node'] != 'node4': + continue + if not 'nic' in pkt or pkt['nic'] < 17750 or pkt['nic'] >= 17950: + continue + if not 'tso_length' in pkt: + continue + bytes = pkt['tso_length'] + data_bytes += bytes + if pkt['id'] == 0: + total_bytes += bytes + tcp_headers + else: + total_bytes += bytes + homa_headers + pkts += 1 + + print('%d packets, data %d bytes (%.3f usec), total %d bytes (%.3f usec)' + % (pkts, data_bytes, data_bytes / 12.5e03, total_bytes, + total_bytes / 12.5e03)) + #------------------------------------------------ # Analyzer: timeline #------------------------------------------------ @@ -9696,9 +10009,8 @@ class AnalyzeTxpkts: data packet transmitted from that node, in time order. If either --node or --tx-qid is specified, only packets matching those options will be considered. Packets will normally be sorted by the 'Xmit' column, but the - --sort option can be used to specify a different column to use for sorting - ('Xmit', 'Nic', 'Gro', or 'Free'). Also generates aggregate statistics - for each tx queue on each node. + --sort option can be used to specify a different column to use for sorting. + Also generates aggregate statistics for each tx queue on each node. """ def __init__(self, dispatcher): @@ -9722,15 +10034,6 @@ def output(self): continue node_pkts[pkt['tx_node']].append(pkt) - sort_keys = {'Xmit': 'xmit', 'Nic': 'nic', 'Gro': 'gro', - 'Free': 'free_tx_skb'} - sort_key = 'xmit' - if options.sort != None: - if not options.sort in sort_keys.keys(): - raise Exception('Invalid --sort option %s: must be one of %s' % ( - options.sort, sort_keys.keys())) - sort_key = sort_keys[options.sort] - print('\n----------------') print('Analyzer: txpkts') print('----------------') @@ -9887,12 +10190,7 @@ def output(self): # Create a data file for this node with packets in time order # (or whatever order was requested on the command line). - pkts = sorted(pkts, key = lambda pkt : pkt['xmit']) - if sort_key == 'gro': - pkts = sorted(pkts, key = lambda pkt : get_max_gro(pkt)) - elif sort_key != 'xmit': - pkts = sorted(pkts, key = lambda pkt : - pkt[sort_key] if sort_key in pkt else 1e20) + pkts = sort_pkts(pkts, options.sort) f = open('%s/txpkts_%s.dat' % (options.data, node), 'w') f.write('# Node: %s\n' % (node)) @@ -10103,9 +10401,19 @@ def output(self): 'output data files (suitable for graphing) in the directory given ' 'by DIR. If this option is not specified, no data files will ' 'be generated') +parser.add_option('--filter', dest='filter', default=None, + metavar='FUNC', help='\'filter_FUNC\' is the name of a function in the ' + 'analyzer class; used by some analyzers as an additional filter for ' + 'packets.') parser.add_option('--gbps', dest='gbps', type=float, default=100.0, metavar='G', help='Link speed in Gbps (default: 100); used by some ' 'analyzers.') +parser.add_option('--grolat', dest='grolat', default=None, + metavar='L', help='Used by some analyzers to filter packets based on ' + 'the elapsed time from when the packet was passed to ip*xmit until ' + 'it was received by GRO on the destination; it can contain either a ' + 'single floating-point value (minimum latency) or two values (min and ' + 'max, inclusive).') parser.add_option('-h', '--help', dest='help', action='store_true', help='Show this help message and exit') parser.add_option('--interval', dest='interval', type=int, default=20, @@ -10123,6 +10431,10 @@ def output(self): parser.add_option('--min', dest='min', type=float, default=None, metavar='T', help='Lower bound to consider for some parameter; ' 'specific meaning depends on analyzer') +parser.add_option('--msglen', dest='msglen', default=None, + metavar='L', help='Used by some analyzers to filter packets based on ' + 'message length; it can contain either a single integer value ' + '(largest allowable length) or two values (min and max, inclusive).') parser.add_option('--negative-ok', action='store_true', default=False, dest='negative_ok', help='Don\'t print warnings when negative delays are encountered') @@ -10142,23 +10454,21 @@ def output(self): parser.add_option('--rx-core', dest='rx_core', type=int, default=None, metavar='C', help='If specified, some analyzers will ignore packets ' 'transmitted from cores other than C') -parser.add_option('--rx-end', dest='rx_end', type=float, default=None, - metavar='T', help='If specified, some analyzers will ignore packets ' - 'received at or after time T') parser.add_option('--rx-node', dest='rx_node', default=None, metavar='N', help='If specified, some analyzers will ignore packets ' 'received by nodes other than N') -parser.add_option('--rx-start', dest='rx_start', type=float, default=None, - metavar='T', help='If specified, some analyzers will ignore packets ' - 'received before time T') parser.add_option('--same-gro-core', dest='same_gro_core', action="store_true", default=False, help='If specified, the pass analyzer will only ' 'consider passing for packets that are processed by GRO on the ' 'same core') -parser.add_option('--sort', dest='sort', default=None, +parser.add_option('--segs', action='store_true', default=False, dest='segs', + help='By default some analyzers will consider only the first segment ' + 'of packets that are segmented by TSO segmentation; if this option ' + 'is specified then they will consider all of the derived segments') +parser.add_option('--sort', dest='sort', default='Xmit', metavar='S', help='Used by some analyzers to select a field to use ' - 'for sorting data. The supported values depend on the analyzer; ' - 'see analyzer documentation for details') + 'for sorting packets. Must be \'Xmit\', \'Nic\', \'Gro\', \'SoftIRQ\', ' + 'or \'Free\' (default: \'Xmit\')') parser.add_option('--threshold', dest='threshold', type=int, default=100, metavar='T', help='Used by some analyzers as a threshold time value, ' 'in microseconds (default: 100)') @@ -10167,18 +10477,12 @@ def output(self): parser.add_option('--tx-core', dest='tx_core', type=int, default=None, metavar='C', help='If specified, some analyzers will ignore packets ' 'transmitted from cores other than C') -parser.add_option('--tx-end', dest='tx_end', type=float, default=None, - metavar='T', help='If specified, some analyzers will ignore packets ' - 'transmitted at or after time T') parser.add_option('--tx-qid', dest='tx_qid', type=int, default=None, metavar='C', help='Specifies a transmit queue identifier; used ' 'by some anlyzers to select a specific queue.') parser.add_option('--tx-node', dest='tx_node', default=None, metavar='N', help='If specified, some analyzers will ignore ignore packets ' 'transmitted by nodes other than N') -parser.add_option('--tx-start', dest='tx_start', type=float, default=None, - metavar='T', help='If specified, some analyzers will ignore packets ' - 'transmitted before time T') parser.add_option('--verbose', '-v', action='store_true', default=False, dest='verbose', help='Print additional output with more details') From f5958e54ecf36f13e860be2c7347fb3526f4f57b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 28 Nov 2025 11:20:01 -0800 Subject: [PATCH 585/625] A few minor improvements to tthoma.py E.g., added --pkt-types option to filter analyzer. --- util/tthoma.py | 143 +++++++++++++++++++++++++++++-------------------- 1 file changed, 84 insertions(+), 59 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 4ece854c..a2e197c1 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -966,7 +966,7 @@ def print_pkts(pkts, header=True): buf.write('# Length: Size of packet (before segmentation)\n') buf.write('# Qid: Transmit queue on which packet was sent\n') buf.write('# Nic: Time when packet was queued for NIC\n') - buf.write('# NDelay: Nic - (later of Xmit and Qdisc)\n') + buf.write('# NDelay: Nic - Xmit\n') buf.write('# Gro: Time when packet was received by GRO\n') buf.write('# GDelay: Gro - Nic\n') buf.write('# Free: Time when sk_buff was released on sender\n') @@ -987,9 +987,7 @@ def print_pkts(pkts, header=True): nic_delay = None if 'nic' in pkt: nic = pkt['nic'] - if qdisc != None: - nic_delay = nic - qdisc - elif xmit != None: + if xmit != None: nic_delay = nic - xmit else: nic = None @@ -3477,8 +3475,9 @@ class AnalyzeFilter: Select packets based on various criteria, then print summary statistics for those packets. The following command-line options are used to filter the packets: --tx-node, --rx-node, --tx-qid, --msglen, --grolat, --segs, - and --filter. If --verbose is specified then the matching packets are printed - in detail; the --sort option determines the order of printing. + --pkt_type, and --filter. If --verbose is specified then the matching + packets are printed in detail; the --sort option determines the order of + printing. """ def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') @@ -3494,7 +3493,7 @@ def filter_packets(self, options): options: A dictionary of option values (see class doc for list of valid options); usually contains the command-line options. """ - global packets, rpcs + global packets, tcp_packets, grants, rpcs filter_func = None if options.filter != None: @@ -3503,52 +3502,68 @@ def filter_packets(self, options): if filter_func == None or not callable(filter_func): raise Exception('Couldn\'t find a filter method %s in the ' '%s class' % (name, self.__class__.__name__)) + if options.msglen != None: min_length, max_length = get_range(options.msglen, option_name='--msglen') if max_length == None: max_length = min_length min_length = 0 + if options.grolat != None: min_gro, max_gro = get_range(options.grolat, option_name='--grolat', parse_float=True) if max_gro == None: max_gro = 1e20 + pkt_dict = {} + for name in options.pkt_types.split(): + if name == 'all': + pkt_dict['data'] = packets.values() + pkt_dict['tcp'] = tcp_packets.values() + pkt_dict['grant'] = grants.values() + elif name == 'data': + pkt_dict['data'] = packets.values() + elif name == 'tcp': + pkt_dict['tcp'] = tcp_packets.values() + elif name == 'grant': + pkt_dict['grant'] = grants.values() + else: + raise Exception('Unknown packet type \'%s\'; must be \'data\', ' + '\'tcp\', or \'grant\'' % (name)) + result = [] - for pkt in packets.values(): - tx_id = pkt['id'] - rx_id = tx_id ^ 1 - if not 'gro' in pkt or not 'xmit' in pkt: - continue - if not rx_id in rpcs or not rx_id in rpcs: - continue - if options.tx_node != None and options.tx_node != pkt['tx_node']: - continue - if options.rx_node != None and options.rx_node != pkt['rx_node']: - continue - if options.tx_core != None and options.tx_core != pkt['tx_core']: - continue - if options.rx_core != None and options.rx_core != pkt['gro_core']: - continue - if options.tx_qid != None and (not 'tx_qid' in pkt or - options.tx_qid != pkt['tx_qid']): - continue - if options.msglen != None: - if not 'msg_length' in pkt: + for pkt_list in pkt_dict.values(): + for pkt in pkt_list: + if not 'gro' in pkt or not 'xmit' in pkt: continue - length = pkt['msg_length'] - if length < min_length or length > max_length: + if options.tx_node != None and options.tx_node != pkt['tx_node']: continue - if options.grolat != None: - latency = pkt['gro'] - pkt['xmit'] - if latency < min_gro or latency > max_gro: + # print('%s\n' % (pkt)) + if options.rx_node != None and options.rx_node != pkt['rx_node']: continue - if not options.segs and not 'tso_length' in pkt: - continue - if filter_func != None and not filter_func(pkt): - continue - result.append(pkt) + if options.tx_core != None and options.tx_core != pkt['tx_core']: + continue + if options.rx_core != None and options.rx_core != pkt['gro_core']: + continue + if options.tx_qid != None and (not 'tx_qid' in pkt or + options.tx_qid != pkt['tx_qid']): + continue + if options.msglen != None: + if not 'msg_length' in pkt: + continue + length = pkt['msg_length'] + if length < min_length or length > max_length: + continue + if options.grolat != None: + latency = pkt['gro'] - pkt['xmit'] + if latency < min_gro or latency > max_gro: + continue + if not options.segs and not 'tso_length' in pkt: + continue + if filter_func != None and not filter_func(pkt): + continue + result.append(pkt) return result def output(self): @@ -3562,24 +3577,27 @@ def output(self): if not pkts: print('No packets matched filters') return - print('%d Homa data packets were selected using the following filters:' % + print('%d packets were selected using the following filters:' % (len(pkts))) + print(' --pkt_types %s' % (options.pkt_types)) if options.tx_node != None: - print(' --tx-node %s' % (options.tx_node)) + print(' --tx-node %s' % (options.tx_node)) if options.tx_core != None: - print(' --tx-core %d' % (options.tx_core)) + print(' --tx-core %d' % (options.tx_core)) if options.tx_qid != None: - print(' --tx-qid %d' % (options.tx_qid)) + print(' --tx-qid %d' % (options.tx_qid)) if options.rx_node != None: - print(' --rx-node %s' % (options.rx_node)) + print(' --rx-node %s' % (options.rx_node)) if options.rx_core != None: - print(' --rx-core %s' % (options.rx_core)) + print(' --rx-core %s' % (options.rx_core)) if options.segs: - print(' --segs True') + print(' --segs True') if options.msglen: - print(' --msglen %s' % (options.msglen)) + print(' --msglen %s' % (options.msglen)) + if options.grolat: + print(' --grolat %s' % (options.grolat)) if options.filter != None: - print(' --filter %s' % (options.filter)) + print(' --filter %s' % (options.filter)) nic = [] gro = [] @@ -3600,12 +3618,13 @@ def output(self): free.append(pkt['free_tx_skb'] - pkt['nic']) print('\nDelays (in usecs) for each of the following phases of the ' - 'selected packets\' lifetimes:') - print('Xmit: Time from ip*xmit call until driver queued packet for NIC') - print('Net: Time from when NIC received packet until GRO started processing') + 'selected packets\'') + print('lifetimes:') + print('Xmit: Time from ip*xmit call until NIC handoff') + print('Net: Time from NIC handoff until GRO started processing') print('SoftIRQ: Time from GRO until SoftIRQ started processing') - print('Free: Time from when NIC received packet until packet was returned') - print(' to Linux and freed') + print('Free: Time from NIC handoff until packet was returned to') + print(' Linux and freed') print('Total: Total time from ip*xmit call until SoftIRQ started') print() @@ -10028,9 +10047,13 @@ def output(self): for pkt in packets.values(): if not 'xmit' in pkt or not 'tso_length' in pkt: continue + if not 'gro' in pkt: + continue node_pkts[pkt['tx_node']].append(pkt) for pkt in tcp_packets.values(): - if not 'xmit' in pkt or not ('tso_length' in pkt): + if not 'xmit' in pkt or not 'tso_length' in pkt: + continue + if not 'gro' in pkt: continue node_pkts[pkt['tx_node']].append(pkt) @@ -10068,9 +10091,9 @@ def output(self): 'handoff)') print('NicP50: Median NIC delay') print('NicP90: 90th percentile of NIC delay') - print('GroP10: 10th percentile of GRO delay (maximum time across ' - 'segments of TSO') - print(' from NIC handoff to receipt by destination GRO)') + print('GroP10: 10th percentile of GRO delay (time from NIC handoff ' + 'to receipt by') + print(' destination GRO)') print('GroP50: Median GRO delay') print('GroP90: 90th percentile of GRO delay') print('FreP10: 10th percentile of free delay (time from NIC handoff ' @@ -10146,10 +10169,7 @@ def output(self): nic_delay = None if nic != None: - if qdisc != None: - nic_delay = nic - qdisc - elif xmit != None: - nic_delay = nic - xmit + nic_delay = nic - xmit if qid != None: qid_tsos[qid] += 1 @@ -10451,6 +10471,11 @@ def output(self): metavar='DIR', help='Some analyzers can generate data plots, but ' 'they will do so only if this option is specified; DIR gives the ' 'directory in which to place plots.') +parser.add_option('--pkt-types', dest='pkt_types', default='data', + metavar='T', help='Used by some analyzers to determine which types of ' + 'packets to include for analysis; a list of the values \'data\' for ' + 'Homa data packets, \'tcp\' for TCP packets, and \'grant\' for Homa ' + 'grants, or \'all\' to select all types (default: \'homa\')') parser.add_option('--rx-core', dest='rx_core', type=int, default=None, metavar='C', help='If specified, some analyzers will ignore packets ' 'transmitted from cores other than C') From 0803bfe2076cc74e57e2636ac50068fcbea31d07 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Dec 2025 15:53:30 -0800 Subject: [PATCH 586/625] Modify ttsync.py to consider TCP packets as well as Homa packets --- util/ttsync.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/util/ttsync.py b/util/ttsync.py index 9ce3ca76..b61c2142 100755 --- a/util/ttsync.py +++ b/util/ttsync.py @@ -66,6 +66,14 @@ # busy packets were received for rpc_id (rpc_id is the id on the receiver). recv_ctl = defaultdict(list) +# pkt_id -> for each TCP packet sent. pkt_id is a unique +# identifier for the packet: saddr:sport:daddr:dport:sequence:data_bytes:ack. +send_tcp = {} + +# pkt_id -> for each TCP packet received. pkt_id has the same +# structure as for send_tcp. +recv_tcp = {} + # List of with one entry for each FREEZE packet # sent. Time is the unadjusted time on the sender when the packet was sent. # sender is the sender node index, and receiver is the receiver *address*. @@ -125,6 +133,11 @@ def parse_tt(tt, node_num): first_time = None last_time = None + # core -> saddr:sport:daddr:dport for a TCP packet, derived from the + # first of two associated time trace records for an event. Saves info + # for use by the second record + tcp_id = defaultdict(lambda: None) + for line in open(tt): num_records += 1 match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] (.*)', line) @@ -139,6 +152,42 @@ def parse_tt(tt, node_num): match = re.match('.* id ([-0-9.]+),.* offset ([-0-9.]+)', msg) if not match: + match = re.match('Transmitting TCP packet from ([^:]+):([0-9]+) to ' + '([^:]+):([0-9]+)', msg) + if match: + tcp_id[core] = '%s:%s:%s:%s' % (match.group(1), match.group(2), + match.group(3), match.group(4)) + continue + + match = re.match(r'Transmitting TCP packet .2. sequence ([-0-9]+), ' + 'data bytes ([0-9]+), .* ack ([-0-9]+)', msg) + if match: + if tcp_id[core] != None: + id = '%s:%s:%s:%s' % (tcp_id[core], match.group(1), + match.group(2), match.group(3)) + send_tcp[id] = [time, node_num] + sent += 1 + tcp_id[core] = None + continue + + match = re.match('tcp_gro_receive got packet from ([^:]+):([0-9]+) ' + 'to ([^:]+):([0-9]+)', msg) + if match: + tcp_id[core] = '%s:%s:%s:%s' % (match.group(1), match.group(2), + match.group(3), match.group(4)) + continue + + match = re.match(r'tcp_gro_receive .2. sequence ([-0-9]+), ' + 'data bytes ([0-9]+), .* ack ([-0-9]+)', msg) + if match: + if tcp_id[core] != None: + id = '%s:%s:%s:%s' % (tcp_id[core], match.group(1), + match.group(2), match.group(3)) + recv_tcp[id] = [time, node_num] + recvd += 1 + tcp_id[core] = None + continue + match = re.match('retransmitting offset ([0-9.]+), .*id ([0-9.]+)', msg) if match: @@ -251,8 +300,8 @@ def parse_tt(tt, node_num): def find_min_delays(num_nodes): """ - Combines the information in send_pkts and recv_pkts to fill in - min_delays + Combines the information in send_pkts, recv_pkts, send_tcp, and + recv_tcp to fill in min_delays num_nodes: Total number of distinct nodes; node numbers in send_pkts and recv_pkts must be < num_nodes. @@ -265,16 +314,26 @@ def find_min_delays(num_nodes): # Iterate over all the client-side events and match them to server-side # events if possible. - for id, send_pkt in send_pkts.items(): + for id, send_info in send_pkts.items(): if not id in recv_pkts: continue - send_time, send_node = send_pkt + send_time, send_node = send_info recv_time, recv_node = recv_pkts[id] delay = recv_time - send_time if delay < min_delays[send_node][recv_node]: min_delays[send_node][recv_node] = delay min_recv_times[send_node][recv_node] = recv_time + for id, send_info in send_tcp.items(): + if not id in recv_tcp: + continue + send_time, send_node = send_info + recv_time, recv_node = recv_tcp[id] + delay = recv_time - send_time + if delay < min_delays[send_node][recv_node]: + min_delays[send_node][recv_node] = delay + min_recv_times[send_node][recv_node] = recv_time + def find_min_delays_alt(num_nodes): """ This function provides an alternate way to compute minimum delays, From 9f072f0af9ddd087fa78ee15e9f74e928b013e24 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 1 Dec 2025 15:54:32 -0800 Subject: [PATCH 587/625] Add more statistics to the nicbacklog analyzer in tthoma.py Print average bytes and packets owned by the NIC for each node. --- util/tthoma.py | 58 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index a2e197c1..60d77ab3 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -6267,7 +6267,7 @@ def output(self): # node -> list of packets transmitted by that node node_pkts = defaultdict(list) - # node-> list of interval stats for the largest size interval + # node -> list of interval stats for the largest size interval # (4*base_interval) on that node. Each list element consists of # : # time: End time of the interval @@ -6277,16 +6277,20 @@ def output(self): # free_bytes: Bytes of data in free_pkts node_intervals = defaultdict(list) - # Bytes and packets owned by the NIC as of current time - nic_pkts = 0 - nic_bytes = 0 + # node -> running sum of packets owned by NIC on that node * time. + node_nic_pkts = defaultdict(lambda : 0) + + # node -> running sum of bytes owned by NIC on that node * time. + node_nic_bytes = defaultdict(lambda : 0) print('\n--------------------') print('Analyzer: nicbacklog') print('--------------------') print('See data files %s/nicbacklog_*.dat' % (options.data)) - print('\nMaximum values observed for each node:') + print('\nSummary data for each node:') print('Node: Name of node') + print('AvgPkts: Average # packets owned by NIC at one time') + print('AvgKB: Average Kbytes of data in packets owned by NIC') print('MaxPkts: Maximum packets owned by NIC at one time') print('MaxKB: Maximum Kbytes of data in packets owned by NIC ' 'at one time') @@ -6299,15 +6303,32 @@ def output(self): print('MaxFrD: Maximum data rate from pkts freed in a %d usec ' 'interval (Gbps)' % (4 * base_interval)) print() - print('Node MaxPkts MaxKB MaxInP MaxInD MaxFrP MaxFrD') - print('------------------------------------------------------') + print('Node AvgPkts AvgKB MaxPkts MaxKB MaxInP MaxInD MaxFrP MaxFrD') + print('---------------------------------------------------------------------') - # Bucket all of the packets by transmitting node. + # Bucket all of the packets by transmitting node. Also compute + # average backlog data (this calculation will consider packets + # that don't have enough data to use in later calculations). for pkt in itertools.chain(packets.values(), tcp_packets.values()): - if (not 'nic' in pkt or not 'free_tx_skb' in pkt or - not 'tso_length' in pkt): + if not 'tso_length' in pkt or not 'tx_node' in pkt: continue - node_pkts[pkt['tx_node']].append(pkt) + node = pkt['tx_node'] + if 'nic' in pkt: + t1 = pkt['nic'] + else: + if not 'free_tx_skb' in pkt: + continue + t1 = traces[node]['first_time'] + if 'free_tx_skb' in pkt: + t2 = pkt['free_tx_skb'] + else: + t2 = traces[node]['last_time'] + delta_t = t2 - t1 + node_nic_pkts[node] += delta_t + node_nic_bytes[node] += delta_t * pkt['tso_length'] + + if 'nic' in pkt and 'free_tx_skb' in pkt: + node_pkts[node].append(pkt) # Each iteration in this loops generates data for one node. for node in get_sorted_nodes(): @@ -6392,6 +6413,10 @@ def output(self): max_free_pkts = 0 max_free_bytes = 0 + # Bytes and packets owned by the NIC as of current time + nic_pkts = 0 + nic_bytes = 0 + pkts = sorted(node_pkts[node], key = lambda pkt : pkt['nic']) interval_end = (math.ceil(pkts[0]['nic'] / base_interval) * base_interval) @@ -6408,16 +6433,16 @@ def output(self): while cur < len(pkts) and pkts[cur]['nic'] <= interval_end: pkt = pkts[cur] + # print('\n%9.3f: to Nic: %s' % (pkt['nic'], pkt['free_tx_skb'])) cur += 1 in_pkts += 1 in_bytes += pkt['tso_length'] heapq.heappush(active, [pkt['free_tx_skb'], cur, pkt]) - # print('\n%9.3f: to Nic: %s' % (pkt['nic'], pkt['free_tx_skb'])) while len(active) > 0 and active[0][0] < interval_end: pkt = heapq.heappop(active)[2] + # print('\n%9.3f: freed: %s' % (pkt['free_tx_skb'], pkt)) free_pkts += 1 free_bytes += pkt['tso_length'] - # print('\n%9.3f: freed: %s' % (pkt['free_tx_skb'], pkt)) nic_pkts += in_pkts - free_pkts nic_bytes += in_bytes - free_bytes @@ -6471,8 +6496,11 @@ def output(self): interval_end += base_interval f.close() - print('%-10s %6d %6d %6d %7.2f %6d %7.2f' % ( - node, max_pkts, max_bytes/1000, + node_time = traces[node]['last_time'] - traces[node]['first_time'] + print('%-10s %6d %6d %7d %6d %6d %7.2f %6d %7.2f' % (node, + node_nic_pkts[node]/node_time, + 1e-3*node_nic_bytes[node]/node_time, + max_pkts, max_bytes/1000, max_in_pkts, max_in_bytes*8/(4000*base_interval), max_free_pkts, max_free_bytes*8/(4000*base_interval))) From f389234c71f69481ab60859454690694f08f49c8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 2 Dec 2025 14:51:32 -0800 Subject: [PATCH 588/625] More features for tthoma.py * New function print_rpcs * New analyzer: p99short * Add statistics on packet rates and throughput to activity analyzer --- util/tthoma.py | 242 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 218 insertions(+), 24 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 60d77ab3..8f3fc7b5 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -954,28 +954,29 @@ def print_pkts(pkts, header=True): """ buf = StringIO() - buf.write('# Source: Node that sent packet\n') - buf.write('# Dest: Node to which packet was sent\n') - buf.write('# Xmit: Time when packet was passed to ip*xmit\n') - buf.write('# Qdisc: Time when homa_qdisc requeued packet after ' - 'deferral, if any\n') - buf.write('# Id/Seq: RPC identifier for Homa packets, sequence ' - 'number for TCP\n') - buf.write('# Offset: Offset of packet within message or "TCP" if ' - 'packet is TCP\n') - buf.write('# Length: Size of packet (before segmentation)\n') - buf.write('# Qid: Transmit queue on which packet was sent\n') - buf.write('# Nic: Time when packet was queued for NIC\n') - buf.write('# NDelay: Nic - Xmit\n') - buf.write('# Gro: Time when packet was received by GRO\n') - buf.write('# GDelay: Gro - Nic\n') - buf.write('# Free: Time when sk_buff was released on sender\n') - buf.write('# FDelay: Free - Nic\n') - buf.write('# Rx: Number of times segments in the packet were ' - 'retransmitted\n\n') - buf.write('Source Dest Xmit Qdisc Id/Seq Offset') - buf.write(' Length Qid Nic NDelay Gro GDelay') - buf.write(' Free FDelay Rx\n') + if header: + buf.write('# Source: Node that sent packet\n') + buf.write('# Dest: Node to which packet was sent\n') + buf.write('# Xmit: Time when packet was passed to ip*xmit\n') + buf.write('# Qdisc: Time when homa_qdisc requeued packet after ' + 'deferral, if any\n') + buf.write('# Id/Seq: RPC identifier for Homa packets, sequence ' + 'number for TCP\n') + buf.write('# Offset: Offset of packet within message or "TCP" if ' + 'packet is TCP\n') + buf.write('# Length: Size of packet (before segmentation)\n') + buf.write('# Qid: Transmit queue on which packet was sent\n') + buf.write('# Nic: Time when packet was queued for NIC\n') + buf.write('# NDelay: Nic - Xmit\n') + buf.write('# Gro: Time when packet was received by GRO\n') + buf.write('# GDelay: Gro - Nic\n') + buf.write('# Free: Time when sk_buff was released on sender\n') + buf.write('# FDelay: Free - Nic\n') + buf.write('# Rx: Number of times segments in the packet were ' + 'retransmitted\n\n') + buf.write('Source Dest Xmit Qdisc Id/Seq Offset') + buf.write(' Length Qid Nic NDelay Gro GDelay') + buf.write(' Free FDelay Rx\n') for pkt in pkts: xmit = pkt['xmit'] if 'qdisc_xmit' in pkt: @@ -1028,6 +1029,63 @@ def print_pkts(pkts, header=True): buf.write('\n') return buf.getvalue() +def print_rpcs(client_rpcs, header=True): + """ + Returns a string containing one line for each RPC in client_rpcs, which + contains various useful statistics about the RPC. The RPCs are all + assumed to be client-side RPCs. If header is True then the string also + includes initial text describing the fields that are printed on each line. + """ + global rpcs + + buf = StringIO() + if header: + buf.write('# Client: Node that sent the RPC request\n') + buf.write('# Server: Node that handled the RPC and sent response\n') + buf.write('# Id: RPC identifier (client side)\n') + buf.write('# Length: Length of request message\n') + buf.write('# RqNic: Elapsed time from sendmsg until first ' + 'request packet handed\n') + buf.write('# off to NIC\n') + buf.write('# RqGRO: Time from NIC handoff to GRO receipt for ' + 'first request packet\n') + buf.write('# RqSoft: Time from GRO to SoftIRQ for first request ' + 'packet\n') + buf.write('# RqRecv: Time from SoftIRQ for first request packet ' + 'until recvmsg completes\n') + buf.write('# on server\n') + buf.write('# Srvc: Time from recvmsg return on server until ' + 'sendmsg for response\n') + buf.write('# RspNic: Elapsed time from sendmsg of response until ' + 'first packet handed\n') + buf.write('# off to NIC\n') + buf.write('# RspGRO: Time from NIC handoff to GRO receipt for ' + 'first response packet\n') + buf.write('# RspSoft: Time from GRO to SoftIRQ for first response ' + 'packet\n') + buf.write('# RspRecv: Time from SoftIRQ for first response packet ' + 'until RPC completes\n') + buf.write('# Total: End-to-end RTT\n\n') + buf.write('Client Server Id Length RqNic RqGRO ') + buf.write('RqSoft RqRecv Srvc RspNic RspGRO ') + buf.write('RspSoft RspRecv Total\n') + for rpc in client_rpcs: + srpc = rpcs[rpc['id'] ^ 1] + tx = rpc['send_data_pkts'][0] + rx = rpc['softirq_data_pkts'][0] + buf.write('%-8s %-8s %10s %7d %6.1f %6.1f' % ( + tx['tx_node'], tx['rx_node'], rpc['id'], rpc['out_length'], + tx['nic'] - rpc['sendmsg'], tx['gro'] - tx['nic'])) + buf.write(' %6.1f %6.1f %6.1f %6.1f %6.1f' % ( + tx['softirq'] - tx['gro'], + srpc['recvmsg_done'] - tx['softirq'], + srpc['sendmsg'] - srpc['recvmsg_done'], + rx['nic'] - srpc['sendmsg'], rx['gro'] - rx['nic'])) + buf.write(' %7.1f %7.1f %6.1f\n' % ( + rx['softirq'] - rx['gro'], rpc['recvmsg_done'] - rx['softirq'], + rpc['recvmsg_done'] - rpc['sendmsg'])) + return buf.getvalue() + def require_options(analyzer, *args): """ For each argument, ensures that the associated option has been specified; @@ -2305,6 +2363,75 @@ def analyze(self): for time, offset, length in rpc['send_data']: self.node_out_bytes[node] += length + def print_rates(self): + """ + Print summary information about packet and data rates for both Homa + and TCP. + """ + global packets, grants, tcp_packets + + # node -> dictionary of information about that node: + # homa_pkts: Total Homa data packets sent by the node + # homa_bytes: Total Homa message bytes sent by the node + # homa_grants: Homa grants sent by the node + # tcp_pkts: TCP packets with data sent by the node + # tcp_bytes: Total data sent in TCP packets + # tcp_acks: TCP packets with no data + nodes = defaultdict(lambda : defaultdict(lambda: 0)) + + for pkt in packets.values(): + if not 'tx_node' in pkt or not 'tso_length' in pkt: + continue + node_stats = nodes[pkt['tx_node']] + node_stats['homa_pkts'] += 1 + node_stats['homa_bytes'] += pkt['tso_length'] + + for pkt in grants.values(): + if not 'tx_node' in pkt: + continue + node_stats = nodes[pkt['tx_node']] + node_stats['homa_grants'] += 1 + + for pkt in tcp_packets.values(): + if not 'tx_node' in pkt: + continue + node_stats = nodes[pkt['tx_node']] + if not 'tso_length' in pkt: + continue + length = pkt['tso_length'] + if length > 0: + node_stats['tcp_pkts'] += 1 + node_stats['tcp_bytes'] += length + else: + node_stats['tcp_acks'] += 1 + + print('Summary of packet and data rates for Homa and TCP for each node:') + print('HomaGbps: Rate of outgoing Homa message data from node ' + '(Gbps)') + print('HomaPkts: Rate of outgoing Homa data packets from node ' + '(K pkts/sec)') + print('HomaGrants: Rate of outgoing Homa grant packets from node ' + '(K pkts/sec)') + print('TcpGbps: Rate of outgoing TCP data from node (Gbps)') + print('TcpPkts: Rate of outgoing TCP packets with data from node ' + '(K pkts/sec)') + print('TcpAcks: Rate of outgoing TCP ack packets from node ' + '(K pkts/sec)') + print() + print('Node HomaGbps HomaPkts HomaGrants TcpGbps TcpPkts TcpAcks') + print('-----------------------------------------------------------------') + for node in get_sorted_nodes(): + node_stats = nodes[node] + usecs = traces[node]['last_time'] - traces[node]['first_time'] + print('%-10s %7.2f %7.1f %7.1f %7.2f %7.1f %7.1f' % ( + node, node_stats['homa_bytes'] * 8e-3 / usecs, + node_stats['homa_pkts'] * 1e3 / usecs, + node_stats['homa_grants'] * 1e3 / usecs, + node_stats['tcp_bytes'] * 8e-3 / usecs, + node_stats['tcp_pkts'] * 1e3 / usecs, + node_stats['tcp_acks'] * 1e3 / usecs + )) + def sum_list(self, events): """ Given a list of entries where event is 'start' or 'end', @@ -2318,6 +2445,9 @@ def sum_list(self, events): cur_live = 0 live_time = 0 live_integral = 0 + + if not events: + return [0, 0, 0] last_time = events[0][0] for time, event in events: @@ -2396,7 +2526,8 @@ def output(self): avg_gbps = total_bytes*8e-3 / elapsed print('%-10s %6d %7.2f %9.3f %8.2f %7.2f %7.2f' % ( node, msgs, rate, liveFrac, avgLive, avg_gbps, - avg_gbps/liveFrac), end='') + avg_gbps/liveFrac if liveFrac != 0 else 0, + ), end='') print(' %5.2f (C%02d) %6.3f (C%02d) %6.3f (C%02d)' % ( max_gbps, max_core, div_safe(max_rpcs, total_rpcs), max_rpcs_core, div_safe(max_pending, total_pending), @@ -2415,7 +2546,10 @@ def output(self): avg_gbps = bytes*8e-3 / elapsed print('%-10s %6d %7.2f %9.3f %8.2f %7.2f %7.2f' % ( node, msgs, rate, liveFrac, avgLive, avg_gbps, - avg_gbps/liveFrac)) + avg_gbps/liveFrac if liveFrac != 0 else 0)) + + print() + self.print_rates() if options.data: for node in get_sorted_nodes(): @@ -6984,6 +7118,66 @@ def output(self): print(info, end='') count += 1 +#------------------------------------------------ +# Analyzer: p99short +#------------------------------------------------ +class AnalyzeP99short: + """ + Selects the 1% of short RPCs (those with single-packet request and + response messages) and breaks down the delay both for the overall + RPCs and for their constituent packets. + """ + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeRpcs') + dispatcher.interest('AnalyzePackets') + + def output(self): + global rpcs + + # tuples for all short rpcs. + short_rpcs = [] + + for rpc in rpcs.values(): + # Select only client RPCs, and make sure there is complete + # information for each RPC. + if rpc['id'] & 0x1: + continue + if not 'sendmsg' in rpc or not 'recvmsg_done' in rpc: + continue + peer = get_rpc_node(rpc['id'] ^ 1) + if not peer: + continue + if rpc['sendmsg'] < traces[peer]['first_time']: + continue + if rpc['recvmsg_done'] > traces[peer]['last_time']: + continue + if len(rpc['send_data_pkts']) != 1: + continue + if len(rpc['softirq_data_pkts']) != 1: + continue + short_rpcs.append([rpc['recvmsg_done'] - rpc['sendmsg'], rpc]) + + print('\n------------------') + print('Analyzer: p99short') + print('------------------') + + if not short_rpcs: + print('Couldn\'t find any single-packet RPCs') + return + short_rpcs.sort(key=lambda t: t[0]) + packets = [] + slow_rpcs = [] + for rtt, rpc in reversed(short_rpcs[99*len(short_rpcs)//100:]): + slow_rpcs.append(rpc) + packets.append(rpc['send_data_pkts'][0]) + packets.append(rpc['softirq_data_pkts'][0]) + print('The slowest 1% of short RPCs (those with a single packet for ' + 'request and') + print('response):') + print(print_rpcs(slow_rpcs, header=True), end='') + print('\nPackets from the slow RPCs:') + print(print_pkts(packets, header=True), end='') + #------------------------------------------------ # Analyzer: packet #------------------------------------------------ From ed468e0c594503bf6327218603334de24cae75b4 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 2 Dec 2025 15:11:19 -0800 Subject: [PATCH 589/625] Add more data to delay analyzer For SoftIRQ to app delays, separate those on client vs. server (goal: make it easier to see that there aren't enough threads for the workload). --- util/tthoma.py | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 8f3fc7b5..99e9cadc 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -3146,7 +3146,12 @@ def __init__(self, dispatcher): self.app_sleep_wakeups = [] # for softirq->app handoffs when RPC was queued - self.app_queue_wakeups = [] + # (for request messages, i.e. on server) + self.app_queue_req_wakeups = [] + + # for softirq->app handoffs when RPC was queued + # (for response messages, i.e. on client) + self.app_queue_rsp_wakeups = [] # An entry exists for RPC id if a handoff occurred while a # thread was polling @@ -3189,8 +3194,12 @@ def tt_wait_found_rpc(self, trace, time, core, id, type, blocked): self.app_poll_wakeups.append([delay, time, trace['node']]) del self.rpc_handoffs[id] elif id in self.rpc_queued and blocked == 0: - self.app_queue_wakeups.append([time - self.rpc_queued[id], time, - trace['node']]) + if id & 0x1: + self.app_queue_req_wakeups.append([time - self.rpc_queued[id], + time, trace['node']]) + else: + self.app_queue_rsp_wakeups.append([time - self.rpc_queued[id], + time, trace['node']]) del self.rpc_queued[id] def print_pkt_delays(self): @@ -3541,31 +3550,34 @@ def print_wakeup_delays(self): app_poll.sort() app_sleep = self.app_sleep_wakeups app_sleep.sort() - app_queue = self.app_queue_wakeups - app_queue.sort() + app_queue_req = self.app_queue_req_wakeups + app_queue_req.sort() + app_queue_rsp = self.app_queue_rsp_wakeups + app_queue_rsp.sort() print('\nDelays in handing off from one core to another:') - print(' Count Min P10 P50 P90 P99 ' - 'Max Avg') + print(' Count Min P10 P50 ' + 'P90 P99 Max Avg') print('------------------------------------------------------------' - '---------------------') + '------------------------') def print_percentiles(label, data): num = len(data) if num == 0: - print('%-26s %6d' % (label, 0)) + print('%-30s %6d' % (label, 0)) else: - print('%-26s %6d %5.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f' + print('%-30s %6d %5.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f' % (label, num, data[0][0], data[10*num//100][0], data[50*num//100][0], data[90*num//100][0], data[99*num//100][0], data[num-1][0], list_avg(data, 0))) print_percentiles('GRO to SoftIRQ:', soft) print_percentiles('SoftIRQ to polling app:', app_poll) print_percentiles('SoftIRQ to sleeping app:', app_sleep) - print_percentiles('SoftIRQ to app via queue:', app_queue) + print_percentiles('SoftIRQ to server via queue:', app_queue_req) + print_percentiles('SoftIRQ to client via queue:', app_queue_rsp) verbose = 'Worst-case handoff delays:\n' - verbose += 'Type Delay (us) End Time Node Pctl\n' - verbose += '--------------------------------------------------------------\n' + verbose += 'Type Delay (us) End Time Node Pctl\n' + verbose += '------------------------------------------------------------------\n' def print_worst(label, data): # The goal is to print about 10 records covering the 98th-100th @@ -3579,7 +3591,7 @@ def print_worst(label, data): if i < 0: break time, delay, node = data[i] - result += '%-26s %6.1f %9.3f %10s %5.1f\n' % ( + result += '%-30s %6.1f %9.3f %10s %5.1f\n' % ( label, time, delay, node, 100*i/(num-1) if num > 1 else 100) return result @@ -3587,7 +3599,8 @@ def print_worst(label, data): verbose += print_worst('GRO to SoftIRQ', soft) verbose += print_worst('SoftIRQ to polling app', app_poll) verbose += print_worst('SoftIRQ to sleeping app', app_sleep) - verbose += print_worst('SoftIRQ to app via queue', app_queue) + verbose += print_worst('SoftIRQ to server via queue', app_queue_req) + verbose += print_worst('SoftIRQ to client via queue', app_queue_rsp) return verbose def output(self): From 9729dff30930cce3646cb2aadaebfb8f98ce895b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 3 Dec 2025 09:08:04 -0800 Subject: [PATCH 590/625] Add TCP packets to the intervals analyzer in tthoma.py This makes TCP information available in other analyzers that use intervals, such as txintervals. --- util/tthoma.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 99e9cadc..4b9deeb5 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -252,6 +252,8 @@ def __missing__(self, key): # total_length: Total length of the packet, including IP and TCP headers # ack: The ack sequence number in the packet # xmit: Time when ip*xmit was invoked for the packet +# xmit2: qdisc_xmit if it exists, otherwise xmit: a time when Homa +# has decided to transmit the packet (after any Homa queuing) # qdisc_xmit: Time when homa_qdisc requeued a packet that was deferred # because of NIC queue length (only present for deferred # packets) @@ -260,6 +262,7 @@ def __missing__(self, key): # happen until the packet has been fully transmitted. # gro: Time when GRO received the packet # tx_node: Node that sent the packet (corresponds to saddr) +# tx_qid: NIC channel on which packet was transmitted # rx_node: Node that received the packet (corresponds to daddr) # retransmits: Always empty (for compatibility with Homa packets) tcp_packets = {} @@ -4974,20 +4977,20 @@ def analyze(self): # See if packets include NIC xmit times nic_data = False - for pkt in packets.values(): + for pkt in itertools.chain(packets.values(), tcp_packets.values()): if ('xmit' in pkt) and ('gro' in pkt): if 'nic' in pkt: nic_data = True break # Extract information from packets - for pkt in packets.values(): + for pkt in itertools.chain(packets.values(), tcp_packets.values()): if (self.tx_qid != None) and ((not 'tx_qid' in pkt) or (pkt['tx_qid'] != self.tx_qid)): continue tx_node = pkt['tx_node'] if 'tx_node' in pkt else None if not 'length' in pkt: - print('Packet with no length: %s' % (pkt)) + print('Packet with no length: %s' % (pkt), file=sys.stderr) continue length = pkt['length'] txmit = pkt['xmit2'] if 'xmit2' in pkt else None @@ -5014,8 +5017,8 @@ def analyze(self): else: tgro = None - # For most tx statistics, process only the overall TSO frame, - # not the individual segments + # For most tx statistics, process only the original TSO frame, + # not the generated segments if ('tso_length' in pkt): tso_length = pkt['tso_length'] @@ -5036,13 +5039,12 @@ def analyze(self): interval = get_interval(tx_node, txmit) interval['tx_pkts'] += 1 interval['tx_bytes'] += tso_length - if 'nic' in pkt: - add_to_intervals(tx_node, txmit, pkt['nic'], + if tnic != None: + add_to_intervals(tx_node, txmit, tnic, 'tx_qdisc', tso_length) if tnic != None: - nic_interval = get_interval(tx_node, tnic) - node_xmits[tx_node].append([pkt['nic'], + node_xmits[tx_node].append([tnic, tso_length + data_overhead_bytes]) nic_interval['tx_nic_pkts'] += 1 nic_interval['tx_nic_bytes'] += tso_length @@ -5113,7 +5115,9 @@ def analyze(self): else: add_to_intervals(rx_node, traces[rx_node]['first_time'], tsoftirq, 'rx_data_gro', length) - elif tgro != None: + elif tgro != None and pkt['id'] != 0: + # Note: TCP doesn't yet provide softirq times, hence the + # exclusion above. add_to_intervals(rx_node, tgro, traces[rx_node]['last_time'], 'rx_data_gro', length) @@ -6486,9 +6490,9 @@ def output(self): f.write('# NIC backlog (packets passed to the NIC but not yet ' 'returned to the\n') f.write('# kernel) as a function of time\n') - f.write('# Time: Time of measurement (usecs)\n') - f.write('# NicPkts: Packets currently owned by NIC\n') - f.write('# NicKB: Kbytes of data in packets currently owned by NIC\n') + f.write('# Time: Time of measurement (usecs)\n') + f.write('# NicPkts: Packets currently owned by NIC\n') + f.write('# NicKB: Kbytes of data in packets currently owned by NIC\n') f.write('# %-8s Packets passed to NIC in last %d usecs\n' % ('InP%d:' % (base_interval), base_interval)) @@ -7552,12 +7556,14 @@ def tt_xmit_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, data_bytes, total, ack): tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, data_bytes, ack) + # if 'xmit' in tcp_pkt: + # print('%9.3f: Duplicate TCP packet transmission on node %s (previous: %.3f)' % (t, + # trace['node'], tcp_pkt['xmit'])) node = trace['node'] tcp_pkt['xmit'] = t + tcp_pkt['xmit2'] = t tcp_pkt['total_length'] = total tcp_pkt['tx_node'] = node - if sequence == 1749134782: - print('tt_xmit_tcp setting tx_node to %s' % (node)) if not saddr in peer_nodes and saddr != '0x00000000': peer_nodes[saddr] = node @@ -7567,6 +7573,7 @@ def tt_qdisc_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, data_bytes, ack) node = trace['node'] tcp_pkt['qdisc_xmit'] = t + tcp_pkt['xmit2'] = t tcp_pkt['tx_node'] = node if sequence == 1749134782: print('tt_qdisc_tcp setting tx_node to %s' % (node)) From b66a304f1166ef5597ae844956495bcdc65eb891 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 3 Dec 2025 11:50:21 -0800 Subject: [PATCH 591/625] Add txqstop analyzer to tthoma.py --- util/tthoma.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/util/tthoma.py b/util/tthoma.py index 4b9deeb5..f352326d 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -2247,6 +2247,29 @@ def __gro_tcp2(self, trace, time, core, match, interests): '([0-9]+), total length ([0-9]+), ack ([-0-9]+)' }) + def __txq_stop(self, trace, time, core, match, interests): + queue = match.group(1) + limit = int(match.group(2)) + queued = int(match.group(3)) + for interest in interests: + interest.tt_txq_stop(trace, time, core, queue, limit, queued) + + patterns.append({ + 'name': 'txq_stop', + 'regexp': r'netdev_tx_sent_queue stopped queue (0x[a-f0-9]+): limit ' + '([0-9]+), queued ([0-9]+)' + }) + + def __txq_restart(self, trace, time, core, match, interests): + queue = match.group(1) + for interest in interests: + interest.tt_txq_restart(trace, time, core, queue) + + patterns.append({ + 'name': 'txq_restart', + 'regexp': r'netdev_tx_completed_queue restarted queue (0x[a-f0-9]+)' + }) + #------------------------------------------------ # Analyzer: activity #------------------------------------------------ @@ -10516,6 +10539,123 @@ def print_type(delays): print(node_info) print(q_details, end='') +#------------------------------------------------ +# Analyzer: txqstop +#------------------------------------------------ +class AnalyzeTxqstop: + """ + Prints information about transmit queue stoppage, where netdev_tx + refuses to transmit packets on a dev_queue because there is too much + data that has been handed off to the NIC but not yet returned after + transmission. + """ + + def __init__(self, dispatcher): + # node -> list of events for that node. Each event is a tuple + # , where queue is the identifier for a + # dev_queue and what is either "stop" or "restart". Events are + # not guaranteed to be in time order. + self.events = defaultdict(list) + + # node -> maximum queue length limit observed for that node + self.max_limit = defaultdict(lambda: 0) + + # node -> maximum queue length limit observed for that node + self.min_limit = defaultdict(lambda: 1e20) + + def init_trace(self, trace): + # queue identifier -> 1. An entry exists for a queue if a + # queue stoppage event has been seen for that queue (used to + # fill in missing stop events). + self.stopped = {} + + # Name of node for the current trace file. + self.node = trace['node'] + + def tt_txq_stop(self, trace, t, core, queue, limit, queued): + self.stopped[queue] = 1 + self.events[self.node].append([t, queue, 'stop']) + if limit > self.max_limit[self.node]: + self.max_limit[self.node] = limit + if limit < self.min_limit[self.node]: + self.min_limit[self.node] = limit + + def tt_txq_restart(self, trace, t, core, queue): + if not queue in self.stopped: + self.events[self.node].append([trace['first_time'], queue, 'stop']) + self.events[self.node].append([t, queue, 'restart']) + + def output(self): + + print('\n-----------------') + print('Analyzer: txqstop') + print('-----------------') + print() + print('Statistics on dev_queues that have been stopped by Linux ' + 'because there') + print('are too many bytes of packet data currently in the NIC\'s ' + 'possession for') + print('that queue:') + print('Node: Node whose data follows on this line') + print('Stopped: Fraction of time when at least one txq was stopped') + print('Avg: Average number of txqs stopped') + print('Stop1: Fraction of time when 1 txq was stopped') + print('Stop2: Fraction of time when 2 txqs were stopped') + print('Stop3: Fraction of time when 3 txqs were stopped') + print('StopMany: Fraction of time when >3 txqs were stopped') + print('LimitMin: Minimum observed value of length limit for a txq') + print('LimitMax: Maximum observed value of length limit for a txq') + print() + print('Node Stopped Avg Stop1 Stop2 Stop3 StopMany LimitMin LimitMax') + + for node in get_sorted_nodes(): + # queue identifier -> 1 if that queue is currently stopped; + # no entry if queue is running + stopped = {} + + # Used to compute the average number of queues stopped; sum of + # (time_delta * stopped) + avg_stopped = 0 + + # Total time that [1, 2, 3, >3] queues were stopped. + stop_time = [0, 0, 0, 0] + + # Time of last event processed. + prev_t = traces[node]['first_time'] + + if not self.events[node]: + print('%-8s No queue stoppage events detected' % (node)) + continue + + for event in sorted(self.events[node], key = lambda t: t[0]): + t, queue, what = event + + interval = t - prev_t + num_stopped = len(stopped) + if num_stopped > 0: + avg_stopped += interval * num_stopped + index = num_stopped - 1 if num_stopped <= 3 else 3 + stop_time[index] += interval + # if num_stopped > 3: + # print('%9.3f: %d queues stopped on %s: %s' % (t, + # num_stopped, node, sorted(stopped.keys()))) + if what == 'stop': + stopped[queue] = 1 + elif what == 'restart': + if queue in stopped: + del stopped[queue] + else: + raise Exception('Bad \'what\' field in txqstop event: %s' % + (what)) + prev_t = t + + total_t = prev_t - traces[node]['first_time'] + print('%-8s %5.3f %6.2f %5.3f %5.3f %5.3f %5.3f %7d %7d' % ( + node, sum(stop_time) / total_t, avg_stopped / total_t, + stop_time[0] / total_t, stop_time[1] / total_t, + stop_time[2] / total_t, stop_time[3] / total_t, + self.min_limit[node], self.max_limit[node])) + #------------------------------------------------ # Analyzer: txsnapshot #------------------------------------------------ From a15950c5349906592db3b87f45c64b62227a3263 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 10 Dec 2025 16:27:30 -0800 Subject: [PATCH 592/625] Restructure message_header in cp_node.cc Distinguish request and response messages (needed to detect TCP RPCs using timetrace records). --- util/cp_node.cc | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/util/cp_node.cc b/util/cp_node.cc index 42c511f6..aad59d3f 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -482,7 +482,20 @@ struct message_header { * @length: total number of bytes in the message, including this * header. */ - int length:30; + int32_t length; + + /** + * @cid: uniquely identifies the connection between a client + * and a server. + */ + conn_id cid; + + /** + * @msg_id: created by client, returned by server so client can + * match responses to requests; distinguishes among concurrent + * outstanding requests from a client. Not unique across all time. + */ + uint16_t msg_id; /** @freeze: true means the recipient should freeze its time trace. */ unsigned int freeze:1; @@ -494,16 +507,12 @@ struct message_header { unsigned int short_response:1; /** - * @cid: uniquely identifies the connection between a client - * and a server. + * @response: nonzero means this is a response messages, zero means + * request */ - conn_id cid; + unsigned int response:1; - /** - * @msg_id: unique identifier for this message among all those - * from a given client machine. - */ - uint32_t msg_id; + unsigned int reserved:13; }; /** @@ -1111,6 +1120,7 @@ void homa_server::server(int thread_id, server_metrics *metrics) if ((header->short_response) && (header->length > 100)) { header->length = 100; } + header->response = 1; num_vecs = 0; offset = 0; @@ -1497,6 +1507,7 @@ void tcp_server::read(int fd, int pid) } if ((header->short_response) && (header->length > 100)) header->length = 100; + header->response = 1; metrics->bytes_out += header->length; if (!connections[fd]->send_message(header)) connections[fd]->set_epoll_events(epoll_fd, @@ -2175,9 +2186,10 @@ void homa_client::sender() rinfos[slot].request_length = header->length; header->cid = server_conns[server]; header->cid.client_port = id; + header->msg_id = slot; header->freeze = freeze[header->cid.server]; header->short_response = one_way; - header->msg_id = slot; + header->response = 0; tt("sending request, cid 0x%08x, id %u, length %d", header->cid, header->msg_id, header->length); @@ -2588,6 +2600,7 @@ void tcp_client::sender() header.msg_id = slot; header.freeze = freeze[header.cid.server]; header.short_response = one_way; + header.response = 0; size_t old_pending = connections[server]->pending(); tt("Sending TCP request, cid 0x%08x, id %u, length %d, pid %d", header.cid, header.msg_id, header.length, From f56c222cbcabe789891f42a7fbf57bebfe971d64 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 10 Dec 2025 16:37:56 -0800 Subject: [PATCH 593/625] Update ttsync.py for new TCP timetrace record formats --- util/ttsync.py | 57 ++++++++++++++++---------------------------------- 1 file changed, 18 insertions(+), 39 deletions(-) diff --git a/util/ttsync.py b/util/ttsync.py index b61c2142..8c651b13 100755 --- a/util/ttsync.py +++ b/util/ttsync.py @@ -133,11 +133,6 @@ def parse_tt(tt, node_num): first_time = None last_time = None - # core -> saddr:sport:daddr:dport for a TCP packet, derived from the - # first of two associated time trace records for an event. Saves info - # for use by the second record - tcp_id = defaultdict(lambda: None) - for line in open(tt): num_records += 1 match = re.match(' *([-0-9.]+) us .* us\) \[C([0-9]+)\] (.*)', line) @@ -152,40 +147,23 @@ def parse_tt(tt, node_num): match = re.match('.* id ([-0-9.]+),.* offset ([-0-9.]+)', msg) if not match: - match = re.match('Transmitting TCP packet from ([^:]+):([0-9]+) to ' - '([^:]+):([0-9]+)', msg) + match = re.match('Transmitting TCP packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)', msg) if match: - tcp_id[core] = '%s:%s:%s:%s' % (match.group(1), match.group(2), + id = '%s:%s:%s:%s' % (match.group(1), match.group(2), match.group(3), match.group(4)) + send_tcp[id] = [time, node_num] + sent += 1 continue - match = re.match(r'Transmitting TCP packet .2. sequence ([-0-9]+), ' - 'data bytes ([0-9]+), .* ack ([-0-9]+)', msg) - if match: - if tcp_id[core] != None: - id = '%s:%s:%s:%s' % (tcp_id[core], match.group(1), - match.group(2), match.group(3)) - send_tcp[id] = [time, node_num] - sent += 1 - tcp_id[core] = None - continue - - match = re.match('tcp_gro_receive got packet from ([^:]+):([0-9]+) ' - 'to ([^:]+):([0-9]+)', msg) + match = re.match('tcp_gro_receive got packet from ' + '(0x[a-f0-9]+) to (0x[a-f0-9]+), data bytes ([0-9]+), ' + 'seq/ack ([0-9]+)', msg) if match: - tcp_id[core] = '%s:%s:%s:%s' % (match.group(1), match.group(2), + id = '%s:%s:%s:%s' % (match.group(1), match.group(2), match.group(3), match.group(4)) - continue - - match = re.match(r'tcp_gro_receive .2. sequence ([-0-9]+), ' - 'data bytes ([0-9]+), .* ack ([-0-9]+)', msg) - if match: - if tcp_id[core] != None: - id = '%s:%s:%s:%s' % (tcp_id[core], match.group(1), - match.group(2), match.group(3)) - recv_tcp[id] = [time, node_num] - recvd += 1 - tcp_id[core] = None + recv_tcp[id] = [time, node_num] + recvd += 1 continue match = re.match('retransmitting offset ([0-9.]+), .*id ([0-9.]+)', @@ -344,7 +322,7 @@ def find_min_delays_alt(num_nodes): global send_ctl, recv_ctl, send_freeze, recv_freeze global min_delays, min_recv_times, addr_node_num - # Resend and busy packet are problematic because they are not unique: + # Resend and busy packets are problematic because they are not unique: # there can be several identical packets between the same pair of nodes. # Here's how this function matches up sends and receives: # * Start from freeze packets, which are unique; use them to compute @@ -435,7 +413,7 @@ def peer_id(id): print('File: Name of trace file') print('Records: Total number of timetrace records') print('Sends: Total number of packets sent') -print('Receives: Total number of packets redeived (will be more than Sends') +print('Receives: Total number of packets received (will be more than Sends') print(' because of TSO)') print('Timespan: Elapsed time between first and last timetrace records (ms)') print('\nFile Records Sends Receives Timespan') @@ -485,10 +463,11 @@ def peer_id(id): # ref can potentially serve as reference for i. rtt = min_delays[ref][node] + min_delays[node][ref] if rtt < 0: - print('Negative RTT %.1f between %s (recv %.3f) and ' - '%s (recv %.3f),' % (rtt, node_names[ref], - min_recv_times[node][ref], node_names[node], - min_recv_times[ref][node])) + print('Negative RTT %.1f between %s (recv %.3f, delay %.3f) and ' + '%s (recv %.3f, delay %.3f),' % (rtt, node_names[ref], + min_recv_times[node][ref], min_delays[node][ref], + node_names[node], min_recv_times[ref][node], + min_delays[ref][node])) if (rtt < best_rtt) and (rtt > 0): best_node = node best_ref = ref From 1b4fe8157814c12a58de40e02671cb7102550c24 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 10 Dec 2025 16:39:27 -0800 Subject: [PATCH 594/625] Add nicbacklog2 and tcp_rpcs analyzers to tthoma.py * Required major refactoring of the TCP timetrace records. * Also adds several new functions such as print_tcp_rpcs. --- homa_qdisc.c | 19 +- timetrace.c | 105 ++++- timetrace.h | 3 + util/tthoma.py | 1147 +++++++++++++++++++++++++++++++++++++----------- 4 files changed, 988 insertions(+), 286 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 119e8430..8422a764 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -598,21 +598,10 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) pkt_len = qdisc_pkt_len(skb); homa_qdisc_update_link_idle(qdev, pkt_len, -1); - if (ip_hdr(skb)->protocol == IPPROTO_TCP) { - struct tcphdr *th; - - th = (struct tcphdr*) skb_transport_header(skb); - tt_record4("homa_qdisc_pacer requeued TCP packet " - "from 0x%08x:%d to 0x%08x:%d", - ntohl(ip_hdr(skb)->saddr), ntohs(th->source), - ntohl(ip_hdr(skb)->daddr), ntohs(th->dest)); - tt_record4("homa_qdisc_pacer requeued TCP packet (2) " - "sequence %u, data bytes %d, ack %u, gso_size %d", - ntohl(th->seq), - skb->len - skb_transport_offset(skb) - - tcp_hdrlen(skb), ntohl(th->ack_seq), - skb_shinfo(skb)->gso_size); - } + if (ip_hdr(skb)->protocol == IPPROTO_TCP) + tt_record_tcp("homa_qdisc_pacer requeued TCP packet from " + "0x%x to 0x%x, data bytes %d, seq/ack %d", + skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); rcu_read_lock_bh(); txq = netdev_get_tx_queue(skb->dev, skb_get_queue_mapping(skb)); diff --git a/timetrace.c b/timetrace.c index a06f3d2f..958ede0a 100644 --- a/timetrace.c +++ b/timetrace.c @@ -36,6 +36,12 @@ extern void homa_trace(u64 u0, u64 u1, int i0, int i1); extern void ltt_record_nop(struct tt_buffer *buffer, u64 timestamp, const char *format, u32 arg0, u32 arg1, u32 arg2, u32 arg3); +extern void (*ltt_record_sendmsg)(struct sock *sk, struct msghdr *msg); +extern void ltt_record_sendmsg_nop(struct sock *sk, struct msghdr *msg); +extern void (*ltt_record_tcp)(char *format, struct sk_buff *skb, + __be32 saddr, __be32 daddr); +extern void ltt_record_tcp_nop(char *format, struct sk_buff *skb, + __be32 saddr, __be32 daddr); #endif void tt_inc_metric(int metric, u64 count); @@ -139,6 +145,8 @@ int tt_init(char *proc_file) tt_linux_freeze_count = &tt_freeze_count; tt_linux_inc_metrics = tt_inc_metric; tt_linux_printk = tt_printk; + ltt_record_sendmsg = tt_record_sendmsg; + ltt_record_tcp = tt_record_tcp; tt_linux_dbg1 = tt_dbg1; tt_linux_dbg2 = tt_dbg2; tt_linux_dbg3 = tt_dbg3; @@ -201,6 +209,8 @@ void tt_destroy(void) tt_linux_buffers[i] = NULL; tt_linux_inc_metrics = tt_linux_skip_metrics; tt_linux_printk = tt_linux_nop; + ltt_record_sendmsg = ltt_record_sendmsg_nop; + ltt_record_tcp = ltt_record_tcp_nop; tt_linux_dbg1 = (void (*)(char *, ...)) tt_linux_nop; tt_linux_dbg2 = (void (*)(char *, ...)) tt_linux_nop; tt_linux_dbg3 = (void (*)(char *, ...)) tt_linux_nop; @@ -866,11 +876,13 @@ void tt_get_messages(char *buffer, size_t length) */ void tt_dbg1(char *msg, ...) { - pr_err("tt_dbg1 starting\n"); - if (atomic_read(&tt_frozen)) - return; - tt_freeze(); - tt_printk(); + pr_err("printk is currently disabled in tt_dbg1"); + return; + // pr_err("tt_dbg1 starting\n"); + // if (atomic_read(&tt_frozen)) + // return; + // tt_freeze(); + // tt_printk(); } /** @@ -917,3 +929,86 @@ void tt_inc_metric(int metric, u64 count) *metric_addr += count; #endif /* See strip.py */ } + +/** + * tt_record_sendmsg() - Invoked by tcp_sendmsg to create a timetrace + * record for the kernel call (if a new message is being started). + * @sk: Socket on which tcp_sendmsg was invoked. + * msg: The data to transmit on the socket (in user space). + */ +void tt_record_sendmsg(struct sock *sk, struct msghdr *msg) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct iov_iter iter; + int header[3]; + int copied; + int length; + + /* This design assumes that cp_node is generating the requests, + * so new messages will always start at the beginning of + * msg (but in some cases it may take multiple calls to + * sendmsg to transmit an entire message). + */ + if (!tp->homa_init) { + tp->homa_next_seq = tp->write_seq; + tp->homa_init = 1; + } + + /* This function is intended only for use with requests generated + * by cp_node, in which case new messages will always start at the + * beginnign of msg (but but in some cases it may take multiple + * calls to sendmsg to transmit an entire message). Check to see + * if we're in the middle of a message, or if this isn't cp_node; + * if so, do nothing. + */ + iter = msg->msg_iter; + if (iov_iter_count(&iter) < sizeof(header)) + return; + copied = copy_from_iter(&header, sizeof(header), &iter); + if (copied != sizeof(header)) { + tt_record1("copy_from_iter returned %d in tt_record_sendmsg", + copied); + return; + } + length = header[0]; + if (length < iov_iter_count(&msg->msg_iter)) { + /* There isn't a Homa message at the expected place. Most + * likely this isn't a Homa socket. + */ + return; + } + if (tp->homa_next_seq != tp->write_seq) + return; + + tp->homa_next_seq += length; + tt_record2("tcp_sendmsg new message slot is %d, response %d", + header[2] & 0xffff, (header[2] & 0x40000) ? 1 : 0); + tt_record4("tcp_sendmsg invoked for message from 0x%x to 0x%x, " + "length %d, starting sequence %u", + (htonl(inet->inet_saddr) << 16) + htons(inet->inet_sport), + (htonl(inet->inet_daddr) << 16) + htons(inet->inet_dport), + length, tp->write_seq); +} + +/** + * tt_record_tcp() - Create a timetrace record for a TCP packet, formatting + * data in a standard way. + * @format: Format string for tt_record4; must have % specs for + * source, dest, length, and ack/seq, in that order. + * @skb: Contains TCP packet with valid transport header. + * @saddr: Source address for packet. + * @daddr: Destination address for packet. + */ +void tt_record_tcp(char *format, struct sk_buff *skb, __be32 saddr, + __be32 daddr) +{ + struct tcphdr *th; + int data_length; + + th = (struct tcphdr*) skb_transport_header(skb); + data_length = skb->len - skb_transport_offset(skb) - th->doff * 4; + tt_record4(format, (ntohl(saddr) << 16) + ntohs(th->source), + (ntohl(daddr) << 16) + ntohs(th->dest), data_length, + data_length == 0 ? ntohl(th->ack_seq) : ntohl(th->seq)); +} \ No newline at end of file diff --git a/timetrace.h b/timetrace.h index f7fc8058..9e8eca38 100644 --- a/timetrace.h +++ b/timetrace.h @@ -136,6 +136,9 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, size_t length, loff_t *offset); int tt_proc_release(struct inode *inode, struct file *file); loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence); +void tt_record_sendmsg(struct sock *sk, struct msghdr *msg); +void tt_record_tcp(char *format, struct sk_buff *skb, __be32 saddr, + __be32 daddr); extern struct tt_buffer *tt_buffers[]; extern atomic_t tt_freeze_count; extern atomic_t tt_frozen; diff --git a/util/tthoma.py b/util/tthoma.py index f352326d..b324359e 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -143,8 +143,12 @@ def __missing__(self, id): # elapsed_time: Total time interval covered by the trace traces = {} -# Peer address -> node name. Computed by AnalyzeRpcs and AnalyzePackets. -peer_nodes = {} +# IP address -> node name. Computed by AnalyzeRpcs and AnalyzePackets. +# A single node may be in the table twice, once using the full 4-byte +# IP address, such as "0xa000105", which is available for Homa packets, +# and once using only the low-order two bytes (e.g. "0x0105") which is +# all that is available in TCP packets. +ip_to_node = {} # This variable holds information about every data packet in the traces. # it is created by AnalyzePackets. Packets sent with TSO can turn into @@ -242,15 +246,21 @@ def __missing__(self, key): # fields: # id: Always zero; this can be used to distinguish TCP packets from # Homa packets, where there is always a nonzero id. -# saddr: Source address of the packet (hex string) -# sport: Source port number -# daddr: Destination address of the packet (hex string) -# dport: Destination port number -# sequence: The sequence number in the packet +# source: Hex string identifying source port for the packet: lower 16 +# bits are port number, upper 16 bits are low-order 16-bits of +# IPv4 address +# dest: Destination port for the packet; same format as source. # length: # bytes of message data in the received packet -# tso_length: The number of data bytes in the packet (before TSO) +# seq_ack: If length is non-zero then this is the sequence number of +# the first byte of data in the packet; otherwise it is the +# acknowledgment sequence number. +# tso_length: The number of data bytes in the packet (before TSO). This +# field is not present in packets that are generated by TSO. +# segments: This field will be present in the first packet of each +# TSO packet (the one with tso_length set); it will be a +# list of all the other segments deriving from the same +# TSO packet. # total_length: Total length of the packet, including IP and TCP headers -# ack: The ack sequence number in the packet # xmit: Time when ip*xmit was invoked for the packet # xmit2: qdisc_xmit if it exists, otherwise xmit: a time when Homa # has decided to transmit the packet (after any Homa queuing) @@ -267,6 +277,35 @@ def __missing__(self, key): # retransmits: Always empty (for compatibility with Homa packets) tcp_packets = {} +# This variable holds information about every identifiable RPC sent via +# TCP in the traces. It is created by the tcp_rpcs analyzer. Keys are +# unique identifiers for RPCs in the form 'client server req_seq' with +# the meanings described below. Each value is a dictionary with the following +# entries: +# client: Client port: hex string whose low 16 bits are the port number, +# high 16 bits are the low-order bits of the client's IP address +# server: Server port (same format as client) +# slot: Slot allocated by cp_node on the client for the message; +# used to differentiate concurrente RPCs between the same +# client and server ports +# req_send: Time when tcp_sendmsg was invoked for the first bytes of +# the request +# req_seq: Sequence number of the first byte of the request message +# req_length: Size of the request message in bytes +# req_end_seq: req_seq + req_length +# req_pkts: List of request packets, in the order sent. Includes only +# data packets +# req_recvd: Time when tcp_recvmsg returned on the server +# resp_send: Time when tcp_sendmsg was invoked for the first bytes of +# the response +# resp_seq: Sequence number of the first byte of the response message +# resp_length: Size of the response message in bytes +# resp_end_seq: resp_seq + resp_length +# resp_pkts: List of response packets, in the order sent. Includes only +# data packets +# resp_recvd: Time when tcp_recvmsg returned on the client (RPC complete) +tcp_rpcs = {} + # Node -> list of intervals for that node. Created by the intervals analyzer. # Each interval contains information about a particular time range, including # things that happened during that time range and the state of the node at @@ -388,6 +427,9 @@ def __missing__(self, key): # Total bytes in a Homa grant packet, including IP header (assume IPv4). grant_pkt_length = 33 + ipv4_hdr_length +# Total header bytes in a TCP packet (TCP and IP headers, assuming IPv4). +tcp_hdr_length = ipv4_hdr_length + 20 + # Various color values for plotting: color_red = '#c00000' color_blue = '#1f77b4' @@ -635,21 +677,29 @@ def get_packet(id, offset): global packets return packets['%d:%d' % (id, offset)] -def get_range(s, option_name=None, parse_float=False): +def get_range(s, option_name=None, parse_float=False, one_value=True): """ Parse a range defined by two endpoints and return the endpoints as a list. - If only one value is specified in list then it is returned as the lower - end of the range, with None as the upper end. s: The input string to parse; may contain either one or two values option_name: If specified, contains the name of the option that was specified as range; used for error messages parse_float: True means parse values as floating-point numbers; False means integers + one_value: True means it is OK for s to contain only one value, in + which case it is returned as the lower end of the range, + with None as the upper end. """ values = s.split() if len(values) == 1: + if not one_value: + if option_name != None: + raise Exception('Bad %s spec \'%s\'; must contain two values' % + (option_name, s)) + else: + raise Exception('Bad range spec \'%s\'; must contain two values' + % (s)) min = float(values[0]) if parse_float else int(values[0]) return [min, None] if len(values) == 2: @@ -657,42 +707,51 @@ def get_range(s, option_name=None, parse_float=False): max = float(values[1]) if parse_float else int(values[1]) return [min, max] if option_name != None: - raise Exception('Bad %s value \'%s\'; must be \'value\' or ' + raise Exception('Bad %s spec \'%s\'; must be \'value\' or ' '\'value1 value2\'' % (option_name, s)) else: raise Exception('Bad range spec \'%s\'; must be \'value\' or ' '\'value1 value2\'' % (s)) -def get_tcp_packet(saddr, sport, daddr, dport, sequence, data_bytes, ack): +def get_tcp_node(addr_port): + """ + Return the name of the node corresponding to the argument. + addr_port: A hex string used in TCP timetrace entries: the lower + 16 bits are a port number and the upper 16 bits are + the low 16 bits of a node's IP address. + """ + global ip_to_node + + key = addr_port[:-4] + if key in ip_to_node: + return ip_to_node[key] + return key + +def get_tcp_packet(source, dest, data_bytes, seq_ack): """ Returns the entry in tcp_packets corresponding to the arguments. Creates a new packet if it doesn't already exist. - saddr: IP address of source (hex string) - sport: Source port - daddr: IP address of destination (hex string) - dport: Destination port - sequence: Sequence number in packet + source: Hex string identifying source for packet; lower 16 bits are + port number, upper 16 bits are low-order 16-bits of IP address + dest: Hex string identifying destination for packet; same format + as source data_bytes: Amount of payload data in the packet - ack: Acknowledgment sequence number in the packet + seq_ack: Packet sequence number if data_bytes != 0, otherwise + ack sequence number from packet """ global tcp_packets - # This is tricky because a 'data' packet can arrive with no data, - # just an ack. This code will create one packet if there is data, - # ignoring amount of data and ack. If there is no data, then one - # packet is created for each distinct sequence/ack combination. - - if data_bytes > 0: - key = '%s:%d %s:%d %d' % (saddr, sport, daddr, dport, sequence) + # Distinguish data packets (those where data_bytes is nonzero) from + # packets that are purely acknowledgments (data_bytes is zero). + if data_bytes != 0: + key = f'{source} {dest} {seq_ack} data' else: - key = '%s:%d %s:%d %d ack %d' % (saddr, sport, daddr, dport, sequence, - ack) + key = f'{source} {dest} {seq_ack} ack' if key in tcp_packets: return tcp_packets[key] - pkt = {'id': 0, 'saddr': saddr, 'sport': sport, 'daddr': daddr, - 'dport': dport, 'sequence': sequence, 'length': data_bytes, - 'ack': ack, 'retransmits': []} + pkt = {'id': 0, 'source': source, 'dest': dest, 'seq_ack': seq_ack, + 'retransmits': [], 'segments': []} tcp_packets[key] = pkt return pkt @@ -774,7 +833,7 @@ def get_rpc_node(id): if id^1 in rpcs: rpc = rpcs[id^1] if 'peer' in rpc: - return peer_nodes[rpc['peer']] + return ip_to_node[rpc['peer']] return '' def get_sorted_nodes(): @@ -981,7 +1040,7 @@ def print_pkts(pkts, header=True): buf.write(' Length Qid Nic NDelay Gro GDelay') buf.write(' Free FDelay Rx\n') for pkt in pkts: - xmit = pkt['xmit'] + xmit = pkt['xmit'] if 'xmit' in pkt else None if 'qdisc_xmit' in pkt: qdisc = pkt['qdisc_xmit'] qdisc_string = '%10.3f' % (qdisc) @@ -1006,14 +1065,14 @@ def print_pkts(pkts, header=True): rx += len(seg['retransmits']) rx_msg = str(rx) if rx > 0 else "" - line = '%-8s %-8s %10.3f %10s' % (pkt['tx_node'], + line = '%-8s %-8s %10s %10s' % (pkt['tx_node'], pkt['rx_node'] if 'rx_node' in pkt else "", - xmit, qdisc_string) + print_if(xmit, '%.3f'), qdisc_string) if pkt['id'] != 0: line += ' %10d %6d' % (pkt['id'], pkt['offset']) else: # This is a TCP packet - line += ' %10d TCP' % (pkt['sequence']) + line += ' %10d TCP' % (pkt['seq_ack']) nic_delay_string = '' if nic_delay != None: nic_delay_string = '%.1f' % (nic_delay) @@ -1089,6 +1148,93 @@ def print_rpcs(client_rpcs, header=True): rpc['recvmsg_done'] - rpc['sendmsg'])) return buf.getvalue() +def print_tcp_rpcs(rpcs, header=True): + """ + Returns a string containing one line for each RPC in tcp_rpcs, which + contains various useful statistics about the RPC. + rpcs: RPCs to print; must be entries in tcp_rpcs + header: If True then the result will include initial text describing + the fields printed on each line. + """ + buf = StringIO() + if header: + buf.write('Start: Time when tcp_sendmsg was invoked for request\n') + buf.write('Client: Node that sent the RPC request\n') + buf.write('Server: Node that handled the RPC and sent response\n') + buf.write('Length: Length of request message\n') + buf.write('ReqSeq: Sequence number of first byte of request\n') + buf.write('RspSeq: Sequence number of first byte of response\n') + buf.write('ReqNic: Elapsed time from sendmsg until first ' + 'request packet handed\n') + buf.write(' off to NIC\n') + buf.write('ReqGRO: Time from NIC handoff to GRO receipt for ' + 'first request packet\n') + buf.write('ReqRecv: Time from GRO for first request packet ' + 'until recvmsg completes\n') + buf.write(' on server\n') + buf.write('Srvc: Time from recvmsg return on server until ' + 'sendmsg for response\n') + buf.write('RspNic: Elapsed time from sendmsg of response until ' + 'first packet handed\n') + buf.write(' off to NIC\n') + buf.write('RspGRO: Time from NIC handoff to GRO receipt for ' + 'first response packet\n') + buf.write('RspRecv: Time from GRO for first response packet ' + 'until End\n') + buf.write('End: Time when response was returned to client\n') + buf.write('Rtt: RspRecv - Start\n\n') + buf.write('Start Client Server Length ReqSeq RspSeq ') + buf.write('ReqNic ReqGRO ReqRecv Srvc ') + buf.write('RspNic RspGRO RspRecv End Rtt\n') + for rpc in rpcs: + request_pkt = rpc['req_pkts'][0] if rpc['req_pkts'] else {} + response_pkt = rpc['resp_pkts'][0] if rpc['resp_pkts'] else {} + if 'nic' in request_pkt: + rqnic = '%.1f' % (request_pkt['nic'] - rpc['req_send']) + else: + rqnic = '' + if 'gro' in request_pkt and 'nic' in request_pkt: + rqgro = '%.1f' % (request_pkt['gro'] - request_pkt['nic']) + else: + rqgro = '' + if 'gro' in request_pkt and 'req_recvd' in rpc: + rqrecv = '%.1f' % (rpc['req_recvd'] - request_pkt['gro']) + else: + rqrecv = '' + if 'req_recvd' in rpc and 'resp_send' in rpc: + srvc = '%.1f' % (rpc['resp_send'] - rpc['req_recvd']) + else: + srvc = '' + if 'nic' in response_pkt: + rspnic = '%.1f' % (response_pkt['nic'] - rpc['resp_send']) + else: + rspnic = '' + if 'gro' in response_pkt and 'nic' in response_pkt: + rspgro = '%.1f' % (response_pkt['gro'] - response_pkt['nic']) + else: + rspgro = '' + if 'gro' in response_pkt and 'resp_recvd' in rpc: + rsprecv = '%.1f' % (rpc['resp_recvd'] - response_pkt['gro']) + else: + rsprecv = '' + if 'req_send' in rpc and 'resp_recvd' in rpc: + rtt = '%.1f' % (rpc['resp_recvd'] - rpc['req_send']) + else: + rtt = '' + if 'resp_recvd' in rpc: + end = '%.3f' % (rpc['resp_recvd']) + else: + end = '' + line = ('%9.3f %-8s %-8s %7d %10d %10d' % ( + rpc['req_send'], get_tcp_node(rpc['client']), + get_tcp_node(rpc['server']), rpc['req_length'], + rpc['req_seq'], rpc['resp_seq'])) + line += (' %7s %6s %7s %6s' % (rqnic, rqgro, rqrecv, srvc)) + line += (' %7s %6s %7s %9s %7s' % (rspnic, rspgro, rsprecv, end, rtt)) + buf.write(line.rstrip()) + buf.write('\n') + return buf.getvalue() + def require_options(analyzer, *args): """ For each argument, ensures that the associated option has been specified; @@ -1101,6 +1247,17 @@ def require_options(analyzer, *args): raise Exception('The %s analyzer requires the --%s option' % ( analyzer, arg)) +def set_tcp_ip_node(tcp_endpoint, node): + """ + Add a mapping from IP address to node to the ip_to_node table. + tcp_endpoint: An endpoint spec from a TCP packet. Must be a hex string + (with leading "0x") whose high-order 16 bits are the + low-order 16 bits of an IPv4 address. + node: Name of the node corresponding to tcp_endpoing + """ + key = tcp_endpoint[:-4] + ip_to_node[key] = node + def sort_pkts(pkts, key): """ Sort a list of packets using a given key and return the sorted list. @@ -1185,7 +1342,7 @@ def __init__(self): # where it takes multiple time trace entries to provide all the data # needed for an event: info accumulates here until the last time # trace entry is seen. - self.core_saved = {} + self.core_saved = defaultdict(dict) for pattern in self.patterns: pattern['matches'] = 0 @@ -2078,173 +2235,132 @@ def __snapshot_server_response(self, trace, time, core, match, interests): 'kbytes_started ([0-9]+), kbytes_done ([0-9]+), done ([0-9]+)' }) - def __xmit_tcp(self, trace, time, core, match, interests): - saddr = match.group(1) - sport = int(match.group(2)) - daddr = match.group(3) - dport = int(match.group(4)) - self.core_saved[core] = {'saddr': saddr, 'sport': sport, - 'daddr': daddr, 'dport': dport} + def __tcp_sendmsg2(self, trace, time, core, match, interests): + saved = self.core_saved[core] + saved['sendmsg_slot'] = int(match.group(1)) + saved['sendmsg_response'] = int(match.group(2)) patterns.append({ - 'name': 'xmit_tcp', - 'regexp': 'Transmitting TCP packet from ([^:]+):([0-9]+) to ' - '([^:]+):([0-9]+)' + 'name': 'tcp_sendmsg2', + 'regexp': 'tcp_sendmsg new message slot is ([0-9]+), response ([0-9]+)' }) - def __xmit_tcp2(self, trace, time, core, match, interests): - if not core in self.core_saved: - return + def __tcp_sendmsg(self, trace, time, core, match, interests): saved = self.core_saved[core] - sequence = int(match.group(1)) - data_bytes = int(match.group(2)) - total_length = int(match.group(3)) - ack = int(match.group(4)) + if not 'sendmsg_slot' in saved: + return + source = match.group(1) + dest = match.group(2) + msg_length = int(match.group(3)) + sequence = int(match.group(4)) for interest in interests: - interest.tt_xmit_tcp(trace, time, core, saved['saddr'], - saved['sport'], saved['daddr'], saved['dport'], - sequence, data_bytes, total_length, ack) - del self.core_saved[core] - - patterns.append({ - 'name': 'xmit_tcp2', - 'regexp': r'Transmitting TCP packet .2. sequence ([-0-9]+), ' - 'data bytes ([0-9]+), total length ([-0-9]+), ack ([-0-9]+)' - }) - - def __qdisc_tcp(self, trace, time, core, match, interests): - saddr = match.group(1) - sport = int(match.group(2)) - daddr = match.group(3) - dport = int(match.group(4)) - self.core_saved[core] = {'saddr': saddr, 'sport': sport, - 'daddr': daddr, 'dport': dport} + interest.tt_tcp_sendmsg(trace, time, core, source, dest, msg_length, + sequence, saved['sendmsg_slot'], saved['sendmsg_response']) patterns.append({ - 'name': 'qdisc_tcp', - 'regexp': 'homa_qdisc_pacer requeued TCP packet from ([^:]+):([0-9]+) ' - 'to ([^:]+):([0-9]+)' + 'name': 'tcp_sendmsg', + 'regexp': 'tcp_sendmsg invoked for message from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), length ([0-9]+), starting sequence ([0-9]+)' }) - def __qdisc_tcp2(self, trace, time, core, match, interests): - if not core in self.core_saved: - return - saved = self.core_saved[core] - sequence = int(match.group(1)) - data_bytes = int(match.group(2)) - ack = int(match.group(3)) + def __tcp_xmit(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) for interest in interests: - interest.tt_qdisc_tcp(trace, time, core, saved['saddr'], - saved['sport'], saved['daddr'], saved['dport'], - sequence, data_bytes, ack) - del self.core_saved[core] + interest.tt_tcp_xmit(trace, time, core, source, dest, data_bytes, + seq_ack) patterns.append({ - 'name': 'qdisc_tcp2', - 'regexp': r'homa_qdisc_pacer requeued TCP packet .2. sequence ([-0-9]+), ' - 'data bytes ([0-9]+), ack ([-0-9]+)' + 'name': 'tcp_xmit', + 'regexp': 'Transmitting TCP packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' }) - def __nic_tcp(self, trace, time, core, match, interests): - saddr = match.group(2) - sport = int(match.group(3)) - daddr = match.group(4) - dport = int(match.group(5)) - self.core_saved[core] = {'saddr': saddr, 'sport': sport, - 'daddr': daddr, 'dport': dport} + def __tcp_qdisc(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) + for interest in interests: + interest.tt_tcp_qdisc(trace, time, core, source, dest, data_bytes, + seq_ack) patterns.append({ - 'name': 'nic_tcp', - 'regexp': '(mlx|ice) sent TCP packet from ([^:]+):([0-9]+) to ' - '([^:]+):([0-9]+)' + 'name': 'tcp_qdisc', + 'regexp': 'homa_qdisc_pacer requeued TCP packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' }) - def __nic_tcp2(self, trace, time, core, match, interests): - if not core in self.core_saved: - return - saved = self.core_saved[core] - sequence = int(match.group(2)) - data_bytes = int(match.group(3)) - ack = int(match.group(4)) - gso_size = int(match.group(5)) + def __tcp_nic(self, trace, time, core, match, interests): + source = match.group(2) + dest = match.group(3) + data_bytes = int(match.group(4)) + seq_ack = int(match.group(5)) for interest in interests: - interest.tt_nic_tcp(trace, time, core, saved['saddr'], - saved['sport'], saved['daddr'], saved['dport'], - sequence, data_bytes, ack, gso_size) - del self.core_saved[core] + interest.tt_tcp_nic(trace, time, core, source, dest, data_bytes, + seq_ack) patterns.append({ - 'name': 'nic_tcp2', - 'regexp': r'(mlx|ice) sent TCP packet .2. sequence ([-0-9]+), ' - 'data bytes ([0-9]+), ack ([-0-9]+), gso_size ([0-9]+)' + 'name': 'tcp_nic', + 'regexp': '(mlx|ice) sent TCP packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' }) - def __free_tcp(self, trace, time, core, match, interests): - saddr = match.group(2) - sport = int(match.group(3)) - daddr = match.group(4) - dport = int(match.group(5)) - self.core_saved[core] = {'saddr': saddr, 'sport': sport, - 'daddr': daddr, 'dport': dport} + def __tcp_free2(self, trace, time, core, match, interests): + self.core_saved[core]['tcp_free_qid'] = int(match.group(2)) patterns.append({ - 'name': 'free_tcp', - 'regexp': '(mlx|ice) freeing TCP skb from ([^:]+):([0-9]+) to ' - '([^:]+):([0-9]+)' + 'name': 'tcp_free2', + 'regexp': '(mlx|ice) freeing TCP skb for qid ([0-9]+)' }) - def __free_tcp2(self, trace, time, core, match, interests): - if not core in self.core_saved: - return + def __tcp_free(self, trace, time, core, match, interests): saved = self.core_saved[core] - sequence = int(match.group(2)) - data_bytes = int(match.group(3)) - ack = int(match.group(4)) - qid = int(match.group(5)) + if not 'tcp_free_qid' in saved: + return + source = match.group(2) + dest = match.group(3) + data_bytes = int(match.group(4)) + seq_ack = int(match.group(5)) for interest in interests: - interest.tt_free_tcp(trace, time, core, saved['saddr'], - saved['sport'], saved['daddr'], saved['dport'], - sequence, data_bytes, ack, qid) - del self.core_saved[core] + interest.tt_tcp_free(trace, time, core, source, dest, data_bytes, + seq_ack, saved['tcp_free_qid']) patterns.append({ - 'name': 'free_tcp2', - 'regexp': r'(mlx|ice) freeing TCP skb .2. sequence ([-0-9]+), ' - 'data bytes ([0-9]+), ack ([-0-9]+), qid ([-0-9]+)' + 'name': 'tcp_free', + 'regexp': '(mlx|ice) freeing TCP skb from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' }) - def __gro_tcp(self, trace, time, core, match, interests): - saddr = match.group(1) - sport = int(match.group(2)) - daddr = match.group(3) - dport = int(match.group(4)) - self.core_saved[core] = {'saddr': saddr, 'sport': sport, - 'daddr': daddr, 'dport': dport} - + def __tcp_gro(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) + for interest in interests: + interest.tt_tcp_gro(trace, time, core, source, dest, data_bytes, + seq_ack) patterns.append({ - 'name': 'gro_tcp', - 'regexp': 'tcp_gro_receive got packet from ([^:]+):([0-9]+) to ' - '([^:]+):([0-9]+)' + 'name': 'tcp_gro', + 'regexp': 'tcp_gro_receive got packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' }) - def __gro_tcp2(self, trace, time, core, match, interests): - if not core in self.core_saved: - return - saved = self.core_saved[core] - sequence = int(match.group(1)) - data_bytes = int(match.group(2)) - total = int(match.group(3)) - ack = int(match.group(4)) + def __tcp_recvmsg(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + msg_length = int(match.group(3)) + sequence = int(match.group(4)) for interest in interests: - interest.tt_gro_tcp(trace, time, core, saved['saddr'], - saved['sport'], saved['daddr'], saved['dport'], sequence, - data_bytes, total, ack) - del self.core_saved[core] + interest.tt_tcp_recvmsg(trace, time, core, source, dest, msg_length, + sequence) patterns.append({ - 'name': 'gro_tcp2', - 'regexp': r'tcp_gro_receive .2. sequence ([-0-9]+), data bytes ' - '([0-9]+), total length ([0-9]+), ack ([-0-9]+)' + 'name': 'tcp_recvmsg', + 'regexp': 'tcp_recvmsg returning message from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), length ([0-9]+), ending sequence ([0-9]+)' }) def __txq_stop(self, trace, time, core, match, interests): @@ -3649,14 +3765,28 @@ class AnalyzeFilter: for those packets. The following command-line options are used to filter the packets: --tx-node, --rx-node, --tx-qid, --msglen, --grolat, --segs, --pkt_type, and --filter. If --verbose is specified then the matching - packets are printed in detail; the --sort option determines the order of - printing. + packets are printed in detail. The --sort option selects a column to + use for sorting the packets; it must be one of Xmit, Nic, Gro, SoftIRQ, + or Free (default is Xmit). """ def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') dispatcher.interest('AnalyzePackets') return + def filter_short_tcp(self, pkt): + """ + Returns True if pkt is a short TCP packet: it has some data, but + no more than 1500 bytes. + """ + if pkt['id'] != 0: + return False + if not 'tso_length' in pkt: + return False + length = pkt['tso_length'] + # print('\nLength %d: pkt %s' % (length, pkt)) + return length > 10 and length <= 1500 + def filter_packets(self, options): """ Returns a list containing all of the packets that match options. @@ -3678,14 +3808,14 @@ def filter_packets(self, options): if options.msglen != None: min_length, max_length = get_range(options.msglen, - option_name='--msglen') + option_name='--msglen', one_value=True) if max_length == None: max_length = min_length min_length = 0 if options.grolat != None: min_gro, max_gro = get_range(options.grolat, - option_name='--grolat', parse_float=True) + option_name='--grolat', parse_float=True, one_value=True) if max_gro == None: max_gro = 1e20 @@ -3712,7 +3842,6 @@ def filter_packets(self, options): continue if options.tx_node != None and options.tx_node != pkt['tx_node']: continue - # print('%s\n' % (pkt)) if options.rx_node != None and options.rx_node != pkt['rx_node']: continue if options.tx_core != None and options.tx_core != pkt['tx_core']: @@ -3787,6 +3916,8 @@ def output(self): softirq.append(pkt['softirq'] - pkt['gro']) if 'softirq' in pkt and 'xmit' in pkt: total.append(pkt['softirq'] - pkt['xmit']) + elif 'gro' in pkt and 'xmit' in pkt: + total.append(pkt['gro'] - pkt['xmit']) if 'nic' in pkt and 'free_tx_skb' in pkt: free.append(pkt['free_tx_skb'] - pkt['nic']) @@ -6457,6 +6588,12 @@ def output(self): # node -> running sum of bytes owned by NIC on that node * time. node_nic_bytes = defaultdict(lambda : 0) + # node -> total bytes passed to NIC over trace + node_total_in = defaultdict(lambda: 0) + + # node -> total bytes freed after return from NIC over trace + node_total_freed = defaultdict(lambda: 0) + print('\n--------------------') print('Analyzer: nicbacklog') print('--------------------') @@ -6476,9 +6613,11 @@ def output(self): 'return from NIC' % (4 * base_interval)) print('MaxFrD: Maximum data rate from pkts freed in a %d usec ' 'interval (Gbps)' % (4 * base_interval)) + print('AvgIn: Average rate of data handed off to NIC (Gbps)') + print('AvgFr: Average rate of data freed after return from NIC (Gbps)') print() - print('Node AvgPkts AvgKB MaxPkts MaxKB MaxInP MaxInD MaxFrP MaxFrD') - print('---------------------------------------------------------------------') + print('Node AvgPkts AvgKB MaxPkts MaxKB MaxInP MaxInD MaxFrP MaxFrD AvgIn AvgFr') + print('-------------------------------------------------------------------------------------') # Bucket all of the packets by transmitting node. Also compute # average backlog data (this calculation will consider packets @@ -6486,20 +6625,23 @@ def output(self): for pkt in itertools.chain(packets.values(), tcp_packets.values()): if not 'tso_length' in pkt or not 'tx_node' in pkt: continue + length = pkt['tso_length'] node = pkt['tx_node'] if 'nic' in pkt: t1 = pkt['nic'] + node_total_in[node] += length else: if not 'free_tx_skb' in pkt: continue t1 = traces[node]['first_time'] if 'free_tx_skb' in pkt: t2 = pkt['free_tx_skb'] + node_total_freed[node] += length else: t2 = traces[node]['last_time'] delta_t = t2 - t1 node_nic_pkts[node] += delta_t - node_nic_bytes[node] += delta_t * pkt['tso_length'] + node_nic_bytes[node] += delta_t * length if 'nic' in pkt and 'free_tx_skb' in pkt: node_pkts[node].append(pkt) @@ -6563,6 +6705,10 @@ def output(self): # heapq of all active packets (those that are currently in # the posession of the NIC) in increasing order of free time. + # Entries are where free is the packet's + # free_tx_skb time (for sorting), index is the packet's index + # in the list of all packets (for resolving sorting ties), and + # pkt is the packet. active = [] # list of for each of @@ -6671,12 +6817,14 @@ def output(self): interval_end += base_interval f.close() node_time = traces[node]['last_time'] - traces[node]['first_time'] - print('%-10s %6d %6d %7d %6d %6d %7.2f %6d %7.2f' % (node, + print('%-10s %6d %6d %7d %6d %6d %7.2f %6d %7.2f %7.2f %7.2f' % (node, node_nic_pkts[node]/node_time, 1e-3*node_nic_bytes[node]/node_time, max_pkts, max_bytes/1000, max_in_pkts, max_in_bytes*8/(4000*base_interval), - max_free_pkts, max_free_bytes*8/(4000*base_interval))) + max_free_pkts, max_free_bytes*8/(4000*base_interval), + node_total_in[node]*8e-3/node_time, + node_total_freed[node]*8e-3/node_time)) # Output a table showing stats for the intervals with the highest # and lowest free_bytes. @@ -6724,6 +6872,173 @@ def output(self): sum(t[3] for t in fast)/len(fast), sum(t[4] for t in fast)/(1000*len(fast)),)) +#------------------------------------------------ +# Analyzer: nicbacklog2 +#------------------------------------------------ +class AnalyzeNicbacklog2: + """ + Prints a time-series analysis of backlog in the NIC (packets that + have been passed to the NIC but not yet returned after transmission). + This differs from the nicbacklog analyzer in that it analyzes + the distribution of traffic between device queues. Requries the + --data option. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + require_options('nicbacklog2', 'data') + + def output(self): + global packets, tcp_packets, options, traces + + # node -> list of packets transmitted by that node + node_pkts = defaultdict(list) + + # Bucket all of the packets by transmitting node. + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if not 'tso_length' in pkt or not 'tx_node' in pkt: + continue + if not 'tx_qid' in pkt: + continue + length = pkt['tso_length'] + node = pkt['tx_node'] + if 'nic' in pkt or 'free_tx_skb' in pkt: + node_pkts[node].append(pkt) + + # Each iteration in this loops generates data for one node. + for node in get_sorted_nodes(): + f = open('%s/nicbacklog2_%s.dat' % (options.data, node), 'w') + f.write('# Node: %s\n' % (node)) + f.write('# Generated at %s.\n\n' % + (time.strftime('%I:%M %p on %m/%d/%Y'))) + f.write('# Statistics on NIC backlog (packets handed off to the ' + 'NIC for transmission\n') + f.write('# but not yet returned to the kernel after xmit):\n') + f.write('# Time: End of interval (usecs)\n') + f.write('# Pkts: Total # packets owned by NIC\n') + f.write('# KB: Total kbytes of packet data owned by NIC\n') + f.write('# TcpKB: Total kbytes of TCP packet data in NIC\n') + f.write('# PktQs: Number of queues with packets\n') + f.write('# FreeQs: Number of queues for which a packet was freed\n') + f.write('# a packet in the interval\n') + f.write('# Qid1: Id of queue with the most data\n') + f.write('# Pkts1: Number of packets in Qid1\n') + f.write('# KB1: Kbytes of packet data in Qid1 (Homa and TCP)\n') + f.write('# TcpKB1: Kbytes of TCP packet data in Qid1\n') + f.write('# Qid2, etc: Info for queue with second most data\n') + f.write('# Qid3, etc: Info for queue with third most data\n') + f.write('\n') + f.write('Time Pkts KB TcpKB PktQs FreeQs ' + 'Qid1 Pkts1 KB1 TcpKB1 ' + 'Qid2 Pkts2 KB2 TcpKB2 ' + 'Qid3 Pkts3 KB3 TcpKB3\n') + + # heapq of all active packets (those that are currently in + # the posession of the NIC) in increasing order of free time. + # Entries are where free is the packet's + # free_tx_skb time (for sorting), index is the packet's index + # in the list of all packets (for resolving sorting ties), and + # pkt is the packet. + active = [] + + # queue -> 1 for all queues for which a packet was freed (i.e. + # transmitted) in the current interval. + queues_freed = {} + + # queue -> count of bytes currently owned by this queue. + queue_bytes = defaultdict(lambda: 0) + + # queue -> same as queue_bytes except count only TCP bytes. + queue_tcp_bytes = defaultdict(lambda: 0) + + # queue -> count of bytes currently owned by this queue. + queue_packets = defaultdict(lambda: 0) + + pkts = sorted(node_pkts[node], key = lambda pkt : + pkt['nic'] if 'nic' in pkt else -1e20) + cur = 0 + t = traces[node]['first_time'] + interval_end = (math.ceil(traces[node]['first_time'] / + options.interval) * options.interval) + + # Each iteration of this loop processes one event: either a + # packet handed off to the NIC or a packet freed. + while True: + # Decide on next event + if cur < len(pkts): + pkt = pkts[cur] + if 'nic' in pkt: + nic = pkt['nic'] + else: + nic = traces[node]['first_time'] + else: + nic = None + if nic != None and (not active or active[0][0] > nic): + free = False + t = nic + cur += 1 + elif active: + t, _, pkt = heapq.heappop(active) + free = True + else: + break + + # Handle end of interval(s) + while t >= interval_end: + # Format info for the queues with the most data. + details = '' + printed = 0 + nonzero = 0 + for qid, bytes in sorted(queue_bytes.items(), + key = lambda t: t[1], reverse = True): + if bytes == 0: + break + nonzero += 1 + if printed < 3: + details += ' %4d %5d %5d %6d' % (qid, + queue_packets[qid], queue_bytes[qid]//1000, + queue_tcp_bytes[qid]//1000) + printed += 1 + + # Generate output line + f.write('%7.1f %5d %5d %5d %5d %6d%s\n' % ( + interval_end, sum(queue_packets.values()), + sum(queue_bytes.values())//1000, + sum(queue_tcp_bytes.values())//1000, + nonzero, len(queues_freed), details)) + + interval_end += options.interval + queues_freed.clear() + + # Update statistics with current event + qid = pkt['tx_qid'] + if free: + queues_freed[qid] = 1 + queue_packets[qid] -= 1 + queue_bytes[qid] -= pkt['tso_length'] + if pkt['id'] == 0: + queue_tcp_bytes[qid] -= pkt['tso_length'] + if queue_packets[qid] == 0: + if queue_bytes[qid] != 0: + raise Exception('%9.3f: queue_bytes[%d] is %d but ' + 'queue_packets is 0' % (t, qid, + queue_bytes[qid])) + else: + queue_packets[qid] += 1 + queue_bytes[qid] += pkt['tso_length'] + if pkt['id'] == 0: + queue_tcp_bytes[qid] += pkt['tso_length'] + free = (pkt['free_tx_skb'] if 'free_tx_skb' in pkt else + traces[node]['last_time']) + heapq.heappush(active, [free, cur, pkt]) + + f.close() + + print('\n---------------------') + print('Analyzer: nicbacklog2') + print('---------------------') + print('See data files %s/nicbacklog2_*.dat' % (options.data)) + #------------------------------------------------ # Analyzer: nicpkts #------------------------------------------------ @@ -7235,7 +7550,7 @@ def __init__(self, dispatcher): return def output(self): - global rpcs, traces, options, peer_nodes, packets + global rpcs, traces, options, ip_to_node, packets print('\n-----------------') print('Analyzer: packet') @@ -7575,93 +7890,106 @@ def tt_softirq_grant(self, trace, t, core, id, offset, priority, increment): g['increment'] = increment g['rx_node'] = trace['node'] - def tt_xmit_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, - data_bytes, total, ack): - tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, - data_bytes, ack) + def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, seq_ack): + global tcp_hdr_length + + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) # if 'xmit' in tcp_pkt: # print('%9.3f: Duplicate TCP packet transmission on node %s (previous: %.3f)' % (t, # trace['node'], tcp_pkt['xmit'])) node = trace['node'] tcp_pkt['xmit'] = t tcp_pkt['xmit2'] = t - tcp_pkt['total_length'] = total + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['total_length'] = data_bytes + tcp_hdr_length tcp_pkt['tx_node'] = node - if not saddr in peer_nodes and saddr != '0x00000000': - peer_nodes[saddr] = node + set_tcp_ip_node(source, node) - def tt_qdisc_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, - data_bytes, ack): - tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, - data_bytes, ack) + def tt_tcp_qdisc(self, trace, t, core, source, dest, data_bytes, seq_ack): + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) node = trace['node'] tcp_pkt['qdisc_xmit'] = t tcp_pkt['xmit2'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['total_length'] = data_bytes + tcp_hdr_length tcp_pkt['tx_node'] = node - if sequence == 1749134782: - print('tt_qdisc_tcp setting tx_node to %s' % (node)) - if not saddr in peer_nodes and saddr != '0x00000000': - peer_nodes[saddr] = node + set_tcp_ip_node(source, node) - def tt_nic_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, - data_bytes, ack, gso_size): + def tt_tcp_nic(self, trace, t, core, source, dest, data_bytes, seq_ack): + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) node = trace['node'] - if not saddr in peer_nodes and saddr != '0x00000000': - peer_nodes[saddr] = node - - # Break TSO packets up into multiple packets, matching what will - # be received on the other end. - bytes_left = data_bytes - pkt_sequence = sequence - while True: - pkt_bytes = bytes_left - if pkt_bytes > gso_size and gso_size != 0: - pkt_bytes = gso_size - tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, pkt_sequence, - pkt_bytes, ack) - if 'nic' in tcp_pkt and data_bytes > 0: - # Retransmitted packet: retain only the last transmission. - if 'gro' in tcp_pkt and tcp_pkt['gro'] < t: - del tcp_pkt['gro'] - tcp_pkt['nic'] = t - tcp_pkt['tx_node'] = node - if pkt_sequence == 1749134782: - print('tt_xmit_tcp setting tx_node to %s' % (node)) - if pkt_sequence == sequence: - tcp_pkt['tso_length'] = data_bytes - bytes_left -= pkt_bytes - pkt_sequence = (pkt_sequence + pkt_bytes) & 0xffffffff - if bytes_left <= 0: - break + tcp_pkt['nic'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['tx_node'] = node + set_tcp_ip_node(source, node) - def tt_free_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, - data_bytes, ack, qid): - tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, - data_bytes, ack) + def tt_tcp_free(self, trace, t, core, source, dest, data_bytes, seq_ack, + qid): + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) node = trace['node'] tcp_pkt['free_tx_skb'] = t + tcp_pkt['tso_length'] = data_bytes tcp_pkt['tx_qid'] = qid tcp_pkt['tx_node'] = node - if sequence == 1749134782: - print('%9.3f: tt_free_tcp setting tx_node to %s' % (t, node)) - if not saddr in peer_nodes and saddr != '0x00000000': - peer_nodes[saddr] = node - - def tt_gro_tcp(self, trace, t, core, saddr, sport, daddr, dport, sequence, - data_bytes, total, ack): - tcp_pkt = get_tcp_packet(saddr, sport, daddr, dport, sequence, - data_bytes, ack) + set_tcp_ip_node(source, node) + + def tt_tcp_gro(self, trace, t, core, source, dest, data_bytes, seq_ack): + global tcp_hdr_length + + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) node = trace['node'] - if 'nic' in tcp_pkt and t < tcp_pkt['nic']: - # This packet was apparently retransmitted; we want to retain - # only the last transmission, but this event appears to be from - # an earlier transmision; ignore it. - return + tcp_pkt['length'] = data_bytes tcp_pkt['gro'] = t - tcp_pkt['total_length'] = total + tcp_pkt['total_length'] = data_bytes + tcp_hdr_length tcp_pkt['rx_node'] = node - if not daddr in peer_nodes and daddr != '0x00000000': - peer_nodes[daddr] = node + set_tcp_ip_node(dest, node) + + def cleanup_tcp_pkts(self): + """ + This method post-processes all of the TCP packets to fill in missing + fields. + """ + global tcp_packets + + # -> list of data packets (nonzero length) from + # source to dest, where source and dest come from fields in packets + # with the same name. + stream_pkts = defaultdict(list) + + # Pass 1: divide data packets into buckets for uinidirectional + # streams, and also fill in a fiew fields. + for pkt in tcp_packets.values(): + if not 'tx_node' in pkt: + pkt['tx_node'] = get_tcp_node(pkt['source']) + if not 'rx_node' in pkt: + pkt['rx_node'] = get_tcp_node(pkt['dest']) + if not 'length' in pkt: + if not 'tso_length' in pkt: + print('No tso_length in packet: %s' % (pkt)) + pkt['length'] = pkt['tso_length'] + + if pkt['length'] == 0: + continue + stream_pkts[f"{pkt['source']} {pkt['dest']}"].append(pkt) + + # Pass 2: process the packets in a stream in sequence order, in + # order to copy information from a source TSO packet into each of + # the segments generated from it. + for pkts in stream_pkts.values(): + tso_pkt = None + tso_end = None + for pkt in sorted(pkts, key = lambda pkt: pkt['seq_ack']): + if 'tso_length' in pkt: + tso_pkt = pkt + tso_end = pkt['seq_ack'] + pkt['tso_length'] + continue + if tso_pkt == None or pkt['seq_ack'] >= tso_end: + continue + tso_pkt['segments'].append(pkt) + for field in ['xmit', 'qdisc_xmit', 'xmit2', 'tx_qid', + 'nic', 'free_tx_skb']: + if field in tso_pkt: + pkt[field] = tso_pkt[field] def analyze(self): """ @@ -7707,8 +8035,8 @@ def analyze(self): pkt['tso_length'] = tso_length if not 'rx_node' in pkt: - if 'peer' in tx_rpc and tx_rpc['peer'] in peer_nodes: - pkt['rx_node'] = peer_nodes[tx_rpc['peer']] + if 'peer' in tx_rpc and tx_rpc['peer'] in ip_to_node: + pkt['rx_node'] = ip_to_node[tx_rpc['peer']] if 'qdisc_xmit' in pkt: pkt['xmit2'] = pkt['qdisc_xmit'] @@ -7754,6 +8082,8 @@ def analyze(self): for pid, pkt in new_pkts: packets[pid] = pkt + self.cleanup_tcp_pkts() + #------------------------------------------------ # Analyzer: pairs #------------------------------------------------ @@ -8009,12 +8339,12 @@ def analyze(self): trace = None if 'gro' in pkt: t = pkt['gro'] - if pkt['saddr'] in peer_nodes: - trace = traces[peer_nodes[pkt['saddr']]] + if pkt['saddr'] in ip_to_node: + trace = traces[ip_to_node[pkt['saddr']]] else: t = pkt['nic'] - if pkt['daddr'] in peer_nodes: - trace = traces[peer_nodes[pkt['daddr']]] + if pkt['daddr'] in ip_to_node: + trace = traces[ip_to_node[pkt['daddr']]] if (trace != None and trace['first_time'] < (t-1000) and trace['last_time'] > (t+1000)): print('%9.3f: incomplete TCP packet for %s:%d to %s:%d (peer %s, ' @@ -8624,7 +8954,7 @@ def analyze(self): """ Fill in various additional information related to RPCs """ - global rpcs, traces, peer_nodes + global rpcs, traces, ip_to_node for id, rpc in rpcs.items(): peer_id = id ^ 1 @@ -8636,8 +8966,8 @@ def analyze(self): # Fill in peer_nodes if 'peer' in rpc: peer = rpc['peer'] - if (not peer in peer_nodes) and peer_rpc: - peer_nodes[peer] = peer_rpc['node'] + if (not peer in ip_to_node) and peer_rpc: + ip_to_node[peer] = peer_rpc['node'] # Deduce out_length if not already present. if not 'out_length' in rpc: @@ -8683,7 +9013,7 @@ def __init__(self, dispatcher): return def output(self): - global rpcs, peer_nodes, options + global rpcs, ip_to_node, options # List with one entry for each short RPC, containing a tuple # where rtt is the round-trip @@ -8703,7 +9033,7 @@ def output(self): continue rtts.append([rpc['recvmsg_done'] - rpc['sendmsg'], id, rpc['sendmsg'], rpc['recvmsg_done'], rpc['node'], - peer_nodes[rpc['peer']]]) + ip_to_node[rpc['peer']]]) rtts.sort(key=lambda t : t[0]) @@ -9541,7 +9871,7 @@ def output(self): info += '%sgranted %d' % (prefix, got_gro) prefix = ', ' if 'peer' in rx_rpc: - info += '%speer %s' % (prefix, peer_nodes[rx_rpc['peer']]) + info += '%speer %s' % (prefix, ip_to_node[rx_rpc['peer']]) if info: info += ')' @@ -9726,6 +10056,287 @@ def output(self): start, end, node = smi print('%9.3f %9.3f %6.1f %s' % (start, end, end - start, node)) +#------------------------------------------------ +# Analyzer: tcp_rpcs +#------------------------------------------------ +class AnalyzeTcp_rpcs: + """ + Print information about RPCs that were transmitted with TCP. The options + --msglen, --rpc-start, and --rtt may be used to filter the RPCs to print. + By default the RPCs are printed in order of start time, but that may be + changed with the --sort option. The --sort option is a list of the + column names Start, End, and Rtt; the RPCs will be sorted by each keyword + in order before printing. If --verbose is specified then the packets from + the selected RPCs are also printed. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + + # "source dest" -> list of entries in tcp_rpcs whose client and + # server fields match source and dest. + self.rpcs = defaultdict(list) + + # "source dest" -> list of tuples for + # all of the recvmsg completions for this stream. Time is the time + # when recvmsgc completed, sequence is the sequence number + # just after the last one returned, and length is the number of + # bytes returned. + self.recvs = defaultdict(list) + + def tt_tcp_sendmsg(self, trace, t, core, source, dest, msg_length, + sequence, slot, response): + global tcp_rpcs + + # Create a new entry in tcp_rpcs for this message. At this point + # we don't have enough information to pair request and response + # messages, so create separate "rpcs" for each, pretending all + # messages are requests. The analyze method will combine requests + # and responses into a single entry. + rpc = { + 'client': source, + 'server': dest, + 'req_send': t, + 'req_length': msg_length, + 'req_seq': sequence, + 'req_end_seq': sequence + msg_length, + 'slot': slot, + 'req_pkts': [], + 'resp_pkts': [] + } + if response: + rpc['response'] = 1 + tcp_rpcs[f'{source} {dest} {sequence}'] = rpc + self.rpcs[f'{source} {dest}'].append(rpc) + + def tt_tcp_recvmsg(self, trace, t, core, source, dest, msg_length, + sequence): + self.recvs[f'{source} {dest}'].append([t, sequence, msg_length]) + + def del_rpc(self, rpc): + """ + Remove an entry from tcp_rpcs. + msg: Entry to remove (it's incomplete: describes either a + request or a response but not both) + """ + global tcp_rpcs + + del tcp_rpcs[f"{rpc['client']} {rpc['server']} {rpc['req_seq']}"] + + def merge(self, request, response): + """ + Move information from an RPC that contains only a response to + an RPC that currently contains only a request. + request: Entry in tcp_rpcs that describes a request message + response: Entry in tcp_repcs that describes the response + corresponding to request. This will be deleted. + """ + request['resp_send'] = response['req_send'] + request['resp_seq'] = response['req_seq'] + request['resp_length'] = response['req_length'] + request['resp_end_seq'] = response['req_end_seq'] + request['resp_pkts'] = response['req_pkts'] + if 'req_recvd' in response: + request['resp_recvd'] = response['req_recvd'] + self.del_rpc(response) + + def analyze(self): + """ + Finish the creation of tcp_rpcs + """ + global tcp_packets, tcp_rpcs + + # -> list of data packets (nonzero length) from + # source to dest. + stream_pkts = defaultdict(list) + + # Bucket TCP packets in the same way as self.messages. + for pkt in tcp_packets.values(): + if pkt['length'] == 0: + continue + key = '%s %s' % (pkt['source'], pkt['dest']) + stream_pkts[key].append(pkt) + + # Assign TCP packets to messages. Each iteration through this + # loop processes a source-dest pair, working through messages + # and packets in sequence order. A single packet could contain + # parts of multiple messages. + for key, rpcs in self.rpcs.items(): + rpcs.sort(key = lambda rpc: rpc['req_seq']) + pkts = sorted(stream_pkts[key], + key = lambda pkt: pkt['seq_ack']) + pkt_ix = 0 + + for rpc in rpcs: + rpc_start = rpc['req_seq'] + rpc_end = rpc['req_end_seq'] + while pkt_ix < len(pkts): + pkt = pkts[pkt_ix] + pkt_start = pkt['seq_ack'] + pkt_end = pkt_start + pkt['length'] + if pkt_end <= rpc_start: + pkt_ix += 1 + continue + if pkt_start >= rpc_end: + break + rpc['req_pkts'].append(pkt) + if pkt_end <= rpc_end: + pkt_ix += 1 + else: + break + + # Add req_gro times to RPCs. + for key, rpcs in self.rpcs.items(): + rpcs.sort(key = lambda rpc: rpc['req_end_seq']) + recvs = sorted(self.recvs[key], key = lambda t: t[1], + reverse = True) + + for rpc in rpcs: + while recvs: + t, sequence, length = recvs[-1] + rpc_end = rpc['req_end_seq'] + if sequence < rpc_end: + recvs.pop() + continue + if sequence - length < rpc_end: + rpc['req_recvd'] = t + break + + # Find matching pairs of request and response messages and merge + # them into single (and complete) RPCs. + for key, rpcs in self.rpcs.items(): + # slot -> List of requests in the forward direction and responses + # in the reverse direction for this slot and client-server pair, + # sorted by sendmsg time. + slot_rpcs = defaultdict(list) + + for rpc in rpcs: + if 'response' in rpc: + continue; + slot_rpcs[rpc['slot']].append(rpc) + source, dest = key.split() + for rpc in self.rpcs[f'{dest} {source}']: + if not 'response' in rpc: + continue + slot_rpcs[rpc['slot']].append(rpc) + for rpcs in slot_rpcs.values(): + rpcs.sort(key = lambda msg: msg['req_send']) + request = None + for rpc in rpcs: + if 'response' in rpc: + if (request == None or + rpc['req_send'] <= request['req_send']): + # Unmatchable response + self.del_rpc(rpc) + continue + self.merge(request, rpc) + request = None + else: + if request != None: + # Unmatchable request + self.del_rpc(request) + request = rpc + if request != None: + # Unmatchable trailing request + self.del_rpc(request) + + def filter_rpcs(self, rpcs, options): + """ + Returns a list of all the TCP RPCs that match a set of command-line + options + rpcs: List of TCP RPCs to filter (must be entries in tcp_rpcs) + options: Command-line options to use for filtering; see below for + valid options + """ + if options.msglen != None: + min_length, max_length = get_range(options.msglen, + option_name='--msglen', one_value=True) + if max_length == None: + max_length = min_length + min_length = 0 + if options.rpc_start != None: + min_start, max_start = get_range(options.rpc_start, + parse_float=True, option_name='--rpc-start') + if options.rtt != None: + min_rtt, max_rtt = get_range(options.rtt, parse_float = True, + option_name='--rtt') + + result = [] + for rpc in rpcs: + if options.msglen != None: + if not 'req_length' in rpc: + continue + length = rpc['req_length'] + if length < min_length or length > max_length: + continue + if options.rpc_start != None: + if not 'req_send' in rpc: + continue + start = rpc['req_send'] + if start < min_start or start > max_start: + continue + if options.rtt != None: + if not 'req_send' in rpc or not 'resp_recvd' in rpc: + continue + rtt = rpc['resp_recvd'] - rpc['req_send'] + if rtt < min_rtt or rtt > max_rtt: + continue + result.append(rpc) + return result + + def output(self): + global tcp_rpcs, options + + print('\n------------------') + print('Analyzer: tcp_rpcs') + print('------------------') + + print_rpcs = self.filter_rpcs(tcp_rpcs.values(), options) + + if (options.msglen != None or options.rpc_start != None or + options.rtt != None): + print('%d TCP RPCs were selected using the following filters:' % + (len(print_rpcs))) + if options.msglen: + print(' --msglen %s' % (options.msglen)) + if options.rpc_start: + print(' --rpc-start %s' % (options.rpc_start)) + if options.rtt: + print(' --rtt %s' % (options.rtt)) + else: + print('There are %d TCP RPCs in the traces:' % (len(print_rpcs))) + + sort_keys = options.sort + if sort_keys == None: + sort_keys = 'Start' + for key in sort_keys.split(): + if key == 'Start': + print_rpcs.sort(key = lambda rpc: + rpc['req_send'] if 'req_send' in rpc else 1e20) + elif key == 'End': + print_rpcs.sort(key = lambda rpc: + rpc['resp_recvd'] if 'resp_recvd' in rpc else 1e20) + elif key == 'Rtt': + print_rpcs.sort(reverse = True, key = lambda rpc: + rpc['resp_recvd'] - rpc['req_send'] + if 'resp_recvd' in rpc and 'req_send' in rpc else 0) + else: + raise Exception('Unknwon sort key \'%s\' for tcp_rpcs ' + 'analyzer' % (key)) + + print(print_tcp_rpcs(print_rpcs, header = True), end='') + + if options.verbose: + first = True + print('\nPackets from the selected RPCs (in the same RPC order as') + print('above):') + for rpc in print_rpcs: + if not first: + print() + print(print_pkts(rpc['req_pkts'], header=first), end='') + print(print_pkts(rpc['resp_pkts'], header=False), end='') + first = False + #------------------------------------------------ # Analyzer: temp #------------------------------------------------ @@ -9818,7 +10429,7 @@ def output_slow_rpcs(self): rtt = rpc['recvmsg_done'] - rpc['sendmsg'] print('RPC id %d (%s -> %s) took %.1f usecs, length %d, %.3f -> %.3f' % (rpc['id'], rpc['node'], - peer_nodes[peer] if peer in peer_nodes else peer, + ip_to_node[peer] if peer in ip_to_node else peer, rtt, rpc['out_length'], rpc['sendmsg'], rpc['recvmsg_done'])) if rpc['send_data_pkts']: pkt = rpc['send_data_pkts'][0] @@ -10309,13 +10920,7 @@ def output(self): node_pkts = defaultdict(list) # Bucket all of the packets by transmitting node. - for pkt in packets.values(): - if not 'xmit' in pkt or not 'tso_length' in pkt: - continue - if not 'gro' in pkt: - continue - node_pkts[pkt['tx_node']].append(pkt) - for pkt in tcp_packets.values(): + for pkt in itertools.chain(packets.values(), tcp_packets.values()): if not 'xmit' in pkt or not 'tso_length' in pkt: continue if not 'gro' in pkt: @@ -10475,7 +11080,10 @@ def output(self): # Create a data file for this node with packets in time order # (or whatever order was requested on the command line). - pkts = sort_pkts(pkts, options.sort) + key = options.sort + if key == None: + key = 'Xmit' + pkts = sort_pkts(pkts, key) f = open('%s/txpkts_%s.dat' % (options.data, node), 'w') f.write('# Node: %s\n' % (node)) @@ -10858,6 +11466,13 @@ def output(self): 'packets to include for analysis; a list of the values \'data\' for ' 'Homa data packets, \'tcp\' for TCP packets, and \'grant\' for Homa ' 'grants, or \'all\' to select all types (default: \'homa\')') +parser.add_option('--rpc-start', dest='rpc_start', default=None, + metavar='T', help='Used by some analyzers to filter RPCs based on ' + 'starting time; contains two values (min and max, inclusive).') +parser.add_option('--rtt', dest='rtt', default=None, + metavar='T', help='Used by some analyzers to filter RPCs based on ' + 'end-to-end round trip time; contains two values (min and max, ' + 'inclusive).') parser.add_option('--rx-core', dest='rx_core', type=int, default=None, metavar='C', help='If specified, some analyzers will ignore packets ' 'transmitted from cores other than C') @@ -10872,11 +11487,11 @@ def output(self): help='By default some analyzers will consider only the first segment ' 'of packets that are segmented by TSO segmentation; if this option ' 'is specified then they will consider all of the derived segments') -parser.add_option('--sort', dest='sort', default='Xmit', +parser.add_option('--sort', dest='sort', default=None, metavar='S', help='Used by some analyzers to select a field to use ' - 'for sorting packets. Must be \'Xmit\', \'Nic\', \'Gro\', \'SoftIRQ\', ' - 'or \'Free\' (default: \'Xmit\')') -parser.add_option('--threshold', dest='threshold', type=int, default=100, + 'for sorting packets; legal values and the default depend on the' + 'analyzer') +parser.add_option('--threshold', dest='threshold', type=int, default=50, metavar='T', help='Used by some analyzers as a threshold time value, ' 'in microseconds (default: 100)') parser.add_option('--time', dest='time', type=float, default=None, From 428b1c68e6e8e7566ba66b2ccc942bd96c816899 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 10 Dec 2025 17:25:28 -0800 Subject: [PATCH 595/625] Rework tt patterns to speed up parsing in tthoma.py Special characters at the start of a pattern caused all patterns to end up in a single bucket --- util/tthoma.py | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index b324359e..e65f57c8 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -1457,6 +1457,7 @@ def parse(self, file): # Parse each line in 2 phases: first the time and core information # that is common to all patterns, then the message, which will # select at most one pattern. + # print('\n%s' % (trace['line'].rstrip())) self.trace_lines += 1 self.regex_tries += 1 match = prefix_matcher.match(trace['line']) @@ -1474,6 +1475,7 @@ def parse(self, file): if prefix in self.parse_table: for pattern in self.parse_table[prefix]: self.regex_tries += 1 + # print(' %s' % (pattern['regexp'])) match = pattern['cregexp'].match(msg) if match: pattern['matches'] += 1 @@ -1514,6 +1516,11 @@ def print_stats(self): print('(%.1f usec/line, %.1f usec/regex attempt)' % ( ((self.parse_ns/self.trace_lines)*1e-3), ((self.parse_ns/self.regex_tries)*1e-3))) + sum = 0 + for bucket in self.parse_table.values(): + sum += len(bucket) + print('Parse table has %d patterns in %d buckets' % ( + sum, len(self.parse_table))) def __build_parse_table(self): """ @@ -1540,7 +1547,11 @@ def __build_parse_table(self): else: length = match.start() if length < self.prefix_length: - self.prefix_length = length; + self.prefix_length = length + if length == 0: + print('Warning: parse table has only 1 bucket because ' + 'of the following pattern:\n %s' % ( + pattern['regexp']), file = sys.stderr) # Pass 2: fill in self.parse_table for pattern in self.patterns: @@ -1698,7 +1709,7 @@ def __nic_data(self, trace, time, core, match, interests): patterns.append({ 'name': 'nic_data', - 'regexp': '(mlx|ice) sent homa data packet to ([^,]+), id ([0-9]+), ' + 'regexp': 'sent homa data packet via (mlx|ice) to ([^,]+), id ([0-9]+), ' 'offset ([0-9]+), queue (0x[0-9a-f]+)' }) @@ -1712,22 +1723,22 @@ def __nic_grant(self, trace, time, core, match, interests): patterns.append({ 'name': 'nic_grant', - 'regexp': '(mlx|ice) sent homa grant to ([^,]+), id ([0-9]+), ' + 'regexp': 'sent homa grant via (mlx|ice) to ([^,]+), id ([0-9]+), ' 'offset ([0-9]+), queue (0x[0-9a-f]+)' }) def __free_tx_skb(self, trace, time, core, match, interests): - id = int(match.group(2)) - offset = int(match.group(3)) - qid = int(match.group(4)) - msg_length = int(match.group(5)) + id = int(match.group(1)) + offset = int(match.group(2)) + qid = int(match.group(3)) + msg_length = int(match.group(4)) for interest in interests: interest.tt_free_tx_skb(trace, time, core, id, offset, qid, msg_length) patterns.append({ 'name': 'free_tx_skb', - 'regexp': '(mlx|ice) freeing tx skb for homa data, id ([0-9]+), ' + 'regexp': 'freeing tx skb for homa data, id ([0-9]+), ' 'offset ([0-9]+), qid ([0-9]+), msg_length ([0-9]+)' }) @@ -2304,33 +2315,33 @@ def __tcp_nic(self, trace, time, core, match, interests): patterns.append({ 'name': 'tcp_nic', - 'regexp': '(mlx|ice) sent TCP packet from (0x[a-f0-9]+) to ' + 'regexp': 'sent TCP packet via (mlx|ice) from (0x[a-f0-9]+) to ' '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' }) def __tcp_free2(self, trace, time, core, match, interests): - self.core_saved[core]['tcp_free_qid'] = int(match.group(2)) + self.core_saved[core]['tcp_free_qid'] = int(match.group(1)) patterns.append({ 'name': 'tcp_free2', - 'regexp': '(mlx|ice) freeing TCP skb for qid ([0-9]+)' + 'regexp': 'freeing TCP skb for qid ([0-9]+)' }) def __tcp_free(self, trace, time, core, match, interests): saved = self.core_saved[core] if not 'tcp_free_qid' in saved: return - source = match.group(2) - dest = match.group(3) - data_bytes = int(match.group(4)) - seq_ack = int(match.group(5)) + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) for interest in interests: interest.tt_tcp_free(trace, time, core, source, dest, data_bytes, seq_ack, saved['tcp_free_qid']) patterns.append({ 'name': 'tcp_free', - 'regexp': '(mlx|ice) freeing TCP skb from (0x[a-f0-9]+) to ' + 'regexp': 'freeing TCP skb from (0x[a-f0-9]+) to ' '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' }) From f086ee1ecacf83ade875081005c94f4bcc6d411e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 12 Dec 2025 16:49:49 -0800 Subject: [PATCH 596/625] Improve error message in cperf.py --- util/cperf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util/cperf.py b/util/cperf.py index 34927be8..cc2d80b3 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -1124,7 +1124,8 @@ def scan_logs(): for key in ["client_gbps", "client_kops", "server_gbps", "server_kops"]: if not key in totals: - log("%s missing in node log files" % (key)) + log("%s missing in node log files for experiment %s" % ( + key, name)) totals[key] = 0 log("\nClients for %s experiment: %d nodes, %.2f Gbps, %.1f Kops/sec " From 2fac1a51b3b56cd1f5713fd9b9d5fa7965e310a0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 12 Dec 2025 16:50:15 -0800 Subject: [PATCH 597/625] Restore cp_tcp_config to life and add nic-queue config option --- util/cp_tcp_config | 203 ++++++++++++++++++++------------------------- 1 file changed, 92 insertions(+), 111 deletions(-) diff --git a/util/cp_tcp_config b/util/cp_tcp_config index 3cdf6e70..1ab427f1 100755 --- a/util/cp_tcp_config +++ b/util/cp_tcp_config @@ -4,7 +4,7 @@ # SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures TCP and DCTCP while varying one or more -# aspects of Homa's configuration (such as duty cycle). +# aspects of Homa's configuration (such as number of server threads). # Type "cp_tcp_config --help" for documentation. from cperf import * @@ -16,88 +16,86 @@ parser = get_parser(description= 'varies.', usage='%(prog)s [options]') parser.add_argument('-c', '--config', dest='config', - choices=['cports', 'sports', 'threads'], + choices=['cports', 'nic_queue', 'sports', 'threads'], required = True, help='Aspect of configuration to change') -parser.add_argument('--tcp', dest='tcp', type=boolean, - default=True, help="Boolean value: indicates whether measurements " - "should be run on TCP (default: true)") parser.add_argument('--dctcp', dest='dctcp', type=boolean, default=False, help="Boolean value:: indicates whether measurements " "should be run on DCTCP (default: false)") options = parser.parse_args() init(options) -servers = range(0, options.num_nodes) -clients = range(0, options.num_nodes) if options.workload != "": load_info = [[options.workload, options.gbps]] specs = [] -if options.config == 'threads': - for client, server in [[3, 6], [4, 8], [5, 10], [6, 12], [7, 14]]: - o = copy.deepcopy(options) - o.tcp_server_ports = server - o.tcp_client_ports = client - name = "s%dc%d" % (server, client) - specs.append({'options': o, 'exp_name': name, 'label': name}) -elif options.config == 'cports': +if options.config == 'cports': for ports in [2, 3, 4, 6, 8]: - o = copy.deepcopy(options) - o.tcp_client_ports = ports - specs.append({'options': o, - 'exp_name': "cports%d" % (ports), - 'label': "%d client ports" % (ports)}) + specs.append({'exp_name': "cports%d" % (ports), + 'label': "%d client ports" % (ports), + 'options': ['tcp_client_ports', ports]}) +elif options.config == 'nic_queue': + for usec in [5, 10, 20, 40]: + specs.append({'exp_name': "nicq%d" % (usec), + 'label': r'NIC queue max %d µsec' % (usec), + 'sysctl': ['.net.homa.max_nic_queue_ns', usec * 1000]}) elif options.config == 'sports': for ports in [6, 9, 12, 15, 18]: - o = copy.deepcopy(options) - o.tcp_server_ports = ports - specs.append({'options': o, - 'exp_name': "sports%d" % (ports), - 'label': "%d server ports" % (ports)}) + specs.append({ 'exp_name': "sports%d" % (ports), + 'label': "%d server ports" % (ports), + 'options': ['tcp_server_ports', ports]}) +elif options.config == 'threads': + for client, server in [[3, 6], [4, 8], [5, 10], [6, 12], [7, 14]]: + name = "s%dc%d" % (server, client) + specs.append({'exp_name': name, + 'label': name, + 'options': ['tcp_server_ports', server, + 'tcp_client_ports', client]}) +# sysctl parameter name -> old value to restore. +old_values = {} if not options.plot_only: - congestion = get_sysctl_parameter("net.ipv4.tcp_congestion_control") + congestion = get_sysctl_parameter("net.ipv4.tcp_congestion_control", + options.nodes[0]) try: # For each workload, run a set of experiments with different # configurations. for workload, bw in load_info: - o = copy.deepcopy(options) - o.protocol = "homa" - o.workload = workload - o.client_ports = 1 - o.client_max = 1 - o.server_ports = 1 - o.server_nodes = 1 - o.first_server = 1 - o.unloaded = 500 - start_servers(range(1, 2), o) - run_experiment("unloaded_" + workload, range(0, 1), o) - for spec in specs: - o = options - if 'options' in spec: - o = spec['options'] + o = copy.deepcopy(options) o.protocol = "tcp" o.workload = workload o.gbps = bw/2.0 - start_servers(servers, o) - if options.tcp: - set_sysctl_parameter("net.ipv4.tcp_congestion_control", - "cubic", range(0, options.num_nodes)) - run_experiment("tcp_%s_%s" % (spec['exp_name'], workload), - clients, o) + if 'sysctl' in spec: + for i in range(0, len(spec['sysctl']), 2): + name = spec['sysctl'][i] + value = spec['sysctl'][i+1] + if name not in old_values: + old_values[name] = get_sysctl_parameter(name, + options.nodes[0]) + log("Setting %s = %s" % (name, value)) + set_sysctl_parameter(name, value, options.nodes) + if 'options' in spec: + for i in range(0, len(spec['options']), 2): + name = spec['options'][i] + value = spec['options'][i+1] + setattr(o, name, value) + exp_name = "%s_%s" % (spec['exp_name'], workload) + start_servers(exp_name, o.servers, o) if options.dctcp: set_sysctl_parameter("net.ipv4.tcp_congestion_control", - "dctcp", range(0, options.num_nodes)) - run_experiment("dctcp_%s_%s" % (spec['exp_name'], workload), - clients, o) + "dctcp", options.nodes) + run_experiment("%s_%s" % (spec['exp_name'], workload), + o.clients, o) except Exception as e: log(traceback.format_exc()) print("Resetting TCP congestion control to %s" % (congestion)) set_sysctl_parameter("net.ipv4.tcp_congestion_control", congestion, - range(0, options.num_nodes)) + options.nodes) + for name, value in old_values.items(): + print("Restoring %s to %s" % (name, value)) + set_sysctl_parameter(name, value, options.nodes) log("Stopping nodes") stop_nodes() @@ -105,66 +103,49 @@ if not options.plot_only: # Generate plots and reports for workload, bw in load_info: - set_unloaded("unloaded_" + workload) - log("Generating slowdown plots for %s" % (workload)) - if options.tcp: - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) - ax = start_plot_vs_msg_length(title, 1000, - "tcp_%s_%s" % (specs[0]['exp_name'], workload), - y_label="TCP Slowdown") - for spec in specs: - exp_name = "tcp_%s_%s" % (spec['exp_name'], workload) - plot_slowdown(ax, exp_name, "p99", spec['label']+' P99') - for spec in specs: - exp_name = "tcp_%s_%s" % (spec['exp_name'], workload) - plot_slowdown(ax, exp_name, "p50", spec['label']+' P50') - ax.legend(loc="upper right", prop={'size': 9}) - plt.tight_layout() - plt.savefig("%s/reports/tcp_%s_%s.pdf" % - (options.log_dir, options.config, workload)) - - if options.dctcp: - title = "%s %d nodes, %.1f Gbps" % (workload.capitalize(), - options.num_nodes, bw) - ax = start_plot_vs_msg_length(title, 10000, - "dctcp_%s_%s" % (specs[0]['exp_name'], workload), - y_label="DCTCP Slowdown") - for spec in specs: - exp_name = "dctcp_%s_%s" % (spec['exp_name'], workload) - plot_slowdown(ax, exp_name, "p99", spec['label']+' P99') - for spec in specs: - exp_name = "dctcp_%s_%s" % (spec['exp_name'], workload) - plot_slowdown(ax, exp_name, "p50", spec['label']+' P50') - ax.legend(loc="upper right", prop={'size': 9}) - plt.tight_layout() - plt.savefig("%s/reports/dctcp_%s_%s.pdf" % - (options.log_dir, options.config, workload)) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, 1000, + "%s_%s" % (specs[0]['exp_name'], workload), + y_label="Slowdown") + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_slowdown(ax, exp_name, "p99", spec['label']+' P99') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_slowdown(ax, exp_name, "p50", spec['label']+' P50') + ax.legend(loc="upper right", prop={'size': 9}) + plt.tight_layout() + plt.savefig("%s/reports/%s_%s.pdf" % + (options.log_dir, options.config, workload)) + + log("Generating latency plots for %s" % (workload)) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + ax = start_plot_vs_msg_length(title, [10, 10000], "%s_%s" % ( + specs[0]['exp_name'], workload), y_label=r'RTT (µsec)') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_histogram(ax, exp_name, "p99", spec['label'] + ' P99') + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + plot_histogram(ax, exp_name, "p50", spec['label'] + ' P50') + ax.legend(loc="lower right", prop={'size': 8}) + plt.tight_layout() + plt.savefig("%s/reports/%s_%s_rtt.pdf" % + (options.log_dir, options.config, workload)) log("Generating short message CDFs for %s" % (workload)) - if options.tcp: - title = "%s %d nodes" % (workload.capitalize(), options.num_nodes) - start_cdf_plot(title, 10, 0.99e05, 1e-05, "TCP RTT (usecs)", - "Cumulative Fraction of Short Messages") - for spec in specs: - exp_name = "tcp_%s_%s" % (spec['exp_name'], workload) - x, y = get_short_cdf(exp_name) - plt.plot(x, y, label=spec['label']) - - plt.legend(loc="upper right", prop={'size': 9}) - plt.savefig("%s/reports/tcp_%s_%s_cdfs.pdf" % - (options.log_dir, options.config, workload)) - - if options.dctcp: - title = "%s %d nodes" % (workload.capitalize(), options.num_nodes) - start_cdf_plot(title, 10, 0.99e05, 1e-05, "DCTCP RTT (usecs)", - "Cumulative Fraction of Short Messages") - for spec in specs: - exp_name = "dctcp_%s_%s" % (spec['exp_name'], workload) - x, y = get_short_cdf(exp_name) - plt.plot(x, y, label=spec['label']) - - plt.legend(loc="upper right", prop={'size': 9}) - plt.savefig("%s/reports/dctcp_%s_%s_cdfs.pdf" % - (options.log_dir, options.config, workload)) + title = "%s, %d %s nodes, %.1f Gbps" % (workload.capitalize(), + options.num_nodes, get_node_type(), bw) + start_cdf_plot(title, 10, 0.99e05, 1e-05, "RTT (usecs)", + "Cumulative Fraction of Short Messages") + for spec in specs: + exp_name = "%s_%s" % (spec['exp_name'], workload) + x, y = get_short_cdf(exp_name) + plt.plot(x, y, label=spec['label']) + + plt.legend(loc="upper right", prop={'size': 9}) + plt.savefig("%s/reports/%s_%s_cdfs.pdf" % + (options.log_dir, options.config, workload)) From 3266bfbd0b9fd5c2f4d686c940ddfd28dde25ea9 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 15 Dec 2025 11:45:52 -0800 Subject: [PATCH 598/625] Slight modifications to tt_records in homa_qdisc.c --- homa_qdisc.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 8422a764..c48856b9 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -487,14 +487,9 @@ void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb) u64 now = homa_clock(); unsigned long flags; - tt_record4("homa_qdisc deferring TCP packet from 0x%08x to 0x%08x, " - "ports %x, length %d", - ntohl(ip_hdr(skb)->saddr), - ntohl(ip_hdr(skb)->daddr), - (ntohs(tcp_hdr(skb)->source) << 16) + - ntohs(tcp_hdr(skb)->dest), - skb->len - skb_transport_offset(skb) - - tcp_hdrlen(skb)); + tt_record_tcp("homa_qdisc deferring TCP packet from " + "0x%x to 0x%x, data bytes %d, seq/ack %u", + skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); spin_lock_irqsave(&qdev->defer_lock, flags); __skb_queue_tail(&qdev->deferred_tcp, skb); @@ -600,7 +595,7 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) homa_qdisc_update_link_idle(qdev, pkt_len, -1); if (ip_hdr(skb)->protocol == IPPROTO_TCP) tt_record_tcp("homa_qdisc_pacer requeued TCP packet from " - "0x%x to 0x%x, data bytes %d, seq/ack %d", + "0x%x to 0x%x, data bytes %d, seq/ack %u", skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); rcu_read_lock_bh(); From 9ac23d68fd21ae30edf8d0077af51391e2d8eb85 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 15 Dec 2025 11:47:22 -0800 Subject: [PATCH 599/625] Various improvements to tthoma.py --- util/tthoma.py | 350 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 233 insertions(+), 117 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index e65f57c8..22ed4b86 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -523,6 +523,51 @@ def extract_num(s): return int(match.group(1)) return None +def filter_tcp_rpcs(rpcs, msglen=None, rpc_start=None, rtt=None): + """ + Returns a list of all the TCP RPCs that match a set of command-line + options + rpcs: List of TCP RPCs to filter (must be entries in tcp_rpcs) + msglen: If not None, filter on msglen (see --msglen arg) + rpc_start: If not None, filter on RPC start time (see --rpc-start arg) + rtt: If not None, filter on round-trip time (see --rtt arg) + """ + if msglen != None: + min_length, max_length = get_range(msglen, + option_name='--msglen', one_value=True) + if max_length == None: + max_length = min_length + min_length = 0 + if rpc_start != None: + min_start, max_start = get_range(rpc_start, + parse_float=True, option_name='--rpc-start') + if rtt != None: + min_rtt, max_rtt = get_range(rtt, parse_float = True, + option_name='--rtt') + + result = [] + for rpc in rpcs: + if msglen != None: + if not 'req_length' in rpc: + continue + length = rpc['req_length'] + if length < min_length or length > max_length: + continue + if rpc_start != None: + if not 'req_send' in rpc: + continue + start = rpc['req_send'] + if start < min_start or start > max_start: + continue + if rtt != None: + if not 'req_send' in rpc or not 'resp_recvd' in rpc: + continue + rtt = rpc['resp_recvd'] - rpc['req_send'] + if rtt < min_rtt or rtt > max_rtt: + continue + result.append(rpc) + return result + def gbps(bytes, usecs): """ Compute the data rate in Gbps for data transmitted or received in @@ -715,7 +760,8 @@ def get_range(s, option_name=None, parse_float=False, one_value=True): def get_tcp_node(addr_port): """ - Return the name of the node corresponding to the argument. + Return the name of the node corresponding to the argument, or None + if no corresponding node could be found. addr_port: A hex string used in TCP timetrace entries: the lower 16 bits are a port number and the upper 16 bits are the low 16 bits of a node's IP address. @@ -725,7 +771,7 @@ def get_tcp_node(addr_port): key = addr_port[:-4] if key in ip_to_node: return ip_to_node[key] - return key + return None def get_tcp_packet(source, dest, data_bytes, seq_ack): """ @@ -907,20 +953,6 @@ def get_xmit_time(offset, rpc, rx_time=1e20): return fallback return xmit -def percentile(data, pct, format, na): - """ - Finds the element of data corresponding to a given percentile pct - (0 is first, 100 or more is last), formats it according to format, - and returns the result. Returns na if the list is empty. Data must - be sorted in percentile order - """ - if len(data) == 0: - return na - i = int(pct*len(data)/100) - if i >= len(data): - i = len(data) - 1 - return format % (data[i]) - def pkt_id(id, offset): return '%d:%d' % (id, offset) @@ -1006,6 +1038,21 @@ def print_if(value, fmt, modifier=None): return fmt % (value) return '' +def print_pctl(values, pctl, fmt): + """ + Return a formatted string describing a given percentile from a list + of values. + values: List of values, sorted from 0th percentile to 100th percentile. + If empty then an empty string is returned. + pctl: Desired percentile, from 0-1000 (e.g. 900 selects P90) + fmt: printf-style formt string containing a single % specifier for + the selected percentile. + """ + if len(values) == 0: + return '' + ix = len(values) * pctl // 1000 + return fmt % (values[ix] if ix < len(values) else values[-1]) + def print_pkts(pkts, header=True): """ Returns a string containing one line for each packet in pkts, which @@ -1164,57 +1211,71 @@ def print_tcp_rpcs(rpcs, header=True): buf.write('Length: Length of request message\n') buf.write('ReqSeq: Sequence number of first byte of request\n') buf.write('RspSeq: Sequence number of first byte of response\n') - buf.write('ReqNic: Elapsed time from sendmsg until first ' + buf.write('ReqXmit: Elapsed time from sendmsg until first ' 'request packet handed\n') buf.write(' off to NIC\n') - buf.write('ReqGRO: Time from NIC handoff to GRO receipt for ' + buf.write('ReqNet: Time from NIC handoff to GRO receipt for ' 'first request packet\n') - buf.write('ReqRecv: Time from GRO for first request packet ' + buf.write('ReqRecv: Time from GRO for last request packet ' 'until recvmsg completes\n') buf.write(' on server\n') buf.write('Srvc: Time from recvmsg return on server until ' 'sendmsg for response\n') - buf.write('RspNic: Elapsed time from sendmsg of response until ' + buf.write('RspXmit: Elapsed time from sendmsg of response until ' 'first packet handed\n') buf.write(' off to NIC\n') - buf.write('RspGRO: Time from NIC handoff to GRO receipt for ' + buf.write('RspNet: Time from NIC handoff to GRO receipt for ' 'first response packet\n') - buf.write('RspRecv: Time from GRO for first response packet ' + buf.write('RspRecv: Time from GRO for last response packet ' 'until End\n') buf.write('End: Time when response was returned to client\n') buf.write('Rtt: RspRecv - Start\n\n') - buf.write('Start Client Server Length ReqSeq RspSeq ') - buf.write('ReqNic ReqGRO ReqRecv Srvc ') - buf.write('RspNic RspGRO RspRecv End Rtt\n') + buf.write('Start Client Server Length ReqSeq RspSeq ') + buf.write('ReqXmit ReqNet ReqRecv Srvc ') + buf.write('RspXmit RspNet RspRecv End Rtt\n') for rpc in rpcs: - request_pkt = rpc['req_pkts'][0] if rpc['req_pkts'] else {} - response_pkt = rpc['resp_pkts'][0] if rpc['resp_pkts'] else {} - if 'nic' in request_pkt: - rqnic = '%.1f' % (request_pkt['nic'] - rpc['req_send']) + if rpc['req_pkts']: + first_req_pkt = rpc['req_pkts'][0] + last_req_pkt = rpc['req_pkts'][-1] else: - rqnic = '' - if 'gro' in request_pkt and 'nic' in request_pkt: - rqgro = '%.1f' % (request_pkt['gro'] - request_pkt['nic']) + first_req_pkt = [] + last_req_pkt = [] + if rpc['resp_pkts']: + first_resp_pkt = rpc['resp_pkts'][0] + last_resp_pkt = rpc['resp_pkts'][-1] else: - rqgro = '' - if 'gro' in request_pkt and 'req_recvd' in rpc: - rqrecv = '%.1f' % (rpc['req_recvd'] - request_pkt['gro']) + first_resp_pkt = [] + last_resp_pkt = [] + if 'resp_seq' in rpc: + resp_seq = '%d' % (rpc['resp_seq']) + else: + resp_seq = '' + if 'nic' in first_req_pkt: + rqxmit = '%.1f' % (first_req_pkt['nic'] - rpc['req_send']) + else: + rqxmit = '' + if 'gro' in first_req_pkt and 'nic' in first_req_pkt: + rqnet = '%.1f' % (first_req_pkt['gro'] - first_req_pkt['nic']) + else: + rqnet = '' + if 'gro' in last_req_pkt and 'req_recvd' in rpc: + rqrecv = '%.1f' % (rpc['req_recvd'] - last_req_pkt['gro']) else: rqrecv = '' if 'req_recvd' in rpc and 'resp_send' in rpc: srvc = '%.1f' % (rpc['resp_send'] - rpc['req_recvd']) else: srvc = '' - if 'nic' in response_pkt: - rspnic = '%.1f' % (response_pkt['nic'] - rpc['resp_send']) + if 'nic' in first_resp_pkt: + rspxmit = '%.1f' % (first_resp_pkt['nic'] - rpc['resp_send']) else: - rspnic = '' - if 'gro' in response_pkt and 'nic' in response_pkt: - rspgro = '%.1f' % (response_pkt['gro'] - response_pkt['nic']) + rspxmit = '' + if 'gro' in first_resp_pkt and 'nic' in first_resp_pkt: + rspnet = '%.1f' % (first_resp_pkt['gro'] - first_resp_pkt['nic']) else: - rspgro = '' - if 'gro' in response_pkt and 'resp_recvd' in rpc: - rsprecv = '%.1f' % (rpc['resp_recvd'] - response_pkt['gro']) + rspnet = '' + if 'gro' in last_resp_pkt and 'resp_recvd' in rpc: + rsprecv = '%.1f' % (rpc['resp_recvd'] - last_resp_pkt['gro']) else: rsprecv = '' if 'req_send' in rpc and 'resp_recvd' in rpc: @@ -1225,12 +1286,12 @@ def print_tcp_rpcs(rpcs, header=True): end = '%.3f' % (rpc['resp_recvd']) else: end = '' - line = ('%9.3f %-8s %-8s %7d %10d %10d' % ( + line = ('%9.3f %-8s %-8s %7d %10d %10s' % ( rpc['req_send'], get_tcp_node(rpc['client']), get_tcp_node(rpc['server']), rpc['req_length'], - rpc['req_seq'], rpc['resp_seq'])) - line += (' %7s %6s %7s %6s' % (rqnic, rqgro, rqrecv, srvc)) - line += (' %7s %6s %7s %9s %7s' % (rspnic, rspgro, rsprecv, end, rtt)) + rpc['req_seq'], resp_seq)) + line += (' %7s %6s %7s %6s' % (rqxmit, rqnet, rqrecv, srvc)) + line += (' %7s %6s %7s %9s %7s' % (rspxmit, rspnet, rsprecv, end, rtt)) buf.write(line.rstrip()) buf.write('\n') return buf.getvalue() @@ -2165,16 +2226,6 @@ def __qdisc_xmit(self, trace, time, core, match, interests): 'offset ([-0-9]+)' }) - def __tcp_xmit(self, trace, time, core, match, interests): - length = int(match.group(1)) - for interest in interests: - interest.tt_tcp_xmit(trace, time, core, length) - - patterns.append({ - 'name': 'tcp_xmit', - 'regexp': '__tcp_transmit_skb sent packet with ([0-9]+) bytes' - }) - def __snapshot_clock(self, trace, time, core, match, interests): usecs = int(match.group(1)) for interest in interests: @@ -5004,8 +5055,9 @@ def __init__(self, dispatcher): self.tcp_xmits = defaultdict(list) return - def tt_tcp_xmit(self, trace, t, core, length): - self.tcp_xmits[trace['node']].append([t, length]) + def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, + seq_ack): + self.tcp_xmits[trace['node']].append([t, data_bytes]) def restrict_qid(self, qid): """ @@ -5166,6 +5218,9 @@ def analyze(self): else: tnic = None if tx_node != None: + if not tx_node in traces: + print('Bogus node name %s. Packet: %s' % (tx_node, pkt)) + print('\nTraces: %s' % (traces)) nic_start = traces[tx_node]['first_time'] if 'free_tx_skb' in pkt: tfree = pkt['free_tx_skb'] @@ -7203,8 +7258,8 @@ def __init__(self, dispatcher): def tt_send_grant(self, trace, t, core, id, offset, priority, increment): self.nodes[trace['node']].append([t, 34, 0, "homa_grant"]) - def tt_tcp_xmit(self, trace, t, core, length): - self.nodes[trace['node']].append([t, length, 0, "tcp"]) + def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, seq_ack): + self.nodes[trace['node']].append([t, data_bytes, 0, "tcp"]) def output(self): global options, traces, packets, dispatcher @@ -7490,8 +7545,8 @@ def output(self): class AnalyzeP99short: """ Selects the 1% of short RPCs (those with single-packet request and - response messages) and breaks down the delay both for the overall - RPCs and for their constituent packets. + response messages) with highest RTT and breaks down the delay both + for the overall RPCs and for their constituent packets. """ def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') @@ -7967,13 +8022,17 @@ def cleanup_tcp_pkts(self): # with the same name. stream_pkts = defaultdict(list) - # Pass 1: divide data packets into buckets for uinidirectional + # Pass 1: divide data packets into buckets for unidirectional # streams, and also fill in a fiew fields. for pkt in tcp_packets.values(): if not 'tx_node' in pkt: - pkt['tx_node'] = get_tcp_node(pkt['source']) + node = get_tcp_node(pkt['source']) + if node != None: + pkt['tx_node'] = node if not 'rx_node' in pkt: - pkt['rx_node'] = get_tcp_node(pkt['dest']) + node = get_tcp_node(pkt['source']) + if node != None: + pkt['rx_node'] = node if not 'length' in pkt: if not 'tso_length' in pkt: print('No tso_length in packet: %s' % (pkt)) @@ -10215,7 +10274,7 @@ def analyze(self): # Find matching pairs of request and response messages and merge # them into single (and complete) RPCs. - for key, rpcs in self.rpcs.items(): + for key, rpcs in list(self.rpcs.items()): # slot -> List of requests in the forward direction and responses # in the reverse direction for this slot and client-server pair, # sorted by sendmsg time. @@ -10251,50 +10310,6 @@ def analyze(self): # Unmatchable trailing request self.del_rpc(request) - def filter_rpcs(self, rpcs, options): - """ - Returns a list of all the TCP RPCs that match a set of command-line - options - rpcs: List of TCP RPCs to filter (must be entries in tcp_rpcs) - options: Command-line options to use for filtering; see below for - valid options - """ - if options.msglen != None: - min_length, max_length = get_range(options.msglen, - option_name='--msglen', one_value=True) - if max_length == None: - max_length = min_length - min_length = 0 - if options.rpc_start != None: - min_start, max_start = get_range(options.rpc_start, - parse_float=True, option_name='--rpc-start') - if options.rtt != None: - min_rtt, max_rtt = get_range(options.rtt, parse_float = True, - option_name='--rtt') - - result = [] - for rpc in rpcs: - if options.msglen != None: - if not 'req_length' in rpc: - continue - length = rpc['req_length'] - if length < min_length or length > max_length: - continue - if options.rpc_start != None: - if not 'req_send' in rpc: - continue - start = rpc['req_send'] - if start < min_start or start > max_start: - continue - if options.rtt != None: - if not 'req_send' in rpc or not 'resp_recvd' in rpc: - continue - rtt = rpc['resp_recvd'] - rpc['req_send'] - if rtt < min_rtt or rtt > max_rtt: - continue - result.append(rpc) - return result - def output(self): global tcp_rpcs, options @@ -10302,10 +10317,12 @@ def output(self): print('Analyzer: tcp_rpcs') print('------------------') - print_rpcs = self.filter_rpcs(tcp_rpcs.values(), options) - if (options.msglen != None or options.rpc_start != None or options.rtt != None): + print_rpcs = filter_tcp_rpcs(tcp_rpcs.values(), + msglen=options.msglen, + rpc_start=options.rpc_start, + rtt=options.rtt) print('%d TCP RPCs were selected using the following filters:' % (len(print_rpcs))) if options.msglen: @@ -10315,7 +10332,8 @@ def output(self): if options.rtt: print(' --rtt %s' % (options.rtt)) else: - print('There are %d TCP RPCs in the traces:' % (len(print_rpcs))) + print_rpcs = tcp_rpcs.values() + print('There are %d TCP RPCs in the traces' % (len(print_rpcs))) sort_keys = options.sort if sort_keys == None: @@ -10335,6 +10353,79 @@ def output(self): raise Exception('Unknwon sort key \'%s\' for tcp_rpcs ' 'analyzer' % (key)) + # Collect and print overall statistics about the RPCs. + xmit = [] + net = [] + free = [] + recv = [] + srvc = [] + rtt = [] + for rpc in print_rpcs: + if rpc['req_pkts']: + first_req_pkt = rpc['req_pkts'][0] + last_req_pkt = rpc['req_pkts'][-1] + else: + first_req_pkt = [] + last_req_pkt = [] + if rpc['resp_pkts']: + first_resp_pkt = rpc['resp_pkts'][0] + last_resp_pkt = rpc['resp_pkts'][-1] + else: + first_resp_pkt = [] + last_resp_pkt = [] + if 'nic' in first_req_pkt: + xmit.append(first_req_pkt['nic'] - rpc['req_send']) + if 'nic' in first_resp_pkt: + xmit.append(first_resp_pkt['nic'] - rpc['resp_send']) + for pkt in itertools.chain(rpc['req_pkts'], rpc['resp_pkts']): + if 'gro' in pkt and 'nic' in pkt: + net.append(pkt['gro'] - pkt['nic']) + if 'free_tx_skb' in pkt and 'nic' in pkt: + free.append(pkt['free_tx_skb'] - pkt['nic']) + if 'gro' in last_req_pkt and 'req_recvd' in rpc: + recv.append(rpc['req_recvd'] - last_req_pkt['gro']) + if 'gro' in last_resp_pkt and 'resp_recvd' in rpc: + recv.append(rpc['resp_recvd'] - last_resp_pkt['gro']) + if 'req_recvd' in rpc and 'resp_send' in rpc: + srvc.append(rpc['resp_send'] - rpc['req_recvd']) + if 'req_send' in rpc and 'resp_recvd' in rpc: + rtt.append(rpc['resp_recvd'] - rpc['req_send']) + for l in [xmit, net, free, recv, srvc, rtt]: + l.sort() + + print('\nOverall statistics about the selected RPCs. Most of these ' + 'statistics') + print('combine data from request messages and response messages.') + print('Xmit: Time from sendmsg until driver queued first ' + 'packet for NIC') + print('Net: Time from NIC handoff to GRO receipt for packets') + print('Free: Time from when NIC received packet until packet ' + 'was returned') + print(' to Linux and freed') + print('Recv: Time from GRO for last packet in a message ' + 'until recvmsg completes') + print('Srvc: Time from recvmsg return on server until ' + 'sendmsg for response') + print('Rtt: Total time from request sendmsg until recvmsg ' + 'completes for response\n') + + print(' Min P10 P50 P90 P99 Max') + pctls = [0, 100, 500, 900, 990, 1000] + print('Xmit %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(xmit, p, '%.1f') for p in pctls)) + print('Net %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(net, p, '%.1f') for p in pctls)) + print('Free %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(free, p, '%.1f') for p in pctls)) + print('Recv %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(recv, p, '%.1f') for p in pctls)) + print('Srvc %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(srvc, p, '%.1f') for p in pctls)) + print('Rtt %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(rtt, p, '%.1f') for p in pctls)) + + # Print a summary line for each RPC. + print('\nSummary information for each selected RPC:') print(print_tcp_rpcs(print_rpcs, header = True), end='') if options.verbose: @@ -10357,11 +10448,36 @@ class AnalyzeTemp: debugging. Consult the code to see what it does right now. """ def __init__(self, dispatcher): - dispatcher.interest('AnalyzeRpcs') - dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcp_rpcs') + # dispatcher.interest('AnalyzeRpcs') + # dispatcher.interest('AnalyzePackets') def output(self): - self.output_slow_pkts() + global tcp_rpcs + + qdisc = 0 + rpcs = filter_tcp_rpcs(tcp_rpcs.values(), msglen='1500') + rtts = [] + for rpc in rpcs: + if not 'req_send' in rpc or not 'resp_recvd' in rpc: + continue + if not rpc['req_pkts'] or not rpc['resp_pkts']: + continue + if ('qdisc_xmit' in rpc['req_pkts'][0] or + 'qdisc_xmit' in rpc['resp_pkts'][0]): + qdisc += 1 + continue + rtts.append(rpc['resp_recvd'] - rpc['req_send']) + + rtts.sort() + print('%d RPCS smaller than 1500 bytes left after filtering out ' + '%d deferred' % (len(rtts), qdisc)) + print('Min RTT: %8s' % (print_pctl(rtts, 0, '%.1f'))) + print('P10: %8s' % (print_pctl(rtts, 100, '%.1f'))) + print('P50: %8s' % (print_pctl(rtts, 500, '%.1f'))) + print('P90: %8s' % (print_pctl(rtts, 900, '%.1f'))) + print('P99: %8s' % (print_pctl(rtts, 990, '%.1f'))) + print('Max RTT: %8s' % (print_pctl(rtts, 1000, '%.1f'))) def output_slow_pkts(self): pkts = [] From 9c28b23fff107d797727303f43d0e5a70d0edde0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 16 Dec 2025 14:15:38 -0800 Subject: [PATCH 600/625] Add nicsnapshot analyzer to tthoma.py --- util/tthoma.py | 287 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 246 insertions(+), 41 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 22ed4b86..90c5fa65 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -157,6 +157,7 @@ def __missing__(self, id): # offset is the offset in message of the first byte of the packet. Each # value is a dictionary containing the following fields (some may not # be present, depending on which events were present in the traces): +# type: Packet type: always 'data' # xmit: Time when ip*xmit was invoked # qdisc_defer: If the packet was deferred by homa_qdisc, gives the # time when the deferral decision was made. This field @@ -206,8 +207,8 @@ def __missing__(self, id): class PacketDict(dict): def __missing__(self, key): id_str, offset_str = key.split(':') - self[key] = {'id': int(id_str), 'offset': int(offset_str), - 'retransmits': []} + self[key] = {'type': 'data', 'id': int(id_str), + 'offset': int(offset_str), 'retransmits': []} return self[key] packets = PacketDict() @@ -220,6 +221,7 @@ def __missing__(self, key): # the RPC id on the sending side and offset is the offset in message of # the first byte of the packet. Each value is a dictionary containing # the following fields: +# type: Packet type: always 'grant' # xmit: Time when ip*xmit was invoked # nic: Time when the the packet was handed off to the NIC # gro: Time when GRO received (the first bytes of) the packet @@ -235,8 +237,8 @@ def __missing__(self, key): class GrantDict(dict): def __missing__(self, key): id_str, offset_str = key.split(':') - self[key] = {'id': int(id_str), 'offset': int(offset_str), - 'increment': 0} + self[key] = {'type': 'grant', 'id': int(id_str), + 'offset': int(offset_str), 'increment': 0} return self[key] grants = GrantDict() @@ -244,8 +246,7 @@ def __missing__(self, key): # It is created by AnalyzePackets. See get_tcp_packet for details on the keys # used to look up packets. Each value is a dictionary containing the following # fields: -# id: Always zero; this can be used to distinguish TCP packets from -# Homa packets, where there is always a nonzero id. +# type: Packet type: always 'tcp' # source: Hex string identifying source port for the packet: lower 16 # bits are port number, upper 16 bits are low-order 16-bits of # IPv4 address @@ -638,6 +639,27 @@ def get_granted(rpc, time): return max_offset return None +def get_hdr_length(pkt, tx=True): + """ + Returns the total amount of header data for a packet (i.e. everything + except message data). + pkt: A packet (either Homa data, Homa grant, or TCP) + tx: If true, compute the total headers for the transmitted packet, + which can include multiple segments for TSO frames. If false, + compute the header for the (single) received packet. + """ + global data_hdr_length, grant_pkt_length, tcp_hdr_length + + pkt_type = pkt['type'] + if pkt_type == 'data': + return data_hdr_length * (1 + tx * len(pkt['segments'])) + elif pkt_type == 'tcp': + return tcp_hdr_length * (1 + tx * len(pkt['segments'])) + elif pkt_type == 'grant': + return grant_pkt_length + else: + return 0 + def get_interval(node, usecs): """ Returns the interval dictionary corresponding to the arguments. A @@ -796,7 +818,7 @@ def get_tcp_packet(source, dest, data_bytes, seq_ack): key = f'{source} {dest} {seq_ack} ack' if key in tcp_packets: return tcp_packets[key] - pkt = {'id': 0, 'source': source, 'dest': dest, 'seq_ack': seq_ack, + pkt = {'type': 'tcp', 'source': source, 'dest': dest, 'seq_ack': seq_ack, 'retransmits': [], 'segments': []} tcp_packets[key] = pkt return pkt @@ -1053,36 +1075,40 @@ def print_pctl(values, pctl, fmt): ix = len(values) * pctl // 1000 return fmt % (values[ix] if ix < len(values) else values[-1]) -def print_pkts(pkts, header=True): +def print_pkts(pkts, header=True, comment=False): """ Returns a string containing one line for each packet in pkts, which - contains various useful information about the packet. The entries in - pkts can be either Homa packets or TCP packets. If header is True - then the string also includes initial text describing the fields that - are printed on each line. + contains various useful information about the packet. + pkts: Packets to print (either Homa packets or TCP packets) + header: If True, the result string will include initial text describing + the fields that are printed on each line. + comment: If True, all of the initial lines except column headers will + be preceded by '# '. """ buf = StringIO() + prefix = '# ' if comment else '' if header: - buf.write('# Source: Node that sent packet\n') - buf.write('# Dest: Node to which packet was sent\n') - buf.write('# Xmit: Time when packet was passed to ip*xmit\n') - buf.write('# Qdisc: Time when homa_qdisc requeued packet after ' - 'deferral, if any\n') - buf.write('# Id/Seq: RPC identifier for Homa packets, sequence ' + buf.write(prefix + 'Source: Node that sent packet\n') + buf.write(prefix + 'Dest: Node to which packet was sent\n') + buf.write(prefix + 'Xmit: Time when packet was passed to ip*xmit\n') + buf.write(prefix + 'Qdisc: Time when homa_qdisc requeued packet ' + 'after deferral, if any\n') + buf.write(prefix + 'Id/Seq: RPC identifier for Homa packets, sequence ' 'number for TCP\n') - buf.write('# Offset: Offset of packet within message or "TCP" if ' - 'packet is TCP\n') - buf.write('# Length: Size of packet (before segmentation)\n') - buf.write('# Qid: Transmit queue on which packet was sent\n') - buf.write('# Nic: Time when packet was queued for NIC\n') - buf.write('# NDelay: Nic - Xmit\n') - buf.write('# Gro: Time when packet was received by GRO\n') - buf.write('# GDelay: Gro - Nic\n') - buf.write('# Free: Time when sk_buff was released on sender\n') - buf.write('# FDelay: Free - Nic\n') - buf.write('# Rx: Number of times segments in the packet were ' - 'retransmitted\n\n') + buf.write(prefix + 'Offset: Offset of packet within message or ' + '"TCP" if packet is TCP\n') + buf.write(prefix + 'Length: Size of packet (before segmentation)\n') + buf.write(prefix + 'Qid: Transmit queue on which packet was sent\n') + buf.write(prefix + 'Nic: Time when packet was queued for NIC\n') + buf.write(prefix + 'NDelay: Nic - Xmit\n') + buf.write(prefix + 'Gro: Time when packet was received by GRO\n') + buf.write(prefix + 'GDelay: Gro - Nic\n') + buf.write(prefix + 'Free: Time when sk_buff was released on ' + 'sender\n') + buf.write(prefix + 'FDelay: Free - Nic\n') + buf.write(prefix + 'Rx: Number of times segments in the packet ' + 'were retransmitted\n\n') buf.write('Source Dest Xmit Qdisc Id/Seq Offset') buf.write(' Length Qid Nic NDelay Gro GDelay') buf.write(' Free FDelay Rx\n') @@ -1115,7 +1141,7 @@ def print_pkts(pkts, header=True): line = '%-8s %-8s %10s %10s' % (pkt['tx_node'], pkt['rx_node'] if 'rx_node' in pkt else "", print_if(xmit, '%.3f'), qdisc_string) - if pkt['id'] != 0: + if pkt['type'] == 'data': line += ' %10d %6d' % (pkt['id'], pkt['offset']) else: # This is a TCP packet @@ -3841,7 +3867,7 @@ def filter_short_tcp(self, pkt): Returns True if pkt is a short TCP packet: it has some data, but no more than 1500 bytes. """ - if pkt['id'] != 0: + if pkt['type'] == '': return False if not 'tso_length' in pkt: return False @@ -5335,7 +5361,7 @@ def analyze(self): else: add_to_intervals(rx_node, traces[rx_node]['first_time'], tsoftirq, 'rx_data_gro', length) - elif tgro != None and pkt['id'] != 0: + elif tgro != None and pkt['type'] == 'homa': # Note: TCP doesn't yet provide softirq times, hence the # exclusion above. add_to_intervals(rx_node, tgro, traces[rx_node]['last_time'], @@ -7082,7 +7108,7 @@ def output(self): queues_freed[qid] = 1 queue_packets[qid] -= 1 queue_bytes[qid] -= pkt['tso_length'] - if pkt['id'] == 0: + if pkt['type'] == 'tcp': queue_tcp_bytes[qid] -= pkt['tso_length'] if queue_packets[qid] == 0: if queue_bytes[qid] != 0: @@ -7092,7 +7118,7 @@ def output(self): else: queue_packets[qid] += 1 queue_bytes[qid] += pkt['tso_length'] - if pkt['id'] == 0: + if pkt['type'] == 'tcp': queue_tcp_bytes[qid] += pkt['tso_length'] free = (pkt['free_tx_skb'] if 'free_tx_skb' in pkt else traces[node]['last_time']) @@ -7383,6 +7409,188 @@ def output(self): interval_end += interval file.close() +#------------------------------------------------ +# Analyzer: nicsnapshot +#------------------------------------------------ +class AnalyzeNicsnapshot: + """ + Print information about the state of the NIC queues on a particular + node at a particular point in time. Requires the --time and --node + options. If --verbose is specified then all of the packets in the + position of the NIC at the reference time are printed. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + require_options('nicsnapshot', 'time', 'node') + + def output(self): + global options, packets, tcp_packets + + # Queue id -> packets in queue at reference time. + id_pkts = defaultdict(list) + + # Intervals to use for id_queued_interval and other variables. + intervals = [10, 20, 50, 100] + + # Queue id -> dict of interval -> count, where interval is a time + # in usecs and count is the number of bytes that were handed + # off to the NIC for that queue in the interval preceding the + # reference time. + id_queued_interval = defaultdict(lambda: defaultdict(lambda: 0)) + + # Queue id -> dict of interval -> count, where interval is a time + # in usecs and count is the number of bytes in packets for that + # queue that were freed after transmission in the interval preceding + # the reference time. + id_freed_interval = defaultdict(lambda: defaultdict(lambda: 0)) + + # Queue id -> last packet freed for that queue before the reference + # time + id_last_freed = {} + + # All packets active in the NIC at the reference time + all_active = [] + + # Scan all packets and fill in the variables above. + for pkt in itertools.chain(packets.values(), tcp_packets.values()): + if not 'tx_node' in pkt or pkt['tx_node'] != options.node: + continue + if not 'tso_length' in pkt: + continue + length = pkt['tso_length'] + get_hdr_length(pkt) + qid = pkt['tx_qid'] if 'tx_qid' in pkt else 'unknown' + nic = pkt['nic'] if 'nic' in pkt else None + free = pkt['free_tx_skb'] if 'free_tx_skb' in pkt else None + if (nic != None and free != None and nic < options.time + and free > options.time): + all_active.append(pkt) + id_pkts[qid].append(pkt) + for i in intervals: + if (nic != None and nic < options.time and + nic >= (options.time - i)): + id_queued_interval[qid][i] += length + if (free != None and free < options.time and + free >= (options.time - i)): + id_freed_interval[qid][i] += length + if free != None and free < options.time: + if (not qid in id_last_freed or + id_last_freed[qid]['free_tx_skb'] < free): + id_last_freed[qid] = pkt + + intervals = [10, 20, 50, 100] + + print('\n---------------------') + print('Analyzer: nicsnapshot') + print('---------------------') + print('Information about the state of the NIC queues on %s at time %.3f:' + % (options.node, options.time)) + print('Qid: Identifier of transmit queue') + print('Pkts: Packets in Qid at the reference time') + print('KB: Kbytes of packet data in Qid at the reference time') + print('Oldest: Time when the oldest packet in Qid was handed ' + 'off to the NIC') + print('ODiff: Time difference between Oldest and reference time ' + '(usecs)') + print('RecFree: Most recent time when a packet for Qid was returned ' + 'to Linux and freed') + print('RFDiff: Time difference between RecFree and reference time ' + '(usecs)') + print('QXX: Kbytes of packet data handed off to NIC for Qid ' + 'in the XX usecs') + print(' precending the reference time') + print('FXX: Kbytes of packet data freed after transmission ' + 'in the XX usecs') + print(' precending the reference time') + print() + print(' Qid Pkts KB Oldest ODiff RecFree RFDiff', end='') + for i in intervals: + print('%7s' % ('Q%d' % (i)), end='') + for i in intervals: + print('%7s' % ('F%d' % (i)), end='') + print() + + # Interval (usecs) -> total bytes handed off to the NIC (for any + # queue) within that interval of the reference time. + total_interval_bytes = defaultdict(lambda: 0) + nic_oldest = {'nic': 1e20} + total_bytes = 0 + total_pkts = 0 + + qids = sorted(id_pkts.keys() | id_queued_interval.keys() | + id_freed_interval.keys()) + for qid in qids: + pkts = id_pkts[qid] + oldest_queued = None + q_bytes = 0 + total_pkts += len(pkts) + + for pkt in pkts: + nic = pkt['nic'] + if oldest_queued == None or nic < oldest_queued['nic']: + oldest_queued = pkt + if nic < nic_oldest['nic']: + nic_oldest = pkt + length = pkt['tso_length'] + get_hdr_length(pkt) + q_bytes += length + total_bytes += length + + if qid in id_last_freed: + t = id_last_freed[qid]['free_tx_skb'] + freed = '%.3f' % (t) + free_diff = '%.1f' % (options.time - t) + else: + freed = '' + free_diff = '' + if oldest_queued != None: + old_queue_time = '%.3f' % (oldest_queued['nic']) + odiff = '%.1f' % (options.time - oldest_queued['nic']) + else: + old_queue_time = '' + odiff = '' + line = '%5d %4d %6.1f %9s %7s %9s %7s' % (qid, len(pkts), + q_bytes*1e-3, old_queue_time, odiff, freed, free_diff) + for i in intervals: + bytes = id_queued_interval[qid][i] + if bytes != 0: + msg = '%.1f' % (bytes*1e-3) + else: + msg = '' + line += '%7s' % (msg) + for i in intervals: + bytes = id_freed_interval[qid][i] + if bytes != 0: + msg = '%.1f' % (bytes*1e-3) + else: + msg = '' + line += '%7s' % (msg) + print(line.rstrip()) + if id_last_freed: + rec_free = max(pkt['free_tx_skb'] for pkt in id_last_freed.values()) + freed = '%.3f' % (rec_free) + free_diff = '%.1f' % (options.time - rec_free) + else: + freed = '' + free_diff = '' + print('Total %4d %6.1f %9.3f %7.1f %9s %7s' % (total_pkts, total_bytes*1e-3, + nic_oldest['nic'], options.time - nic_oldest['nic'], + freed, free_diff), end='') + for i in intervals: + print('%7.1f' % ( + sum(q[i] for q in id_queued_interval.values())*1e-3), + end='') + for i in intervals: + print('%7.1f' % ( + sum(q[i] for q in id_freed_interval.values())*1e-3), + end='') + print() + + if options.verbose: + print('\nDetails for all of the packets owned by the NIC at time %.1f:' + % (options.time)) + all_active.sort(key=lambda pkt: [pkt['tx_qid'], pkt['nic']]) + print(print_pkts(all_active, header=True), end='') + #------------------------------------------------ # Analyzer: ooo #------------------------------------------------ @@ -7967,7 +8175,6 @@ def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, seq_ack): tcp_pkt['xmit'] = t tcp_pkt['xmit2'] = t tcp_pkt['tso_length'] = data_bytes - tcp_pkt['total_length'] = data_bytes + tcp_hdr_length tcp_pkt['tx_node'] = node set_tcp_ip_node(source, node) @@ -7977,7 +8184,6 @@ def tt_tcp_qdisc(self, trace, t, core, source, dest, data_bytes, seq_ack): tcp_pkt['qdisc_xmit'] = t tcp_pkt['xmit2'] = t tcp_pkt['tso_length'] = data_bytes - tcp_pkt['total_length'] = data_bytes + tcp_hdr_length tcp_pkt['tx_node'] = node set_tcp_ip_node(source, node) @@ -8006,7 +8212,6 @@ def tt_tcp_gro(self, trace, t, core, source, dest, data_bytes, seq_ack): node = trace['node'] tcp_pkt['length'] = data_bytes tcp_pkt['gro'] = t - tcp_pkt['total_length'] = data_bytes + tcp_hdr_length tcp_pkt['rx_node'] = node set_tcp_ip_node(dest, node) @@ -8451,7 +8656,7 @@ def analyze(self): 'q_homa_sched', length) else: add_to_intervals(rx_node, q_start, gro, 'q_tcp', - pkt['total_length']) + pkt['length'] + tcp_hdr_length) def init_axis(self, ax, x_min, x_max, y_max, size=10): """ @@ -10717,7 +10922,7 @@ def output(self): continue bytes = pkt['tso_length'] data_bytes += bytes - if pkt['id'] == 0: + if pkt['type'] == 0: total_bytes += bytes + tcp_headers else: total_bytes += bytes + homa_headers @@ -11217,7 +11422,7 @@ def output(self): f.write('# Generated at %s.\n' % (time.strftime('%I:%M %p on %m/%d/%Y'))) f.write('# Data packets transmitted from %s:\n' % (node)) - f.write(print_pkts(pkts)) + f.write(print_pkts(pkts, comment=True)) f.close() def print_type(delays): From 7c44fac10507bd22df0e2efd930d732015d59d6e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 19 Dec 2025 09:38:39 -0800 Subject: [PATCH 601/625] First cut at nictx analyzer for tthoma.py --- util/tthoma.py | 472 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 437 insertions(+), 35 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 90c5fa65..bbb7b0c8 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -23,6 +23,7 @@ from pathlib import Path import re from socket import NI_NUMERICHOST +from statistics import stdev import string import sys import textwrap @@ -208,7 +209,7 @@ class PacketDict(dict): def __missing__(self, key): id_str, offset_str = key.split(':') self[key] = {'type': 'data', 'id': int(id_str), - 'offset': int(offset_str), 'retransmits': []} + 'offset': int(offset_str), 'retransmits': [], 'segments': []} return self[key] packets = PacketDict() @@ -228,6 +229,9 @@ def __missing__(self, key): # gro_core: Core on which homa_gro_receive was invoked # softirq: Time when homa_softirq processed the packet # softirq_core: Core on which SoftIRQ processed the packet +# free_tx_skb: Time when NAPI released the skb on the sender, which can't +# happen until the packet has been transmitted. +# tx_qid: NIC channel on which packet was transmitted # tx_node: Node that sent grant (if known) # rx_node: Node that received grant (if known) # id: Id of the RPC on the node that sent the grant @@ -265,6 +269,7 @@ def __missing__(self, key): # xmit: Time when ip*xmit was invoked for the packet # xmit2: qdisc_xmit if it exists, otherwise xmit: a time when Homa # has decided to transmit the packet (after any Homa queuing) +# tx_core: Core on which ip*xmit was invoked for the packet # qdisc_xmit: Time when homa_qdisc requeued a packet that was deferred # because of NIC queue length (only present for deferred # packets) @@ -1829,6 +1834,19 @@ def __free_tx_skb(self, trace, time, core, match, interests): 'offset ([0-9]+), qid ([0-9]+), msg_length ([0-9]+)' }) + def __free_grant(self, trace, time, core, match, interests): + id = int(match.group(1)) + offset = int(match.group(2)) + qid = int(match.group(3)) + for interest in interests: + interest.tt_free_grant(trace, time, core, id, offset, qid) + + patterns.append({ + 'name': 'free_grant', + 'regexp': 'freeing tx skb for homa grant, id ([0-9]+), ' + 'offset ([0-9]+), qid ([0-9]+)' + }) + def __sendmsg_request(self, trace, time, core, match, interests): peer = match.group(1) id = int(match.group(2)) @@ -6647,7 +6665,7 @@ class AnalyzeNicbacklog: Prints a time-series analysis of backlog in the NIC (packets that have been passed to the NIC but not yet returned after transmission) along with information about the rate of packets flowing into the - NIC and being returned from the NIC. Requries the --data option. + NIC and being returned from the NIC. Requires the --data option. """ def __init__(self, dispatcher): @@ -6657,7 +6675,7 @@ def __init__(self, dispatcher): def output(self): global packets, tcp_packets, options, traces - # Microseconds in the smalleset interval we'll consider for + # Microseconds in the smallest interval we'll consider for # computing rates. base_interval = 50 @@ -6848,13 +6866,13 @@ def output(self): # print('\n%9.3f: to Nic: %s' % (pkt['nic'], pkt['free_tx_skb'])) cur += 1 in_pkts += 1 - in_bytes += pkt['tso_length'] + in_bytes += pkt['tso_length'] + get_hdr_length(pkt) heapq.heappush(active, [pkt['free_tx_skb'], cur, pkt]) while len(active) > 0 and active[0][0] < interval_end: pkt = heapq.heappop(active)[2] # print('\n%9.3f: freed: %s' % (pkt['free_tx_skb'], pkt)) free_pkts += 1 - free_bytes += pkt['tso_length'] + free_bytes += pkt['tso_length'] + get_hdr_length(pkt) nic_pkts += in_pkts - free_pkts nic_bytes += in_bytes - free_bytes @@ -6972,7 +6990,7 @@ class AnalyzeNicbacklog2: Prints a time-series analysis of backlog in the NIC (packets that have been passed to the NIC but not yet returned after transmission). This differs from the nicbacklog analyzer in that it analyzes - the distribution of traffic between device queues. Requries the + the distribution of traffic between device queues. Requires the --data option. """ @@ -7012,7 +7030,7 @@ def output(self): f.write('# TcpKB: Total kbytes of TCP packet data in NIC\n') f.write('# PktQs: Number of queues with packets\n') f.write('# FreeQs: Number of queues for which a packet was freed\n') - f.write('# a packet in the interval\n') + f.write('# in the interval\n') f.write('# Qid1: Id of queue with the most data\n') f.write('# Pkts1: Number of packets in Qid1\n') f.write('# KB1: Kbytes of packet data in Qid1 (Homa and TCP)\n') @@ -7043,7 +7061,7 @@ def output(self): # queue -> same as queue_bytes except count only TCP bytes. queue_tcp_bytes = defaultdict(lambda: 0) - # queue -> count of bytes currently owned by this queue. + # queue -> count of packets currently owned by this queue. queue_packets = defaultdict(lambda: 0) pkts = sorted(node_pkts[node], key = lambda pkt : @@ -7140,7 +7158,7 @@ class AnalyzeNicpkts: (packets passed to the NIC but not yet returned after transmission), showing the state of the NIC queues at each point in time and the order in which packets are returned to Linux after transmission. - Requries the --data option. + Requires the --data option. """ def __init__(self, dispatcher): @@ -7591,6 +7609,380 @@ def output(self): all_active.sort(key=lambda pkt: [pkt['tx_qid'], pkt['nic']]) print(print_pkts(all_active, header=True), end='') +#------------------------------------------------ +# Analyzer: nictx +#------------------------------------------------ +class AnalyzeNictx: + """ + Analyze NIC throughput for packet transmission and generate plots + of throughput as a function of number of bytes queued in the NIC + and number of active NIC queues. Requires the --plot option + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzePackets') + require_options('nictx', 'plot') + + def output(self): + global packets, grants, tcp_packets, options, traces + + # node -> list of packets transmitted by that node + node_pkts = defaultdict(list) + + # Bucket all of the packets by transmitting node. + type_counts = defaultdict(lambda: 0) + for pkt in itertools.chain(packets.values(), tcp_packets.values(), + grants.values()): + if not 'tx_node' in pkt or not 'tx_qid' in pkt: + continue + if pkt['type'] == 'grant': + length = 0 + elif not 'tso_length' in pkt: + continue + node = pkt['tx_node'] + if 'nic' in pkt or 'free_tx_skb' in pkt: + node_pkts[node].append(pkt) + type_counts[pkt['type']] += 1 + + # List of tuples, one for each + # interval across all nodes, where queues is the number of tx queues + # in the NIC with packets at the start of the interval, bytes is + # the number of packet bytes queued in the NIC at the start of + # the interval, pkts is the total number of queued packets at the + # start of the interval, freed is the number of packet bytes freed + # after transmission during the interval, and queued is the number + # of new packet bytes queued during the interval. + intervals = [] + + # Process node_pkts, one node at a time, to populate intervals. + for node in get_sorted_nodes(): + # heapq with one entry for each packet currently in the NIC's + # possession. Each entry is a tuple, + # where free is the packet's free_tx_skb time, index is the + # packet's index in the list of all packets (for resolving sorting + # ties), and pkt is the packet. + active = [] + + # qid -> count of packets currently owned by this queue. + qid_packets = defaultdict(lambda: 0) + + # Total bytes owned by NIC + nic_bytes = 0 + + # Total packets owned by NIC + nic_pkts = 0 + + # The next tuple that will be added to intervals. + next = [0, 0, 0, 0, 0] + + pkts = sorted(node_pkts[node], key = lambda pkt : + pkt['nic'] if 'nic' in pkt else -1e20) + cur = 0 + t = traces[node]['first_time'] + interval_end = (math.ceil(traces[node]['first_time'] / + options.interval) * options.interval) + + # True means there was at least one point in the current interval + # where the total # of bytes queued in the NIC dropped below + # a threshold value. + below_threshold = False + + interval_pkts = [] + + # Each iteration of this loop processes one event: either a + # packet handed off to the NIC or a packet freed. + while True: + # Decide on next event + if cur < len(pkts): + pkt = pkts[cur] + if 'nic' in pkt: + nic = pkt['nic'] + else: + nic = traces[node]['first_time'] + else: + nic = None + if nic != None and (not active or active[0][0] > nic): + free_event = False + t = nic + cur += 1 + elif active: + t, _, pkt = heapq.heappop(active) + free_event = True + else: + break + + # Handle end of interval(s) + while t >= interval_end: + if traces[node]['first_time'] <= (interval_end - + options.interval): + gbps_in = next[4] * 8e-3 / options.interval + gbps_out = next[3] * 8e-3 / options.interval + # if gbps_in >= gbps_out + 5: + if gbps_in > 100: + print('%9.1f %s has %.1f KB queued data, tput %.1f ' + 'Gbps, input %.1f Gbps' + % (interval_end, node, nic_bytes * 1e-3, + gbps_out, gbps_in)) + if not below_threshold: + intervals.append(next) + active_queues = sum(n > 0 for n in qid_packets.values()) + next = [active_queues, nic_bytes, nic_pkts, 0, 0] + below_threshold = False + interval_pkts = [] + interval_end += options.interval + + # Update statistics with current event + qid = pkt['tx_qid'] + if pkt['type'] == 'grant': + length = get_hdr_length(pkt) + else: + length = pkt['tso_length'] + get_hdr_length(pkt) + if free_event: + qid_packets[qid] -= 1 + nic_pkts -= 1 + nic_bytes -= length + # if nic_bytes < 1000000: + # below_threshold = True + next[3] += length + else: + qid_packets[qid] += 1 + nic_pkts += 1 + nic_bytes += length + if 'nic' in pkt: + interval_pkts.append(pkt) + next[4] += length + free_event = (pkt['free_tx_skb'] if 'free_tx_skb' in pkt else + traces[node]['last_time']) + heapq.heappush(active, [free_event, cur, pkt]) + + # Generate scatter plots of throughput vs. queues occupied, NIC bytes, + # and NIC pkts. + occupied = [] + kbytes = [] + num_pkts = [] + gbps = [] + for queues, bytes, pkts, freed, queued in intervals: + occupied.append(queues) + kbytes.append(bytes * 1e-3) + num_pkts.append(pkts) + gbps.append(freed * 8e-3 / options.interval) + + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 30) + ax.set_xlabel('# Nic Queues Occupied') + ax.set_ylim(0, 120) + ax.set_ylabel('Tx Completion Rate(Gbps)') + ax.scatter(occupied, gbps, marker='o', s=1) + plt.tight_layout() + plt.savefig('%s/nictx_vs_queues_scat.pdf' % (options.plot)) + + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 2500) + ax.set_xlabel('KBytes in NIC Queues') + ax.set_ylim(0, 120) + ax.set_ylabel('Tx Completion Rate (Gbps)') + ax.scatter(kbytes, gbps, marker='o', s=1) + plt.tight_layout() + plt.savefig('%s/nictx_vs_kb_scat.pdf' % (options.plot)) + + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 30) + ax.set_xlabel('# Packets Owned by NIC') + ax.set_ylim(0, 120) + ax.set_ylabel('Tx Completion Rate (Gbps)') + ax.scatter(num_pkts, gbps, marker='o', s=1) + plt.tight_layout() + plt.savefig('%s/nictx_vs_pkts_scat.pdf' % (options.plot)) + + # Now generate point plots with mean and standard deviation for + # buckets of similar intervals. + + # Throughput vs. number of active queues + xmax = 30 + x = range(xmax) + buckets = [[] for _ in x] + for queues, bytes, pkts, freed, queued in intervals: + if queues < xmax: + buckets[queues].append(freed * 8e-3 / options.interval) + y = [] + yerr = [] + for bucket in buckets: + if not bucket: + y.append(0) + yerr.append(0) + continue + y.append(sum(bucket) / len(bucket)) + b = sorted(bucket) + if len(bucket) >= 2: + yerr.append(stdev(bucket)) + else: + yerr.append(0) + + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, xmax) + ax.set_xlabel('# Nic Queues Occupied') + ax.set_ylim(0, 120) + ax.set_ylabel('Tx Completion Rate(Gbps)') + ax.errorbar(x, y, yerr=yerr, fmt='o', capsize=4) + plt.tight_layout() + plt.savefig('%s/nictx_vs_queues.pdf' % (options.plot)) + + # Throughput vs. Kbytes owned by NIC + xmax = 2500 + bucket_size = 100 + x = range(0, xmax, bucket_size) + buckets = [[] for _ in x] + for queues, bytes, pkts, freed, queued in intervals: + kb = bytes//1000 + if kb < xmax: + buckets[kb//bucket_size].append(freed * 8e-3 / options.interval) + y = [] + yerr = [] + for bucket in buckets: + if not bucket: + y.append(0) + yerr.append(0) + continue + y.append(sum(bucket) / len(bucket)) + if len(bucket) >= 2: + yerr.append(stdev(bucket)) + else: + yerr.append(0) + + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, xmax) + ax.set_xlabel('KBytes in NIC Queues') + ax.set_ylim(0, 120) + ax.set_ylabel('Tx Completion Rate(Gbps)') + ax.errorbar(x, y, yerr=yerr, fmt='o', capsize=4) + plt.tight_layout() + plt.savefig('%s/nictx_vs_kb.pdf' % (options.plot)) + + # Throughput vs. packets owned by NIC + xmax = 30 + x = range(xmax) + buckets = [[] for _ in x] + for queues, bytes, pkts, freed, queued in intervals: + if pkts < xmax: + buckets[pkts].append(freed * 8e-3 / options.interval) + y = [] + yerr = [] + for bucket in buckets: + if not bucket: + y.append(0) + yerr.append(0) + continue + y.append(sum(bucket) / len(bucket)) + if len(bucket) >= 2: + yerr.append(stdev(bucket)) + else: + yerr.append(0) + + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, xmax) + ax.set_xlabel('# Packets Owned by NIC') + ax.set_ylim(0, 120) + ax.set_ylabel('Tx Completion Rate(Gbps)') + ax.errorbar(x, y, yerr=yerr, fmt='o', capsize=4) + plt.tight_layout() + plt.savefig('%s/nictx_vs_pkts.pdf' % (options.plot)) + + # Generate CDF of throughput for intervals. + tput = [(i[3] * 8e-3 / options.interval) for i in intervals] + tput.sort() + y = [i / len(tput) for i in range(len(tput))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 120) + ax.set_xlabel('Tx Completion Rate (Gbps)') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of Intervals') + plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") + plt.plot(tput, y) + plt.tight_layout() + plt.savefig('%s/nictx_tput_cdf.pdf' % (options.plot)) + + # Generate CDF of active queues for intervals. + active = [i[0] for i in intervals] + active.sort() + y = [i / len(active) for i in range(len(active))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 20) + ax.xaxis.set_major_locator(matplotlib.ticker.MultipleLocator(2)) + ax.set_xlabel('Active NIC queues') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of Intervals') + plt.grid(which="major", axis="y") + plt.plot(active, y) + plt.tight_layout() + plt.savefig('%s/nictx_queues_cdf.pdf' % (options.plot)) + + # Generate CDF of KBytes in queued packets. + kb = [i[1] * 1e-3 for i in intervals] + kb.sort() + y = [i / len(tput) for i in range(len(tput))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 1500) + ax.set_xlabel('KBytes in Queued Packets') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of Intervals') + plt.grid(which="major", axis="y") + plt.plot(kb, y) + plt.tight_layout() + plt.savefig('%s/nictx_kb_cdf.pdf' % (options.plot)) + + # Generate CDF of queued packets for intervals. + pkts = [i[2] for i in intervals] + pkts.sort() + y = [i / len(active) for i in range(len(active))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 50) + ax.set_xlabel('Packets Queued in NIC') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of Intervals') + plt.grid(which="major", axis="y") + plt.plot(pkts, y) + plt.tight_layout() + plt.savefig('%s/nictx_pkts_cdf.pdf' % (options.plot)) + + # Generate CDF of input to the NIC for intervals. + input = [(i[4] * 8e-3 / options.interval) for i in intervals] + input.sort() + y = [i / len(input) for i in range(len(input))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 120) + ax.set_xlabel('Rate of New Bytes Queued in NIC (Gbps)') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of Intervals') + plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") + plt.plot(input, y) + plt.tight_layout() + plt.savefig('%s/nictx_input_cdf.pdf' % (options.plot)) + + + print('\n---------------') + print('Analyzer: nictx') + print('---------------') + print('Analyzed %d Homa data packets, %d Homa grants, %d ' + 'TCP packets.' % (type_counts['data'], type_counts['grant'], + type_counts['tcp'])) + print('Analyzed %d intervals of length %d usecs (use --interval ' + 'option to' % (len(intervals), options.interval)) + print('change interval length).') + print('See files %s/nictx_*.pdf for plots.' % (options.plot)) + #------------------------------------------------ # Analyzer: ooo #------------------------------------------------ @@ -8145,6 +8537,13 @@ def tt_nic_grant(self, trace, t, core, peer, id, offset, tx_queue): g['tx_node'] = trace['node'] g['tx_queue'] = tx_queue + def tt_free_grant(self, trace, t, core, id, offset, qid): + global grants + g = grants[pkt_id(id, offset)] + g['free_tx_skb'] = t + g['tx_qid'] = qid + g['tx_node'] = trace['node'] + def tt_gro_grant(self, trace, t, core, peer, id, offset, priority): global grants g = grants[pkt_id(id^1, offset)] @@ -8176,6 +8575,7 @@ def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, seq_ack): tcp_pkt['xmit2'] = t tcp_pkt['tso_length'] = data_bytes tcp_pkt['tx_node'] = node + tcp_pkt['tx_core'] = core set_tcp_ip_node(source, node) def tt_tcp_qdisc(self, trace, t, core, source, dest, data_bytes, seq_ack): @@ -8193,6 +8593,7 @@ def tt_tcp_nic(self, trace, t, core, source, dest, data_bytes, seq_ack): tcp_pkt['nic'] = t tcp_pkt['tso_length'] = data_bytes tcp_pkt['tx_node'] = node + tcp_pkt['nic_core'] = core set_tcp_ip_node(source, node) def tt_tcp_free(self, trace, t, core, source, dest, data_bytes, seq_ack, @@ -8344,7 +8745,7 @@ def analyze(self): new_pkts.append([pid, pkt2]) for key in ['xmit', 'qdisc_xmit', 'xmit2', 'nic', 'id', 'msg_length', 'priority', 'tx_node', 'tx_core', - 'free_tx_skb', 'tx_qid']: + 'free_tx_skb', 'tx_qid', 'type']: if key in pkt: pkt2[key] = pkt[key] if pkt2['msg_length'] != None and pkt2['offset'] > pkt2['msg_length']: @@ -10653,36 +11054,37 @@ class AnalyzeTemp: debugging. Consult the code to see what it does right now. """ def __init__(self, dispatcher): - dispatcher.interest('AnalyzeTcp_rpcs') + # dispatcher.interest('AnalyzeTcp_rpcs') # dispatcher.interest('AnalyzeRpcs') - # dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzePackets') def output(self): - global tcp_rpcs + global packets, grants, tcp_packets - qdisc = 0 - rpcs = filter_tcp_rpcs(tcp_rpcs.values(), msglen='1500') - rtts = [] - for rpc in rpcs: - if not 'req_send' in rpc or not 'resp_recvd' in rpc: + nic_idle = -1e20 + range_pkts = [] + for pkt in tcp_packets.values(): + if not 'tx_node' in pkt or pkt['tx_node'] != 'node5': continue - if not rpc['req_pkts'] or not rpc['resp_pkts']: + if not 'nic' in pkt or not 'tso_length' in pkt: continue - if ('qdisc_xmit' in rpc['req_pkts'][0] or - 'qdisc_xmit' in rpc['resp_pkts'][0]): - qdisc += 1 + nic = pkt['nic'] + if nic < -500 or nic > 1000: continue - rtts.append(rpc['resp_recvd'] - rpc['req_send']) - - rtts.sort() - print('%d RPCS smaller than 1500 bytes left after filtering out ' - '%d deferred' % (len(rtts), qdisc)) - print('Min RTT: %8s' % (print_pctl(rtts, 0, '%.1f'))) - print('P10: %8s' % (print_pctl(rtts, 100, '%.1f'))) - print('P50: %8s' % (print_pctl(rtts, 500, '%.1f'))) - print('P90: %8s' % (print_pctl(rtts, 900, '%.1f'))) - print('P99: %8s' % (print_pctl(rtts, 990, '%.1f'))) - print('Max RTT: %8s' % (print_pctl(rtts, 1000, '%.1f'))) + range_pkts.append(pkt) + range_pkts.sort(key=lambda pkt: pkt['nic']) + for pkt in range_pkts: + nic = pkt['nic'] + length = pkt['tso_length'] + get_hdr_length(pkt) + 24 + xmit_usecs = length * 8e-5 + if nic_idle < nic: + nic_idle = nic + xmit_usecs + else: + nic_idle += xmit_usecs + print('%9.3f: C%02d length %5d, xmit %.2f us (%5.0f cycles), ' + 'new nic_idle %.3f (queue %5.1f us, %5.0f cycles)' % ( + nic, pkt['nic_core'], length, xmit_usecs, xmit_usecs*2100, + nic_idle, nic_idle - nic, (nic_idle - nic)*2100)) def output_slow_pkts(self): pkts = [] @@ -11758,7 +12160,7 @@ def output(self): 'max, inclusive).') parser.add_option('-h', '--help', dest='help', action='store_true', help='Show this help message and exit') -parser.add_option('--interval', dest='interval', type=int, default=20, +parser.add_option('--interval', dest='interval', type=int, default=50, metavar='T', help='Specifies the length of intervals for ' 'interval-based output, in microseconds (default: 20)') parser.add_option('--late', dest='late', type=int, default=100, @@ -11893,4 +12295,4 @@ def output(self): for name in analyzer_classes: analyzer = dispatcher.get_analyzer(name) if hasattr(analyzer, 'output'): - analyzer.output() \ No newline at end of file + analyzer.output() From 77c7da5f3b8e8802ddc8952ed72b712182586677 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 19 Dec 2025 16:26:11 -0800 Subject: [PATCH 602/625] Add note about pacer limitations to perf.txt --- perf.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/perf.txt b/perf.txt index 41e2aea0..9da2da25 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,18 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +65. (December 2025) The pacer does not prevent NIC queue buildup. Under +"-w w4 -b 80" on cc620 machines (Intel NICs) it is not unusual to see +periods of 1ms or longer with more than 500 Kbytes of packet data queued +in the NIC. This happens because the NIC cannot always sustain 100 Gbps of +output. Even with large amounts of queued data, the NIC completion rate varies +between 85 and 100 Gbps. Since the pacer will queue data at almost 100 Gbps, +the NIC queue builds when there is a large backlog of data; over time, the +queue for data tends to move from the pacer to the NIC. The pacer limit rate +would have to be reduced considerably to eliminate this problem (e.g., 85 Gbps +instead of 99 Gbps?) but that would waste a lot of NIC bandwidth since there +are many times when the NIC can transmit at nearly 100 Gbps. + 64. (November 2025) Separating pacer traffic from non-paced traffic in homa_qdisc (use tx queue 0 for paced traffic; non-paced traffic is spread across other queues, using default queues except that traffic for queue 0 From b16eec6df6d2993540640615bf913eb1fef0a149 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 19 Dec 2025 16:31:30 -0800 Subject: [PATCH 603/625] Rework nictx analyzer in tthoma.py --- util/tthoma.py | 285 +++++++++++++++++++++++++------------------------ 1 file changed, 143 insertions(+), 142 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index bbb7b0c8..bc82b9d2 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -7614,9 +7614,9 @@ def output(self): #------------------------------------------------ class AnalyzeNictx: """ - Analyze NIC throughput for packet transmission and generate plots - of throughput as a function of number of bytes queued in the NIC - and number of active NIC queues. Requires the --plot option + Analyze various factors related to NIC queueing and throughput for packet + transmission; generate graphs displaying these factors. Requires the + --plot option. """ def __init__(self, dispatcher): @@ -7644,167 +7644,136 @@ def output(self): node_pkts[node].append(pkt) type_counts[pkt['type']] += 1 - # List of tuples, one for each - # interval across all nodes, where queues is the number of tx queues - # in the NIC with packets at the start of the interval, bytes is - # the number of packet bytes queued in the NIC at the start of - # the interval, pkts is the total number of queued packets at the - # start of the interval, freed is the number of packet bytes freed - # after transmission during the interval, and queued is the number - # of new packet bytes queued during the interval. + # List of tuples, one for + # each interval across all nodes: + # queues: The number of tx queues in the NIC with packets at the + # start of the interval + # bytes: The number of packet bytes queued in the NIC at the start + # of the interval + # pkts: The total number of queued packets at the start of the + # interval + # freed: The number of packet bytes freed after transmission during + # the interval + # queued: The number of new packet bytes queued during the interval + # qdisc: The number of bytes in packets that are queued in the + # qdisc system (they have been passed to ip*xmit but have + # not been handed off to the NIC) intervals = [] - # Process node_pkts, one node at a time, to populate intervals. + # node -> dict containing data series for plotting: + # t: list of time values for the other data series + # qdisc: for each t, kbytes queued in qdiscs or NIC at t + # nic: for each t, kbytes queued in the NIC at t + node_data = defaultdict(lambda: {'t': [], 'qdisc': [], 'nic': []}) + + # Process the packets in each node separately in order to populate + # intervals and node_data. for node in get_sorted_nodes(): - # heapq with one entry for each packet currently in the NIC's - # possession. Each entry is a tuple, - # where free is the packet's free_tx_skb time, index is the - # packet's index in the list of all packets (for resolving sorting - # ties), and pkt is the packet. - active = [] + # List of , where time is the time when an + # event occurred, type indicates what happened ('xmit', + # 'nic', or 'freed'), and pkt is the packet for which the + # particular event occurred. The list is eventually sorted + # in time order. + events = [] + + # Generate list of interesting events. + first_time = traces[node]['first_time'] + last_time = traces[node]['last_time'] + for pkt in node_pkts[node]: + events.append([pkt['xmit'] if 'xmit' in pkt else first_time, + 'xmit', pkt]) + events.append([pkt['nic'] if 'nic' in pkt else first_time, + 'nic', pkt]) + events.append([pkt['free_tx_skb'] if 'free_tx_skb' in pkt + else last_time, 'freed', pkt]) + events.sort(key=lambda t: t[0]) + + # Process the events in time order to generate statistics. # qid -> count of packets currently owned by this queue. qid_packets = defaultdict(lambda: 0) - # Total bytes owned by NIC + # qid -> count of bytes currently owned by this queue. + qid_bytes = defaultdict(lambda: 0) + + # Total bytes currently owned by NIC nic_bytes = 0 - # Total packets owned by NIC + # Total packets currently owned by NIC nic_pkts = 0 - # The next tuple that will be added to intervals. - next = [0, 0, 0, 0, 0] + # Total bytes that have been passed to ip*xmit but have not + # yet been queued in the NIC (they are queued in the qdisc system). + qdisc_bytes = 0 - pkts = sorted(node_pkts[node], key = lambda pkt : - pkt['nic'] if 'nic' in pkt else -1e20) - cur = 0 - t = traces[node]['first_time'] - interval_end = (math.ceil(traces[node]['first_time'] / - options.interval) * options.interval) + # The next tuple that will be added to intervals. + next = [0, 0, 0, 0, 0, 0] # True means there was at least one point in the current interval # where the total # of bytes queued in the NIC dropped below # a threshold value. below_threshold = False - interval_pkts = [] + t = traces[node]['first_time'] + interval_end = (math.ceil(first_time / options.interval) * + options.interval) - # Each iteration of this loop processes one event: either a - # packet handed off to the NIC or a packet freed. - while True: - # Decide on next event - if cur < len(pkts): - pkt = pkts[cur] - if 'nic' in pkt: - nic = pkt['nic'] - else: - nic = traces[node]['first_time'] - else: - nic = None - if nic != None and (not active or active[0][0] > nic): - free_event = False - t = nic - cur += 1 - elif active: - t, _, pkt = heapq.heappop(active) - free_event = True - else: - break + data = node_data[node] + for t, event, pkt in events: # Handle end of interval(s) while t >= interval_end: - if traces[node]['first_time'] <= (interval_end - - options.interval): + if first_time <= (interval_end - options.interval): gbps_in = next[4] * 8e-3 / options.interval gbps_out = next[3] * 8e-3 / options.interval - # if gbps_in >= gbps_out + 5: - if gbps_in > 100: + if 0 and gbps_in > 100: print('%9.1f %s has %.1f KB queued data, tput %.1f ' 'Gbps, input %.1f Gbps' % (interval_end, node, nic_bytes * 1e-3, gbps_out, gbps_in)) if not below_threshold: intervals.append(next) + data['t'].append(interval_end) + data['qdisc'].append((qdisc_bytes + nic_bytes) * 1e-3) + data['nic'].append(nic_bytes * 1e-3) active_queues = sum(n > 0 for n in qid_packets.values()) - next = [active_queues, nic_bytes, nic_pkts, 0, 0] + next = [active_queues, nic_bytes, nic_pkts, 0, 0, + qdisc_bytes] below_threshold = False - interval_pkts = [] interval_end += options.interval - # Update statistics with current event + # Process event qid = pkt['tx_qid'] if pkt['type'] == 'grant': length = get_hdr_length(pkt) else: length = pkt['tso_length'] + get_hdr_length(pkt) - if free_event: + if event == 'xmit': + qdisc_bytes += length + elif event == 'nic': + qid_packets[qid] += 1 + qid_bytes[qid] += length + nic_pkts += 1 + nic_bytes += length + qdisc_bytes -= length + if 'nic' in pkt: + next[4] += length + elif event == 'freed': qid_packets[qid] -= 1 + qid_bytes[qid] -= length nic_pkts -= 1 nic_bytes -= length # if nic_bytes < 1000000: # below_threshold = True next[3] += length else: - qid_packets[qid] += 1 - nic_pkts += 1 - nic_bytes += length - if 'nic' in pkt: - interval_pkts.append(pkt) - next[4] += length - free_event = (pkt['free_tx_skb'] if 'free_tx_skb' in pkt else - traces[node]['last_time']) - heapq.heappush(active, [free_event, cur, pkt]) - - # Generate scatter plots of throughput vs. queues occupied, NIC bytes, - # and NIC pkts. - occupied = [] - kbytes = [] - num_pkts = [] - gbps = [] - for queues, bytes, pkts, freed, queued in intervals: - occupied.append(queues) - kbytes.append(bytes * 1e-3) - num_pkts.append(pkts) - gbps.append(freed * 8e-3 / options.interval) - - fig = plt.figure(figsize=[6,4]) - ax = fig.add_subplot(111) - ax.set_xlim(0, 30) - ax.set_xlabel('# Nic Queues Occupied') - ax.set_ylim(0, 120) - ax.set_ylabel('Tx Completion Rate(Gbps)') - ax.scatter(occupied, gbps, marker='o', s=1) - plt.tight_layout() - plt.savefig('%s/nictx_vs_queues_scat.pdf' % (options.plot)) + raise Exception('unknown event type %s' % event) - fig = plt.figure(figsize=[6,4]) - ax = fig.add_subplot(111) - ax.set_xlim(0, 2500) - ax.set_xlabel('KBytes in NIC Queues') - ax.set_ylim(0, 120) - ax.set_ylabel('Tx Completion Rate (Gbps)') - ax.scatter(kbytes, gbps, marker='o', s=1) - plt.tight_layout() - plt.savefig('%s/nictx_vs_kb_scat.pdf' % (options.plot)) - - fig = plt.figure(figsize=[6,4]) - ax = fig.add_subplot(111) - ax.set_xlim(0, 30) - ax.set_xlabel('# Packets Owned by NIC') - ax.set_ylim(0, 120) - ax.set_ylabel('Tx Completion Rate (Gbps)') - ax.scatter(num_pkts, gbps, marker='o', s=1) - plt.tight_layout() - plt.savefig('%s/nictx_vs_pkts_scat.pdf' % (options.plot)) - - # Now generate point plots with mean and standard deviation for - # buckets of similar intervals. - - # Throughput vs. number of active queues + # Plot throughput vs. number of active queues xmax = 30 x = range(xmax) buckets = [[] for _ in x] - for queues, bytes, pkts, freed, queued in intervals: + for queues, bytes, pkts, freed, queued, qdisc in intervals: if queues < xmax: buckets[queues].append(freed * 8e-3 / options.interval) y = [] @@ -7831,12 +7800,12 @@ def output(self): plt.tight_layout() plt.savefig('%s/nictx_vs_queues.pdf' % (options.plot)) - # Throughput vs. Kbytes owned by NIC + # Plot throughput vs. Kbytes owned by NIC xmax = 2500 bucket_size = 100 x = range(0, xmax, bucket_size) buckets = [[] for _ in x] - for queues, bytes, pkts, freed, queued in intervals: + for queues, bytes, pkts, freed, queued, qdisc in intervals: kb = bytes//1000 if kb < xmax: buckets[kb//bucket_size].append(freed * 8e-3 / options.interval) @@ -7863,11 +7832,11 @@ def output(self): plt.tight_layout() plt.savefig('%s/nictx_vs_kb.pdf' % (options.plot)) - # Throughput vs. packets owned by NIC + # Plot throughput vs. packets owned by NIC xmax = 30 x = range(xmax) buckets = [[] for _ in x] - for queues, bytes, pkts, freed, queued in intervals: + for queues, bytes, pkts, freed, queued, qdisc in intervals: if pkts < xmax: buckets[pkts].append(freed * 8e-3 / options.interval) y = [] @@ -7928,7 +7897,7 @@ def output(self): # Generate CDF of KBytes in queued packets. kb = [i[1] * 1e-3 for i in intervals] kb.sort() - y = [i / len(tput) for i in range(len(tput))] + y = [i / len(kb) for i in range(len(kb))] fig = plt.figure(figsize=[6,4]) ax = fig.add_subplot(111) ax.set_xlim(0, 1500) @@ -7940,10 +7909,25 @@ def output(self): plt.tight_layout() plt.savefig('%s/nictx_kb_cdf.pdf' % (options.plot)) + # Generate CDF of KBytes in packets queued in a qdisc. + kb = [i[5] * 1e-3 for i in intervals] + kb.sort() + y = [i / len(kb) for i in range(len(kb))] + fig = plt.figure(figsize=[6,4]) + ax = fig.add_subplot(111) + ax.set_xlim(0, 1500) + ax.set_xlabel('Kbytes in Packets Queued in a Qdisc') + ax.set_ylim(0, 5000) + ax.set_ylabel('KBytes Queued') + plt.grid(which="major", axis="y") + plt.plot(kb, y) + plt.tight_layout() + plt.savefig('%s/nictx_qdisc_cdf.pdf' % (options.plot)) + # Generate CDF of queued packets for intervals. pkts = [i[2] for i in intervals] pkts.sort() - y = [i / len(active) for i in range(len(active))] + y = [i / len(pkts) for i in range(len(pkts))] fig = plt.figure(figsize=[6,4]) ax = fig.add_subplot(111) ax.set_xlim(0, 50) @@ -7971,6 +7955,35 @@ def output(self): plt.tight_layout() plt.savefig('%s/nictx_input_cdf.pdf' % (options.plot)) + # Generate time-series plot showing queuing in the qdisc and NIC + # for each node. + x_min = get_last_start() + x_max = get_first_end() + nodes = get_sorted_nodes() + maxy = max(max(node_data[node]['qdisc']) for node in nodes) + fig, axes = plt.subplots(nrows=len(nodes), ncols=1, sharex=False, + figsize=[8, len(nodes)*2]) + for i in range(len(nodes)): + node = nodes[i] + ax = axes[i] + ax.set_xlim(x_min, x_max) + ax.set_xlabel('Time') + ax.set_ylim(0, maxy) + ax.set_ylabel('Kbytes Queued') + ax.grid(which="major", axis="y") + ax.plot(node_data[node]['t'], node_data[node]['qdisc'], + color=color_blue, label='Nic + Qdisc') + ax.plot(node_data[node]['t'], node_data[node]['nic'], + color=color_red, label='Nic') + legend_handles = [ + matplotlib.lines.Line2D([], [], color=c, marker='o', + linestyle='None', markersize=8, label=label) + for c, label in [[color_blue, 'Nic + Qdisc'], + [color_red, 'Nic']] + ] + fig.legend(handles=legend_handles) + plt.tight_layout() + plt.savefig("%s/nictx_qtrends.pdf" % (options.plot), bbox_inches='tight') print('\n---------------') print('Analyzer: nictx') @@ -11061,30 +11074,18 @@ def __init__(self, dispatcher): def output(self): global packets, grants, tcp_packets - nic_idle = -1e20 - range_pkts = [] - for pkt in tcp_packets.values(): - if not 'tx_node' in pkt or pkt['tx_node'] != 'node5': - continue + selected = [] + for pkt in itertools.chain(packets.values(), grants.values(), + tcp_packets.values()): if not 'nic' in pkt or not 'tso_length' in pkt: continue - nic = pkt['nic'] - if nic < -500 or nic > 1000: + if not 'xmit' in pkt: continue - range_pkts.append(pkt) - range_pkts.sort(key=lambda pkt: pkt['nic']) - for pkt in range_pkts: - nic = pkt['nic'] - length = pkt['tso_length'] + get_hdr_length(pkt) + 24 - xmit_usecs = length * 8e-5 - if nic_idle < nic: - nic_idle = nic + xmit_usecs - else: - nic_idle += xmit_usecs - print('%9.3f: C%02d length %5d, xmit %.2f us (%5.0f cycles), ' - 'new nic_idle %.3f (queue %5.1f us, %5.0f cycles)' % ( - nic, pkt['nic_core'], length, xmit_usecs, xmit_usecs*2100, - nic_idle, nic_idle - nic, (nic_idle - nic)*2100)) + if pkt['nic'] - pkt['xmit'] > 5: + selected.append(pkt) + + selected.sort(key=lambda pkt: pkt['xmit']) + print(print_pkts(selected, header=True), end='') def output_slow_pkts(self): pkts = [] @@ -12295,4 +12296,4 @@ def output(self): for name in analyzer_classes: analyzer = dispatcher.get_analyzer(name) if hasattr(analyzer, 'output'): - analyzer.output() + analyzer.output() \ No newline at end of file From a491270921a4d11f42ef9dd6259799048a150603 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 19 Dec 2025 16:31:57 -0800 Subject: [PATCH 604/625] Update notes.txt --- notes.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/notes.txt b/notes.txt index 92642c10..923468a0 100755 --- a/notes.txt +++ b/notes.txt @@ -1,9 +1,18 @@ Notes for Homa implementation in Linux: --------------------------------------- +* (12/12/25) Something is wrong with the xl170 cluster: both Homa and TCP + are showing considerably worse performance than previously. I tried multiple + different clusters, and tried backing out to older versions of Homa and + Linux, but the problems don't go away. + * Performance problems to track down: * On xl170s, both TCP and Homa run slower with qdisc than pacer (P99 for TCP small packets increases by 50%) + * P99 latency for W4 short packets is considerably worse with IPv6 than IPv4 + (c6620). + * On c6620 cluster, "cp_node client --workload 500000" gets 20 GBps each + way with TCP but only 15 Gbps with Homa. * Move interest cleanup code from homa_sock to a new function in homa_interest. Also move wakeup code from homa_rpc_handoff. From 256d6ebc07d4ec3c0269ed128c60547ed729c32c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 23 Dec 2025 15:19:27 -0800 Subject: [PATCH 605/625] Change the scheduling algorithm for deferred TCP packets homa_qdisc used to schedule deferred packets in strict FIFO order, but this caused head-of-line blocking where a short message for one NIC queue got stuck behind a long message for a different NIC queue. The new approach is round-robin between NIC queues (but FIFO within a queue). --- homa_qdisc.c | 78 ++++++++++++++++++++++++------------------ homa_qdisc.h | 35 +++++++++++++------ test/unit_homa_qdisc.c | 76 +++++++++++++++++++++++----------------- 3 files changed, 114 insertions(+), 75 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index c48856b9..7e4aff5a 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -167,7 +167,7 @@ struct homa_qdisc_shared *homa_qdisc_shared_alloc(void) * homa_qdisc_shared_free() - Invoked when a struct homa is being freed; * releases information related to all the associated homa_qdiscs. * @qshared: Information about homa_qdisc_devs associated with a - * particular struct homa. + * particular struct homa. */ void homa_qdisc_shared_free(struct homa_qdisc_shared *qshared) { @@ -254,7 +254,8 @@ struct homa_qdisc_dev *homa_qdisc_qdev_get(struct net_device *dev) homa_qdev_update_sysctl(qdev); INIT_LIST_HEAD(&qdev->links); qdev->deferred_rpcs = RB_ROOT_CACHED; - skb_queue_head_init(&qdev->deferred_tcp); + INIT_LIST_HEAD(&qdev->deferred_qdiscs); + qdev->next_qdisc = &qdev->deferred_qdiscs; spin_lock_init(&qdev->defer_lock); init_waitqueue_head(&qdev->pacer_sleep); spin_lock_init(&qdev->pacer_mutex); @@ -313,7 +314,7 @@ void homa_qdisc_dev_callback(struct rcu_head *head) qdev = container_of(head, struct homa_qdisc_dev, rcu_head); homa_qdisc_free_homa(qdev); - WARN_ON(!skb_queue_empty(&qdev->deferred_tcp)); + WARN_ON(!list_empty(&qdev->deferred_qdiscs)); kfree(qdev); } @@ -344,6 +345,8 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, break; } } + skb_queue_head_init(&q->deferred_tcp); + INIT_LIST_HEAD(&q->defer_links); sch->limit = 10 * 1024; return 0; @@ -357,19 +360,15 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, void homa_qdisc_destroy(struct Qdisc *qdisc) { struct homa_qdisc *q = qdisc_priv(qdisc); - struct sk_buff *skb, *tmp; unsigned long flags; qdisc_reset_queue(qdisc); - /* Delete any deferred skb's for this qdisc. */ spin_lock_irqsave(&q->qdev->defer_lock, flags); - skb_queue_walk_safe(&q->qdev->deferred_tcp, skb, tmp) { - if (skb_get_queue_mapping(skb) == q->ix) { - __skb_unlink(skb, &q->qdev->deferred_tcp); - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); - } - } + while (!skb_queue_empty(&q->deferred_tcp)) + kfree_skb_reason(__skb_dequeue(&q->deferred_tcp), + SKB_DROP_REASON_QDISC_DROP); + list_del_init(&q->defer_links); spin_unlock_irqrestore(&q->qdev->defer_lock, flags); homa_qdisc_qdev_put(q->qdev); } @@ -410,7 +409,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * already deferred for this qdisc. */ INC_METRIC(qdisc_tcp_packets, 1); - if (atomic_read(&q->num_deferred_tcp) > 0) { + if (!skb_queue_empty(&q->deferred_tcp)) { homa_qdisc_defer_tcp(q, skb); return NET_XMIT_SUCCESS; } @@ -492,8 +491,9 @@ void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb) skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); spin_lock_irqsave(&qdev->defer_lock, flags); - __skb_queue_tail(&qdev->deferred_tcp, skb); - atomic_inc(&q->num_deferred_tcp); + __skb_queue_tail(&q->deferred_tcp, skb); + if (list_empty(&q->defer_links)) + list_add_tail(&q->defer_links, &qdev->deferred_qdiscs); if (qdev->last_defer) INC_METRIC(nic_backlog_cycles, now - qdev->last_defer); else @@ -572,22 +572,43 @@ void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) */ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) { - struct netdev_queue *txq; struct homa_qdisc *q; unsigned long flags; struct sk_buff *skb; - struct Qdisc *qdisc; int pkt_len; + /* When there are deferred TCP packets on multiple queues, we + * will cycle between the queues in round-robin style, transmitting + * one packet from each queue. An earlier implementation kept all + * of the deferred TCP packets on a single global queue for the qdev + * and transmitted them in FIFO fashion. However, this resulted in + * head-of-line blocking where a short message for one queue could + * get stuck behind a long messaage for a different queue, resulting + * in high tail latency. With the round-robin approach, shorter + * messages get transmitted more quickly as long as they don't use + * the same NIC queue as a long message. + */ + spin_lock_irqsave(&qdev->defer_lock, flags); - if (skb_queue_empty(&qdev->deferred_tcp)) { + if (list_empty(&qdev->deferred_qdiscs)) { spin_unlock_irqrestore(&qdev->defer_lock, flags); return 0; } - skb = __skb_dequeue(&qdev->deferred_tcp); - if (!homa_qdisc_any_deferred(qdev)) { - INC_METRIC(nic_backlog_cycles, homa_clock() - qdev->last_defer); - qdev->last_defer = 0; + if (qdev->next_qdisc == &qdev->deferred_qdiscs) + q = list_first_entry(&qdev->deferred_qdiscs, struct homa_qdisc, + defer_links); + else + q = list_entry(qdev->next_qdisc, struct homa_qdisc, + defer_links); + qdev->next_qdisc = q->defer_links.next; + skb = __skb_dequeue(&q->deferred_tcp); + if (skb_queue_empty(&q->deferred_tcp)) { + list_del_init(&q->defer_links); + if (!homa_qdisc_any_deferred(qdev)) { + INC_METRIC(nic_backlog_cycles, + homa_clock() - qdev->last_defer); + qdev->last_defer = 0; + } } spin_unlock_irqrestore(&qdev->defer_lock, flags); @@ -597,18 +618,7 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) tt_record_tcp("homa_qdisc_pacer requeued TCP packet from " "0x%x to 0x%x, data bytes %d, seq/ack %u", skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); - - rcu_read_lock_bh(); - txq = netdev_get_tx_queue(skb->dev, skb_get_queue_mapping(skb)); - qdisc = rcu_dereference_bh(txq->qdisc); - if (qdisc->ops == &homa_qdisc_ops) { - q = qdisc_priv(qdisc); - atomic_dec(&q->num_deferred_tcp); - homa_qdisc_schedule_skb(skb, qdisc); - } else { - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); - } - rcu_read_unlock_bh(); + homa_qdisc_schedule_skb(skb, qdisc_from_priv(q)); return pkt_len; } @@ -869,7 +879,7 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) * prevent negative credit buildup for the protocol * with packets. */ - if (skb_queue_empty(&qdev->deferred_tcp)) { + if (list_empty(&qdev->deferred_qdiscs)) { if (!rb_first_cached(&qdev->deferred_rpcs)) break; qdev->homa_credit = 1; diff --git a/homa_qdisc.h b/homa_qdisc.h index 6adeec89..7c4ae7f3 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -38,11 +38,18 @@ struct homa_qdisc { int ix; /** - * @num_deferred_tcp: Count of the number of TCP packets for this - * qdisc that are currently in qdev->deferred_tcp. Incremented and - * decremented without holding a lock. + * @deferred_tcp: List of non-Homa packets for this qdisc that have + * been deferred because of NIC overload, in order of arrival. + * Synchronize with qdev->defer_lock. */ - atomic_t num_deferred_tcp; + struct sk_buff_head deferred_tcp; + + /** + * @defer_links: Used to link this object into qdev->deferred_qdiscs + * when deferred_tcp is nonempty. This will be an empty list if + * deferred_tcp is nonempty. Synchronized with qdev->defer_lock. + */ + struct list_head defer_links; }; /** @@ -98,11 +105,18 @@ struct homa_qdisc_dev { struct rb_root_cached deferred_rpcs; /** - * @deferred_tcp: List of all non-Homa packets that have been deferred - * because of NIC overload, in order of when they were deferred. - * The internal lock isn't used (defer_lock is used instead) + * @deferred_qdiscs: List of all homa_qdiscs with non-Homa packets + * that have been deferred because of NIC overload. */ - struct sk_buff_head deferred_tcp; + struct list_head deferred_qdiscs; + + /** + * @next_qdisc: Points to either the defer_links field in a homa_qdisc + * or to deferred_qdiscs above. Used to select the next non-Homa packet + * for transmission. Note: this may refer to deferred_qdiscs even when + * deferred_qdiscs is nonempty. + */ + struct list_head *next_qdisc; /** * @last_defer: The most recent homa_clock() time when a packet was @@ -124,7 +138,8 @@ struct homa_qdisc_dev { /** * @defer_lock: Synchronizes access to information about deferred - * packets, including deferred_rpcs, deferred_tcp, and last_defer. + * packets, including deferred_rpcs, deferred_qdiscs, next_qdisc, + * last_defer, and some information in homa_qdiscs. */ spinlock_t defer_lock; @@ -308,7 +323,7 @@ static inline void homa_qdisc_rpc_init(struct homa_rpc_qdisc *qrpc) static inline bool homa_qdisc_any_deferred(struct homa_qdisc_dev *qdev) { return rb_first_cached(&qdev->deferred_rpcs) || - !skb_queue_empty(&qdev->deferred_tcp); + !list_empty(&qdev->deferred_qdiscs); } /** diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 46399c70..cf253a79 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -442,16 +442,14 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) EXPECT_NE(NULL, qdev); EXPECT_EQ(2, refcount_read(&qdev->refs)); - mock_queue_index = 3; homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); - mock_queue_index = 4; - homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 6000, 1100)); - mock_queue_index = 3; + homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 6000, 1100)); homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1100)); + EXPECT_EQ(2, unit_list_length(&qdev->deferred_qdiscs)); homa_qdisc_destroy(qdisc); EXPECT_EQ(1, refcount_read(&qdev->refs)); - EXPECT_EQ(1, skb_queue_len(&qdev->deferred_tcp)); + EXPECT_EQ(1, unit_list_length(&qdev->deferred_qdiscs)); homa_qdisc_destroy(qdisc2); EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); @@ -482,7 +480,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, atomic_read(&q->num_deferred_tcp)); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); /* Second packet is short, but must be deferred to maintain order @@ -492,7 +490,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(2, atomic_read(&q->num_deferred_tcp)); + EXPECT_EQ(2, skb_queue_len(&q->deferred_tcp)); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); homa_qdisc_destroy(qdisc); @@ -563,7 +561,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_homa_deferred to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, atomic_read(&q->num_deferred_tcp)); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); homa_qdisc_destroy(qdisc); @@ -721,12 +719,12 @@ TEST_F(homa_qdisc, homa_qdisc_defer_tcp__basics) mock_queue_index = 2; homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1500)); - EXPECT_EQ(1, skb_queue_len(&q->qdev->deferred_tcp)); - EXPECT_EQ(1, atomic_read(&q->num_deferred_tcp)); + EXPECT_EQ(1, unit_list_length(&q->qdev->deferred_qdiscs)); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1500)); - EXPECT_EQ(2, skb_queue_len(&q->qdev->deferred_tcp)); - EXPECT_EQ(2, atomic_read(&q->num_deferred_tcp)); + EXPECT_EQ(1, unit_list_length(&q->qdev->deferred_qdiscs)); + EXPECT_EQ(2, skb_queue_len(&q->deferred_tcp)); } TEST_F(homa_qdisc, homa_qdisc_defer_tcp__update_metrics_and_wakeup) { @@ -1010,7 +1008,7 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__basics) EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q->qdev)); EXPECT_EQ(1, self->qdiscs[2]->q.qlen); - EXPECT_EQ(0, skb_queue_len(&q->qdev->deferred_tcp)); + EXPECT_EQ(0, unit_list_length(&q->qdev->deferred_qdiscs)); EXPECT_LT(20000, atomic64_read(&q->qdev->link_idle_time)); } TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__no_deferred_packets) @@ -1023,6 +1021,40 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__no_deferred_packets) EXPECT_EQ(0, self->qdiscs[2]->q.qlen); homa_qdisc_qdev_put(qdev); } +TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__round_robin_between_qdiscs) +{ + struct homa_qdisc *q1, *q2, *q3; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[1], NULL, NULL)); + q1 = qdisc_priv(self->qdiscs[1]); + q1->ix = 1; + mock_queue_index = 1; + homa_qdisc_defer_tcp(q1, mock_tcp_skb(&self->addr, 5000, 1000)); + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); + q2 = qdisc_priv(self->qdiscs[2]); + q2->ix = 2; + mock_queue_index = 2; + homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 5000, 1100)); + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + q3 = qdisc_priv(self->qdiscs[3]); + q3->ix = 3; + mock_queue_index = 3; + homa_qdisc_defer_tcp(q3, mock_tcp_skb(&self->addr, 5000, 1200)); + EXPECT_EQ(3, unit_list_length(&q3->qdev->deferred_qdiscs)); + + q2->qdev->next_qdisc = &q3->defer_links; + EXPECT_EQ(1300, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(1, self->qdiscs[3]->q.qlen); + EXPECT_EQ(2, unit_list_length(&q3->qdev->deferred_qdiscs)); + EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(1, self->qdiscs[1]->q.qlen); + EXPECT_EQ(1, unit_list_length(&q1->qdev->deferred_qdiscs)); + EXPECT_EQ(1200, homa_qdisc_xmit_deferred_tcp(q2->qdev)); + EXPECT_EQ(1, self->qdiscs[2]->q.qlen); + EXPECT_EQ(0, unit_list_length(&q2->qdev->deferred_qdiscs)); +} TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__backlog_cycles_metric) { struct homa_qdisc *q1; @@ -1047,24 +1079,6 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__backlog_cycles_metric) EXPECT_EQ(3000, homa_metrics_per_cpu()->nic_backlog_cycles); EXPECT_EQ(0, q1->qdev->last_defer); } -TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__qdisc_not_homa) -{ - const struct Qdisc_ops *saved_ops; - struct homa_qdisc *q; - - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[2], NULL, NULL)); - q = qdisc_priv(self->qdiscs[2]); - q->ix = 2; - mock_queue_index = 2; - homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); - saved_ops = self->qdiscs[2]->ops; - self->qdiscs[2]->ops = NULL; - - EXPECT_EQ(1100, homa_qdisc_xmit_deferred_tcp(q->qdev)); - EXPECT_EQ(0, self->qdiscs[2]->q.qlen); - EXPECT_EQ(0, skb_queue_len(&q->qdev->deferred_tcp)); - self->qdiscs[2]->ops = saved_ops; -} TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__no_deferred_rpcs) { From b2dca2f0580523ce1543109cd92f5e89756867f8 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 23 Dec 2025 17:30:31 -0800 Subject: [PATCH 606/625] Rename max_nic_queue_ns config param to max_nic_est_backlog_usecs This is in preparation for adding a second parameter for the measured length of individual queues. --- cloudlab/bin/config | 2 +- homa_pacer.c | 8 +++++--- homa_pacer.h | 4 ++-- homa_qdisc.c | 21 +++++++++++---------- homa_qdisc.h | 15 ++++++++------- man/homa.7 | 21 ++++++++++++--------- protocol.md | 2 +- test/unit_homa_outgoing.c | 8 ++++---- test/unit_homa_pacer.c | 28 ++++++++++++++-------------- test/unit_homa_qdisc.c | 22 +++++++++++----------- util/cp_config | 6 +++--- util/cp_tcp_config | 6 +++--- 12 files changed, 75 insertions(+), 68 deletions(-) diff --git a/cloudlab/bin/config b/cloudlab/bin/config index 682a396c..5ccf574f 100755 --- a/cloudlab/bin/config +++ b/cloudlab/bin/config @@ -488,7 +488,7 @@ def config_homa(mod): set_sysctl("num_priorities", "8") link_mbps = get_link_speed() set_sysctl ("link_mbps", str(link_mbps)) - set_sysctl("max_nic_queue_ns", "5000") + set_sysctl("max_nic_est_backlog_usecs", "5") if link_mbps == 10000: set_sysctl("unsched_bytes", "30000") set_sysctl("window", "50000") diff --git a/homa_pacer.c b/homa_pacer.c index 1b716af9..14c9ff71 100644 --- a/homa_pacer.c +++ b/homa_pacer.c @@ -87,8 +87,9 @@ int homa_pacer_check_nic_q(struct homa_pacer *pacer, struct sk_buff *skb, while (1) { clock = homa_clock(); idle = atomic64_read(&pacer->link_idle_time); - if ((clock + pacer->homa->qshared->max_nic_queue_cycles) < idle && - !force && !(pacer->homa->flags & HOMA_FLAG_DONT_THROTTLE)) + if ((clock + pacer->homa->qshared->max_nic_est_backlog_cycles) < + idle && !force && + !(pacer->homa->flags & HOMA_FLAG_DONT_THROTTLE)) return 0; if (!list_empty(&pacer->throttled_rpcs)) { INC_METRIC(pacer_homa_packets, 1); @@ -173,7 +174,8 @@ void homa_pacer_xmit(struct homa_pacer *pacer) while (1) { queue_cycles = atomic64_read(&pacer->link_idle_time) - homa_clock(); - if (queue_cycles >= pacer->homa->qshared->max_nic_queue_cycles) + if (queue_cycles >= + pacer->homa->qshared->max_nic_est_backlog_cycles) break; if (list_empty(&pacer->throttled_rpcs)) break; diff --git a/homa_pacer.h b/homa_pacer.h index 6379f012..0d537d4f 100644 --- a/homa_pacer.h +++ b/homa_pacer.h @@ -113,8 +113,8 @@ static inline void homa_pacer_check(struct homa_pacer *pacer) * to queue new packets; if the NIC queue becomes more than half * empty, then we will help out here. */ - if ((homa_clock() + (pacer->homa->qshared->max_nic_queue_cycles >> 1)) < - atomic64_read(&pacer->link_idle_time)) + if ((homa_clock() + (pacer->homa->qshared->max_nic_est_backlog_cycles >> + 1)) < atomic64_read(&pacer->link_idle_time)) return; tt_record("homa_check_pacer calling homa_pacer_xmit"); homa_pacer_xmit(pacer); diff --git a/homa_qdisc.c b/homa_qdisc.c index 7e4aff5a..2200748d 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -27,8 +27,8 @@ #define OFFSET(field) ((void *)offsetof(struct homa_qdisc_shared, field)) static struct ctl_table homa_qdisc_ctl_table[] = { { - .procname = "max_nic_queue_ns", - .data = OFFSET(max_nic_queue_ns), + .procname = "max_nic_est_backlog_usecs", + .data = OFFSET(max_nic_est_backlog_usecs), .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_qdisc_dointvec @@ -148,7 +148,7 @@ struct homa_qdisc_shared *homa_qdisc_shared_alloc(void) mutex_init(&qshared->mutex); INIT_LIST_HEAD(&qshared->qdevs); qshared->fifo_fraction = 50; - qshared->max_nic_queue_ns = 5000; + qshared->max_nic_est_backlog_usecs = 5; qshared->defer_min_bytes = 1000; qshared->homa_share = 50; qshared->max_link_usage = 99; @@ -419,7 +419,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } if (!homa_qdisc_any_deferred(qdev) && homa_qdisc_update_link_idle(qdev, pkt_len, - qshared->max_nic_queue_cycles)) + qshared->max_nic_est_backlog_cycles)) goto enqueue; homa_qdisc_defer_tcp(q, skb); return NET_XMIT_SUCCESS; @@ -446,7 +446,7 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (!homa_qdisc_any_deferred(qdev) && homa_qdisc_update_link_idle(qdev, pkt_len, - qshared->max_nic_queue_cycles)) + qshared->max_nic_est_backlog_cycles)) goto enqueue; /* This packet needs to be deferred until the NIC queue has @@ -854,7 +854,8 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) /* If the NIC queue is too long, wait until it gets shorter. */ now = homa_clock(); idle_time = atomic64_read(&qdev->link_idle_time); - while ((now + qdev->hnet->homa->qshared->max_nic_queue_cycles) < + while ((now + + qdev->hnet->homa->qshared->max_nic_est_backlog_cycles) < idle_time) { /* If we've xmitted at least one packet then * return (this helps with testing and also @@ -925,7 +926,7 @@ void homa_qdisc_pacer_check(struct homa *homa) u64 now = homa_clock(); int max_cycles; - max_cycles = homa->qshared->max_nic_queue_cycles; + max_cycles = homa->qshared->max_nic_est_backlog_cycles; rcu_read_lock(); list_for_each_entry_rcu(qdev, &homa->qshared->qdevs, links) { if (!homa_qdisc_any_deferred(qdev)) @@ -933,7 +934,7 @@ void homa_qdisc_pacer_check(struct homa *homa) /* The ">> 1" means that we only help out if the NIC queue has * dropped below half of its maximum allowed capacity. This - * gives the pacer thread the first shot at queuting new + * gives the pacer thread the first shot at queuing new * packets. */ if (now + (max_cycles >> 1) < @@ -1028,8 +1029,8 @@ void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared) { struct homa_qdisc_dev *qdev; - qshared->max_nic_queue_cycles = - homa_ns_to_cycles(qshared->max_nic_queue_ns); + qshared->max_nic_est_backlog_cycles = homa_ns_to_cycles(1000 * + qshared->max_nic_est_backlog_usecs); if (qshared->homa_share < 0) qshared->homa_share = 0; diff --git a/homa_qdisc.h b/homa_qdisc.h index 7c4ae7f3..b548234d 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -196,17 +196,18 @@ struct homa_qdisc_shared { int fifo_fraction; /** - * @max_nic_queue_ns: Limits the NIC queue length: we won't queue - * up a packet for transmission if link_idle_time is this many - * nanoseconds in the future (or more). Set externally via sysctl. + * @max_nic_est_backlog_usecs: Limits the NIC queue length: we won't + * queue packets in the NIC for transmission if link_idle_time is + * this many nanoseconds in the future (or more). Set externally via + * sysctl. */ - int max_nic_queue_ns; + int max_nic_est_backlog_usecs; /** - * @max_nic_queue_cycles: Same as max_nic_queue_ns except in - * homa_clock() units. + * @max_nic_est_backlog_cycles: Same as max_nic_est_backlog_usecs + * except in homa_clock() units. */ - int max_nic_queue_cycles; + int max_nic_est_backlog_cycles; /** * @defer_min_bytes: If a packet has fewer bytes than this, then it diff --git a/man/homa.7 b/man/homa.7 index afe49055..06d6642b 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -692,20 +692,23 @@ affect performance. Note that queues can sometimes form even with values less than 100, since most NICs cannot transmit at full link speed under all conditions. .TP -.IR max_nic_queue_ns -An integer value specifying a NIC queue length in units of nanoseconds -(how long it will take the existing packets in the queue -to be fully transmitted). -If the NIC queue is longer than this, Homa will wait to queue additional -packets until the queue length drops below this value. -This parameter is used to throttle the NIC output queue in order to -implement SRPT more accurately for outbound messages. +.IR max_nic_est_backlog_usecs +This value is used to prevent the buildup of large queues of packets +waiting to be transmitted in the NIC. Homa keeps a running +estimate of how many bytes are currently queued in the NIC, assuming +the NIC transmits at full link speed. If the queue gets long enough +that it will take more than +.I max_nic_est_backlog_usecs +microseconds to transmit all of the queued data, Homa will wait to queue +additional packets until the estimated queue length drops below this value. +Large NIC queues are bad because +they interfere with Homa's SRPT scheduling policy. Once a packet has been queued in the NIC, Homa cannot schedule a higher priority back in front of it; the longer the queue, the longer the delay for a newly arriving high priority packet. Lower values for this parameter reduce preemption lag and result in a better approximation of SRPT, but the value must be high enough to -queue the next packet before +allow time to queue the next packet before the NIC becomes idle; otherwise, output bandwidth will be lost. .TP .IR max_overcommit diff --git a/protocol.md b/protocol.md index 42931927..6f9313e3 100644 --- a/protocol.md +++ b/protocol.md @@ -254,7 +254,7 @@ which violates SRPT. To prevent this problem, Homa employs a *pacer* mechanism. Homa maintains a running estimate of how many bytes have been passed to the NIC but not yet transmitted (the *NIC backlog*). If this exceeds a -threshold value (specified in units of time with the `max_nic_queue_ns` +threshold value (specified in units of time with the `max_nic_est_backlog_usecs` parameter) then no more packets will be transmitted until the NIC backlog drops below the limit. Homa maintains a *throttled list*, which contains outgoing messages that have packets ready to transmit. diff --git a/test/unit_homa_outgoing.c b/test/unit_homa_outgoing.c index 520a2a21..efe136ab 100644 --- a/test/unit_homa_outgoing.c +++ b/test/unit_homa_outgoing.c @@ -831,7 +831,7 @@ TEST_F(homa_outgoing, homa_xmit_data__below_throttle_min) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 11000); - self->homa.qshared->max_nic_queue_cycles = 500; + self->homa.qshared->max_nic_est_backlog_cycles = 500; self->homa.qshared->defer_min_bytes = 250; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); @@ -853,7 +853,7 @@ TEST_F(homa_outgoing, homa_xmit_data__force) /* First, get an RPC on the throttled list. */ atomic64_set(&self->homa.pacer->link_idle_time, 11000); - self->homa.qshared->max_nic_queue_cycles = 3000; + self->homa.qshared->max_nic_est_backlog_cycles = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc1); XMIT_DATA(crpc1, false); @@ -884,7 +884,7 @@ TEST_F(homa_outgoing, homa_xmit_data__dont_throttle_because_homa_qdisc_in_use) self->client_id, 2000, 1000); unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 1000000); - self->homa.qshared->max_nic_queue_cycles = 0; + self->homa.qshared->max_nic_est_backlog_cycles = 0; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); @@ -904,7 +904,7 @@ TEST_F(homa_outgoing, homa_xmit_data__throttle) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 11000); - self->homa.qshared->max_nic_queue_cycles = 3000; + self->homa.qshared->max_nic_est_backlog_cycles = 3000; self->homa.flags &= ~HOMA_FLAG_DONT_THROTTLE; homa_rpc_lock(crpc); diff --git a/test/unit_homa_pacer.c b/test/unit_homa_pacer.c index ab4a693f..02e01b2d 100644 --- a/test/unit_homa_pacer.c +++ b/test/unit_homa_pacer.c @@ -142,7 +142,7 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__success) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); mock_clock = 8000; - self->homa.qshared->max_nic_queue_cycles = 1000; + self->homa.qshared->max_nic_est_backlog_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, false)); EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -159,7 +159,7 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); mock_clock = 7999; - self->homa.qshared->max_nic_queue_cycles = 1000; + self->homa.qshared->max_nic_est_backlog_cycles = 1000; EXPECT_EQ(0, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, false)); EXPECT_EQ(9000, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -176,7 +176,7 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_full_but_force) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); mock_clock = 7999; - self->homa.qshared->max_nic_queue_cycles = 1000; + self->homa.qshared->max_nic_est_backlog_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, true)); EXPECT_EQ(9500, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -193,7 +193,7 @@ TEST_F(homa_pacer, homa_pacer_check_nic_q__queue_empty) unit_log_clear(); atomic64_set(&self->homa.pacer->link_idle_time, 9000); mock_clock = 10000; - self->homa.qshared->max_nic_queue_cycles = 1000; + self->homa.qshared->max_nic_est_backlog_cycles = 1000; EXPECT_EQ(1, homa_pacer_check_nic_q(self->homa.pacer, crpc->msgout.packets, true)); EXPECT_EQ(10500, atomic64_read(&self->homa.pacer->link_idle_time)); @@ -219,7 +219,7 @@ TEST_F(homa_pacer, homa_pacer_main__xmit_data) homa_pacer_manage_rpc(crpc1); homa_pacer_manage_rpc(crpc2); - self->homa.qshared->max_nic_queue_cycles = 3000; + self->homa.qshared->max_nic_est_backlog_cycles = 3000; mock_clock_tick = 200; unit_hook_register(exit_idle_hook); hook_pacer = self->homa.pacer; @@ -253,7 +253,7 @@ TEST_F(homa_pacer, homa_pacer_main__rpc_arrives_while_sleeping) mock_clock_tick = 200; unit_hook_register(manage_hook); hook_rpc = crpc; - self->homa.qshared->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_est_backlog_cycles = 2000; unit_log_clear(); homa_pacer_main(self->homa.pacer); @@ -288,7 +288,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__basics) homa_pacer_manage_rpc(crpc1); homa_pacer_manage_rpc(crpc2); homa_pacer_manage_rpc(crpc3); - self->homa.qshared->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_est_backlog_cycles = 2000; unit_log_clear(); homa_pacer_xmit(self->homa.pacer); EXPECT_STREQ("xmit DATA 1400@0; xmit DATA 1400@1400", @@ -308,7 +308,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__pacer_already_active) self->client_id, 10000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.qshared->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_est_backlog_cycles = 2000; mock_trylock_errors = 1; unit_log_clear(); homa_pacer_xmit(self->homa.pacer); @@ -326,7 +326,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__nic_queue_fills) self->client_id, 10000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.qshared->max_nic_queue_cycles = 2001; + self->homa.qshared->max_nic_est_backlog_cycles = 2001; mock_clock = 10000; atomic64_set(&self->homa.pacer->link_idle_time, 12000); unit_log_clear(); @@ -340,7 +340,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__nic_queue_fills) } TEST_F(homa_pacer, homa_pacer_xmit__queue_empty) { - self->homa.qshared->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_est_backlog_cycles = 2000; unit_log_clear(); homa_pacer_xmit(self->homa.pacer); unit_log_throttled(&self->homa); @@ -367,7 +367,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__xmit_fifo) homa_pacer_manage_rpc(crpc3); /* First attempt: pacer->fifo_count doesn't reach zero. */ - self->homa.qshared->max_nic_queue_cycles = 1300; + self->homa.qshared->max_nic_est_backlog_cycles = 1300; self->homa.pacer->fifo_count = 200; self->homa.qshared->fifo_fraction = 150; mock_clock= 13000; @@ -406,7 +406,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__rpc_removed_from_queue_before_locked) self->client_id, 10000, 1000); homa_pacer_manage_rpc(crpc); - self->homa.qshared->max_nic_queue_cycles = 10000; + self->homa.qshared->max_nic_est_backlog_cycles = 10000; unit_log_clear(); unit_hook_register(unmanage_hook); hook_rpc = crpc; @@ -431,7 +431,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__remove_from_queue) homa_pacer_manage_rpc(crpc1); homa_pacer_manage_rpc(crpc2); - self->homa.qshared->max_nic_queue_cycles = 2000; + self->homa.qshared->max_nic_est_backlog_cycles = 2000; unit_log_clear(); /* First call completes id 2, but id 4 is still in the queue. */ @@ -445,7 +445,7 @@ TEST_F(homa_pacer, homa_pacer_xmit__remove_from_queue) /* Second call completes id 4, queue now empty. */ unit_log_clear(); - self->homa.qshared->max_nic_queue_cycles = 10000; + self->homa.qshared->max_nic_est_backlog_cycles = 10000; homa_pacer_xmit(self->homa.pacer); EXPECT_STREQ("xmit DATA 600@1400; removing id 4 from throttled list", unit_log_get()); diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index cf253a79..753b4a8b 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -643,7 +643,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet) /* First packet is deferred because the NIC queue is full. */ EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); - idle = mock_clock + 1 + self->homa.qshared->max_nic_queue_cycles + 1; + idle = mock_clock + 1 + self->homa.qshared->max_nic_est_backlog_cycles + 1; atomic64_set(&q->qdev->link_idle_time, idle); skb = new_test_skb(srpc, &self->addr, 0, 1500); to_free = NULL; @@ -1415,7 +1415,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) mock_clock = 0; mock_clock_tick = 1000; atomic64_set(&qdev->link_idle_time, 10000); - self->homa.qshared->max_nic_queue_cycles = 3500; + self->homa.qshared->max_nic_est_backlog_cycles = 3500; unit_log_clear(); unit_hook_register(xmit_hook); xmit_clock = 0; @@ -1457,7 +1457,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) unit_log_get()); mock_clock = atomic64_read(&qdev->link_idle_time); - self->homa.qshared->max_nic_queue_cycles = 100; + self->homa.qshared->max_nic_est_backlog_cycles = 100; unit_log_clear(); homa_qdisc_pacer(qdev, false); @@ -1593,7 +1593,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__both_protocols_have_packets_choose_tcp) * packet is transmitted. */ atomic64_set(&qdev->link_idle_time, 1000000); - qdev->hnet->homa->qshared->max_nic_queue_cycles = 10000; + qdev->hnet->homa->qshared->max_nic_est_backlog_cycles = 10000; mock_clock = 1000000 - 10000 + 100; homa_qdisc_pacer(qdev, false); @@ -1623,7 +1623,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__xmit_multiple_packets) homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1300)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); qdev->hnet->homa->qshared->homa_share = 40; - qdev->hnet->homa->qshared->max_nic_queue_cycles = 100000; + qdev->hnet->homa->qshared->max_nic_est_backlog_cycles = 100000; homa_qdisc_pacer(qdev, false); EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); @@ -1684,7 +1684,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__enqueue_packet) atomic64_set(&qdev->link_idle_time, 20000); mock_clock = 15000; - self->homa.qshared->max_nic_queue_cycles = 12000; + self->homa.qshared->max_nic_est_backlog_cycles = 12000; homa_qdisc_pacer_check(&self->homa); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); @@ -1707,7 +1707,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__no_deferred_rpcs) atomic64_set(&qdev->link_idle_time, 20000); mock_clock = 15000; - self->homa.qshared->max_nic_queue_cycles = 12000; + self->homa.qshared->max_nic_est_backlog_cycles = 12000; homa_qdisc_pacer_check(&self->homa); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); @@ -1733,7 +1733,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__lag_not_long_enough) atomic64_set(&qdev->link_idle_time, 20000); mock_clock = 13000; - self->homa.qshared->max_nic_queue_cycles = 12000; + self->homa.qshared->max_nic_est_backlog_cycles = 12000; homa_qdisc_pacer_check(&self->homa); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); @@ -1775,12 +1775,12 @@ TEST_F(homa_qdisc, homa_qdev_update_sysctl__cant_get_link_speed_from_dev) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__max_nic_queue_cycles) +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__max_nic_est_backlog_cycles) { - self->homa.qshared->max_nic_queue_ns = 6000; + self->homa.qshared->max_nic_est_backlog_usecs = 6; self->homa.link_mbps = 10000; homa_qdisc_update_sysctl_deps(self->homa.qshared); - EXPECT_EQ(6000, self->homa.qshared->max_nic_queue_cycles); + EXPECT_EQ(6000, self->homa.qshared->max_nic_est_backlog_cycles); } TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__limit_homa_share) { diff --git a/util/cp_config b/util/cp_config index 482d6604..539f2ed4 100755 --- a/util/cp_config +++ b/util/cp_config @@ -19,7 +19,7 @@ parser.add_argument('-c', '--config', dest='config', choices=['balance', 'buffers', 'busy_usecs', 'client_threads', 'dctcp_buffers', 'defer_min_bytes', 'fifo', 'gbps', 'gen2', 'gen3', 'grant_policy', 'gro_busy_usecs', 'load', - 'max_gro', 'max_gso', 'mtu', 'nic_queue', + 'max_gro', 'max_gso', 'mtu', 'nic_backlog', 'poll', 'ports', 'prios', 'receivers', 'repeat', 'tcp_buffers', 'throttle', 'time', 'unsched_bytes'], required = True, @@ -193,12 +193,12 @@ elif options.config == 'mtu': 'label': 'MTU %d' % (length), 'mtu': length }) -elif options.config == 'nic_queue': +elif options.config == 'nic_backlog': # Vary the limit on an NIC queue length for micros in [1, 2, 5, 10, 10000]: specs.append({'exp_name': 'nic_%d' % (micros), 'label': 'nic queue %d us' % (micros), - 'sysctl': ['.net.homa.max_nic_queue_ns', micros*1000] + 'sysctl': ['.net.homa.max_nic_est_backlog_usecs', micros] }) elif options.config == 'poll': # Vary the polling interval diff --git a/util/cp_tcp_config b/util/cp_tcp_config index 1ab427f1..89a5cabd 100755 --- a/util/cp_tcp_config +++ b/util/cp_tcp_config @@ -16,7 +16,7 @@ parser = get_parser(description= 'varies.', usage='%(prog)s [options]') parser.add_argument('-c', '--config', dest='config', - choices=['cports', 'nic_queue', 'sports', 'threads'], + choices=['cports', 'nic_backlog', 'sports', 'threads'], required = True, help='Aspect of configuration to change') parser.add_argument('--dctcp', dest='dctcp', type=boolean, @@ -34,11 +34,11 @@ if options.config == 'cports': specs.append({'exp_name': "cports%d" % (ports), 'label': "%d client ports" % (ports), 'options': ['tcp_client_ports', ports]}) -elif options.config == 'nic_queue': +elif options.config == 'nic_backlog': for usec in [5, 10, 20, 40]: specs.append({'exp_name': "nicq%d" % (usec), 'label': r'NIC queue max %d µsec' % (usec), - 'sysctl': ['.net.homa.max_nic_queue_ns', usec * 1000]}) + 'sysctl': ['.net.homa.max_nic_est_backlog_usecs', usec]}) elif options.config == 'sports': for ports in [6, 9, 12, 15, 18]: specs.append({ 'exp_name': "sports%d" % (ports), From da21990c6a21e8d1ae33ab8353427fd6607e0570 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 Jan 2026 11:38:54 -0800 Subject: [PATCH 607/625] Implement max_nic_queue_usecs sysctl variable --- homa_qdisc.c | 123 +++++++++++++++--- homa_qdisc.h | 86 +++++++++++-- test/mock.c | 1 + test/unit_homa_qdisc.c | 276 ++++++++++++++++++++++++++++++++++++----- 4 files changed, 420 insertions(+), 66 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 2200748d..ea56365f 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -7,17 +7,64 @@ * - It implements the SRPT policy for Homa traffic (highest priority goes * to the message with the fewest bytes remaining to transmit). * - It manages TCP traffic as well as Homa traffic, so that TCP doesn't - * result in long NIC queues. + * create long NIC queues. * - When queues do build up, it balances output traffic between Homa and TCP. */ +/* PACING: + * + * Preventing congestion in the NIC is essential for a proper implementation + * of SRPT (otherwise a short message could get stuck behind a long message + * in the NIC). This file implements a two-part strategy: + * + * First, it paces output traffic so that packets are passed to the NIC at + * a data rate no more than the uplink bandwidth. It implements this by + * keeping a variable qdev->link_idle_time, which is an estimate of when + * the NIC will have finished transmitting all data that has been passed to + * it (assuming transmission at full link speed). If this time gets too far + * into the future (determined by the max_nic_est_backlog_usecs sysctl + * variable) then Homa stops handing off packets to the NIC until link_idle_time + * is no longer too far in the future. + * + * Unfortunately, this technique is not adequate by itself because NICs + * cannot always transmit at full link bandwidth; for example, measurements + * of Intel NICs in December 2025 showed NIC output as low as 80% of link + * bandwidth even with a large backlog of (mixed-size) output packets. As a + * result, with this approach alone NIC queues frequently build up + * (measurements showed total NIC backlogs of 5 MB or more under high + * network load). If the pacing rate is reduced to a level where the NIC + * could always keep up, it would sacrifice link bandwidth in situations + * where the NIC can transmit at closer to line rate. + * + * Thus Homa also uses a second approach, which is based on information + * maintained by the dynamic queue limits mechanism (DQL). DQL keeps + * counters for each netdev_queue that indicate how many bytes are in the + * NIC's possession for each queue (i.e. packets that have been passed + * to the NIC but not yet returned after transmission). If the number of + * outstanding bytes for any queue exceeds a limit (determined by the + * max_nic_queue_usecs sysctl parameter) then the NIC is considered + * congested and Homa will stop queuing more packets until the congestion + * subsides. + * + * It might seem that the second approach is sufficient by itself, so the + * first approach is not needed. Unfortunately, updates to the DQL counters + * don't happen until packets are actually transmitted. This means that a + * a large burst of packets could pass through the qdisc mechanism before the + * DQL counters are updated, resulting in significant queue buildup before + * the counters get updated. The first technique prevents this from + * happening. + * + * There is one additional twist, which is that the rate limits above do + * not apply to small packets. The reasons for this are explained in a comment + * in homa_qdisc_enqueue. + */ + #include "homa_impl.h" #include "homa_qdisc.h" #include "homa_rpc.h" #include "timetrace.h" #include -#include /* Used to enable sysctl access to configuration parameters related to * homa_qdisc. The @data fields are actually offsets within a struct @@ -33,6 +80,13 @@ static struct ctl_table homa_qdisc_ctl_table[] = { .mode = 0644, .proc_handler = homa_qdisc_dointvec }, + { + .procname = "max_nic_queue_usecs", + .data = OFFSET(max_nic_queue_usecs), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_qdisc_dointvec + }, { .procname = "pacer_fifo_fraction", .data = OFFSET(fifo_fraction), @@ -149,6 +203,7 @@ struct homa_qdisc_shared *homa_qdisc_shared_alloc(void) INIT_LIST_HEAD(&qshared->qdevs); qshared->fifo_fraction = 50; qshared->max_nic_est_backlog_usecs = 5; + qshared->max_nic_queue_usecs = 20; qshared->defer_min_bytes = 1000; qshared->homa_share = 50; qshared->max_link_usage = 99; @@ -369,6 +424,8 @@ void homa_qdisc_destroy(struct Qdisc *qdisc) kfree_skb_reason(__skb_dequeue(&q->deferred_tcp), SKB_DROP_REASON_QDISC_DROP); list_del_init(&q->defer_links); + if (q->qdev->congested_qdisc == q) + q->qdev->congested_qdisc = NULL; spin_unlock_irqrestore(&q->qdev->defer_lock, flags); homa_qdisc_qdev_put(q->qdev); } @@ -391,6 +448,8 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, int pkt_len; int offset; + homa_qdisc_update_congested(q); + /* This function tries to transmit short packets immediately for both * Homa and TCP, even when the NIC queue is long. This is because * (a) we don't want to delay Homa control packets, (b) the pacer's @@ -417,7 +476,8 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, homa_qdisc_update_link_idle(qdev, pkt_len, -1); goto enqueue; } - if (!homa_qdisc_any_deferred(qdev) && + if (!READ_ONCE(qdev->congested_qdisc) && + !homa_qdisc_any_deferred(qdev) && homa_qdisc_update_link_idle(qdev, pkt_len, qshared->max_nic_est_backlog_cycles)) goto enqueue; @@ -444,7 +504,8 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto enqueue; } - if (!homa_qdisc_any_deferred(qdev) && + if (!READ_ONCE(qdev->congested_qdisc) && + !homa_qdisc_any_deferred(qdev) && homa_qdisc_update_link_idle(qdev, pkt_len, qshared->max_nic_est_backlog_cycles)) goto enqueue; @@ -619,6 +680,7 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) "0x%x to 0x%x, data bytes %d, seq/ack %u", skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); homa_qdisc_schedule_skb(skb, qdisc_from_priv(q)); + homa_qdisc_update_congested(q); return pkt_len; } @@ -699,10 +761,12 @@ int homa_qdisc_xmit_deferred_homa(struct homa_qdisc_dev *qdev) rcu_read_lock_bh(); txq = netdev_get_tx_queue(skb->dev, skb_get_queue_mapping(skb)); qdisc = rcu_dereference_bh(txq->qdisc); - if (qdisc->ops == &homa_qdisc_ops) + if (qdisc->ops == &homa_qdisc_ops) { homa_qdisc_schedule_skb(skb, qdisc); - else + homa_qdisc_update_congested(qdisc_priv(qdisc)); + } else { kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + } rcu_read_unlock_bh(); return pkt_len; } @@ -828,19 +892,21 @@ int homa_qdisc_pacer_main(void *device) * well, to increase the likelihood that we keep the link busy. Those other * invocations are not guaranteed to happen, so the pacer thread provides a * backstop. - * @qdev: The device on which to transmit. - * @help: True means this function was invoked from homa_qdisc_pacer_check - * rather than homa_qdisc_pacer_main (indicating that the pacer - * thread wasn't keeping up and needs help). + * @qdev: The device on which to transmit. + * @dont_spin: If true, then return immediately if the NIC is congested, + * rather than spinning until congestion drops. If this value + * is false, then the caller must not be running at SoftIRQ + * level, and it must not have acquired a lock that disables + * BH processing (otherwise this function can self-deadlock). */ -void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) +void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool dont_spin) { - int i, xmit_bytes; + int i, xmit_bytes, max_cycles; /* Make sure only one instance of this function executes at a * time. */ - if (!spin_trylock_bh(&qdev->pacer_mutex)) + if (!spin_trylock(&qdev->pacer_mutex)) return; /* Each iteration through the following loop sends one packet. We @@ -848,23 +914,37 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) * time spent in one call to this function (see note in * homa_qdisc_pacer_main about interfering with softirq handlers). */ + max_cycles = qdev->hnet->homa->qshared->max_nic_est_backlog_cycles; for (i = 0; i < 5; i++) { u64 idle_time, now; - /* If the NIC queue is too long, wait until it gets shorter. */ + /* If the NIC is congested, wait for the congestion to + * subside. + */ now = homa_clock(); idle_time = atomic64_read(&qdev->link_idle_time); - while ((now + - qdev->hnet->homa->qshared->max_nic_est_backlog_cycles) < - idle_time) { + while (1) { + struct homa_qdisc *congested; + + congested = READ_ONCE(qdev->congested_qdisc); + if (congested && + homa_qdisc_bytes_pending(congested) + <= qdev->max_nic_queue_bytes) { + WRITE_ONCE(qdev->congested_qdisc, NULL); + congested = NULL; + } + if (!congested && (now + max_cycles) >= idle_time) + break; + /* If we've xmitted at least one packet then * return (this helps with testing and also * allows homa_qdisc_pacer_main to yield the core). */ - if (i != 0) + if (i != 0 || dont_spin) goto done; cpu_relax(); now = homa_clock(); + UNIT_HOOK("pacer spin"); } /* Note: when we get here, it's possible that the NIC queue is @@ -904,12 +984,12 @@ void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help) qdev->hnet->homa->qshared->homa_share; } } - if (help) + if (dont_spin) INC_METRIC(pacer_help_bytes, xmit_bytes); INC_METRIC(pacer_xmit_cycles, homa_clock() - now); } done: - spin_unlock_bh(&qdev->pacer_mutex); + spin_unlock(&qdev->pacer_mutex); } /** @@ -1017,6 +1097,9 @@ void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev) tmp2 = 10ULL * homa->qshared->max_link_usage * qdev->link_mbps; do_div(tmp, tmp2); qdev->cycles_per_mibyte = tmp; + + qdev->max_nic_queue_bytes = (homa->qshared->max_nic_queue_usecs * + qdev->link_mbps) >> 3; } /** diff --git a/homa_qdisc.h b/homa_qdisc.h index b548234d..6ae60c51 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -16,13 +16,14 @@ #endif /* __UNIT_TEST__*/ #include +#include #ifndef _HOMA_QDISC_H #define _HOMA_QDISC_H /** * struct homa_qdisc - Contains Homa-specific data for a single instance of - * the homa queuing discipline + * the homa queuing discipline. */ struct homa_qdisc { /** @sch: The Qdisc that this structure is associated with. */ @@ -94,7 +95,9 @@ struct homa_qdisc_dev { /** * @link_idle_time: The time, measured by homa_clock, at which we * estimate that all of the packets passed to @dev will have been - * transmitted. May be in the past. + * transmitted, assuming the NIC can transmit at full link speed. + * May be in the past. See the PACING comment at the top of + * homa_qdisc.c for a discussion of the pacing mechanism. */ atomic64_t link_idle_time __aligned(L1_CACHE_BYTES); @@ -124,6 +127,29 @@ struct homa_qdisc_dev { */ u64 last_defer; + /** + * @max_nic_queue_bytes: The number of bytes corresponding to + * qdev->max_nic_queue_usecs. + */ + int max_nic_queue_bytes; + + /** + * @congested_qdisc: If non-NULL, this variable identifies a qdisc + * whose NIC queue is overloaded according to @homa_max_nic_queue_bytes. + * NULL means no queue is currently known to be congested. This + * variable is accessed without synchronization. See the PACING comment + * at the top of homa_qdisc.c for a discussion of the packet pacing + * architecture. + */ + struct homa_qdisc *congested_qdisc; + + /** + * @defer_lock: Synchronizes access to information about deferred + * packets, including deferred_rpcs, deferred_qdiscs, next_qdisc, + * last_defer, and some information in homa_qdiscs. + */ + spinlock_t defer_lock; + /** * @homa_credit: When there are both Homa and TCP deferred packets, * this is used to balance output between them according to the @@ -136,13 +162,6 @@ struct homa_qdisc_dev { */ int homa_credit; - /** - * @defer_lock: Synchronizes access to information about deferred - * packets, including deferred_rpcs, deferred_qdiscs, next_qdisc, - * last_defer, and some information in homa_qdiscs. - */ - spinlock_t defer_lock; - /** * @pacer_kthread: Kernel thread that eventually transmits packets * on homa_deferred and tcp_deferred. @@ -158,9 +177,11 @@ struct homa_qdisc_dev { /** * @pacer_mutex: Ensures that only one instance of * homa_qdisc_pacer runs at a time. Only used in "try" mode: - * never block on this. + * never block on this. Note: must not disable bh when acquiring + * this lock, because the pacer may wait for bh activity to + * complete. */ - spinlock_t pacer_mutex __aligned(L1_CACHE_BYTES); + spinlock_t pacer_mutex ____cacheline_aligned_in_smp; /** * @rcu_head: Holds state of a pending call_rcu invocation when @@ -209,6 +230,15 @@ struct homa_qdisc_shared { */ int max_nic_est_backlog_cycles; + /** + * @max_nic_queue_usecs: An additional limit on NIC queue buildup: + * if any individual NIC queue reaches a length where it would + * take at least this many microseconds to transmit all of its packets, + * then no more packets will be queued for *any* NIC queue until + * the queue gets below this limit. Set externally via sysctl. + */ + int max_nic_queue_usecs; + /** * @defer_min_bytes: If a packet has fewer bytes than this, then it * will be transmitted immediately, regardless of NIC queue length. @@ -274,7 +304,7 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack); void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc); -void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool help); +void homa_qdisc_pacer(struct homa_qdisc_dev *qdev, bool dont_spin); void homa_qdisc_pacer_check(struct homa *homa); int homa_qdisc_pacer_main(void *device); struct homa_qdisc_dev * @@ -346,7 +376,8 @@ static inline void homa_qdisc_schedule_skb(struct sk_buff *skb, * @rpc2 (i.e. higher priority) for the purposes of qdev->deferred_rpcs, or * false if @rpc1 is consdered "greater" (ties not allowed). * @rpc1: RPC to compare - * @rpc2: RPC to compare; must be different from rpc1. + * @rpc2: RPC to compare; must be different from rpc1 + * Return: See above */ static inline bool homa_qdisc_precedes(struct homa_rpc *rpc1, struct homa_rpc *rpc2) @@ -367,4 +398,33 @@ static inline bool homa_qdisc_precedes(struct homa_rpc *rpc1, return rpc1 < rpc2; } +/** + * homa_qdisc_bytes_pending() - Return the total number of bytes in skbs + * that have been enqueued in the NIC for transmission via a given queue + * but have not yet been returned after transmission. + * @q: Return the pending bytes for the devqueue asssociated with + * this qdisc. + * Return: See above + */ +static inline int homa_qdisc_bytes_pending(struct homa_qdisc *q) +{ + /* Ideally this function would be provided by dynamic_queue_limits.h + * so that we don't have to root around in its data structures. + */ + struct dql *dql = &qdisc_from_priv(q)->dev_queue->dql; + + return READ_ONCE(dql->num_queued) - READ_ONCE(dql->num_completed); +} + +/** + * homa_qdisc_update_congested() - If the NIC queue for a qdisc has + * become too long, record the fact that this qdisc is congested. + * @q: qdisc whose netdev_queue should be checked. + */ +static inline void homa_qdisc_update_congested(struct homa_qdisc *q) +{ + if (homa_qdisc_bytes_pending(q) > q->qdev->max_nic_queue_bytes) + WRITE_ONCE(q->qdev->congested_qdisc, q); +} + #endif /* _HOMA_QDISC_H */ diff --git a/test/mock.c b/test/mock.c index 0566e44c..4bcf21a4 100644 --- a/test/mock.c +++ b/test/mock.c @@ -2447,6 +2447,7 @@ void mock_teardown(void) memset(mock_devices, 0, sizeof(mock_devices)); mock_peer_free_no_fail = 0; mock_link_mbps = 10000; + memset(&mock_net_queue, 0, sizeof(mock_net_queue)); mock_queue_index = 0; mock_netif_schedule_calls = 0; memset(inet_offloads, 0, sizeof(inet_offloads)); diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 753b4a8b..2b6efba6 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -111,6 +111,14 @@ static void xmit_hook(char *id) xmit_clock = mock_clock; } +static struct Qdisc *hook_qdisc; +static void complete_qdisc_hook(char *id) +{ + if (strcmp(id, "pacer spin") != 0) + return; + hook_qdisc->dev_queue->dql.num_completed += 1; +} + FIXTURE(homa_qdisc) { struct homa homa; struct homa_net *hnet; @@ -446,17 +454,46 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) homa_qdisc_defer_tcp(q2, mock_tcp_skb(&self->addr, 6000, 1100)); homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 7000, 1100)); EXPECT_EQ(2, unit_list_length(&qdev->deferred_qdiscs)); + qdev->congested_qdisc = q2; homa_qdisc_destroy(qdisc); EXPECT_EQ(1, refcount_read(&qdev->refs)); EXPECT_EQ(1, unit_list_length(&qdev->deferred_qdiscs)); + EXPECT_EQ(q2, qdev->congested_qdisc); homa_qdisc_destroy(qdisc2); EXPECT_EQ(0, unit_list_length(&self->homa.qshared->qdevs)); + EXPECT_EQ(NULL, qdev->congested_qdisc); kfree(qdisc); kfree(qdisc2); } +TEST_F(homa_qdisc, homa_qdisc_enqueue__mark_congested_queue) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 200); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + mock_net_queue.dql.num_queued = q->qdev->max_nic_queue_bytes + 1; + skb = new_test_skb(srpc, &self->addr, 0, 200); + to_free = NULL; + unit_log_clear(); + + EXPECT_EQ(NULL, q->qdev->congested_qdisc); + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(q, q->qdev->congested_qdisc); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); @@ -527,7 +564,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__xmit_short_tcp_packet) homa_qdisc_destroy(qdisc); kfree(qdisc); } -TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_homa_deferred) +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_congested_qdisc) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; @@ -541,22 +578,73 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_homa_deferred EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); - atomic64_set(&q->qdev->link_idle_time, 1000000); q->ix = 3; mock_queue_index = 3; - /* First packet is Homa, gets deferred because of link_idle_time. */ - skb = new_test_skb(srpc, &self->addr, 0, 1500); + skb = mock_tcp_skb(&self->addr, 6000, 1500); to_free = NULL; + q->qdev->congested_qdisc = q; homa_qdisc_enqueue(skb, qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); - EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_other_deferred_packets) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + q->ix = 3; + mock_queue_index = 3; + + /* First packet gets deferred because of congested qdisc. */ + skb = mock_tcp_skb(&self->addr, 6000, 1500); + to_free = NULL; + q->qdev->congested_qdisc = q; + homa_qdisc_enqueue(skb, qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); + + /* Second packet gets deferred because the first packet was deferred. */ + skb = mock_tcp_skb(&self->addr, 6000, 1500); + to_free = NULL; + q->qdev->congested_qdisc = NULL; + homa_qdisc_enqueue(skb, qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(2, skb_queue_len(&q->deferred_tcp)); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_link_idle_time) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + q->ix = 3; + mock_queue_index = 3; + atomic64_set(&q->qdev->link_idle_time, 1000000); - /* Second packet is TCP, gets deferred because of deferred Homa - * packet. - */ - mock_clock = 1000000; skb = mock_tcp_skb(&self->addr, 6000, 1500); to_free = NULL; homa_qdisc_enqueue(skb, qdisc, &to_free); @@ -567,7 +655,7 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_homa_deferred homa_qdisc_destroy(qdisc); kfree(qdisc); } -TEST_F(homa_qdisc, homa_qdisc_enqueue__short_message) +TEST_F(homa_qdisc, homa_qdisc_enqueue__short_homa_message) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; @@ -627,43 +715,65 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__short_final_packet_in_long_message) homa_qdisc_destroy(qdisc); kfree(qdisc); } -TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet) +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_congested_qdisc) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); struct sk_buff *skb, *to_free; struct homa_rpc *srpc; struct homa_qdisc *q; - u64 idle; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 100, 7100); ASSERT_NE(NULL, srpc); - /* First packet is deferred because the NIC queue is full. */ EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); q = qdisc_priv(qdisc); - idle = mock_clock + 1 + self->homa.qshared->max_nic_est_backlog_cycles + 1; - atomic64_set(&q->qdev->link_idle_time, idle); skb = new_test_skb(srpc, &self->addr, 0, 1500); to_free = NULL; unit_log_clear(); mock_log_wakeups = 1; + q->qdev->congested_qdisc = q; EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); EXPECT_EQ(NULL, to_free); EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); - EXPECT_STREQ("wake_up", unit_log_get()); + log_deferred(q->qdev); + EXPECT_STREQ("wake_up; [id 1235, offsets 0]", unit_log_get()); - /* Second packet is deferred even though NIC not busy, because - * there are other packets waiting. - */ - atomic64_set(&q->qdev->link_idle_time, 0); - self->data.common.sender_id = cpu_to_be64(101); - skb = new_test_skb(srpc, &self->addr, 1500, 1500); + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_other_packets_deferred) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + + /* First packet is deferred because of congested qdisc. */ + skb = new_test_skb(srpc, &self->addr, 0, 1500); to_free = NULL; + unit_log_clear(); + mock_log_wakeups = 1; + q->qdev->congested_qdisc = q; + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); + /* Second packet is deferred because first packet was deferred. */ + skb = new_test_skb(srpc, &self->addr, 1500, 1500); + to_free = NULL; unit_log_clear(); + q->qdev->congested_qdisc = NULL; EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); EXPECT_EQ(NULL, to_free); log_deferred(q->qdev); @@ -672,6 +782,36 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet) homa_qdisc_destroy(qdisc); kfree(qdisc); } +TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_nic_idle_time) +{ + struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct sk_buff *skb, *to_free; + struct homa_rpc *srpc; + struct homa_qdisc *q; + u64 idle; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 100, 7100); + ASSERT_NE(NULL, srpc); + + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + q = qdisc_priv(qdisc); + idle = mock_clock + 1 + self->homa.qshared->max_nic_est_backlog_cycles + 1; + atomic64_set(&q->qdev->link_idle_time, idle); + skb = new_test_skb(srpc, &self->addr, 0, 1500); + to_free = NULL; + unit_log_clear(); + mock_log_wakeups = 1; + + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NULL, to_free); + EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); + EXPECT_STREQ("wake_up", unit_log_get()); + + homa_qdisc_destroy(qdisc); + kfree(qdisc); +} TEST_F(homa_qdisc, homa_qdisc_enqueue__drop_packet_queue_over_limit) { struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); @@ -1003,6 +1143,8 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__basics) q = qdisc_priv(self->qdiscs[2]); q->ix = 2; mock_queue_index = 2; + self->qdiscs[2]->dev_queue->dql.num_queued = + q->qdev->max_nic_queue_bytes + 1; homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 5000, 1000)); atomic64_set(&q->qdev->link_idle_time, 20000); @@ -1010,6 +1152,7 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__basics) EXPECT_EQ(1, self->qdiscs[2]->q.qlen); EXPECT_EQ(0, unit_list_length(&q->qdev->deferred_qdiscs)); EXPECT_LT(20000, atomic64_read(&q->qdev->link_idle_time)); + EXPECT_EQ(q, q->qdev->congested_qdisc); } TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__no_deferred_packets) { @@ -1219,10 +1362,12 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__packet_available) { struct homa_qdisc_dev *qdev; struct homa_rpc *srpc; + struct homa_qdisc *q; u64 link_idle; EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + q = qdisc_priv(self->qdiscs[3]); mock_clock = 10000; mock_queue_index = 3; qdev = homa_qdisc_qdev_get(self->dev); @@ -1232,6 +1377,8 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__packet_available) ASSERT_NE(NULL, srpc); link_idle = atomic64_read(&qdev->link_idle_time); + self->qdiscs[3]->dev_queue->dql.num_queued = qdev->max_nic_queue_bytes + + 1; homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); @@ -1240,6 +1387,7 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_homa__packet_available) EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(1, self->qdiscs[3]->q.qlen); EXPECT_LT(link_idle, atomic64_read(&qdev->link_idle_time)); + EXPECT_EQ(q, qdev->congested_qdisc); homa_qdisc_qdev_put(qdev); } @@ -1395,6 +1543,64 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_main) homa_qdisc_qdev_put(qdev); } +TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) +{ + struct homa_qdisc_dev *qdev; + u64 link_idle; + struct homa_rpc *srpc; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + link_idle = atomic64_read(&qdev->link_idle_time); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + unit_log_clear(); + + mock_spin_lock_held = 1; + homa_qdisc_pacer(qdev, false); + EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + EXPECT_EQ(link_idle, atomic64_read(&qdev->link_idle_time)); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_qdisc_no_longer_congested) +{ + struct homa_qdisc_dev *qdev; + struct homa_rpc *srpc; + struct homa_qdisc *q; + + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; + q = qdisc_priv(self->qdiscs[3]); + qdev = homa_qdisc_qdev_get(self->dev); + qdev->congested_qdisc = q; + self->qdiscs[3]->dev_queue->dql.num_queued = qdev->max_nic_queue_bytes + + 10; + + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); + hook_qdisc = self->qdiscs[3]; + unit_hook_register(complete_qdisc_hook); + + homa_qdisc_pacer(qdev, false); + EXPECT_FALSE(homa_qdisc_any_deferred(qdev)); + EXPECT_EQ(NULL, qdev->congested_qdisc); + EXPECT_EQ(10, hook_qdisc->dev_queue->dql.num_completed); + + homa_qdisc_qdev_put(qdev); +} TEST_F(homa_qdisc, homa_qdisc_pacer__spin_until_link_idle) { struct homa_qdisc_dev *qdev; @@ -1469,30 +1675,32 @@ TEST_F(homa_qdisc, homa_qdisc_pacer__return_after_one_packet) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdisc_pacer__pacer_lock_unavailable) +TEST_F(homa_qdisc, homa_qdisc_pacer__dont_spin) { struct homa_qdisc_dev *qdev; - u64 link_idle; struct homa_rpc *srpc; + EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); + EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + mock_queue_index = 3; qdev = homa_qdisc_qdev_get(self->dev); + srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 10000, 10000); ASSERT_NE(NULL, srpc); - link_idle = atomic64_read(&qdev->link_idle_time); homa_qdisc_defer_homa(qdev, new_test_skb(srpc, &self->addr, 0, 1000)); - EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); - EXPECT_EQ(0, homa_qdisc_init(self->qdiscs[3], NULL, NULL)); - EXPECT_EQ(0, self->qdiscs[3]->q.qlen); + + mock_clock = 0; + mock_clock_tick = 1000; + atomic64_set(&qdev->link_idle_time, 10000); + self->homa.qshared->max_nic_est_backlog_cycles = 3500; unit_log_clear(); - mock_trylock_errors = 1; - homa_qdisc_pacer(qdev, false); + homa_qdisc_pacer(qdev, true); EXPECT_TRUE(homa_qdisc_any_deferred(qdev)); EXPECT_EQ(0, self->qdiscs[3]->q.qlen); - EXPECT_EQ(link_idle, atomic64_read(&qdev->link_idle_time)); homa_qdisc_qdev_put(qdev); } @@ -1742,7 +1950,7 @@ TEST_F(homa_qdisc, homa_qdisc_pacer_check__lag_not_long_enough) homa_qdisc_qdev_put(qdev); } -TEST_F(homa_qdisc, homa_qdevc_update_sysctl__basics) +TEST_F(homa_qdisc, homa_qdev_update_sysctl__basics) { struct homa_qdisc_dev *qdev; @@ -1752,9 +1960,11 @@ TEST_F(homa_qdisc, homa_qdevc_update_sysctl__basics) self->homa.link_mbps = 25000; mock_link_mbps = 8000; self->homa.qshared->max_link_usage = 90; + self->homa.qshared->max_nic_queue_usecs = 50; homa_qdev_update_sysctl(qdev); EXPECT_EQ(8000, qdev->link_mbps); EXPECT_EQ(1165084, qdev->cycles_per_mibyte); + EXPECT_EQ(50000, qdev->max_nic_queue_bytes); homa_qdisc_qdev_put(qdev); } From 8402da90b7d38efd19b5e48703524610baaa7c1e Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 5 Jan 2026 14:14:33 -0800 Subject: [PATCH 608/625] Add max_nic_queue_usecs tests to cp_config and cp_tcp_config --- util/cp_config | 12 +++++++++--- util/cp_tcp_config | 15 +++++++++++---- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/util/cp_config b/util/cp_config index 539f2ed4..0d56ca88 100755 --- a/util/cp_config +++ b/util/cp_config @@ -1,6 +1,6 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2023 Homa Developers +# Copyright (c) 2020-2026 Homa Developers # SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures Homa slowdown while varying one or more @@ -19,7 +19,7 @@ parser.add_argument('-c', '--config', dest='config', choices=['balance', 'buffers', 'busy_usecs', 'client_threads', 'dctcp_buffers', 'defer_min_bytes', 'fifo', 'gbps', 'gen2', 'gen3', 'grant_policy', 'gro_busy_usecs', 'load', - 'max_gro', 'max_gso', 'mtu', 'nic_backlog', + 'max_gro', 'max_gso', 'max_nic_queue', 'mtu', 'nic_backlog', 'poll', 'ports', 'prios', 'receivers', 'repeat', 'tcp_buffers', 'throttle', 'time', 'unsched_bytes'], required = True, @@ -186,6 +186,12 @@ elif options.config == 'max_gso': 'label': 'max_gso_size %d' % (count), 'sysctl': ['.net.homa.max_gso_size', count] }) +elif options.config == 'max_nic_queue': + # Vary the limit on length of any individual NIC queue + for usecs in [5, 10, 20, 40]: + specs.append({'exp_name': 'nicq_%d' % (usecs), + 'label': 'max_nic_queue_usecs %d' % (usecs), + 'sysctl': ['.net.homa.max_nic_queue_usecs', usecs]}) elif options.config == 'mtu': # Vary the maximum packet size for length in [1500, 3000, 5000, 7000, 9000]: @@ -195,7 +201,7 @@ elif options.config == 'mtu': }) elif options.config == 'nic_backlog': # Vary the limit on an NIC queue length - for micros in [1, 2, 5, 10, 10000]: + for micros in [5, 10, 20, 100, 10000]: specs.append({'exp_name': 'nic_%d' % (micros), 'label': 'nic queue %d us' % (micros), 'sysctl': ['.net.homa.max_nic_est_backlog_usecs', micros] diff --git a/util/cp_tcp_config b/util/cp_tcp_config index 89a5cabd..93db1517 100755 --- a/util/cp_tcp_config +++ b/util/cp_tcp_config @@ -1,6 +1,6 @@ #!/usr/bin/python3 -# Copyright (c) 2020-2022 Homa Developers +# Copyright (c) 2020-2026 Homa Developers # SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ # This cperf benchmark measures TCP and DCTCP while varying one or more @@ -12,11 +12,12 @@ from cperf import * load_info = [["w2", 3.2], ["w3", 14], ["w4", 20], ["w5", 20]] parser = get_parser(description= - 'Measures Homa slowdown as the number of available priority levels ' - 'varies.', + 'Measures TCP performance as a function of various system ' + 'configuration parameters.', usage='%(prog)s [options]') parser.add_argument('-c', '--config', dest='config', - choices=['cports', 'nic_backlog', 'sports', 'threads'], + choices=['cports', 'max_nic_queue', 'nic_backlog', 'sports', + 'threads'], required = True, help='Aspect of configuration to change') parser.add_argument('--dctcp', dest='dctcp', type=boolean, @@ -34,6 +35,12 @@ if options.config == 'cports': specs.append({'exp_name': "cports%d" % (ports), 'label': "%d client ports" % (ports), 'options': ['tcp_client_ports', ports]}) +elif options.config == 'max_nic_queue': + # Vary the limit on length of any individual NIC queue + for usecs in [10, 20, 40, 80]: + specs.append({'exp_name': 'nicq_%d' % (usecs), + 'label': 'max_nic_queue_usecs %d' % (usecs), + 'sysctl': ['.net.homa.max_nic_queue_usecs', usecs]}) elif options.config == 'nic_backlog': for usec in [5, 10, 20, 40]: specs.append({'exp_name': "nicq%d" % (usec), From 2d55f99f0e05ac2889f8d55e7adc96c5ec3c543f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Wed, 7 Jan 2026 13:49:27 -0800 Subject: [PATCH 609/625] Rework handling of short TCP packets in homa_qdisc The previous approach (sorting deferred queue by packet length) didn't help much (a short packet for one flow could get stuck behind an even shorter packet for another flow, which was stuck behind a long packet for that flow). The new approach bypasses the queue entirely for short packets if there are no conflicts, otherwise queues in FIFO order. This provides a significant performance benefit (95% of short packets can bypass the queue). --- homa_qdisc.c | 106 +++++++++-- homa_qdisc.h | 6 +- test/mock.c | 2 + test/unit_homa_qdisc.c | 399 ++++++++++++++++++++++++----------------- 4 files changed, 328 insertions(+), 185 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index ea56365f..780d5ad3 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -391,7 +391,7 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, if (IS_ERR(qdev)) return PTR_ERR(qdev); - q->sch = sch; + q->qdisc = sch; q->qdev = qdev; q->ix = -1; for (i = 0; i < qdev->dev->num_tx_queues; i++) { @@ -451,31 +451,30 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, homa_qdisc_update_congested(q); /* This function tries to transmit short packets immediately for both - * Homa and TCP, even when the NIC queue is long. This is because - * (a) we don't want to delay Homa control packets, (b) the pacer's - * single thread doesn't have enough throughput to handle all the short - * packets at high load (whereas processing here happens concurrently - * on multiple cores), and (c) there is no way to generate enough - * short packets to cause NIC queue buildup, so bypassing the pacer - * won't impact the SRPT mechanism significantly. + * Homa and TCP, even when the NIC queue is long. We do this because + * (a) it reduces tail latency significantly for short packets, + * (b) there is no way to generate enough short packets to cause NIC + * queue buildup, and (c) the pacer's single thread doesn't have + * enough throughput to handle all the short packets at high load + * (whereas processing here happens concurrently on multiple cores). */ qshared = qdev->hnet->homa->qshared; pkt_len = qdisc_pkt_len(skb); if (!is_homa_pkt(skb)) { /* This is a TCP packet (or something else other than Homa). - * In order to maintain the order of packets within a stream - * we must defer short packets if there are other packets - * already deferred for this qdisc. + * Defer short TCP packets only if they are in the same flow + * as a previously deferred packet for this qdisc. */ INC_METRIC(qdisc_tcp_packets, 1); - if (!skb_queue_empty(&q->deferred_tcp)) { + if (pkt_len < qshared->defer_min_bytes) { + if (skb_queue_empty(&q->deferred_tcp) || + homa_qdisc_can_bypass(skb, q)) { + homa_qdisc_update_link_idle(qdev, pkt_len, -1); + goto enqueue; + } homa_qdisc_defer_tcp(q, skb); return NET_XMIT_SUCCESS; } - if (pkt_len < qshared->defer_min_bytes) { - homa_qdisc_update_link_idle(qdev, pkt_len, -1); - goto enqueue; - } if (!READ_ONCE(qdev->congested_qdisc) && !homa_qdisc_any_deferred(qdev) && homa_qdisc_update_link_idle(qdev, pkt_len, @@ -535,6 +534,81 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return qdisc_enqueue_tail(skb, sch); } +/** + * homa_qdisc_can_bypass() - Determine whether it is OK to transmit a given + * TCP packet before those already deferred for a qdisc. + * @q: New packet + * @q: Qdisc with deferred TCP packets + * Return: True if skb can be transmitted before the packets in @list + * without violating reordering rules. + */ +bool homa_qdisc_can_bypass(struct sk_buff *skb, struct homa_qdisc *q) +{ + struct sk_buff *skb2; + __be32 daddr, daddr2; + __be16 source, dest; + bool result; + int element; + + /* Collect information from skb. If it isn't a TCP packet then + * reordering constraints are unknown so deny reordering. + */ + if (skb->protocol == htons(ETH_P_IP)) { + if (ip_hdr(skb)->protocol != IPPROTO_TCP) + return false; + daddr = ip_hdr(skb)->daddr; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + if (ipv6_hdr(skb)->nexthdr != IPPROTO_TCP) + return false; + daddr = ipv6_hdr(skb)->daddr.in6_u.u6_addr32[0] ^ + ipv6_hdr(skb)->daddr.in6_u.u6_addr32[1] ^ + ipv6_hdr(skb)->daddr.in6_u.u6_addr32[2] ^ + ipv6_hdr(skb)->daddr.in6_u.u6_addr32[3]; + } else { + return false; + } + + /* If skb is an ack (i.e. no payload) then reordering is fine. */ + if ((skb->len - skb_transport_offset(skb) - tcp_hdrlen(skb)) == 0) + return true; + + /* If any packets in the list are TCP packets on the same flow + * then deny reordering. The flow check is overconservative, in that + * it may sometimes deny even when the flows aren't the same. + */ + source = tcp_hdr(skb)->source; + dest = tcp_hdr(skb)->dest; + element = 0; + result = true; + spin_lock_bh(&q->qdev->defer_lock); + skb_queue_walk(&q->deferred_tcp, skb2) { + element++; + if (skb2->protocol == htons(ETH_P_IP)) { + if (ip_hdr(skb2)->protocol != IPPROTO_TCP) + continue; + daddr2 = ip_hdr(skb2)->daddr; + } else if (skb2->protocol == htons(ETH_P_IPV6)) { + if (ipv6_hdr(skb2)->nexthdr != IPPROTO_TCP) + continue; + daddr2 = ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[0] ^ + ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[1] ^ + ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[2] ^ + ipv6_hdr(skb2)->daddr.in6_u.u6_addr32[3]; + + } else { + continue; + } + + if (daddr == daddr2 && dest == tcp_hdr(skb2)->dest && + source == tcp_hdr(skb2)->source) { + result = false; + break; + } + } + spin_unlock_bh(&q->qdev->defer_lock); + return result; +} + /** * homa_qdisc_defer_tcp() - Add a non-Homa packet to the deferred list for * a qdisc. diff --git a/homa_qdisc.h b/homa_qdisc.h index 6ae60c51..b952f3a1 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -26,8 +26,8 @@ * the homa queuing discipline. */ struct homa_qdisc { - /** @sch: The Qdisc that this structure is associated with. */ - struct Qdisc *sch; + /** @qdisc: The Qdisc that this structure is associated with. */ + struct Qdisc *qdisc; /** @qdev: Info shared among all qdiscs for a net_device. */ struct homa_qdisc_dev *qdev; @@ -289,6 +289,8 @@ struct homa_rcu_kfreer { }; void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev); +bool homa_qdisc_can_bypass(struct sk_buff *skb, + struct homa_qdisc *q); void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb); void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb); diff --git a/test/mock.c b/test/mock.c index 4bcf21a4..9a0e3e15 100644 --- a/test/mock.c +++ b/test/mock.c @@ -2022,11 +2022,13 @@ struct sk_buff *mock_raw_skb(struct in6_addr *saddr, int protocol, int length) ipv6_hdr(skb)->version = 6; ipv6_hdr(skb)->saddr = *saddr; ipv6_hdr(skb)->nexthdr = protocol; + skb->protocol = htons(ETH_P_IPV6); } else { ip_hdr(skb)->version = 4; ip_hdr(skb)->saddr = saddr->in6_u.u6_addr32[3]; ip_hdr(skb)->protocol = protocol; ip_hdr(skb)->check = 0; + skb->protocol = htons(ETH_P_IP); } skb->users.refs.counter = 1; skb->_skb_refdst = 0; diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 2b6efba6..183c3400 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -48,6 +48,17 @@ static struct sk_buff *new_test_skb(struct homa_rpc *rpc, return skb; } +/** + * init_qdisc() - Make a homa_qdisc out of a Qdisc. + * @qdisc: Qdisc to initialize for use as a homa_qdisc. + * Return: The homa_qdisc private data. + */ +struct homa_qdisc *init_qdisc(struct Qdisc *qdisc) +{ + EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); + return qdisc_priv(qdisc); +} + void log_deferred(struct homa_qdisc_dev *qdev) { struct homa_skb_info *info; @@ -67,6 +78,15 @@ void log_deferred(struct homa_qdisc_dev *qdev) } } +void log_tcp_deferred(struct homa_qdisc *q) +{ + struct sk_buff *skb; + + skb_queue_walk(&q->deferred_tcp, skb) { + unit_log_printf(" ", "%d", ntohl(tcp_hdr(skb)->seq)); + } +} + static struct homa_qdisc_dev *exit_hook_qdev; static int exit_hook_count; static void exit_hook(char *id) { @@ -123,6 +143,7 @@ FIXTURE(homa_qdisc) { struct homa homa; struct homa_net *hnet; struct in6_addr addr; + struct in6_addr addr2; struct net_device *dev; #define NUM_TXQS 4 struct netdev_queue txqs[NUM_TXQS]; @@ -145,6 +166,7 @@ FIXTURE_SETUP(homa_qdisc) homa_init(&self->homa); self->hnet = mock_hnet(0, &self->homa); self->addr = unit_get_in_addr("1.2.3.4"); + self->addr2 = unit_get_in_addr("1.2.3.5"); self->dev = mock_dev(0, &self->homa); self->dev->_tx = self->txqs; self->dev->num_tx_queues = NUM_TXQS; @@ -470,149 +492,95 @@ TEST_F(homa_qdisc, homa_qdisc_destroy) TEST_F(homa_qdisc, homa_qdisc_enqueue__mark_congested_queue) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; struct homa_rpc *srpc; - struct homa_qdisc *q; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 100, 200); ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); - mock_net_queue.dql.num_queued = q->qdev->max_nic_queue_bytes + 1; + q->qdisc->dev_queue->dql.num_queued = q->qdev->max_nic_queue_bytes + 1; skb = new_test_skb(srpc, &self->addr, 0, 200); to_free = NULL; unit_log_clear(); EXPECT_EQ(NULL, q->qdev->congested_qdisc); - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); EXPECT_EQ(q, q->qdev->congested_qdisc); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); } -TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_short_tcp_packet) +TEST_F(homa_qdisc, homa_qdisc_enqueue__short_tcp_packet) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; - struct homa_rpc *srpc; - struct homa_qdisc *q; - - srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 100, 7100); - ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); - q->ix = 3; mock_queue_index = 3; - /* First packet is long and gets deferred because of link_idle_time. */ - skb = mock_tcp_skb(&self->addr, 5000, 1500); + /* First packet is short but gets transmitted in spite of + * link_idle_time. + */ + skb = mock_tcp_skb(&self->addr, 5000, 500); to_free = NULL; - homa_qdisc_enqueue(skb, qdisc, &to_free); + homa_qdisc_enqueue(skb, q->qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); - EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); + EXPECT_EQ(0, skb_queue_len(&q->deferred_tcp)); + EXPECT_EQ(1, q->qdisc->q.qlen); - /* Second packet is short, but must be deferred to maintain order - * within qdisc. + /* Second packet also gets transmitted: previously deferred + * packet is for different flow. */ - skb = mock_tcp_skb(&self->addr, 6000, 500); - to_free = NULL; - homa_qdisc_enqueue(skb, qdisc, &to_free); + homa_qdisc_enqueue(mock_tcp_skb(&self->addr, 5000, 2000), q->qdisc, + &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_EQ(2, skb_queue_len(&q->deferred_tcp)); - EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); -} -TEST_F(homa_qdisc, homa_qdisc_enqueue__xmit_short_tcp_packet) -{ - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); - struct sk_buff *skb, *to_free; - struct homa_rpc *srpc; - struct homa_qdisc *q; - - srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 100, 7100); - ASSERT_NE(NULL, srpc); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); - atomic64_set(&q->qdev->link_idle_time, 1000000); - q->ix = 3; - skb = mock_tcp_skb(&self->addr, 5000, 500); + skb = mock_tcp_skb(&self->addr, 7000, 500); + tcp_hdr(skb)->source = 13; to_free = NULL; - unit_log_clear(); - - homa_qdisc_enqueue(skb, qdisc, &to_free); + homa_qdisc_enqueue(skb, q->qdisc, &to_free); EXPECT_EQ(NULL, to_free); - EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); - EXPECT_EQ(1, qdisc->q.qlen); - EXPECT_STREQ("", unit_log_get()); - EXPECT_LT(1000000, atomic64_read(&q->qdev->link_idle_time)); - EXPECT_EQ(1, homa_metrics_per_cpu()->qdisc_tcp_packets); + EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); + EXPECT_EQ(2, q->qdisc->q.qlen); - homa_qdisc_destroy(qdisc); - kfree(qdisc); + /* Thiurd packet gets differed: same flow as previously deferred + * packet. + */ + skb = mock_tcp_skb(&self->addr, 8000, 500); + to_free = NULL; + homa_qdisc_enqueue(skb, q->qdisc, &to_free); + EXPECT_EQ(NULL, to_free); + EXPECT_EQ(2, skb_queue_len(&q->deferred_tcp)); + EXPECT_EQ(2, q->qdisc->q.qlen); } TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_congested_qdisc) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; - struct homa_rpc *srpc; - struct homa_qdisc *q; - srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 100, 7100); - ASSERT_NE(NULL, srpc); - - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); - q->ix = 3; mock_queue_index = 3; skb = mock_tcp_skb(&self->addr, 6000, 1500); to_free = NULL; q->qdev->congested_qdisc = q; - homa_qdisc_enqueue(skb, qdisc, &to_free); + homa_qdisc_enqueue(skb, q->qdisc, &to_free); EXPECT_EQ(NULL, to_free); EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_other_deferred_packets) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; - struct homa_rpc *srpc; - struct homa_qdisc *q; - - srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 100, 7100); - ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); - q->ix = 3; mock_queue_index = 3; /* First packet gets deferred because of congested qdisc. */ skb = mock_tcp_skb(&self->addr, 6000, 1500); to_free = NULL; q->qdev->congested_qdisc = q; - homa_qdisc_enqueue(skb, qdisc, &to_free); + homa_qdisc_enqueue(skb, q->qdisc, &to_free); EXPECT_EQ(NULL, to_free); EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); @@ -620,152 +588,116 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_other_deferre skb = mock_tcp_skb(&self->addr, 6000, 1500); to_free = NULL; q->qdev->congested_qdisc = NULL; - homa_qdisc_enqueue(skb, qdisc, &to_free); + homa_qdisc_enqueue(skb, q->qdisc, &to_free); EXPECT_EQ(NULL, to_free); EXPECT_EQ(2, skb_queue_len(&q->deferred_tcp)); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_tcp_packet_because_of_link_idle_time) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; - struct homa_rpc *srpc; - struct homa_qdisc *q; - srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, - &self->server_ip, self->client_port, - self->server_id, 100, 7100); - ASSERT_NE(NULL, srpc); - - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); - q->ix = 3; mock_queue_index = 3; atomic64_set(&q->qdev->link_idle_time, 1000000); skb = mock_tcp_skb(&self->addr, 6000, 1500); to_free = NULL; - homa_qdisc_enqueue(skb, qdisc, &to_free); + homa_qdisc_enqueue(skb, q->qdisc, &to_free); EXPECT_EQ(NULL, to_free); EXPECT_EQ(1, skb_queue_len(&q->deferred_tcp)); EXPECT_EQ(1000000, atomic64_read(&q->qdev->link_idle_time)); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_enqueue__short_homa_message) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; struct homa_rpc *srpc; - struct homa_qdisc *q; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 100, 200); ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); - q->ix = 3; skb = new_test_skb(srpc, &self->addr, 0, 200); to_free = NULL; unit_log_clear(); - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); EXPECT_EQ(NULL, to_free); EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); - EXPECT_EQ(1, qdisc->q.qlen); + EXPECT_EQ(1, q->qdisc->q.qlen); EXPECT_STREQ("", unit_log_get()); EXPECT_LT(1000000, atomic64_read(&q->qdev->link_idle_time)); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_enqueue__short_final_packet_in_long_message) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; struct homa_rpc *srpc; - struct homa_qdisc *q; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 100, 7100); ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); atomic64_set(&q->qdev->link_idle_time, 1000000); - q->ix = 3; self->data.message_length = htonl(3000); self->data.seg.offset = htonl(2800); skb = new_test_skb(srpc, &self->addr, 7000, 100); to_free = NULL; unit_log_clear(); - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); EXPECT_EQ(NULL, to_free); EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); - EXPECT_EQ(0, qdisc->q.qlen); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); + EXPECT_EQ(0, q->qdisc->q.qlen); } TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_congested_qdisc) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; struct homa_rpc *srpc; - struct homa_qdisc *q; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 100, 7100); ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); skb = new_test_skb(srpc, &self->addr, 0, 1500); to_free = NULL; unit_log_clear(); mock_log_wakeups = 1; q->qdev->congested_qdisc = q; - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, q->qdisc, + &to_free)); EXPECT_EQ(NULL, to_free); EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); log_deferred(q->qdev); EXPECT_STREQ("wake_up; [id 1235, offsets 0]", unit_log_get()); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_other_packets_deferred) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; struct homa_rpc *srpc; - struct homa_qdisc *q; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 100, 7100); ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); - /* First packet is deferred because of congested qdisc. */ skb = new_test_skb(srpc, &self->addr, 0, 1500); to_free = NULL; unit_log_clear(); mock_log_wakeups = 1; q->qdev->congested_qdisc = q; - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); EXPECT_EQ(NULL, to_free); EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); @@ -774,20 +706,17 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_other_packets_deferred) to_free = NULL; unit_log_clear(); q->qdev->congested_qdisc = NULL; - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); EXPECT_EQ(NULL, to_free); log_deferred(q->qdev); EXPECT_STREQ("[id 1235, offsets 0 1500]", unit_log_get()); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_nic_idle_time) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; struct homa_rpc *srpc; - struct homa_qdisc *q; u64 idle; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, @@ -795,8 +724,6 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_nic_idle_time) self->server_id, 100, 7100); ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); idle = mock_clock + 1 + self->homa.qshared->max_nic_est_backlog_cycles + 1; atomic64_set(&q->qdev->link_idle_time, idle); skb = new_test_skb(srpc, &self->addr, 0, 1500); @@ -804,43 +731,181 @@ TEST_F(homa_qdisc, homa_qdisc_enqueue__defer_homa_packet_nic_idle_time) unit_log_clear(); mock_log_wakeups = 1; - EXPECT_EQ(NET_XMIT_SUCCESS, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NET_XMIT_SUCCESS, + homa_qdisc_enqueue(skb, q->qdisc, &to_free)); EXPECT_EQ(NULL, to_free); EXPECT_TRUE(homa_qdisc_any_deferred(q->qdev)); EXPECT_STREQ("wake_up", unit_log_get()); - - homa_qdisc_destroy(qdisc); - kfree(qdisc); } TEST_F(homa_qdisc, homa_qdisc_enqueue__drop_packet_queue_over_limit) { - struct Qdisc *qdisc = mock_alloc_qdisc(&mock_net_queue); + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); struct sk_buff *skb, *to_free; struct homa_rpc *srpc; - struct homa_qdisc *q; srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, &self->server_ip, self->client_port, self->server_id, 100, 7100); ASSERT_NE(NULL, srpc); - EXPECT_EQ(0, homa_qdisc_init(qdisc, NULL, NULL)); - q = qdisc_priv(qdisc); - q->ix = 3; skb = new_test_skb(srpc, &self->addr, 0, 1500); - qdisc->limit = 1; - qdisc->q.qlen = 5; + q->qdisc->limit = 1; + q->qdisc->q.qlen = 5; to_free = NULL; unit_log_clear(); - EXPECT_EQ(NET_XMIT_DROP, homa_qdisc_enqueue(skb, qdisc, &to_free)); + EXPECT_EQ(NET_XMIT_DROP, homa_qdisc_enqueue(skb, q->qdisc, &to_free)); ASSERT_NE(NULL, to_free); EXPECT_FALSE(homa_qdisc_any_deferred(q->qdev)); - EXPECT_EQ(5, qdisc->q.qlen); + EXPECT_EQ(5, q->qdisc->q.qlen); kfree_skb(to_free); - homa_qdisc_destroy(qdisc); - kfree(qdisc); +} + +TEST_F(homa_qdisc, homa_qdisc_can_bypass__skb_not_tcp) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb; + + /* First packet: IPv4 but not TCP. */ + mock_ipv6 = false; + skb = mock_tcp_skb(&self->addr, 4000, 100); + ip_hdr(skb)->protocol = IPPROTO_TCP + 1; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); + + /* Second packet: IPv6 but not TCP. */ + mock_ipv6 = true; + skb = mock_tcp_skb(&self->addr, 4000, 100); + ipv6_hdr(skb)->nexthdr = IPPROTO_TCP + 1; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); + + /* Third packet: not IPv4 or IPv6. */ + skb = mock_tcp_skb(&self->addr, 4000, 100); + skb->protocol = 1; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); + + /* Fourth packet: TCP so reordering is allowed (no packets to + * conflict with). + */ + skb = mock_tcp_skb(&self->addr, 4000, 100); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); +} +TEST_F(homa_qdisc, homa_qdisc_can_bypass__ack) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb; + + homa_qdisc_defer_tcp(q, mock_tcp_skb(&self->addr, 4000, 100)); + + /* First packet conflicts (not an ack). */ + skb = mock_tcp_skb(&self->addr, 5000, 200); + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); + + /* Second packet is an ack. */ + skb = mock_tcp_skb(&self->addr, 5000, 0); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + kfree_skb(skb); +} +TEST_F(homa_qdisc, homa_qdisc_can_bypass__skb2_not_tcp) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb, *skb2; + + skb = mock_tcp_skb(&self->addr, 4000, 100); + + /* First attempt: IPv4 but not TCP. */ + mock_ipv6 = false; + skb2 = mock_tcp_skb(&self->addr, 5000, 200); + ip_hdr(skb2)->protocol = IPPROTO_TCP + 1; + homa_qdisc_defer_tcp(q, skb2); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + __skb_queue_purge(&q->deferred_tcp); + + /* Second packet: IPv6 but not TCP. */ + mock_ipv6 = true; + skb2 = mock_tcp_skb(&self->addr, 5000, 200); + ipv6_hdr(skb2)->nexthdr = IPPROTO_TCP + 1; + homa_qdisc_defer_tcp(q, skb2); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + __skb_queue_purge(&q->deferred_tcp); + + /* Third packet: not IPv4 or IPv6. */ + skb2 = mock_tcp_skb(&self->addr, 4000, 100); + skb2->protocol = 1; + homa_qdisc_defer_tcp(q, skb2); + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + kfree_skb(skb); +} +TEST_F(homa_qdisc, homa_qdisc_can_bypass__test_address_and_ports) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb; + + mock_ipv6 = true; + skb = mock_tcp_skb(&self->addr, 4000, 100); + ipv6_hdr(skb)->daddr = self->addr2; + tcp_hdr(skb)->source = 13; + tcp_hdr(skb)->dest = 42; + homa_qdisc_defer_tcp(q, skb); + + /* First packet differs on daddr. */ + skb = mock_tcp_skb(&self->addr, 5000, 200); + ipv6_hdr(skb)->daddr = self->addr; + tcp_hdr(skb)->source = 13; + tcp_hdr(skb)->dest = 42; + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + /* Second packet differs on source port. */ + ipv6_hdr(skb)->daddr = self->addr2; + tcp_hdr(skb)->source = 12; + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + /* Third packet differs on dest port. */ + tcp_hdr(skb)->source = 13; + tcp_hdr(skb)->dest = 43; + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + /* Fourth packet conflicts. */ + tcp_hdr(skb)->dest = 42; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + + kfree_skb(skb); +} +TEST_F(homa_qdisc, homa_qdisc_can_bypass__multiple_packets_in_list) +{ + struct homa_qdisc *q = init_qdisc(self->qdiscs[3]); + struct sk_buff *skb; + + skb = mock_tcp_skb(&self->addr, 4000, 100); + tcp_hdr(skb)->source = 13; + homa_qdisc_defer_tcp(q, skb); + skb = mock_tcp_skb(&self->addr, 4000, 100); + tcp_hdr(skb)->source = 14; + homa_qdisc_defer_tcp(q, skb); + skb = mock_tcp_skb(&self->addr, 4000, 100); + tcp_hdr(skb)->source = 15; + homa_qdisc_defer_tcp(q, skb); + + /* First packet conflicts. */ + skb = mock_tcp_skb(&self->addr, 5000, 200); + tcp_hdr(skb)->source = 14; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + + /* Second packet conflicts. */ + tcp_hdr(skb)->source = 15; + EXPECT_EQ(0, homa_qdisc_can_bypass(skb, q)); + + /* Third packet doesn't conflict. */ + tcp_hdr(skb)->source = 16; + EXPECT_EQ(1, homa_qdisc_can_bypass(skb, q)); + + kfree_skb(skb); } TEST_F(homa_qdisc, homa_qdisc_defer_tcp__basics) @@ -2136,4 +2201,4 @@ TEST_F(homa_qdisc, homa_qdisc_precedes__rpc_struct_address) else result = homa_qdisc_precedes(srpc3, srpc1); EXPECT_EQ(1, result); -} \ No newline at end of file +} From f8ded1441e3a7202adb39597d160a506ecb51eb0 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 8 Jan 2026 10:12:56 -0800 Subject: [PATCH 610/625] Cleanup comment and help text in cp_node.cc --- util/cp_node.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/cp_node.cc b/util/cp_node.cc index aad59d3f..4c77a3da 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -341,7 +341,7 @@ void print_help(const char *name) printf(" --protocol Transport protocol to use: homa or tcp (default: %s)\n", protocol); printf(" --port-threads Number of server threads to service each port\n" - " (Homa only, default: %d)\n", + " (default: %d)\n", port_threads); printf(" --ports Number of ports to listen on (default: %d)\n\n", server_ports); @@ -1219,7 +1219,7 @@ std::vector tcp_servers; * requests. * @id: Unique identifier for this server. * @num_threads: Number of threads to service this listening socket and - * all of the other sockets excepted from it. + * all of the other sockets accepted from it. * @experiment: Name of the experiment in which this server is participating. */ tcp_server::tcp_server(int port, int id, int num_threads, From 73d074b9394071256fc9869ee3136490c842657c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 8 Jan 2026 10:13:19 -0800 Subject: [PATCH 611/625] Add new tcpdelay and tcppackets analyzers to tthoma.py --- util/tthoma.py | 514 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 385 insertions(+), 129 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index bc82b9d2..708578a5 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -292,7 +292,7 @@ def __missing__(self, key): # high 16 bits are the low-order bits of the client's IP address # server: Server port (same format as client) # slot: Slot allocated by cp_node on the client for the message; -# used to differentiate concurrente RPCs between the same +# used to differentiate concurrent RPCs between the same # client and server ports # req_send: Time when tcp_sendmsg was invoked for the first bytes of # the request @@ -1103,7 +1103,11 @@ def print_pkts(pkts, header=True, comment=False): 'number for TCP\n') buf.write(prefix + 'Offset: Offset of packet within message or ' '"TCP" if packet is TCP\n') - buf.write(prefix + 'Length: Size of packet (before segmentation)\n') + buf.write(prefix + 'Length: Size of packet; for the first segment ' + 'generated from a TSO\n') + buf.write(prefix + ' frame this is the size of the TSO ' + 'frame; for other segments\n') + buf.write(prefix + ' it is the size of the received packet\n') buf.write(prefix + 'Qid: Transmit queue on which packet was sent\n') buf.write(prefix + 'Nic: Time when packet was queued for NIC\n') buf.write(prefix + 'NDelay: Nic - Xmit\n') @@ -1358,8 +1362,8 @@ def sort_pkts(pkts, key): option); must be 'Xmit', 'Nic', 'Gro', 'SoftIRQ', or 'Free' """ - sort_keys = {'Xmit': 'xmit', 'Nic': 'nic', 'Gro': 'gro', - 'SoftIRQ': 'softirq', 'Free': 'free_tx_skb'} + sort_keys = {'Xmit': 'xmit', 'Qdisc': 'qdisc_xmit', 'Nic': 'nic', + 'Gro': 'gro', 'SoftIRQ': 'softirq', 'Free': 'free_tx_skb'} if not key in sort_keys: raise Exception('Invalid sort option %s: must be one of %s' % ( key, sort_keys.keys())) @@ -6670,6 +6674,7 @@ class AnalyzeNicbacklog: def __init__(self, dispatcher): dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') require_options('nicbacklog', 'data') def output(self): @@ -6996,6 +7001,7 @@ class AnalyzeNicbacklog2: def __init__(self, dispatcher): dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') require_options('nicbacklog2', 'data') def output(self): @@ -7163,6 +7169,7 @@ class AnalyzeNicpkts: def __init__(self, dispatcher): dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') require_options('nicpkts', 'data') def print_active(self, f, active, free_index): @@ -7440,6 +7447,7 @@ class AnalyzeNicsnapshot: def __init__(self, dispatcher): dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') require_options('nicsnapshot', 'time', 'node') def output(self): @@ -7621,6 +7629,7 @@ class AnalyzeNictx: def __init__(self, dispatcher): dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') require_options('nictx', 'plot') def output(self): @@ -7915,10 +7924,10 @@ def output(self): y = [i / len(kb) for i in range(len(kb))] fig = plt.figure(figsize=[6,4]) ax = fig.add_subplot(111) - ax.set_xlim(0, 1500) + ax.set_xlim(0, 5000) ax.set_xlabel('Kbytes in Packets Queued in a Qdisc') - ax.set_ylim(0, 5000) - ax.set_ylabel('KBytes Queued') + ax.set_ylim(0, 1.0) + ax.set_ylabel('Fraction of Intervals') plt.grid(which="major", axis="y") plt.plot(kb, y) plt.tight_layout() @@ -7957,8 +7966,8 @@ def output(self): # Generate time-series plot showing queuing in the qdisc and NIC # for each node. - x_min = get_last_start() - x_max = get_first_end() + x_min = get_first_time() + x_max = get_last_time() nodes = get_sorted_nodes() maxy = max(max(node_data[node]['qdisc']) for node in nodes) fig, axes = plt.subplots(nrows=len(nodes), ncols=1, sharex=False, @@ -8576,110 +8585,6 @@ def tt_softirq_grant(self, trace, t, core, id, offset, priority, increment): g['increment'] = increment g['rx_node'] = trace['node'] - def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, seq_ack): - global tcp_hdr_length - - tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) - # if 'xmit' in tcp_pkt: - # print('%9.3f: Duplicate TCP packet transmission on node %s (previous: %.3f)' % (t, - # trace['node'], tcp_pkt['xmit'])) - node = trace['node'] - tcp_pkt['xmit'] = t - tcp_pkt['xmit2'] = t - tcp_pkt['tso_length'] = data_bytes - tcp_pkt['tx_node'] = node - tcp_pkt['tx_core'] = core - set_tcp_ip_node(source, node) - - def tt_tcp_qdisc(self, trace, t, core, source, dest, data_bytes, seq_ack): - tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) - node = trace['node'] - tcp_pkt['qdisc_xmit'] = t - tcp_pkt['xmit2'] = t - tcp_pkt['tso_length'] = data_bytes - tcp_pkt['tx_node'] = node - set_tcp_ip_node(source, node) - - def tt_tcp_nic(self, trace, t, core, source, dest, data_bytes, seq_ack): - tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) - node = trace['node'] - tcp_pkt['nic'] = t - tcp_pkt['tso_length'] = data_bytes - tcp_pkt['tx_node'] = node - tcp_pkt['nic_core'] = core - set_tcp_ip_node(source, node) - - def tt_tcp_free(self, trace, t, core, source, dest, data_bytes, seq_ack, - qid): - tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) - node = trace['node'] - tcp_pkt['free_tx_skb'] = t - tcp_pkt['tso_length'] = data_bytes - tcp_pkt['tx_qid'] = qid - tcp_pkt['tx_node'] = node - set_tcp_ip_node(source, node) - - def tt_tcp_gro(self, trace, t, core, source, dest, data_bytes, seq_ack): - global tcp_hdr_length - - tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) - node = trace['node'] - tcp_pkt['length'] = data_bytes - tcp_pkt['gro'] = t - tcp_pkt['rx_node'] = node - set_tcp_ip_node(dest, node) - - def cleanup_tcp_pkts(self): - """ - This method post-processes all of the TCP packets to fill in missing - fields. - """ - global tcp_packets - - # -> list of data packets (nonzero length) from - # source to dest, where source and dest come from fields in packets - # with the same name. - stream_pkts = defaultdict(list) - - # Pass 1: divide data packets into buckets for unidirectional - # streams, and also fill in a fiew fields. - for pkt in tcp_packets.values(): - if not 'tx_node' in pkt: - node = get_tcp_node(pkt['source']) - if node != None: - pkt['tx_node'] = node - if not 'rx_node' in pkt: - node = get_tcp_node(pkt['source']) - if node != None: - pkt['rx_node'] = node - if not 'length' in pkt: - if not 'tso_length' in pkt: - print('No tso_length in packet: %s' % (pkt)) - pkt['length'] = pkt['tso_length'] - - if pkt['length'] == 0: - continue - stream_pkts[f"{pkt['source']} {pkt['dest']}"].append(pkt) - - # Pass 2: process the packets in a stream in sequence order, in - # order to copy information from a source TSO packet into each of - # the segments generated from it. - for pkts in stream_pkts.values(): - tso_pkt = None - tso_end = None - for pkt in sorted(pkts, key = lambda pkt: pkt['seq_ack']): - if 'tso_length' in pkt: - tso_pkt = pkt - tso_end = pkt['seq_ack'] + pkt['tso_length'] - continue - if tso_pkt == None or pkt['seq_ack'] >= tso_end: - continue - tso_pkt['segments'].append(pkt) - for field in ['xmit', 'qdisc_xmit', 'xmit2', 'tx_qid', - 'nic', 'free_tx_skb']: - if field in tso_pkt: - pkt[field] = tso_pkt[field] - def analyze(self): """ Try to deduce missing packet fields, such as message length. @@ -8771,8 +8676,6 @@ def analyze(self): for pid, pkt in new_pkts: packets[pid] = pkt - self.cleanup_tcp_pkts() - #------------------------------------------------ # Analyzer: pairs #------------------------------------------------ @@ -9008,6 +8911,7 @@ class AnalyzeQbytes: def __init__(self, dispatcher): require_options('qbytes', 'plot') dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') dispatcher.interest('AnalyzeRpcs') dispatcher.interest('AnalyzeMinlatency') dispatcher.interest('AnalyzeIntervals') @@ -10760,7 +10664,7 @@ class AnalyzeTcp_rpcs: """ def __init__(self, dispatcher): - dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') # "source dest" -> list of entries in tcp_rpcs whose client and # server fields match source and dest. @@ -10959,13 +10863,13 @@ def output(self): sort_keys = 'Start' for key in sort_keys.split(): if key == 'Start': - print_rpcs.sort(key = lambda rpc: + print_rpcs = sorted(print_rpcs, key = lambda rpc: rpc['req_send'] if 'req_send' in rpc else 1e20) elif key == 'End': - print_rpcs.sort(key = lambda rpc: + print_rpcs = sorted(print_rpcs, key = lambda rpc: rpc['resp_recvd'] if 'resp_recvd' in rpc else 1e20) elif key == 'Rtt': - print_rpcs.sort(reverse = True, key = lambda rpc: + print_rpcs = sorted(print_rpcs, reverse = True, key = lambda rpc: rpc['resp_recvd'] - rpc['req_send'] if 'resp_recvd' in rpc and 'req_send' in rpc else 0) else: @@ -11049,8 +10953,8 @@ def output(self): if options.verbose: first = True - print('\nPackets from the selected RPCs (in the same RPC order as') - print('above):') + print('\nPackets from the selected RPCs (in the same RPC order as ' + 'above):') for rpc in print_rpcs: if not first: print() @@ -11058,6 +10962,358 @@ def output(self): print(print_pkts(rpc['resp_pkts'], header=False), end='') first = False +#------------------------------------------------ +# Analyzer: tcpdelay +#------------------------------------------------ +class AnalyzeTcpdelay: + """ + Prints information about various delays in the transmission of + TCP packets. + """ + + def __init__(self, dispatcher): + dispatcher.interest('AnalyzeTcppackets') + + def get_pkt_delays(self, pkt, delays): + """ + Extract delays from a TCP packet, add to lists in delays. + """ + + # Note: negative delays below are probably caused by retransmits; + # ignore them. + if 'xmit' in pkt: + if ('nic' in pkt): + delay = pkt['nic'] - pkt['xmit'] + if (delay >= 0): + delays['nic'].append(delay) + if 'qdisc_xmit' in pkt: + delay = pkt['qdisc_xmit'] - pkt['xmit'] + if (delay >= 0): + delays['qdisc'].append(delay) + if 'nic' in pkt: + if 'gro' in pkt: + delay = pkt['gro'] - pkt['nic'] + if (delay >= 0): + delays['gro'].append(delay) + if 'free_tx_skb' in pkt: + delay = pkt['free_tx_skb'] - pkt['nic'] + if (delay >= 0): + delays['free'].append(delay) + if 'free_tx_skb' in pkt and 'gro' in pkt: + delay = pkt['gro'] - pkt['free_tx_skb'] + if (delay >= 0): + delays['net'].append(delay) + if 'xmit' in pkt and 'gro' in pkt: + delay = pkt['gro'] - pkt['xmit'] + if (delay >= 0): + delays['total'].append(delay) + + def output(self): + global tcp_packets + + # Each of the following dictionaries holds lists of delays + # experienced by TCP packets; each variable covers a different + # range of packet lengths. The dictionary keys are: + # nic: Delay from 'xmit' to 'nic' + # qdisc: Delay from 'xmit' to 'qdisc_xmit' if 'qdisc_xmit' is + # present, else 0 + # gro: Delay from 'nic' to 'gro' + # free: Delay from 'nic' to 'free_tx_skb' + # net: Delay from 'free_tx_skb' to 'gro' + # total: Delay from 'nic' to 'gro' + short = defaultdict(list) + medium = defaultdict(list) + long = defaultdict(list) + ack = defaultdict(list) + + short_limit = 1500 + medium_limit = 10000 + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt: + continue + tso_length = pkt['tso_length'] + if tso_length == 0: + self.get_pkt_delays(pkt, ack) + elif tso_length <= short_limit: + self.get_pkt_delays(pkt, short) + elif tso_length <= medium_limit: + self.get_pkt_delays(pkt, medium) + else: + self.get_pkt_delays(pkt, long) + + print('\n------------------') + print('Analyzer: tcpdelay') + print('------------------') + print('Delays in the transmission of TCP packets (all times in usecs):') + print('Xmit: Time from ip*xmit call until driver queued packet for NIC') + print('Qdisc: Time from ip*xmit call until homa_qdisc released ' + 'packet for') + print(' transmission (deferred packets only)') + print('Gro: Time from when NIC received packet until GRO started ' + 'processing') + print('Free: Time from when NIC received packet until packet was ' + 'returned to Linux') + print(' and freed (large values caused by queuing in NIC)') + print('Net: Slight underestimate of time from when NIC ' + 'transmitted packet') + print(' until GRO processing started (Gro - Free)') + print('Total: Time from ip*xmit call until GRO started processing') + + def print_pcts(delays): + if not delays: + return ' 0' + delays.sort() + return '%6d %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f' % ( + len(delays), delays[0], delays[10*len(delays)//100], + delays[50*len(delays)//100], delays[90*len(delays)//100], + delays[99*len(delays)//100], delays[len(delays)-1], + sum(delays)/len(delays)) + print('\nPhase Count Min P10 P50 P90 P99 Max Avg') + print('----------------------------------------------------------------') + + print('Data packets <= %d bytes:' % (short_limit)) + print('Xmit %s' % print_pcts(short['nic'])) + print('Qdisc %s' % print_pcts(short['qdisc'])) + print('Gro %s' % print_pcts(short['gro'])) + print('Free %s' % print_pcts(short['free'])) + print('Net %s' % print_pcts(short['net'])) + print('Total %s' % print_pcts(short['total'])) + + print('\nData packets %d-%d bytes:' % (short_limit + 1, medium_limit)) + print('Xmit %s' % print_pcts(medium['nic'])) + print('Qdisc %s' % print_pcts(medium['qdisc'])) + print('Gro %s' % print_pcts(medium['gro'])) + print('Free %s' % print_pcts(medium['free'])) + print('Net %s' % print_pcts(medium['net'])) + print('Total %s' % print_pcts(medium['total'])) + + print('\nData packets > %d bytes:' % (medium_limit)) + print('Xmit %s' % print_pcts(long['nic'])) + print('Qdisc %s' % print_pcts(long['qdisc'])) + print('Gro %s' % print_pcts(long['gro'])) + print('Free %s' % print_pcts(long['free'])) + print('Net %s' % print_pcts(long['net'])) + print('Total %s' % print_pcts(long['total'])) + + print('\nAcks:') + print('Xmit %s' % print_pcts(ack['nic'])) + print('Qdisc %s' % print_pcts(ack['qdisc'])) + print('Gro %s' % print_pcts(ack['gro'])) + print('Free %s' % print_pcts(ack['free'])) + print('Net %s' % print_pcts(ack['net'])) + print('Total %s' % print_pcts(ack['total'])) + + # Print stats for P98-99 small packets + p99 = defaultdict(list) + delays = sorted(short['total']) + if delays: + min_delay = delays[98*len(delays)//100] + max_delay = delays[99*len(delays)//100] + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt or not 'xmit' in pkt or not 'gro' in pkt: + continue + if pkt['tso_length'] > short_limit: + continue + delay = pkt['gro'] - pkt['xmit'] + if delay >= min_delay and delay <= max_delay: + self.get_pkt_delays(pkt, p99) + + print('\nP98-P99 packets <= %d bytes:' % (short_limit)) + print('Xmit %s' % print_pcts(p99['nic'])) + print('Qdisc %s' % print_pcts(p99['qdisc'])) + print('Gro %s' % print_pcts(p99['gro'])) + print('Free %s' % print_pcts(p99['free'])) + print('Net %s' % print_pcts(p99['net'])) + print('Total %s' % print_pcts(p99['total'])) + + # Print stats for P98-99 medium packets + p99 = defaultdict(list) + delays = sorted(medium['total']) + if delays: + min_delay = delays[98*len(delays)//100] + max_delay = delays[99*len(delays)//100] + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt or not 'xmit' in pkt or not 'gro' in pkt: + continue + tso_length = pkt['tso_length'] + if tso_length <= short_limit or tso_length > medium_limit: + continue + delay = pkt['gro'] - pkt['xmit'] + if delay >= min_delay and delay <= max_delay: + self.get_pkt_delays(pkt, p99) + + print('\nP98-P99 packets %d-%d bytes:' % (short_limit + 1, medium_limit)) + print('Xmit %s' % print_pcts(p99['nic'])) + print('Qdisc %s' % print_pcts(p99['qdisc'])) + print('Gro %s' % print_pcts(p99['gro'])) + print('Free %s' % print_pcts(p99['free'])) + print('Net %s' % print_pcts(p99['net'])) + print('Total %s' % print_pcts(p99['total'])) + + # Print stats for P98-99 long packets + p99 = defaultdict(list) + delays = sorted(long['total']) + if delays: + min_delay = delays[98*len(delays)//100] + max_delay = delays[99*len(delays)//100] + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt or not 'xmit' in pkt or not 'gro' in pkt: + continue + tso_length = pkt['tso_length'] + if tso_length <= medium_limit: + continue + delay = pkt['gro'] - pkt['xmit'] + if delay >= min_delay and delay <= max_delay: + self.get_pkt_delays(pkt, p99) + + print('\nP98-P99 packets > %d bytes:' % (medium_limit)) + print('Xmit %s' % print_pcts(p99['nic'])) + print('Qdisc %s' % print_pcts(p99['qdisc'])) + print('Gro %s' % print_pcts(p99['gro'])) + print('Free %s' % print_pcts(p99['free'])) + print('Net %s' % print_pcts(p99['net'])) + print('Total %s' % print_pcts(p99['total'])) + + # Print stats for P98-99 acks + p99 = defaultdict(list) + delays = sorted(ack['total']) + if delays: + min_delay = delays[98*len(delays)//100] + max_delay = delays[99*len(delays)//100] + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt or not 'xmit' in pkt or not 'gro' in pkt: + continue + tso_length = pkt['tso_length'] + if tso_length != 0: + continue + delay = pkt['gro'] - pkt['xmit'] + if delay >= min_delay and delay <= max_delay: + self.get_pkt_delays(pkt, p99) + + print('\nAcks:') + print('Xmit %s' % print_pcts(p99['nic'])) + print('Qdisc %s' % print_pcts(p99['qdisc'])) + print('Gro %s' % print_pcts(p99['gro'])) + print('Free %s' % print_pcts(p99['free'])) + print('Net %s' % print_pcts(p99['net'])) + print('Total %s' % print_pcts(p99['total'])) + +#------------------------------------------------ +# Analyzer: tcppackets +#------------------------------------------------ +class AnalyzeTcppackets: + """ + Collects information about each TCP packet but doesn't generate any + output. The data it collects is used by other analyzers. + """ + + def __init__(self, dispatcher): + return + + def tt_tcp_xmit(self, trace, t, core, source, dest, data_bytes, seq_ack): + global tcp_hdr_length + + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + # if 'xmit' in tcp_pkt: + # print('%9.3f: Duplicate TCP packet transmission on node %s (previous: %.3f)' % (t, + # trace['node'], tcp_pkt['xmit'])) + node = trace['node'] + tcp_pkt['xmit'] = t + tcp_pkt['xmit2'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['tx_node'] = node + tcp_pkt['tx_core'] = core + set_tcp_ip_node(source, node) + + def tt_tcp_qdisc(self, trace, t, core, source, dest, data_bytes, seq_ack): + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['qdisc_xmit'] = t + tcp_pkt['xmit2'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['tx_node'] = node + set_tcp_ip_node(source, node) + + def tt_tcp_nic(self, trace, t, core, source, dest, data_bytes, seq_ack): + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['nic'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['tx_node'] = node + tcp_pkt['nic_core'] = core + set_tcp_ip_node(source, node) + + def tt_tcp_free(self, trace, t, core, source, dest, data_bytes, seq_ack, + qid): + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['free_tx_skb'] = t + tcp_pkt['tso_length'] = data_bytes + tcp_pkt['tx_qid'] = qid + tcp_pkt['tx_node'] = node + set_tcp_ip_node(source, node) + + def tt_tcp_gro(self, trace, t, core, source, dest, data_bytes, seq_ack): + global tcp_hdr_length + + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['length'] = data_bytes + tcp_pkt['gro'] = t + tcp_pkt['rx_node'] = node + set_tcp_ip_node(dest, node) + + def analyze(self): + """ + This method post-processes all of the TCP packets to fill in missing + fields. + """ + global tcp_packets + + # -> list of data packets (nonzero length) from + # source to dest, where source and dest come from fields in packets + # with the same name. + stream_pkts = defaultdict(list) + + # Pass 1: divide data packets into buckets for unidirectional + # streams, and also fill in a fiew fields. + for pkt in tcp_packets.values(): + if not 'tx_node' in pkt: + node = get_tcp_node(pkt['source']) + if node != None: + pkt['tx_node'] = node + if not 'rx_node' in pkt: + node = get_tcp_node(pkt['source']) + if node != None: + pkt['rx_node'] = node + if not 'length' in pkt: + if not 'tso_length' in pkt: + print('No tso_length in packet: %s' % (pkt)) + pkt['length'] = pkt['tso_length'] + + if pkt['length'] == 0: + continue + stream_pkts[f"{pkt['source']} {pkt['dest']}"].append(pkt) + + # Pass 2: process the packets in a stream in sequence order, in + # order to copy information from a source TSO packet into each of + # the segments generated from it. + for pkts in stream_pkts.values(): + tso_pkt = None + tso_end = None + for pkt in sorted(pkts, key = lambda pkt: pkt['seq_ack']): + if 'tso_length' in pkt: + tso_pkt = pkt + tso_end = pkt['seq_ack'] + pkt['tso_length'] + continue + if tso_pkt == None or pkt['seq_ack'] >= tso_end: + continue + tso_pkt['segments'].append(pkt) + for field in ['xmit', 'qdisc_xmit', 'xmit2', 'tx_qid', + 'nic', 'free_tx_skb']: + if field in tso_pkt: + pkt[field] = tso_pkt[field] + #------------------------------------------------ # Analyzer: temp #------------------------------------------------ @@ -11069,22 +11325,20 @@ class AnalyzeTemp: def __init__(self, dispatcher): # dispatcher.interest('AnalyzeTcp_rpcs') # dispatcher.interest('AnalyzeRpcs') - dispatcher.interest('AnalyzePackets') + # dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') def output(self): global packets, grants, tcp_packets selected = [] - for pkt in itertools.chain(packets.values(), grants.values(), - tcp_packets.values()): - if not 'nic' in pkt or not 'tso_length' in pkt: - continue - if not 'xmit' in pkt: + for pkt in tcp_packets.values(): + if not 'tso_length' in pkt: continue - if pkt['nic'] - pkt['xmit'] > 5: + if pkt['tso_length'] <= 1500: selected.append(pkt) - selected.sort(key=lambda pkt: pkt['xmit']) + selected.sort(key=lambda pkt: pkt['xmit'] if 'xmit' in pkt else 1e20) print(print_pkts(selected, header=True), end='') def output_slow_pkts(self): @@ -11307,6 +11561,7 @@ class AnalyzeTemp2: def __init__(self, dispatcher): dispatcher.interest('AnalyzeRpcs') dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') def output(self): global packets @@ -11647,6 +11902,7 @@ def __init__(self, dispatcher): global options require_options('txpkts', 'data') dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzeTcppackets') def output(self): global packets, tcp_packets, options, traces From 9f3d0417d8de7755c19e9793f30311cd0ebcb99d Mon Sep 17 00:00:00 2001 From: breakertt Date: Thu, 8 Jan 2026 10:49:08 -0800 Subject: [PATCH 612/625] Adjust homa.7 man page and cp_node --help with homa_api removal Resolves #73 --- man/homa.7 | 13 ------------- util/cp_node.cc | 4 ++-- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/man/homa.7 b/man/homa.7 index 06d6642b..f1a3c317 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -204,19 +204,6 @@ system call can be used to send request and response messages; see Homa's .BR sendmsg (2) man page for details. -In addition, Homa provides library functions -.BR homa_send , -.BR homa_sendv , -.BR homa_reply , -and -.BR homa_replyv , -which are layered on top of -.BR sendmsg . -See the man pages -.BR homa_send (3) -and -.BR homa_reply (3) -for details on these functions. .SH RECEIVING MESSAGES .PP The diff --git a/util/cp_node.cc b/util/cp_node.cc index 4c77a3da..0870fdf4 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -334,7 +334,7 @@ void print_help(const char *name) printf(" (defaults to _)\n"); printf(" --first-port Lowest port number to use (default: 4000 for Homa,\n"); printf(" 5000 for TCP)\n"); - printf(" --iovec Use homa_replyv instead of homa_reply\n"); + printf(" --iovec Use iovecs for reply instead of a single buffer\n"); printf(" --ipv6 Use IPv6 instead of IPv4\n"); printf(" --pin All server threads will be restricted to run only\n" " on the givevn core\n"); @@ -1139,7 +1139,7 @@ void homa_server::server(int thread_id, server_metrics *metrics) homa_args.id = receiver.id(); result = sendmsg(fd, &msghdr, 0); if (result < 0) { - log(NORMAL, "FATAL: homa_reply failed for server " + log(NORMAL, "FATAL: sendmsg failed for server " "port %d: %s\n", port, strerror(errno)); fatal(); From 22d43393b2da16e50e160decede82fbe950ddba7 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 8 Jan 2026 21:22:02 -0800 Subject: [PATCH 613/625] Add warmup period for TCP in cp benchmarks For some reason, TCP performance improves noticeably after 10-15 seconds (don't yet know why). --- util/cperf.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/util/cperf.py b/util/cperf.py index cc2d80b3..a429e1e6 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -740,6 +740,9 @@ def run_experiment(name, clients, options): vlog("Recording initial metrics") for id in exp_nodes: do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) + if options.protocol == "tcp" or options.protocol == "dctcp": + log("Waiting for TCP to warm up...") + time.sleep(15) if not "no_rtt_files" in options: do_cmd("dump_times /dev/null %s" % (name), clients) if options.protocol == "homa" and options.tt_freeze: @@ -914,6 +917,9 @@ def run_experiments(*args): if homa_clients: # Wait a bit so that homa_prio can set priorities appropriately time.sleep(2) + if tcp_nodes: + log("Waiting for TCP to warm up...") + time.sleep(15) if homa_nodes: if stripped: vlog("Skipping metrics initialization (Homa is stripped)") From d460d72d61c8d424f4c02e4d0fe203946d569e01 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 13 Jan 2026 13:49:44 -0800 Subject: [PATCH 614/625] Improve error message in cp_node --- util/cp_node.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/util/cp_node.cc b/util/cp_node.cc index 0870fdf4..f92ae47f 100644 --- a/util/cp_node.cc +++ b/util/cp_node.cc @@ -1868,8 +1868,11 @@ void client::record(uint64_t end_time, message_header *header) } rinfo *r = &rinfos[header->msg_id]; if (!r->active) { - log(NORMAL, "ERROR: response arrived for inactive msg_id %u\n", - header->msg_id); + int *int_hdr = reinterpret_cast(header); + log(NORMAL, "ERROR: response arrived for inactive msg_id %u, " + "header 0x%x, 0x%x, 0x%x\n", + header->msg_id, int_hdr[0], int_hdr[1], + int_hdr[2]); return; } rtt = end_time - r->start_time; From e316f00ea6c39baf2934992336403eb8e6f36b03 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 13 Jan 2026 13:51:29 -0800 Subject: [PATCH 615/625] Add linux_softirqd_actions metric --- homa_metrics.c | 4 +++- homa_metrics.h | 7 +++++++ timetrace.c | 5 +++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/homa_metrics.c b/homa_metrics.c index ca2d91de..5639eb70 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -241,13 +241,15 @@ char *homa_metrics_print(void) M("bypass_softirq_cycles", m->bypass_softirq_cycles, "Time spent in homa_softirq during bypass from GRO\n"); - /* Adjust stats gathered in Linux: they always use rdtsc. */ + /* Adjust stats gathered in Linux that use rdtsc. */ M("linux_softirq_cycles", m->linux_softirq_cycles * (homa_clock_khz() / 1000) / (tsc_khz / 1000), "Time spent in all Linux SoftIRQ\n"); M("napi_cycles", m->napi_cycles * (homa_clock_khz() / 1000) / (tsc_khz / 1000), "Time spent in NAPI-level packet handling\n"); + M("linux_softirqd_actions", m->linux_softirqd_actions, + "SoftIRQ actions taken in the background softirqd thread\n"); M("send_cycles", m->send_cycles, "Time spent in homa_sendmsg for requests\n"); M("send_calls", m->send_calls, diff --git a/homa_metrics.h b/homa_metrics.h index c1a09c61..8caa6df1 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -286,6 +286,13 @@ struct homa_metrics { */ u64 napi_cycles; + /** + * @linux_softirqd_actions: total number of times that a SoftIRQ + * action was taken in the softirqd daemon thread (slow path) rather + * than in the bottom-half SoftIRQ handler. + */ + u64 linux_softirqd_actions; + /** * @send_cycles: total time spent executing the homa_sendmsg kernel * call handler to send requests. diff --git a/timetrace.c b/timetrace.c index 958ede0a..60c56fd5 100644 --- a/timetrace.c +++ b/timetrace.c @@ -923,6 +923,11 @@ void tt_inc_metric(int metric, u64 count) offsetof(struct homa_metrics, napi_cycles), offsetof(struct homa_metrics, linux_softirq_cycles), offsetof(struct homa_metrics, linux_pkt_alloc_bytes), + offsetof(struct homa_metrics, temp[0]), + offsetof(struct homa_metrics, temp[1]), + offsetof(struct homa_metrics, temp[2]), + offsetof(struct homa_metrics, temp[3]), + offsetof(struct homa_metrics, linux_softirqd_actions) }; u64 *metric_addr = (u64 *)(((char *)homa_metrics_per_cpu()) + offsets[metric]); From 8f05dd23e5de5fefaf93ee7130bb37c735d6bf9c Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 15 Jan 2026 11:24:11 -0800 Subject: [PATCH 616/625] Improve comment in homa_qdisc.c --- homa_qdisc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index 780d5ad3..e9f60af6 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -32,9 +32,9 @@ * bandwidth even with a large backlog of (mixed-size) output packets. As a * result, with this approach alone NIC queues frequently build up * (measurements showed total NIC backlogs of 5 MB or more under high - * network load). If the pacing rate is reduced to a level where the NIC - * could always keep up, it would sacrifice link bandwidth in situations - * where the NIC can transmit at closer to line rate. + * network load, even with DQL). If the pacing rate is reduced to a level + * where the NIC could always keep up, it would sacrifice link bandwidth in + * situations where the NIC can transmit at closer to line rate. * * Thus Homa also uses a second approach, which is based on information * maintained by the dynamic queue limits mechanism (DQL). DQL keeps @@ -44,7 +44,8 @@ * outstanding bytes for any queue exceeds a limit (determined by the * max_nic_queue_usecs sysctl parameter) then the NIC is considered * congested and Homa will stop queuing more packets until the congestion - * subsides. + * subsides. This reduces worst-case total NIC queuing by 2-3x (as of + * January 2026). * * It might seem that the second approach is sufficient by itself, so the * first approach is not needed. Unfortunately, updates to the DQL counters @@ -57,6 +58,13 @@ * There is one additional twist, which is that the rate limits above do * not apply to small packets. The reasons for this are explained in a comment * in homa_qdisc_enqueue. + * + * In case you're wondering "why don't you just use DQL?", the DQL mechanism + * is inadequate in two ways. First, it allows large queues to accumulate in + * the NIC. Second, when queues build up, Homa wants to know so it can + * throttle long messages more than short ones. DQL provides no feedback + * to qdiscs; it simply stops the entire output queue, throttling short and + * long messages alike. This interferes with Homa's SRPT scheduler. */ #include "homa_impl.h" From a2fce56c92ccc9d080a3c4ea10aff42b180f7338 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 15 Jan 2026 11:29:48 -0800 Subject: [PATCH 617/625] Consider node type when computing option defaults in cperf.py --- util/cperf.py | 72 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/util/cperf.py b/util/cperf.py index a429e1e6..551c12f2 100644 --- a/util/cperf.py +++ b/util/cperf.py @@ -16,6 +16,7 @@ import matplotlib.pyplot as plt import numpy as np import os +from pathlib import Path import platform import re import shutil @@ -72,28 +73,63 @@ "default": 25} # Defaults for command-line options; assumes that servers and clients -# share nodes. +# share nodes. Individual benchmarks may override some of these values. +# 'None' values will eventually be replaced with values from defaults_25g +# or defaults_100g. default_defaults = { 'gbps': 0.0, # Note: very large numbers for client_max hurt Homa throughput with # unlimited load (throttle queue inserts take a long time). 'client_max': 200, - 'client_ports': 3, + 'client_ports': None, 'log_dir': 'logs/' + time.strftime('%Y%m%d%H%M%S'), 'mtu': 0, 'no_trunc': '', 'protocol': 'homa', + 'port_receivers': None, + 'port_threads': None, + 'seconds': 30, + 'server_ports': None, + 'tcp_client_ports': None, + 'tcp_port_receivers': None, + 'tcp_server_ports': None, + 'tcp_port_threads': None, + 'unsched': 0, + 'unsched_boost': 0.0, + 'workload': '' +} + +# These defaults are used for 25 Gbps networks. +defaults_25g = { + 'client_ports': 3, 'port_receivers': 3, 'port_threads': 3, - 'seconds': 30, 'server_ports': 3, 'tcp_client_ports': 4, 'tcp_port_receivers': 1, + 'tcp_port_threads': 1, 'tcp_server_ports': 8, +} + +# These defaults are used for 100 Gbps networks. +defaults_100g = { + 'client_ports': 5, + 'port_receivers': 3, + 'port_threads': 3, + 'server_ports': 2, + 'tcp_client_ports': 10, + 'tcp_port_receivers': 1, 'tcp_port_threads': 1, - 'unsched': 0, - 'unsched_boost': 0.0, - 'workload': '' + 'tcp_server_ports': 20, +} + +# Maps from CloudLab node type ('xl170', 'c6620', etc.) to defaults +# appropriate for that cluster type. +type_defaults = { + 'xl170': defaults_25g, + 'c6620': defaults_100g, + 'c6525-25g': defaults_25g, + 'c6525-100g': defaults_100g } # Keys are experiment names, and each value is the digested data for that @@ -193,6 +229,26 @@ def get_parser(description, usage, defaults = {}): are defaults; used to modify the defaults for some of the options (there is a default default for each option). """ + + # Configure defaults for this particular node type (e.g. network speed) + p = Path("/var/emulab/boot/nodetype") + if p.is_file(): + type = p.read_text().strip() + if type in type_defaults: + node_defaults = type_defaults[type] + else: + print("Couldn't find option defaults for node type '%s'; " + "using 100 Gbps defaults" % (type), file=sys.stderr) + node_defaults = defaults_100g + else: + print("Couldn't read node type from /var/emulab/boot/nodetype; " + "using 100 Gbps defaults") + node_defaults = defaults_100g + for key, value in node_defaults.items(): + # Only set default if the application hasn't already specified a value + if default_defaults[key] == None: + default_defaults[key] = value + for key in default_defaults: if not key in defaults: defaults[key] = default_defaults[key] @@ -742,7 +798,7 @@ def run_experiment(name, clients, options): do_subprocess(["ssh", "node%d" % (id), "metrics.py"]) if options.protocol == "tcp" or options.protocol == "dctcp": log("Waiting for TCP to warm up...") - time.sleep(15) + time.sleep(10) if not "no_rtt_files" in options: do_cmd("dump_times /dev/null %s" % (name), clients) if options.protocol == "homa" and options.tt_freeze: @@ -919,7 +975,7 @@ def run_experiments(*args): time.sleep(2) if tcp_nodes: log("Waiting for TCP to warm up...") - time.sleep(15) + time.sleep(10) if homa_nodes: if stripped: vlog("Skipping metrics initialization (Homa is stripped)") From 2b6abb98ff8d5e01db0d0fb0dcb83af78c55cb9b Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 15 Jan 2026 11:30:40 -0800 Subject: [PATCH 618/625] Don't fiddle with option defaults in cp_both: use default defaults --- util/cp_both | 4 ---- 1 file changed, 4 deletions(-) diff --git a/util/cp_both b/util/cp_both index 94916de6..c7dad50e 100755 --- a/util/cp_both +++ b/util/cp_both @@ -9,9 +9,6 @@ from cperf import * -for option in ['client_max', 'client_ports', 'port_threads', 'server_ports', - 'tcp_client_ports', 'tcp_server_ports']: - default_defaults[option] = (default_defaults[option]+1)/2 parser = get_parser(description= 'Measures slowdown when TCP and Homa are competing for resources ' 'on the same nodes.', @@ -22,7 +19,6 @@ parser.add_argument('--homa-gbps', type=float, dest='homa_gbps', 'on each node (clients and servers combined); the remainder of ' '--gbps will be generated by TCP (default: split --gbps between ' 'Homa and TCP)') -default_defaults['client_max'] options = parser.parse_args() init(options) From bc4699d6f2ba7f1458cd669d5cca952ac819843d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 15 Jan 2026 11:31:40 -0800 Subject: [PATCH 619/625] Various improvements to tthoma.py * Add 'output' method to rpcs analyzer, so it works similar to tcp_rpcs analyzer * Add graph of longest NIC queue to nictx analyzer * Collect softirq info for TCP --- util/tthoma.py | 677 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 516 insertions(+), 161 deletions(-) diff --git a/util/tthoma.py b/util/tthoma.py index 708578a5..2995be2b 100755 --- a/util/tthoma.py +++ b/util/tthoma.py @@ -30,9 +30,11 @@ import time # This global variable holds information about every RPC from every trace -# file; it is created by AnalyzeRpcs. Keys are RPC ids, values are dictionaries -# of info about that RPC, with the following elements (some elements may be -# missing if the RPC straddled the beginning or end of the timetrace): +# file; it is created by AnalyzeRpcs. There is a separate entry for the +# client side and the server side of each RPC. Keys are RPC ids, values are +# dictionaries of info about that RPC, with the following elements (some +# elements may be missing if the RPC straddled the beginning or end of the +# timetrace): # found: Last time when homa_wait_for_message found the RPC # gro_core: Core that handled GRO processing for this RPC # gro_data: List of tuples for all incoming @@ -189,12 +191,13 @@ def __missing__(self, id): # pacer: If this field exists it has the value True and it means that # this is a TSO packet that was transmitted by the pacer # priority: Priority at which packet was transmitted -# tx_node: Name of node from which the packet was transmitted (always -# present if xmit is present) +# tx_node: Name of node from which the packet was transmitted or empty +# string if unknown (always valid if xmit is present) # tx_core: Core on which ip*xmit was invoked # tx_qid: NIC channel on which packet was transmitted # tx_queue: Hex address of queue corresponding to tx_qid, if known -# rx_node: Name of node on which packet was received +# rx_node: Name of node on which packet was received or empty string +# if unknown # gro_core: Core on which homa_gro_receive was invoked # softirq_core: Core on which SoftIRQ processed the packet # free_tx_skb: Time when NAPI released the skb on the sender, which can't @@ -232,8 +235,8 @@ def __missing__(self, key): # free_tx_skb: Time when NAPI released the skb on the sender, which can't # happen until the packet has been transmitted. # tx_qid: NIC channel on which packet was transmitted -# tx_node: Node that sent grant (if known) -# rx_node: Node that received grant (if known) +# tx_node: Node that sent grant, or empty string if unknown +# rx_node: Node that received grant, or empty string if unknown # id: Id of the RPC on the node that sent the grant # offset: Offset specified in the grant # increment: How much previously ungranted data is covered by this grant; @@ -277,9 +280,12 @@ def __missing__(self, key): # free_tx_skb: Time when NAPI released the skb on the sender, which can't # happen until the packet has been fully transmitted. # gro: Time when GRO received the packet -# tx_node: Node that sent the packet (corresponds to saddr) +# softirq: Time when SoftIRQ received the packet +# tx_node: Node that sent the packet (corresponds to saddr), or empty +# string if unknown # tx_qid: NIC channel on which packet was transmitted -# rx_node: Node that received the packet (corresponds to daddr) +# rx_node: Node that received the packet (corresponds to daddr), or empty +# string if unknown # retransmits: Always empty (for compatibility with Homa packets) tcp_packets = {} @@ -529,6 +535,54 @@ def extract_num(s): return int(match.group(1)) return None +def filter_rpcs(rpcs, msglen=None, rpc_start=None, rtt=None): + """ + Returns a list of all the Homa RPCs that match a set of command-line + options + rpcs: List of RPCs to filter (must be entries in rpcs); only + client-side RPCs are considered + msglen: If not None, filter on msglen (see --msglen arg) + rpc_start: If not None, filter on RPC start time (see --rpc-start arg) + rtt: If not None, filter on round-trip time (see --rtt arg) + """ + if msglen != None: + min_length, max_length = get_range(msglen, + option_name='--msglen', one_value=True) + if max_length == None: + max_length = min_length + min_length = 0 + if rpc_start != None: + min_start, max_start = get_range(rpc_start, + parse_float=True, option_name='--rpc-start') + if rtt != None: + min_rtt, max_rtt = get_range(rtt, parse_float = True, + option_name='--rtt') + + result = [] + for rpc in rpcs: + if rpc['id'] & 1: + continue + if msglen != None: + if not 'out_length' in rpc: + continue + length = rpc['out_length'] + if length < min_length or length > max_length: + continue + if rpc_start != None: + if not 'sendmsg' in rpc: + continue + start = rpc['sendmsg'] + if start < min_start or start > max_start: + continue + if rtt != None: + if not 'sendmsg' in rpc or not 'recvmsg_done' in rpc: + continue + rtt = rpc['recvmsg_done'] - rpc['sendmsg'] + if rtt < min_rtt or rtt > max_rtt: + continue + result.append(rpc) + return result + def filter_tcp_rpcs(rpcs, msglen=None, rpc_start=None, rtt=None): """ Returns a list of all the TCP RPCs that match a set of command-line @@ -785,49 +839,6 @@ def get_range(s, option_name=None, parse_float=False, one_value=True): raise Exception('Bad range spec \'%s\'; must be \'value\' or ' '\'value1 value2\'' % (s)) -def get_tcp_node(addr_port): - """ - Return the name of the node corresponding to the argument, or None - if no corresponding node could be found. - addr_port: A hex string used in TCP timetrace entries: the lower - 16 bits are a port number and the upper 16 bits are - the low 16 bits of a node's IP address. - """ - global ip_to_node - - key = addr_port[:-4] - if key in ip_to_node: - return ip_to_node[key] - return None - -def get_tcp_packet(source, dest, data_bytes, seq_ack): - """ - Returns the entry in tcp_packets corresponding to the arguments. Creates - a new packet if it doesn't already exist. - - source: Hex string identifying source for packet; lower 16 bits are - port number, upper 16 bits are low-order 16-bits of IP address - dest: Hex string identifying destination for packet; same format - as source - data_bytes: Amount of payload data in the packet - seq_ack: Packet sequence number if data_bytes != 0, otherwise - ack sequence number from packet - """ - global tcp_packets - - # Distinguish data packets (those where data_bytes is nonzero) from - # packets that are purely acknowledgments (data_bytes is zero). - if data_bytes != 0: - key = f'{source} {dest} {seq_ack} data' - else: - key = f'{source} {dest} {seq_ack} ack' - if key in tcp_packets: - return tcp_packets[key] - pkt = {'type': 'tcp', 'source': source, 'dest': dest, 'seq_ack': seq_ack, - 'retransmits': [], 'segments': []} - tcp_packets[key] = pkt - return pkt - def get_recv_length(offset, msg_length=None): """ Compute the length of a received packet. Uses information collected in the @@ -935,6 +946,49 @@ def get_sorted_nodes(): return get_sorted_nodes.result get_sorted_nodes.result = None +def get_tcp_node(addr_port): + """ + Return the name of the node corresponding to the argument, or None + if no corresponding node could be found. + addr_port: A hex string used in TCP timetrace entries: the lower + 16 bits are a port number and the upper 16 bits are + the low 16 bits of a node's IP address. + """ + global ip_to_node + + key = addr_port[:-4] + if key in ip_to_node: + return ip_to_node[key] + return None + +def get_tcp_packet(source, dest, data_bytes, seq_ack): + """ + Returns the entry in tcp_packets corresponding to the arguments. Creates + a new packet if it doesn't already exist. + + source: Hex string identifying source for packet; lower 16 bits are + port number, upper 16 bits are low-order 16-bits of IP address + dest: Hex string identifying destination for packet; same format + as source + data_bytes: Amount of payload data in the packet + seq_ack: Packet sequence number if data_bytes != 0, otherwise + ack sequence number from packet + """ + global tcp_packets + + # Distinguish data packets (those where data_bytes is nonzero) from + # packets that are purely acknowledgments (data_bytes is zero). + if data_bytes != 0: + key = f'{source} {dest} {seq_ack} data' + else: + key = f'{source} {dest} {seq_ack} ack' + if key in tcp_packets: + return tcp_packets[key] + pkt = {'type': 'tcp', 'source': source, 'dest': dest, 'seq_ack': seq_ack, + 'retransmits': [], 'segments': [], 'tx_node': '', 'rx_node': ''} + tcp_packets[key] = pkt + return pkt + def get_time_stats(samples): """ Given a list of elapsed times, returns a string containing statistics @@ -1147,8 +1201,7 @@ def print_pkts(pkts, header=True, comment=False): rx += len(seg['retransmits']) rx_msg = str(rx) if rx > 0 else "" - line = '%-8s %-8s %10s %10s' % (pkt['tx_node'], - pkt['rx_node'] if 'rx_node' in pkt else "", + line = '%-8s %-8s %10s %10s' % (pkt['tx_node'], pkt['rx_node'], print_if(xmit, '%.3f'), qdisc_string) if pkt['type'] == 'data': line += ' %10d %6d' % (pkt['id'], pkt['offset']) @@ -1184,50 +1237,99 @@ def print_rpcs(client_rpcs, header=True): buf = StringIO() if header: - buf.write('# Client: Node that sent the RPC request\n') - buf.write('# Server: Node that handled the RPC and sent response\n') - buf.write('# Id: RPC identifier (client side)\n') - buf.write('# Length: Length of request message\n') - buf.write('# RqNic: Elapsed time from sendmsg until first ' + buf.write('Start: Time when homa_sendmsg was invoked for request\n') + buf.write('Client: Node that sent the RPC request\n') + buf.write('Server: Node that handled the RPC and sent response\n') + buf.write('Id: RPC identifier (client side)\n') + buf.write('Length: Length of request message\n') + buf.write('RqNic: Elapsed time from sendmsg until first ' 'request packet handed\n') - buf.write('# off to NIC\n') - buf.write('# RqGRO: Time from NIC handoff to GRO receipt for ' + buf.write(' off to NIC\n') + buf.write('RqGRO: Time from NIC handoff to GRO receipt for ' 'first request packet\n') - buf.write('# RqSoft: Time from GRO to SoftIRQ for first request ' + buf.write('RqSoft: Time from GRO to SoftIRQ for first request ' 'packet\n') - buf.write('# RqRecv: Time from SoftIRQ for first request packet ' + buf.write('RqRecv: Time from SoftIRQ for first request packet ' 'until recvmsg completes\n') - buf.write('# on server\n') - buf.write('# Srvc: Time from recvmsg return on server until ' + buf.write(' on server\n') + buf.write('Srvc: Time from recvmsg return on server until ' 'sendmsg for response\n') - buf.write('# RspNic: Elapsed time from sendmsg of response until ' + buf.write('RspNic: Elapsed time from sendmsg of response until ' 'first packet handed\n') - buf.write('# off to NIC\n') - buf.write('# RspGRO: Time from NIC handoff to GRO receipt for ' + buf.write(' off to NIC\n') + buf.write('RspGRO: Time from NIC handoff to GRO receipt for ' 'first response packet\n') - buf.write('# RspSoft: Time from GRO to SoftIRQ for first response ' + buf.write('RspSoft: Time from GRO to SoftIRQ for first response ' 'packet\n') - buf.write('# RspRecv: Time from SoftIRQ for first response packet ' + buf.write('RspRecv: Time from SoftIRQ for first response packet ' 'until RPC completes\n') - buf.write('# Total: End-to-end RTT\n\n') - buf.write('Client Server Id Length RqNic RqGRO ') + buf.write('End: Time when response was returned to client\n') + buf.write('Rtt: End-to-end RTT\n\n') + buf.write('Start Client Server Id Length RqNic RqGRO ') buf.write('RqSoft RqRecv Srvc RspNic RspGRO ') - buf.write('RspSoft RspRecv Total\n') + buf.write('RspSoft RspRecv End Rtt\n') for rpc in client_rpcs: - srpc = rpcs[rpc['id'] ^ 1] - tx = rpc['send_data_pkts'][0] - rx = rpc['softirq_data_pkts'][0] - buf.write('%-8s %-8s %10s %7d %6.1f %6.1f' % ( - tx['tx_node'], tx['rx_node'], rpc['id'], rpc['out_length'], - tx['nic'] - rpc['sendmsg'], tx['gro'] - tx['nic'])) - buf.write(' %6.1f %6.1f %6.1f %6.1f %6.1f' % ( - tx['softirq'] - tx['gro'], - srpc['recvmsg_done'] - tx['softirq'], - srpc['sendmsg'] - srpc['recvmsg_done'], - rx['nic'] - srpc['sendmsg'], rx['gro'] - rx['nic'])) - buf.write(' %7.1f %7.1f %6.1f\n' % ( - rx['softirq'] - rx['gro'], rpc['recvmsg_done'] - rx['softirq'], - rpc['recvmsg_done'] - rpc['sendmsg'])) + peer_id = rpc['id'] ^ 1 + if peer_id in rpcs: + srpc = rpcs[peer_id] + else: + srpc = {} + tx = rpc['send_data_pkts'][0] if rpc['send_data_pkts'] else {} + rx = rpc['softirq_data_pkts'][0] if rpc['softirq_data_pkts'] else {} + if 'sendmsg' in rpc: + start = '%.3f' % (rpc['sendmsg']) + else: + start = '' + if 'nic' in tx and 'sendmsg' in rpc: + rq_nic = '%.1f' % (tx['nic'] - rpc['sendmsg']) + else: + rq_nic = '' + if 'gro' in tx and 'nic' in tx: + rq_gro = '%.1f' % (tx['gro'] - tx['nic']) + else: + rq_gro = '' + if 'softirq' in tx and 'gro' in tx: + rq_soft = '%.1f' % (tx['softirq'] - tx['gro']) + else: + rq_soft = '' + if 'recvmsg_done' in srpc and 'softirq' in tx: + rq_recv = '%.1f' % (srpc['recvmsg_done'] - tx['softirq']) + else: + rq_recv = '' + if 'sendmsg' in srpc and 'recvmsg_done' in srpc: + srvc = '%.1f' % (srpc['sendmsg'] - srpc['recvmsg_done']) + else: + srvc = '' + if 'nic' in rx and 'sendmsg' in srpc: + rsp_nic = '%.1f' % (rx['nic'] - srpc['sendmsg']) + else: + rsp_nic = '' + if 'gro' in rx and 'nic' in rx: + rsp_gro = '%.1f' % (rx['gro'] - rx['nic']) + else: + rsp_gro = '' + if 'softirq' in rx and 'gro' in rx: + rsp_soft = '%.1f' % (rx['softirq'] - rx['gro']) + else: + rsp_soft = '' + if 'recvmsg_done' in rpc and 'softirq' in rx: + rsp_recv = '%.1f' % (rpc['recvmsg_done'] - rx['softirq']) + else: + rsp_recv = '' + if 'recvmsg_done' in rpc and 'sendmsg' in rpc: + rtt = '%.1f' % (rpc['recvmsg_done'] - rpc['sendmsg']) + else: + rtt = '' + if 'recvmsg_done' in rpc: + end = '%.3f' % (rpc['recvmsg_done']) + else: + end = '' + buf.write('%10s %-8s %-8s %10s %7d %6s %6s' % (start, + rpc['node'], get_rpc_node(peer_id), rpc['id'], rpc['out_length'], + rq_nic, rq_gro)) + buf.write(' %6s %6s %6s %6s %6s' % ( + rq_soft, rq_recv, srvc, rsp_nic, rsp_gro)) + buf.write(' %7s %7s %10s %6s\n' % (rsp_soft, rsp_recv, end, rtt)) return buf.getvalue() def print_tcp_rpcs(rpcs, header=True): @@ -1251,7 +1353,9 @@ def print_tcp_rpcs(rpcs, header=True): buf.write(' off to NIC\n') buf.write('ReqNet: Time from NIC handoff to GRO receipt for ' 'first request packet\n') - buf.write('ReqRecv: Time from GRO for last request packet ' + buf.write('ReqSft: Time from GRO for last request packet until ' + 'SoftIRQ for it\n') + buf.write('ReqRecv: Time from SoftIRQ for last request packet ' 'until recvmsg completes\n') buf.write(' on server\n') buf.write('Srvc: Time from recvmsg return on server until ' @@ -1261,13 +1365,15 @@ def print_tcp_rpcs(rpcs, header=True): buf.write(' off to NIC\n') buf.write('RspNet: Time from NIC handoff to GRO receipt for ' 'first response packet\n') - buf.write('RspRecv: Time from GRO for last response packet ' + buf.write('RspSft: Time from GRO for last response packet until ' + 'SoftIRQ for it\n') + buf.write('RspRecv: Time from SoftIRQ for last response packet ' 'until End\n') buf.write('End: Time when response was returned to client\n') buf.write('Rtt: RspRecv - Start\n\n') - buf.write('Start Client Server Length ReqSeq RspSeq ') - buf.write('ReqXmit ReqNet ReqRecv Srvc ') - buf.write('RspXmit RspNet RspRecv End Rtt\n') + buf.write('Start Client Server Length ReqSeq RspSeq ') + buf.write('ReqXmit ReqNet ReqSft ReqRecv Srvc ') + buf.write('RspXmit RspNet RspSft RspRecv End Rtt\n') for rpc in rpcs: if rpc['req_pkts']: first_req_pkt = rpc['req_pkts'][0] @@ -1286,33 +1392,41 @@ def print_tcp_rpcs(rpcs, header=True): else: resp_seq = '' if 'nic' in first_req_pkt: - rqxmit = '%.1f' % (first_req_pkt['nic'] - rpc['req_send']) + rq_xmit = '%.1f' % (first_req_pkt['nic'] - rpc['req_send']) else: - rqxmit = '' + rq_xmit = '' if 'gro' in first_req_pkt and 'nic' in first_req_pkt: - rqnet = '%.1f' % (first_req_pkt['gro'] - first_req_pkt['nic']) + rq_net = '%.1f' % (first_req_pkt['gro'] - first_req_pkt['nic']) else: - rqnet = '' - if 'gro' in last_req_pkt and 'req_recvd' in rpc: - rqrecv = '%.1f' % (rpc['req_recvd'] - last_req_pkt['gro']) + rq_net = '' + if 'gro' in last_req_pkt and 'softirq' in last_req_pkt: + rq_soft = '%.1f' % (last_req_pkt['softirq'] - last_req_pkt['gro']) else: - rqrecv = '' + rq_soft = '' + if 'softirq' in last_req_pkt and 'req_recvd' in rpc: + rq_recv = '%.1f' % (rpc['req_recvd'] - last_req_pkt['gro']) + else: + rq_recv = '' if 'req_recvd' in rpc and 'resp_send' in rpc: srvc = '%.1f' % (rpc['resp_send'] - rpc['req_recvd']) else: srvc = '' if 'nic' in first_resp_pkt: - rspxmit = '%.1f' % (first_resp_pkt['nic'] - rpc['resp_send']) + rsp_xmit = '%.1f' % (first_resp_pkt['nic'] - rpc['resp_send']) else: - rspxmit = '' + rsp_xmit = '' if 'gro' in first_resp_pkt and 'nic' in first_resp_pkt: - rspnet = '%.1f' % (first_resp_pkt['gro'] - first_resp_pkt['nic']) + rsp_net = '%.1f' % (first_resp_pkt['gro'] - first_resp_pkt['nic']) + else: + rsp_net = '' + if 'gro' in last_resp_pkt and 'softirq' in last_resp_pkt: + rsp_soft = '%.1f' % (last_resp_pkt['softirq'] - last_resp_pkt['gro']) else: - rspnet = '' - if 'gro' in last_resp_pkt and 'resp_recvd' in rpc: - rsprecv = '%.1f' % (rpc['resp_recvd'] - last_resp_pkt['gro']) + rsp_soft = '' + if 'softirq' in last_resp_pkt and 'resp_recvd' in rpc: + rsp_recv = '%.1f' % (rpc['resp_recvd'] - last_resp_pkt['softirq']) else: - rsprecv = '' + rsp_recv = '' if 'req_send' in rpc and 'resp_recvd' in rpc: rtt = '%.1f' % (rpc['resp_recvd'] - rpc['req_send']) else: @@ -1321,12 +1435,14 @@ def print_tcp_rpcs(rpcs, header=True): end = '%.3f' % (rpc['resp_recvd']) else: end = '' - line = ('%9.3f %-8s %-8s %7d %10d %10s' % ( + line = ('%10.3f %-8s %-8s %7d %10d %10s' % ( rpc['req_send'], get_tcp_node(rpc['client']), get_tcp_node(rpc['server']), rpc['req_length'], rpc['req_seq'], resp_seq)) - line += (' %7s %6s %7s %6s' % (rqxmit, rqnet, rqrecv, srvc)) - line += (' %7s %6s %7s %9s %7s' % (rspxmit, rspnet, rsprecv, end, rtt)) + line += (' %7s %6s %6s %7s %6s' % ( + rq_xmit, rq_net, rq_soft, rq_recv, srvc)) + line += (' %7s %6s %6s %7s %10s %7s' % ( + rsp_xmit, rsp_net, rsp_soft, rsp_recv, end, rtt)) buf.write(line.rstrip()) buf.write('\n') return buf.getvalue() @@ -2458,6 +2574,20 @@ def __tcp_gro(self, trace, time, core, match, interests): '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' }) + def __tcp_softirq(self, trace, time, core, match, interests): + source = match.group(1) + dest = match.group(2) + data_bytes = int(match.group(3)) + seq_ack = int(match.group(4)) + for interest in interests: + interest.tt_tcp_softirq(trace, time, core, source, dest, data_bytes, + seq_ack) + patterns.append({ + 'name': 'tcp_softirq', + 'regexp': 'softirq got TCP packet from (0x[a-f0-9]+) to ' + '(0x[a-f0-9]+), data bytes ([0-9]+), seq/ack ([0-9]+)' + }) + def __tcp_recvmsg(self, trace, time, core, match, interests): source = match.group(1) dest = match.group(2) @@ -2632,20 +2762,20 @@ def print_rates(self): nodes = defaultdict(lambda : defaultdict(lambda: 0)) for pkt in packets.values(): - if not 'tx_node' in pkt or not 'tso_length' in pkt: + if not pkt['tx_node'] or not 'tso_length' in pkt: continue node_stats = nodes[pkt['tx_node']] node_stats['homa_pkts'] += 1 node_stats['homa_bytes'] += pkt['tso_length'] for pkt in grants.values(): - if not 'tx_node' in pkt: + if not pkt['tx_node']: continue node_stats = nodes[pkt['tx_node']] node_stats['homa_grants'] += 1 for pkt in tcp_packets.values(): - if not 'tx_node' in pkt: + if pkt['tx_node']: continue node_stats = nodes[pkt['tx_node']] if not 'tso_length' in pkt: @@ -3700,8 +3830,6 @@ def print_worst(data, label): grant_to_softirq = [] grant_free = [] - print('Number of packets is now %d' % (len(packets))) - for p, pkt in packets.items(): if (not 'softirq' in pkt) or (not 'xmit' in pkt): continue @@ -5253,7 +5381,7 @@ def analyze(self): if (self.tx_qid != None) and ((not 'tx_qid' in pkt) or (pkt['tx_qid'] != self.tx_qid)): continue - tx_node = pkt['tx_node'] if 'tx_node' in pkt else None + tx_node = pkt['tx_node'] if not 'length' in pkt: print('Packet with no length: %s' % (pkt), file=sys.stderr) continue @@ -5265,7 +5393,7 @@ def analyze(self): nic_interval = get_interval(tx_node, tnic) else: tnic = None - if tx_node != None: + if tx_node: if not tx_node in traces: print('Bogus node name %s. Packet: %s' % (tx_node, pkt)) print('\nTraces: %s' % (traces)) @@ -5290,7 +5418,7 @@ def analyze(self): if ('tso_length' in pkt): tso_length = pkt['tso_length'] - if tx_node != None: + if tx_node: if nic_end < 1e20: add_to_intervals(tx_node, nic_start, nic_end, 'tx_in_nic', tso_length) @@ -5339,7 +5467,7 @@ def analyze(self): if interval != None: interval['tx_gro_bytes'] += length - if not 'rx_node' in pkt: + if not pkt['rx_node']: continue rx_node = pkt['rx_node'] if tnic != None: @@ -6048,7 +6176,7 @@ def analyze(self): if rx_node == '': continue if 'gro' in pkt: - if not 'tx_node' in pkt: + if not pkt['tx_node']: print('Strange packet: %s' % (pkt)) self.rx_core[pkt['tx_node']][rx_node] = pkt['gro_core'] continue @@ -6738,7 +6866,7 @@ def output(self): # average backlog data (this calculation will consider packets # that don't have enough data to use in later calculations). for pkt in itertools.chain(packets.values(), tcp_packets.values()): - if not 'tso_length' in pkt or not 'tx_node' in pkt: + if not 'tso_length' in pkt or not pkt['tx_node']: continue length = pkt['tso_length'] node = pkt['tx_node'] @@ -7012,7 +7140,7 @@ def output(self): # Bucket all of the packets by transmitting node. for pkt in itertools.chain(packets.values(), tcp_packets.values()): - if not 'tso_length' in pkt or not 'tx_node' in pkt: + if not 'tso_length' in pkt or not pkt['tx_node']: continue if not 'tx_qid' in pkt: continue @@ -7480,7 +7608,7 @@ def output(self): # Scan all packets and fill in the variables above. for pkt in itertools.chain(packets.values(), tcp_packets.values()): - if not 'tx_node' in pkt or pkt['tx_node'] != options.node: + if not pkt['tx_node'] or pkt['tx_node'] != options.node: continue if not 'tso_length' in pkt: continue @@ -7642,7 +7770,7 @@ def output(self): type_counts = defaultdict(lambda: 0) for pkt in itertools.chain(packets.values(), tcp_packets.values(), grants.values()): - if not 'tx_node' in pkt or not 'tx_qid' in pkt: + if not pkt['tx_node'] or not 'tx_qid' in pkt: continue if pkt['type'] == 'grant': length = 0 @@ -7673,7 +7801,9 @@ def output(self): # t: list of time values for the other data series # qdisc: for each t, kbytes queued in qdiscs or NIC at t # nic: for each t, kbytes queued in the NIC at t - node_data = defaultdict(lambda: {'t': [], 'qdisc': [], 'nic': []}) + # maxq: for each t, kbytes queued in the longest NIC queue at t + node_data = defaultdict(lambda: {'t': [], 'qdisc': [], 'nic': [], + 'maxq': []}) # Process the packets in each node separately in order to populate # intervals and node_data. @@ -7745,6 +7875,7 @@ def output(self): data['t'].append(interval_end) data['qdisc'].append((qdisc_bytes + nic_bytes) * 1e-3) data['nic'].append(nic_bytes * 1e-3) + data['maxq'].append(max(qid_bytes.values()) * 1e-3) active_queues = sum(n > 0 for n in qid_packets.values()) next = [active_queues, nic_bytes, nic_pkts, 0, 0, qdisc_bytes] @@ -7880,7 +8011,7 @@ def output(self): ax.set_xlim(0, 120) ax.set_xlabel('Tx Completion Rate (Gbps)') ax.set_ylim(0, 1.0) - ax.set_ylabel('Fraction of Intervals') + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) plt.grid(which="major", axis="y") plt.grid(which="major", axis="x") plt.plot(tput, y) @@ -7897,8 +8028,9 @@ def output(self): ax.xaxis.set_major_locator(matplotlib.ticker.MultipleLocator(2)) ax.set_xlabel('Active NIC queues') ax.set_ylim(0, 1.0) - ax.set_ylabel('Fraction of Intervals') + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") plt.plot(active, y) plt.tight_layout() plt.savefig('%s/nictx_queues_cdf.pdf' % (options.plot)) @@ -7909,11 +8041,12 @@ def output(self): y = [i / len(kb) for i in range(len(kb))] fig = plt.figure(figsize=[6,4]) ax = fig.add_subplot(111) - ax.set_xlim(0, 1500) + ax.set_xlim(0, kb[95*len(kb)//100]) ax.set_xlabel('KBytes in Queued Packets') ax.set_ylim(0, 1.0) - ax.set_ylabel('Fraction of Intervals') + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") plt.plot(kb, y) plt.tight_layout() plt.savefig('%s/nictx_kb_cdf.pdf' % (options.plot)) @@ -7927,8 +8060,9 @@ def output(self): ax.set_xlim(0, 5000) ax.set_xlabel('Kbytes in Packets Queued in a Qdisc') ax.set_ylim(0, 1.0) - ax.set_ylabel('Fraction of Intervals') + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") plt.plot(kb, y) plt.tight_layout() plt.savefig('%s/nictx_qdisc_cdf.pdf' % (options.plot)) @@ -7942,8 +8076,9 @@ def output(self): ax.set_xlim(0, 50) ax.set_xlabel('Packets Queued in NIC') ax.set_ylim(0, 1.0) - ax.set_ylabel('Fraction of Intervals') + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) plt.grid(which="major", axis="y") + plt.grid(which="major", axis="x") plt.plot(pkts, y) plt.tight_layout() plt.savefig('%s/nictx_pkts_cdf.pdf' % (options.plot)) @@ -7957,7 +8092,7 @@ def output(self): ax.set_xlim(0, 120) ax.set_xlabel('Rate of New Bytes Queued in NIC (Gbps)') ax.set_ylim(0, 1.0) - ax.set_ylabel('Fraction of Intervals') + ax.set_ylabel('Fraction of %d μs Intervals' % options.interval) plt.grid(which="major", axis="y") plt.grid(which="major", axis="x") plt.plot(input, y) @@ -7976,7 +8111,7 @@ def output(self): node = nodes[i] ax = axes[i] ax.set_xlim(x_min, x_max) - ax.set_xlabel('Time') + ax.set_xlabel('Time (%s)' % (node)) ax.set_ylim(0, maxy) ax.set_ylabel('Kbytes Queued') ax.grid(which="major", axis="y") @@ -7992,7 +8127,28 @@ def output(self): ] fig.legend(handles=legend_handles) plt.tight_layout() - plt.savefig("%s/nictx_qtrends.pdf" % (options.plot), bbox_inches='tight') + plt.savefig("%s/nictx_qtrend.pdf" % (options.plot), bbox_inches='tight') + + # Generate time-series plot showing length of the longest NIC queue + # for each node + x_min = get_first_time() + x_max = get_last_time() + nodes = get_sorted_nodes() + maxy = max(max(node_data[node]['maxq']) for node in nodes) + fig, axes = plt.subplots(nrows=len(nodes), ncols=1, sharex=False, + figsize=[8, len(nodes)*2]) + for i in range(len(nodes)): + node = nodes[i] + ax = axes[i] + ax.set_xlim(x_min, x_max) + ax.set_xlabel('Time (%s)' % (node)) + ax.set_ylim(0, maxy) + ax.set_ylabel('Longest NIC Queue (KB)') + ax.grid(which="major", axis="y") + ax.plot(node_data[node]['t'], node_data[node]['maxq'], + color=color_blue) + plt.tight_layout() + plt.savefig("%s/nictx_maxqtrend.pdf" % (options.plot), bbox_inches='tight') print('\n---------------') print('Analyzer: nictx') @@ -8628,9 +8784,11 @@ def analyze(self): else: pkt['tso_length'] = tso_length + if not 'tx_node' in pkt: + pkt['tx_node'] = get_rpc_node(id) + if not 'rx_node' in pkt: - if 'peer' in tx_rpc and tx_rpc['peer'] in ip_to_node: - pkt['rx_node'] = ip_to_node[tx_rpc['peer']] + pkt['rx_node'] = get_rpc_node(id^1) if 'qdisc_xmit' in pkt: pkt['xmit2'] = pkt['qdisc_xmit'] @@ -8676,6 +8834,12 @@ def analyze(self): for pid, pkt in new_pkts: packets[pid] = pkt + for pkt in grants.values(): + if not 'tx_node' in pkt: + pkt['tx_node'] = get_rpc_node(pkt['id']) + if not 'rx_node' in pkt: + pkt['rx_node'] = get_rpc_node(pkt['id']^1) + #------------------------------------------------ # Analyzer: pairs #------------------------------------------------ @@ -8709,9 +8873,9 @@ def output(self): for pkt in packets.values(): if not 'nic' in pkt: continue - if not 'tx_node' in pkt: + if not pkt['tx_node']: continue - if not 'rx_node' in pkt: + if not pkt['rx_node']: continue src = pkt['tx_node'] dst = pkt['rx_node'] @@ -8823,7 +8987,7 @@ def output(self): node_pkts = defaultdict(list) for pkt in packets.values(): - if not 'nic' in pkt or not 'gro' in pkt or not 'rx_node' in pkt: + if not 'nic' in pkt or not 'gro' in pkt or not pkt['rx_node']: continue if not 'priority' in pkt: continue @@ -9292,12 +9456,17 @@ def output(self): #------------------------------------------------ class AnalyzeRpcs: """ - Collects information about each RPC but doesn't actually print - anything. Intended for use by other analyzers. + Print information about Homa RPCs. The options --msglen, --rpc-start, + and --rtt may be used to filter the RPCs to print. By default the RPCs + are printed in order of start time, but that may be changed with the + --sort option. The --sort option is a list of the column names Start, + End, and Rtt; the RPCs will be sorted by each keyword in order before + printing. If --verbose is specified then the packets from the selected + RPCs are also printed. """ def __init__(self, dispatcher): - return + dispatcher.interest('AnalyzePackets') def append(self, trace, id, t, name, value): """ @@ -9592,6 +9761,146 @@ def analyze(self): if 'out_length' in sender: rpc['in_length'] = sender['out_length'] + def output(self): + global rpcs, options + + print('\n------------------') + print('Analyzer: rpcs') + print('------------------') + + rpcs_to_print = filter_rpcs(rpcs.values(), msglen=options.msglen, + rpc_start=options.rpc_start, rtt=options.rtt) + if (options.msglen != None or options.rpc_start != None or + options.rtt != None): + print('%d Homa RPCs were selected using the following filters:' % + (len(rpcs_to_print))) + if options.msglen: + print(' --msglen %s' % (options.msglen)) + if options.rpc_start: + print(' --rpc-start %s' % (options.rpc_start)) + if options.rtt: + print(' --rtt %s' % (options.rtt)) + else: + print('There are %d Homa RPCs in the traces' % (len(rpcs_to_print))) + + sort_keys = options.sort + if sort_keys == None: + sort_keys = 'Start' + for key in sort_keys.split(): + if key == 'Start': + rpcs_to_print = sorted(rpcs_to_print, key = lambda rpc: + rpc['sendmsg'] if 'sendmsg' in rpc else 1e20) + elif key == 'End': + rpcs_to_print = sorted(rpcs_to_print, key = lambda rpc: + rpc['recvmsg_done'] if 'recvmsg_done' in rpc else 1e20) + elif key == 'Rtt': + rpcs_to_print = sorted(rpcs_to_print, reverse = True, key = lambda rpc: + rpc['recvmsg_done'] - rpc['sendmsg'] + if 'recvmsg_done' in rpc and 'sendmsg' in rpc else 0) + else: + raise Exception('Unknown sort key \'%s\' for tcp_rpcs ' + 'analyzer' % (key)) + + # Collect and print overall statistics about the RPCs. + xmit = [] + net = [] + free = [] + softirq = [] + recv = [] + srvc = [] + rtt = [] + for rpc in rpcs_to_print: + sid = rpc['id'] ^ 1 + if sid in rpcs: + srpc = rpcs[sid] + else: + srpc = {} + if rpc['send_data_pkts']: + first_req_pkt = rpc['send_data_pkts'][0] + last_req_pkt = rpc['send_data_pkts'][-1] + else: + first_req_pkt = [] + last_req_pkt = [] + if rpc['gro_data_pkts']: + first_resp_pkt = rpc['gro_data_pkts'][0] + last_resp_pkt = rpc['gro_data_pkts'][-1] + else: + first_resp_pkt = [] + last_resp_pkt = [] + if 'nic' in first_req_pkt: + xmit.append(first_req_pkt['nic'] - rpc['sendmsg']) + if 'nic' in first_resp_pkt and 'sendmsg' in srpc: + xmit.append(first_resp_pkt['nic'] - srpc['sendmsg']) + for pkt in itertools.chain(rpc['send_data_pkts'], + rpc['gro_data_pkts']): + if 'gro' in pkt and 'nic' in pkt: + net.append(pkt['gro'] - pkt['nic']) + if 'free_tx_skb' in pkt and 'nic' in pkt: + free.append(pkt['free_tx_skb'] - pkt['nic']) + if 'softirq' in pkt and 'gro' in pkt: + softirq.append(pkt['softirq'] - pkt['gro']) + if 'softirq' in last_req_pkt and 'req_recvd' in rpc: + recv.append(rpc['req_recvd'] - last_req_pkt['softirq']) + if 'softirq' in last_resp_pkt and 'recvmsg_done' in rpc: + recv.append(rpc['recvmsg_done'] - last_resp_pkt['softirq']) + if 'recvmsg_done' in srpc and 'sendmsg' in srpc: + srvc.append(srpc['sendmsg'] - srpc['recvmsg_done']) + if 'sendmsg' in rpc and 'recvmsg_done' in rpc: + rtt.append(rpc['recvmsg_done'] - rpc['sendmsg']) + for l in [xmit, net, free, recv, srvc, rtt]: + l.sort() + + print('\nOverall statistics about the selected RPCs. Most of these ' + 'statistics') + print('combine data from request messages and response messages.') + print('Xmit: Time from sendmsg until driver queued first ' + 'packet for NIC') + print('Net: Time from NIC handoff to GRO receipt for packets') + print('Free: Time from when NIC received packet until packet ' + 'was returned') + print(' to Linux and freed') + print('SoftIrq: Time from when packet was received by GRO until it ' + 'was received') + print(' by SoftIRQ') + print('Recv: Time from SoftIRQ for last packet in a message ' + 'until recvmsg completes') + print('Srvc: Time from recvmsg return on server until ' + 'sendmsg for response') + print('Rtt: Total time from request sendmsg until recvmsg ' + 'completes for response\n') + + print(' Min P10 P50 P90 P99 Max') + pctls = [0, 100, 500, 900, 990, 1000] + print('Xmit %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(xmit, p, '%.1f') for p in pctls)) + print('Net %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(net, p, '%.1f') for p in pctls)) + print('Free %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(free, p, '%.1f') for p in pctls)) + print('SoftIrq %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(softirq, p, '%.1f') for p in pctls)) + print('Recv %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(recv, p, '%.1f') for p in pctls)) + print('Srvc %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(srvc, p, '%.1f') for p in pctls)) + print('Rtt %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(rtt, p, '%.1f') for p in pctls)) + + # Print a summary line for each RPC. + print('\nSummary information for each selected RPC:') + print(print_rpcs(rpcs_to_print, header = True), end='') + + if options.verbose: + first = True + print('\nPackets from the selected RPCs (in the same RPC order as ' + 'above):') + for rpc in rpcs_to_print: + if not first: + print() + print(print_pkts(rpc['send_data_pkts'], header=first), end='') + print(print_pkts(rpc['gro_data_pkts'], header=False), end='') + first = False + #------------------------------------------------ # Analyzer: rtt #------------------------------------------------ @@ -10837,7 +11146,7 @@ def output(self): global tcp_rpcs, options print('\n------------------') - print('Analyzer: tcp_rpcs') + print('Analyzer: rpcs') print('------------------') if (options.msglen != None or options.rpc_start != None or @@ -10873,13 +11182,14 @@ def output(self): rpc['resp_recvd'] - rpc['req_send'] if 'resp_recvd' in rpc and 'req_send' in rpc else 0) else: - raise Exception('Unknwon sort key \'%s\' for tcp_rpcs ' + raise Exception('Unknown sort key \'%s\' for tcp_rpcs ' 'analyzer' % (key)) # Collect and print overall statistics about the RPCs. xmit = [] net = [] free = [] + softirq = [] recv = [] srvc = [] rtt = [] @@ -10905,10 +11215,12 @@ def output(self): net.append(pkt['gro'] - pkt['nic']) if 'free_tx_skb' in pkt and 'nic' in pkt: free.append(pkt['free_tx_skb'] - pkt['nic']) - if 'gro' in last_req_pkt and 'req_recvd' in rpc: - recv.append(rpc['req_recvd'] - last_req_pkt['gro']) - if 'gro' in last_resp_pkt and 'resp_recvd' in rpc: - recv.append(rpc['resp_recvd'] - last_resp_pkt['gro']) + if 'softirq' in pkt and 'gro' in pkt: + softirq.append(pkt['softirq'] - pkt['gro']) + if 'softirq' in last_req_pkt and 'req_recvd' in rpc: + recv.append(rpc['req_recvd'] - last_req_pkt['softirq']) + if 'softirq' in last_resp_pkt and 'resp_recvd' in rpc: + recv.append(rpc['resp_recvd'] - last_resp_pkt['softirq']) if 'req_recvd' in rpc and 'resp_send' in rpc: srvc.append(rpc['resp_send'] - rpc['req_recvd']) if 'req_send' in rpc and 'resp_recvd' in rpc: @@ -10925,7 +11237,10 @@ def output(self): print('Free: Time from when NIC received packet until packet ' 'was returned') print(' to Linux and freed') - print('Recv: Time from GRO for last packet in a message ' + print('SoftIrq: Time from when packet was received by GRO until it ' + 'was received') + print(' by SoftIRQ') + print('Recv: Time from SoftIRQ for last packet in a message ' 'until recvmsg completes') print('Srvc: Time from recvmsg return on server until ' 'sendmsg for response') @@ -10940,6 +11255,8 @@ def output(self): print_pctl(net, p, '%.1f') for p in pctls)) print('Free %8s %8s %8s %8s %8s %8s' % tuple( print_pctl(free, p, '%.1f') for p in pctls)) + print('SoftIrq %8s %8s %8s %8s %8s %8s' % tuple( + print_pctl(softirq, p, '%.1f') for p in pctls)) print('Recv %8s %8s %8s %8s %8s %8s' % tuple( print_pctl(recv, p, '%.1f') for p in pctls)) print('Srvc %8s %8s %8s %8s %8s %8s' % tuple( @@ -11263,6 +11580,16 @@ def tt_tcp_gro(self, trace, t, core, source, dest, data_bytes, seq_ack): tcp_pkt['rx_node'] = node set_tcp_ip_node(dest, node) + def tt_tcp_softirq(self, trace, t, core, source, dest, data_bytes, seq_ack): + global tcp_hdr_length + + tcp_pkt = get_tcp_packet(source, dest, data_bytes, seq_ack) + node = trace['node'] + tcp_pkt['length'] = data_bytes + tcp_pkt['softirq'] = t + tcp_pkt['rx_node'] = node + set_tcp_ip_node(dest, node) + def analyze(self): """ This method post-processes all of the TCP packets to fill in missing @@ -11278,11 +11605,11 @@ def analyze(self): # Pass 1: divide data packets into buckets for unidirectional # streams, and also fill in a fiew fields. for pkt in tcp_packets.values(): - if not 'tx_node' in pkt: + if not pkt['tx_node']: node = get_tcp_node(pkt['source']) if node != None: pkt['tx_node'] = node - if not 'rx_node' in pkt: + if not pkt['rx_node']: node = get_tcp_node(pkt['source']) if node != None: pkt['rx_node'] = node @@ -11325,21 +11652,37 @@ class AnalyzeTemp: def __init__(self, dispatcher): # dispatcher.interest('AnalyzeTcp_rpcs') # dispatcher.interest('AnalyzeRpcs') - # dispatcher.interest('AnalyzePackets') + dispatcher.interest('AnalyzePackets') dispatcher.interest('AnalyzeTcppackets') def output(self): global packets, grants, tcp_packets - selected = [] - for pkt in tcp_packets.values(): - if not 'tso_length' in pkt: - continue - if pkt['tso_length'] <= 1500: - selected.append(pkt) + # node -> dict of addr -> core, where addr is a sender address and + # core is the GRO core for that address (for Homa) + node_cores = defaultdict(dict) - selected.sort(key=lambda pkt: pkt['xmit'] if 'xmit' in pkt else 1e20) - print(print_pkts(selected, header=True), end='') + for pkt in packets.values(): + if 'gro_core' in pkt and pkt['tx_node'] and pkt['rx_node']: + node_cores[pkt['rx_node']][pkt['tx_node']] = pkt['gro_core'] + + print('\nNode Conflict Max') + total_conflicts = 0 + for node in get_sorted_nodes(): + cores = defaultdict(lambda: 0) + # print('Node %s: %s' % (node, node_cores[node])) + for addr, core in node_cores[node].items(): + cores[core] += 1 + conflicts = 0 + max_conflict = 0 + # print('Node %s core info: %s' % (node, cores)) + for count in cores.values(): + conflicts += count - 1 + if count - 1 > max_conflict: + max_conflict = count - 1 + total_conflicts += conflicts + print('%-8s %3d %3d' % (node, conflicts, max_conflict)) + print('Total conflicts: %d' % (total_conflicts)) def output_slow_pkts(self): pkts = [] @@ -11572,7 +11915,7 @@ def output(self): total_bytes = 0 pkts = 0 for pkt in itertools.chain(packets.values(), tcp_packets.values()): - if not 'tx_node' in pkt or pkt['tx_node'] != 'node4': + if pkt['tx_node'] != 'node4': continue if not 'nic' in pkt or pkt['nic'] < 17750 or pkt['nic'] >= 17950: continue @@ -12542,7 +12885,19 @@ def output(self): # Invoke 'analyze' methods in each analyzer, if present, to perform # postprocessing now that all the trace data has been read. +rpcs_invoked = False for analyzer in dispatcher.get_analyzers(): + # Special hack: AnalyzeRpcs and AnalyzePackets are mutually dependent, + # but we need to make sure that AnalyzeRpcs is always invoked first. + if analyzer.__class__.__name__ == 'AnalyzePackets' and not rpcs_invoked: + rpc_analyzer = dispatcher.get_analyzer('AnalyzeRpcs') + if rpc_analyzer != None: + rpc_analyzer.analyze() + rpcs_invoked = True + elif analyzer.__class__.__name__ == 'AnalyzeRpcs': + if rpcs_invoked: + continue + rpcs_invoked = True if hasattr(analyzer, 'analyze'): # print('Calling %s.analyze' % (type(analyzer).__name__), file=sys.stderr) analyzer.analyze() From bde9bee4b5677fed3d3158d338335832c311d386 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 15 Jan 2026 11:37:13 -0800 Subject: [PATCH 620/625] Add new entries in perf.txt --- perf.txt | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 171 insertions(+), 1 deletion(-) diff --git a/perf.txt b/perf.txt index 9da2da25..9b1526ae 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,176 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +67. (January 2025) Performance variation over reboots. On the c6620 CloudLab +cluster, both Homa and TCP performance seems to vary from reboot to reboot. +Performance is relatively consistent between reboots. However, after +observing this phenomenon one day, it completely disappeared the next day +(reboots consistent result in "fast" behavior). There was a CloudLab datacenter +shutdown overnight... perhaps that somehow changed the behavior? + +Each line below represents one reboot of a c6620 cluster running this command: + +cp_vs_tcp -w w4 -b 80 -s 20 -l /ouster/logs/test -n 6 --skip 0 --tcp yes +--port-threads 3 --port-receivers 3 --client-ports 5 --server-ports 2 +--tcp-client-ports 10 --tcp-server-ports 20 + +The command was run once with the old pacer and once with homa_qdisc +enabled. In addition, for the "Both" measurements, cp_both was used +to run Homa and TCP simultaneously (with homa_qdisc enabled): + +cp_both -w w4 -b 80 -s 20 -l /ouster/logs/test -n 6 --skip 0 --homa-gbps 40 +--port-threads 3 --port-receivers 3 --client-ports 5 --server-ports 2 +--tcp-client-ports 10 --tcp-server-ports 20 + +Each measurement includes average slowdown and P99 short-message latency, +as printed by cp_vs_tcp on the "avg slowdown" line. + +Homa no qdisc TCP no qdisc Homa Qdisc Tcp Qdisc Homa Both TCP Both + Avg P99 Avg P99 Avg P99 Avg P99 Avg P99 Avg P99 +---------------------------------------------------------------------------- +3.30 97 11.68 1258 3.30 90 10.91 850 3.67 125 9.09 766 +3.30 97 11.73 1277 3.29 90 10.72 816 3.69 125 9.17 768 +3.30 97 11.62 1273 3.31 91 10.77 837 3.68 125 9.15 784 +3.30 98 11.71 1268 3.30 90 10.75 828 3.68 124 9.15 769 +3.31 99 11.64 1268 3.60 97 11.54 906 4.38 144 11.60 892 +3.34 101 11.71 1253 3.35 94 10.92 860 3.71 125 9.19 774 +3.85 135 12.40 1501 4.08 117 12.04 961 4.77 158 13.19 1003 +3.94 143 12.53 1555 3.92 107 11.87 961 5.12 204 14.28 1126 +4.20 255 12.86 1694 4.20 133 12.71 1053 + +The following experiments were run repeatedly without rebooting the nodes +(two different reboots separated by a blank line): + +Homa no qdisc TCP no qdisc Homa Qdisc Tcp Qdisc Homa Both TCP Both + Avg P99 Avg P99 Avg P99 Avg P99 Avg P99 Avg P99 +---------------------------------------------------------------------------- +3.30 97 11.73 1277 3.29 90 10.72 816 3.69 125 9.17 768 +3.30 97 11.74 1270 3.31 90 10.72 823 3.67 125 9.05 764 +3.30 97 11.68 1259 3.30 90 10.92 852 3.67 124 9.11 762 +3.30 97 11.68 1267 3.30 90 10.79 831 3.69 125 9.21 766 +3.29 97 11.74 1291 3.30 90 10.79 837 3.68 125 9.12 767 +3.29 97 11.75 1276 3.30 90 10.68 814 3.68 125 9.16 773 + +3.97 138 12.72 1599 4.06 110 12.33 1002 4.91 173 13.78 1083 +4.07 146 12.74 1632 4.03 112 12.04 966 4.72 162 13.49 1020 +4.09 150 12.63 1577 4.06 114 12.43 1013 5.05 181 14.41 1145 +4.08 148 12.59 1557 4.11 114 12.33 1000 5.02 177 14.39 1093 +4.17 180 12.65 1603 3.94 106 12.20 988 4.95 178 14.30 1130 +3.99 132 12.74 1629 4.05 111 12.25 989 5.01 172 14.01 1085 +4.02 143 12.53 1558 4.09 113 11.91 959 5.10 191 15.18 1208 +3.89 126 12.54 1590 4.28 120 12.08 981 5.14 182 14.07 1111 + +66. (January 2025) Evaluated benchmarking parameters for 100 Gbps networks +(c6620 CloudLab cluster). Overall, for W4 the best parameters for Homa are: + +--port-threads 3 --port-receivers 3 --client-ports 5 --server-ports 2 +('--client-ports 4 --server-ports 2' and '--client-ports 3 --server-ports 3' +are about the same) + +and for Tcp: + +--tcp-client-ports 10 --tcp-server-ports 20 + +Here are more detailed measurements: +Thr: --port-threads and --port-receivers +CPorts: --client-ports +SPorts: --server-ports +TcpCP: --tcp-client-ports +TcpSP: --tcp-server-ports +HomaS: Average slowdown for Homa +HomaP99: P99 latency for short messages for Homa (usecs) +TcpS: Average slowdown for TCP +TcpP99: P99 latency for short messages for TCP (usecs) + +Homa under cp_vs_tcp with homa_qdisc (c6620, 6 nodes): +Note: these measurements were taken with a "good" boot configuration +-w -b Thr CPorts SPorts HomaS HomaP99 +------------------------------------------- +w3 34 2 3 3 1.98 142 +w3 34 2 4 4 1.77 98 +w3 34 2 5 5 1.67 76 +w3 34 2 6 6 1.68 72 +w3 34 2 7 7 1.69 69 max tput (47.4 Gbps) +w3 34 2 8 8 1.70 69 + +w4 80 3 5 1 3.44 170 +w4 80 3 4 2 3.36 94 +w4 80 3 5 2 3.35 94 +w4 80 3 3 3 3.41 96 +w4 80 3 4 4 3.51 99 +w4 80 3 5 5 3.53 100 +w4 80 3 3 5 3.58 104 +w4 80 3 5 3 3.43 94 +w4 80 2 4 4 3.43 109 +w4 80 2 5 5 3.43 103 +w4 80 2 6 6 3.47 102 +w4 80 2 7 7 3.51 104 + +w5 80 2 6 4 8.04 177 +w5 80 3 4 2 7.74 136 +w5 80 3 3 3 8.25 141 +w5 80 3 4 4 8.42 141 + +TCP under cp_vs_tcp with homa_qdisc (c6620, 6 nodes): +Note: these measurements were taken with a "good" boot configuration +-w -b TcpCP TcpSP TcpS TcpP99 +------------------------------------ +w3 34 4 8 3.10 445 +w3 34 5 10 3.72 516 +w3 34 6 12 3.50 430 +w3 34 7 14 3.53 390 +w3 34 8 16 3.63 368 max tput (42.7 Gbps) +w3 34 9 18 3.80 361 + +w4 80 2 4 25.31 4040 +w4 80 3 6 13.83 1790 +w4 80 4 8 12.42 1536 +w4 80 5 10 12.23 1461 +w4 80 6 12 12.28 1342 +w4 80 7 14 11.68 1105 +w4 80 8 16 11.40 980 +w4 80 9 18 10.87 872 +w4 80 10 20 10.79 843 +w4 80 12 24 11.41 821 +w4 80 15 30 15.74 915 + +w5 80 6 12 16.00 1927 +w5 80 8 16 15.80 1866 +w5 80 10 20 15.27 1636 +w5 80 12 24 15.38 1478 + +Explored configuration for cp_both (c6620 cluster, -w w4 -b 80): +Note: these measurements were taken with a "good" boot configuration +(Hgbps is the --homa_gbps parameter) + +HGbps Thr CPorts SPorts TcpCP TcpSP HomaS HomaP99 TcpS TcpP99 +---------------------------------------------------------------------- +5 3 1 1 8 16 5.02 268 10.83 929 +5 3 2 2 10 20 4.54 205 10.50 842 +5 3 3 3 12 24 4.45 179 11.07 818 +5 3 4 2 10 15 4.23 173 10.23 892 +5 3 5 2 12 12 3.95 136 10.18 932 +5 3 6 2 8 24 4.36 181 10.30 830 +5 2 4 2 16 16 4.02 148 10.52 856 +5 2 5 2 16 20 4.32 182 11.40 844 + +20 3 4 2 10 20 3.93 154 10.14 828 +20 3 5 2 10 20 3.86 146 10.06 815 +20 3 5 3 10 20 3.89 145 10.13 824 + +40 3 5 2 10 20 3.71 125 9.19 774 +40 3 5 3 10 20 3.71 124 9.21 763 +40 3 5 3 8 16 3.72 126 8.99 794 + +60 3 4 2 10 20 3.48 106 7.60 643 +60 3 5 2 10 20 3.47 104 7.62 647 +60 3 5 3 10 20 3.48 103 7.60 635 + +75 3 4 2 10 20 3.27 94 7.38 560 +75 3 5 2 10 20 3.24 93 7.36 553 +75 3 5 3 3 6 3.42 93 7.04 602 +75 3 5 3 8 16 3.31 91 7.21 565 + 65. (December 2025) The pacer does not prevent NIC queue buildup. Under "-w w4 -b 80" on cc620 machines (Intel NICs) it is not unusual to see periods of 1ms or longer with more than 500 Kbytes of packet data queued @@ -996,7 +1166,7 @@ Event Median 1. Without RPS enabled, Homa performance is limited by a single core handling all softirq actions. In order for RPS to work well, Homa must implement its own hash function for mapping packets to cores (the default IP hasher - doesn't know about Homa ports, so it considers only the peer IP address. + doesn't know about Homa ports, so it considers only the peer IP address). However, with RPS, packets can get spread out over too many cores, which causes poor latency when there is a single client and the server is underloaded. From ee539b2d0ee894ca4071ae046b981127bab747f7 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 15 Jan 2026 11:48:49 -0800 Subject: [PATCH 621/625] Use spin_lock_bh instead of spin_lock_irqrestore in homa_qdisc --- homa_qdisc.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/homa_qdisc.c b/homa_qdisc.c index e9f60af6..5f356585 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -423,18 +423,17 @@ int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, void homa_qdisc_destroy(struct Qdisc *qdisc) { struct homa_qdisc *q = qdisc_priv(qdisc); - unsigned long flags; qdisc_reset_queue(qdisc); - spin_lock_irqsave(&q->qdev->defer_lock, flags); + spin_lock_bh(&q->qdev->defer_lock); while (!skb_queue_empty(&q->deferred_tcp)) kfree_skb_reason(__skb_dequeue(&q->deferred_tcp), SKB_DROP_REASON_QDISC_DROP); list_del_init(&q->defer_links); if (q->qdev->congested_qdisc == q) q->qdev->congested_qdisc = NULL; - spin_unlock_irqrestore(&q->qdev->defer_lock, flags); + spin_unlock_bh(&q->qdev->defer_lock); homa_qdisc_qdev_put(q->qdev); } @@ -627,13 +626,12 @@ void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb) { struct homa_qdisc_dev *qdev = q->qdev; u64 now = homa_clock(); - unsigned long flags; tt_record_tcp("homa_qdisc deferring TCP packet from " "0x%x to 0x%x, data bytes %d, seq/ack %u", skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); - spin_lock_irqsave(&qdev->defer_lock, flags); + spin_lock_bh(&qdev->defer_lock); __skb_queue_tail(&q->deferred_tcp, skb); if (list_empty(&q->defer_links)) list_add_tail(&q->defer_links, &qdev->deferred_qdiscs); @@ -642,7 +640,7 @@ void homa_qdisc_defer_tcp(struct homa_qdisc *q, struct sk_buff *skb) else wake_up(&qdev->pacer_sleep); qdev->last_defer = now; - spin_unlock_irqrestore(&qdev->defer_lock, flags); + spin_unlock_bh(&qdev->defer_lock); } /** @@ -656,9 +654,8 @@ void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb) struct homa_skb_info *info = homa_get_skb_info(skb); struct homa_rpc *rpc = info->rpc; u64 now = homa_clock(); - unsigned long flags; - spin_lock_irqsave(&qdev->defer_lock, flags); + spin_lock_bh(&qdev->defer_lock); __skb_queue_tail(&rpc->qrpc.packets, skb); if (skb_queue_len(&rpc->qrpc.packets) == 1) { int bytes_left; @@ -673,7 +670,7 @@ void homa_qdisc_defer_homa(struct homa_qdisc_dev *qdev, struct sk_buff *skb) else wake_up(&qdev->pacer_sleep); qdev->last_defer = now; - spin_unlock_irqrestore(&qdev->defer_lock, flags); + spin_unlock_bh(&qdev->defer_lock); } /** @@ -716,7 +713,6 @@ void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) { struct homa_qdisc *q; - unsigned long flags; struct sk_buff *skb; int pkt_len; @@ -732,9 +728,9 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) * the same NIC queue as a long message. */ - spin_lock_irqsave(&qdev->defer_lock, flags); + spin_lock_bh(&qdev->defer_lock); if (list_empty(&qdev->deferred_qdiscs)) { - spin_unlock_irqrestore(&qdev->defer_lock, flags); + spin_unlock_bh(&qdev->defer_lock); return 0; } if (qdev->next_qdisc == &qdev->deferred_qdiscs) @@ -753,7 +749,7 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) qdev->last_defer = 0; } } - spin_unlock_irqrestore(&qdev->defer_lock, flags); + spin_unlock_bh(&qdev->defer_lock); pkt_len = qdisc_pkt_len(skb); homa_qdisc_update_link_idle(qdev, pkt_len, -1); @@ -780,13 +776,12 @@ struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev) struct homa_rpc *rpc; struct rb_node *node; struct sk_buff *skb; - unsigned long flags; int bytes_left; - spin_lock_irqsave(&qdev->defer_lock, flags); + spin_lock_bh(&qdev->defer_lock); node = rb_first_cached(&qdev->deferred_rpcs); if (!node) { - spin_unlock_irqrestore(&qdev->defer_lock, flags); + spin_unlock_bh(&qdev->defer_lock); return NULL; } qrpc = container_of(node, struct homa_rpc_qdisc, rb_node); @@ -810,7 +805,7 @@ struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev) INC_METRIC(nic_backlog_cycles, homa_clock() - qdev->last_defer); qdev->last_defer = 0; } - spin_unlock_irqrestore(&qdev->defer_lock, flags); + spin_unlock_bh(&qdev->defer_lock); return skb; } From ed32399658358e5b036315fd0bcd167f925ce9af Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 19 Jan 2026 11:09:40 -0800 Subject: [PATCH 622/625] Add FIFO scheduling to homa_qdisc Also add new metric pacer_fifo_bytes. --- homa_metrics.c | 2 + homa_metrics.h | 6 + homa_qdisc.c | 82 +++++++++++- homa_qdisc.h | 25 ++++ test/unit_homa_qdisc.c | 285 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 393 insertions(+), 7 deletions(-) diff --git a/homa_metrics.c b/homa_metrics.c index 5639eb70..dee3b123 100644 --- a/homa_metrics.c +++ b/homa_metrics.c @@ -301,6 +301,8 @@ char *homa_metrics_print(void) "Homa packets transmitted by the pacer\n"); M("pacer_homa_bytes", m->pacer_homa_bytes, "Homa bytes transmitted by the pacer (including headers)\n"); + M("pacer_fifo_bytes", m->pacer_fifo_bytes, + "Homa bytes transmitted using FIFO priority (including headers)\n"); M("pacer_tcp_packets", m->pacer_tcp_packets, "TCP packets transmitted by the pacer\n"); M("pacer_tcp_bytes", m->pacer_tcp_bytes, diff --git a/homa_metrics.h b/homa_metrics.h index 8caa6df1..ed73dba5 100644 --- a/homa_metrics.h +++ b/homa_metrics.h @@ -415,6 +415,12 @@ struct homa_metrics { */ u64 pacer_homa_bytes; + /** + * @pacer_fifo_bytes: total number of bytes in Homa packets that + * were transmitted using FIFO priority rather than SRPC. + */ + u64 pacer_fifo_bytes; + /** * @pacer_tcp_packets: total number of TCP packets that were * transmitted by homa_qdisc_pacer (they were deferred because of diff --git a/homa_qdisc.c b/homa_qdisc.c index 5f356585..9fa07e64 100755 --- a/homa_qdisc.c +++ b/homa_qdisc.c @@ -701,6 +701,10 @@ void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, struct homa_rpc *rpc) rb_link_node(&rpc->qrpc.rb_node, parent, new); rb_insert_color_cached(&rpc->qrpc.rb_node, &qdev->deferred_rpcs, leftmost); + + if (qdev->oldest_rpc && rpc->msgout.init_time < + qdev->oldest_rpc->msgout.init_time) + qdev->oldest_rpc = rpc; } /** @@ -762,6 +766,34 @@ int homa_qdisc_xmit_deferred_tcp(struct homa_qdisc_dev *qdev) return pkt_len; } +/** + * homa_qdisc_get_oldest() - Find and return the oldest Homa RPC with deferred + * packets for a qdev. + * @qdev: Info about deferred RPCs is stored here. + * Return: See above. NULL is returned if there are no deferred RPCs in qdev. + */ +struct homa_rpc *homa_qdisc_get_oldest(struct homa_qdisc_dev *qdev) +{ + struct rb_node *node; + struct homa_rpc *rpc; + u64 oldest_time; + + if (qdev->oldest_rpc) + return qdev->oldest_rpc; + qdev->oldest_rpc = NULL; + oldest_time = ~0; + + for (node = rb_first_cached(&qdev->deferred_rpcs); node; + node = rb_next(node)) { + rpc = container_of(node, struct homa_rpc, qrpc.rb_node); + if (rpc->msgout.init_time < oldest_time) { + oldest_time = rpc->msgout.init_time; + qdev->oldest_rpc = rpc; + } + } + return qdev->oldest_rpc; +} + /** * homa_qdisc_get_deferred_homa() - Return the highest-priority deferred Homa * packet and dequeue it from the structures that manage deferred packets. @@ -776,6 +808,7 @@ struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev) struct homa_rpc *rpc; struct rb_node *node; struct sk_buff *skb; + bool fifo = false; int bytes_left; spin_lock_bh(&qdev->defer_lock); @@ -785,21 +818,43 @@ struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev) return NULL; } qrpc = container_of(node, struct homa_rpc_qdisc, rb_node); + rpc = container_of(qrpc, struct homa_rpc, qrpc); + if (qdev->srpt_bytes <= 0 && + qdev->hnet->homa->qshared->fifo_fraction != 0) { + fifo = true; + rpc = homa_qdisc_get_oldest(qdev); + qrpc = &rpc->qrpc; + node = &qrpc->rb_node; + } skb = skb_dequeue(&qrpc->packets); - if (skb_queue_len(&qrpc->packets) == 0) + if (skb_queue_len(&qrpc->packets) == 0) { rb_erase_cached(node, &qdev->deferred_rpcs); + if (rpc == qdev->oldest_rpc) + qdev->oldest_rpc = NULL; + } - /* Update qrpc->bytes_left. This can change the priority of the RPC - * in qdev->deferred_rpcs, but the RPC was already the highest- - * priority one and its priority only gets higher, so its position - * in the rbtree won't change (thus we don't need to remove and - * reinsert it). + /* Update qrpc->tx_left and qdev->srpt_bytes. This can increase the + * priority of the RPC in qdev->deferred_rpcs; if this is the FIFO RPC + * then we have to remove it from the tree and reinsert it to make + * sure it's in the right position (if this isn't the FIFO RPC then + * it's position won't change because it is already highest priority). */ - rpc = container_of(qrpc, struct homa_rpc, qrpc); info = homa_get_skb_info(skb); bytes_left = rpc->msgout.length - (info->offset + info->data_bytes); if (bytes_left < qrpc->tx_left) qrpc->tx_left = bytes_left; + if (fifo) { + if (skb_queue_len(&qrpc->packets) > 0) { + rb_erase_cached(node, &qdev->deferred_rpcs); + homa_qdisc_insert_rb(qdev, rpc); + } + qdev->srpt_bytes += (qdisc_pkt_len(skb) * + qdev->hnet->homa->qshared->fifo_weight) >> + HOMA_FIFO_WEIGHT_SHIFT; + INC_METRIC(pacer_fifo_bytes, qdisc_pkt_len(skb)); + } else { + qdev->srpt_bytes -= qdisc_pkt_len(skb); + } if (!homa_qdisc_any_deferred(qdev)) { INC_METRIC(nic_backlog_cycles, homa_clock() - qdev->last_defer); @@ -1154,6 +1209,12 @@ void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev) qdev->link_mbps = ksettings.base.speed; } + /* Must reset srpt_bytes: if qshared->fifo_fraction was previously + * zero, srpt_bytes could be an enormous negative number. Without + * a reset, the pacer could transmit exclusively FIFO for a long time. + */ + qdev->srpt_bytes = 0; + /* Compute cycles_per_mibyte based on the link speed (mibytes/sec) * and max_link_usage: * @@ -1188,6 +1249,13 @@ void homa_qdev_update_sysctl(struct homa_qdisc_dev *qdev) void homa_qdisc_update_sysctl_deps(struct homa_qdisc_shared *qshared) { struct homa_qdisc_dev *qdev; + u64 tmp; + + if (qshared->fifo_fraction > 0) { + tmp = (1000 - qshared->fifo_fraction) << HOMA_FIFO_WEIGHT_SHIFT; + do_div(tmp, qshared->fifo_fraction); + qshared->fifo_weight = tmp; + } qshared->max_nic_est_backlog_cycles = homa_ns_to_cycles(1000 * qshared->max_nic_est_backlog_usecs); diff --git a/homa_qdisc.h b/homa_qdisc.h index b952f3a1..ab3c9ad9 100644 --- a/homa_qdisc.h +++ b/homa_qdisc.h @@ -107,6 +107,19 @@ struct homa_qdisc_dev { */ struct rb_root_cached deferred_rpcs; + /** + * @oldest_rpc: The RPC in deferred_rpcs with the oldest init_time, or + * NULL if not currently known. + */ + struct homa_rpc *oldest_rpc; + + /** + * @srpt_bytes: The number of bytes that should be transmitted from + * SRPT packets before transmitting a FIFO packet. <= 0 means + * the next packet transmission should be FIFO. + */ + s64 srpt_bytes; + /** * @deferred_qdiscs: List of all homa_qdiscs with non-Homa packets * that have been deferred because of NIC overload. @@ -216,6 +229,16 @@ struct homa_qdisc_shared { */ int fifo_fraction; + /** + * @fifo_weight: Determines how much qdev->fifo_count is updated + * when a FIFO packet is transmitted (for each FIFO byte transmitted, + * @fifo_weight >> HOMA_FIFO_WEIGHT_SHIFT SRPT bytes should be + * transmitted); computed from @fifo_fraction. Valid only if + * fifo_fraction is nonzero. + */ + int fifo_weight; +#define HOMA_FIFO_WEIGHT_SHIFT 10 + /** * @max_nic_est_backlog_usecs: Limits the NIC queue length: we won't * queue packets in the NIC for transmission if link_idle_time is @@ -302,6 +325,8 @@ int homa_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); void homa_qdisc_free_homa(struct homa_qdisc_dev *qdev); struct sk_buff *homa_qdisc_get_deferred_homa(struct homa_qdisc_dev *qdev); +struct homa_rpc * + homa_qdisc_get_oldest(struct homa_qdisc_dev *qdev); int homa_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack); void homa_qdisc_insert_rb(struct homa_qdisc_dev *qdev, diff --git a/test/unit_homa_qdisc.c b/test/unit_homa_qdisc.c index 183c3400..5126f0d2 100644 --- a/test/unit_homa_qdisc.c +++ b/test/unit_homa_qdisc.c @@ -1199,6 +1199,43 @@ TEST_F(homa_qdisc, homa_qdisc_insert_rb__long_right_chain) unit_log_get()); homa_qdisc_qdev_put(qdev); } +TEST_F(homa_qdisc, homa_qdisc_insert_rb__update_oldest_rpc) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + /* First insertion: oldest_rpc currently unknown, so can't update. */ + EXPECT_EQ(NULL, qdev->oldest_rpc); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 5000, 1500)); + EXPECT_EQ(NULL, qdev->oldest_rpc); + + /* Second insertion: new RPC is older. */ + srpc1->msgout.init_time = 10000; + srpc2->msgout.init_time = 5000; + qdev->oldest_rpc = srpc1; + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 7000, 1500)); + EXPECT_EQ(srpc2, qdev->oldest_rpc); + + /* Third insertion: new RPC is younger than oldest_rpc. */ + srpc3->msgout.init_time = 5001; + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 3000, 1500)); + EXPECT_EQ(srpc2, qdev->oldest_rpc); + homa_qdisc_qdev_put(qdev); +} TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__basics) { @@ -1288,6 +1325,63 @@ TEST_F(homa_qdisc, homa_qdisc_xmit_deferred_tcp__backlog_cycles_metric) EXPECT_EQ(0, q1->qdev->last_defer); } +TEST_F(homa_qdisc, homa_qdisc_get_oldest__return_cached_value) +{ + struct homa_rpc *srpc1; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 5000); + + qdev->oldest_rpc = srpc1; + EXPECT_EQ(srpc1, homa_qdisc_get_oldest(qdev)); + + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_oldest__iterate_rbtree) +{ + struct homa_rpc *srpc1, *srpc2, *srpc3; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 5000); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 3000); + srpc3 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 4, 10000, 10000); + + srpc1->msgout.init_time = 10000; + srpc2->msgout.init_time = 5000; + srpc3->msgout.init_time = 7000; + + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc1, &self->addr, 0, 1000)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc2, &self->addr, 0, 1000)); + homa_qdisc_defer_homa(qdev, + new_test_skb(srpc3, &self->addr, 0, 1000)); + + EXPECT_EQ(srpc2, homa_qdisc_get_oldest(qdev)); + EXPECT_EQ(srpc2, qdev->oldest_rpc); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_oldest__no_rpcs_in_rbtree) +{ + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + + EXPECT_EQ(NULL, homa_qdisc_get_oldest(qdev)); + + homa_qdisc_qdev_put(qdev); +} + TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__no_deferred_rpcs) { struct homa_qdisc_dev *qdev; @@ -1298,6 +1392,110 @@ TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__no_deferred_rpcs) EXPECT_EQ(NULL, homa_qdisc_get_deferred_homa(qdev)); homa_qdisc_qdev_put(qdev); } +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__basics) +{ + struct homa_rpc *srpc1; + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + + skb = new_test_skb(srpc1, &self->addr, 5000, 500); + homa_qdisc_defer_homa(qdev, skb); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 5000]", unit_log_get()); + + qdev->srpt_bytes = 200; + EXPECT_EQ(skb, homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("", unit_log_get()); + kfree_skb(skb); + EXPECT_EQ(-400, qdev->srpt_bytes); + EXPECT_EQ(4500, srpc1->qrpc.tx_left); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__choose_fifo_rpc) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + + skb = new_test_skb(srpc1, &self->addr, 0, 900); + homa_qdisc_defer_homa(qdev, skb); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, + 900)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]; [id 1235, offsets 0]", + unit_log_get()); + srpc1->msgout.init_time = 5000; + srpc2->msgout.init_time = 6000; + + self->homa.qshared->fifo_fraction = 200; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + qdev->srpt_bytes = -100; + + EXPECT_EQ(skb, homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]", unit_log_get()); + kfree_skb(skb); + EXPECT_EQ(3900, qdev->srpt_bytes); + EXPECT_EQ(NULL, qdev->oldest_rpc); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__fifo_fraction_zero) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + struct sk_buff *skb; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 0, 900)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 2000, + 900)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 2000]; [id 1235, offsets 0]", + unit_log_get()); + srpc1->msgout.init_time = 5000; + srpc2->msgout.init_time = 6000; + + self->homa.qshared->fifo_fraction = 0; + qdev->srpt_bytes = -100; + + skb = homa_qdisc_get_deferred_homa(qdev); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 0]", unit_log_get()); + kfree_skb(skb); + homa_qdisc_qdev_put(qdev); +} TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__multiple_packets_for_rpc) { struct homa_qdisc_dev *qdev; @@ -1351,11 +1549,13 @@ TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__last_packet_for_rpc) log_deferred(qdev); EXPECT_STREQ("[id 1235, offsets 5000]; [id 1237, offsets 2000 3000]", unit_log_get()); + qdev->oldest_rpc = srpc1; EXPECT_EQ(skb, homa_qdisc_get_deferred_homa(qdev)); unit_log_clear(); log_deferred(qdev); EXPECT_STREQ("[id 1237, offsets 2000 3000]", unit_log_get()); + EXPECT_EQ(NULL, qdev->oldest_rpc); kfree_skb(skb); homa_qdisc_qdev_put(qdev); } @@ -1384,6 +1584,69 @@ TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__update_tx_left) homa_qdisc_qdev_put(qdev); } +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__reposition_rpc_in_rbtree) +{ + struct homa_rpc *srpc1, *srpc2; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + srpc2 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id + 2, 10000, 10000); + ASSERT_NE(NULL, srpc2); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 0, 1500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 1500, + 1500)); + homa_qdisc_defer_homa(qdev, new_test_skb(srpc2, &self->addr, 1000, + 1000)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1237, offsets 1000]; [id 1235, offsets 0 1500]", + unit_log_get()); + + qdev->oldest_rpc = srpc1; + qdev->srpt_bytes = -100; + + /* First extraction: FIFO RPC must be repositioned in rbtree. */ + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 1500]; [id 1237, offsets 1000]", + unit_log_get()); + + /* Second extraction: FIFO RPC removed from tree.*/ + qdev->oldest_rpc = srpc2; + qdev->srpt_bytes = -100; + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + unit_log_clear(); + log_deferred(qdev); + EXPECT_STREQ("[id 1235, offsets 1500]", unit_log_get()); + homa_qdisc_qdev_put(qdev); +} +TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__pacer_fifo_bytes_metric) +{ + struct homa_rpc *srpc1; + struct homa_qdisc_dev *qdev; + + qdev = homa_qdisc_qdev_get(self->dev); + srpc1 = unit_server_rpc(&self->hsk, UNIT_OUTGOING, &self->client_ip, + &self->server_ip, self->client_port, + self->server_id, 10000, 10000); + ASSERT_NE(NULL, srpc1); + + homa_qdisc_defer_homa(qdev, new_test_skb(srpc1, &self->addr, 0, 1500)); + + qdev->oldest_rpc = srpc1; + qdev->srpt_bytes = -100; + kfree_skb(homa_qdisc_get_deferred_homa(qdev)); + EXPECT_EQ(1600, homa_metrics_per_cpu()->pacer_fifo_bytes); + homa_qdisc_qdev_put(qdev); +} TEST_F(homa_qdisc, homa_qdisc_get_deferred_homa__nic_backlog_cycles_metric) { struct homa_qdisc_dev *qdev; @@ -2026,10 +2289,12 @@ TEST_F(homa_qdisc, homa_qdev_update_sysctl__basics) mock_link_mbps = 8000; self->homa.qshared->max_link_usage = 90; self->homa.qshared->max_nic_queue_usecs = 50; + qdev->srpt_bytes = -1000; homa_qdev_update_sysctl(qdev); EXPECT_EQ(8000, qdev->link_mbps); EXPECT_EQ(1165084, qdev->cycles_per_mibyte); EXPECT_EQ(50000, qdev->max_nic_queue_bytes); + EXPECT_EQ(0, qdev->srpt_bytes); homa_qdisc_qdev_put(qdev); } @@ -2050,6 +2315,26 @@ TEST_F(homa_qdisc, homa_qdev_update_sysctl__cant_get_link_speed_from_dev) homa_qdisc_qdev_put(qdev); } +TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__fifo_fraction) +{ + self->homa.qshared->fifo_fraction = 500; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(1<homa.qshared->fifo_weight); + + self->homa.qshared->fifo_fraction = 200; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(4 * (1 << HOMA_FIFO_WEIGHT_SHIFT), + self->homa.qshared->fifo_weight); + + self->homa.qshared->fifo_fraction = 800; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ((1 << HOMA_FIFO_WEIGHT_SHIFT) / 4, self->homa.qshared->fifo_weight); + + self->homa.qshared->fifo_fraction = 0; + self->homa.qshared->fifo_weight = -1; + homa_qdisc_update_sysctl_deps(self->homa.qshared); + EXPECT_EQ(-1, self->homa.qshared->fifo_weight); +} TEST_F(homa_qdisc, homa_qdisc_update_sysctl_deps__max_nic_est_backlog_cycles) { self->homa.qshared->max_nic_est_backlog_usecs = 6; From 796eb7ab71e569217b17582c0483cc21f21f87f2 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 19 Jan 2026 14:31:33 -0800 Subject: [PATCH 623/625] First version of homa_qdisc is now complete This commit updates various documentation --- README.md | 12 ++++++++++++ perf.txt | 29 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/README.md b/README.md index 24186b9b..d1ea532c 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,18 @@ This repo contains an implementation of the Homa transport protocol as a Linux k sysctl mechanism. For details, see the man page `homa.7`. ## Significant changes +- January 2026: introduced new 'homa_qdisc' queuing discpline to improve + performance when TCP and Homa run simultaneously. Results on c6620 CloudLab + cluster (100 Gbps network): + - Without homa_qdisc, if Homa and TCP run together, Homa performance + suffers (4x increase for P99 for short messages) but TCP performance + improves. + - Homa_qdisc improves performance for both Homa and TCP, whether + running stand-alone or together. + - homa_qdisc improves Homa short message P99 3x when running together + with TCP, but P99 is still slower than Homa standalone. + - TCP performance improves when running together with Homa, with or + without homa_qdisc. - November 2025: upgraded to Linux 6.17.8. - October 2025: added the HOMAIOCINFO ioctl for retrieving status information about a Homa socket. See man/homa.7 for details. diff --git a/perf.txt b/perf.txt index 9b1526ae..68e2f00c 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,35 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +68. (January 2025) Performance snapshot with and without pacer, using +c6620 CloudLab nodes, "-w w4 -b 80 -s 20 -n 6". cp_vs_tcp is used unless +cp_both is indicated. + +AvgSlow: "avg slowdown" from cp_vs_tcp log output +Min: "min" from cp_vs_tcp "avg slowdown" line +P50: "P50" from cp_vs_tcp "avg slowdown" line +P99: "P99" from cp_vs_tcp "avg slowdown" line +P99L: P99 for 1 MB messages, from *_w4.data file +MaxT: Throughput under "-b100" + + AvgSlow Min P50 P99 P99L MaxT +Homa (old pacer) 3.33 22.1 50.9 98.5 3284 96.6 +homa (homa_qdisc) 3.31 21.1 50.6 90.7 3698 94.6 +Homa (cp_both, old_pacer) 4.56 23.6 56.4 379.6 4182 +homa (cp_both, homa_qdisc) 3.72 23.5 53.6 124.6 4021 +TCP (no homa_qdisc) 11.81 32.9 180.4 1271.6 5235 94.8 +TCP (homa_qdisc) 10.80 32.6 157.1 832.6 4627 95.7 +TCP (cp_both, old pacer) 9.22 31.4 151.6 839.2 3062 +TCP (cp_both, homa_qdisc) 9.13 32.9 136.5 762.5 4127 + +Summary: +* Without homa_qdisc, Homa P99 suffers a lot under cp_both; with homa_qdisc + it improves 3x, but is still 30% slower than running without TCP. +* homa_qdisc improves TCP performance even when running without Homa. +* TCP performance is better running with Homa than standalone. +* Homa_qdisc reduces Homa's maximum throughput slightly, increases TCP's + maximum throughput slightly. + 67. (January 2025) Performance variation over reboots. On the c6620 CloudLab cluster, both Homa and TCP performance seems to vary from reboot to reboot. Performance is relatively consistent between reboots. However, after From c0551f945b46a2fe828b8b57b75ddf4d39c65e93 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Tue, 20 Jan 2026 14:04:01 -0800 Subject: [PATCH 624/625] Update notes.txt --- notes.txt | 100 ++++++++++-------------------------------------------- 1 file changed, 17 insertions(+), 83 deletions(-) diff --git a/notes.txt b/notes.txt index 923468a0..b008fc3d 100755 --- a/notes.txt +++ b/notes.txt @@ -9,55 +9,29 @@ Notes for Homa implementation in Linux: * Performance problems to track down: * On xl170s, both TCP and Homa run slower with qdisc than pacer (P99 for TCP small packets increases by 50%) - * P99 latency for W4 short packets is considerably worse with IPv6 than IPv4 - (c6620). * On c6620 cluster, "cp_node client --workload 500000" gets 20 GBps each way with TCP but only 15 Gbps with Homa. +* Significant new functionality: + * Run Homa SoftIRQ on the GRO core? This would eliminate the latency and + cache overheads of switching cores, and RSS already provides adequate + load balancing. Possibly even invoke GRO during SoftIRQ? + * Don't transmit client port number. Instead, vary the contents of this + field to generate packet spraying. + * Implement zero-copy for output (may work only on client side). + * Change grant mechanism so that each message is either entirely + scheduled or entirely unscheduled (see July 2024 note below). + * Fix Homa to ensure at-most-once semantics (see "execution multiple + times" below). + * Implement some of the ideas from SIRD to reduce buffer utilization. + * Refactor the granting mechanism to grant separately for each device. + * Move interest cleanup code from homa_sock to a new function in homa_interest. Also move wakeup code from homa_rpc_handoff. * Find a way to reduce pacer core utilization? It currently takes about 0.7 core when running at high load. Maybe use polling threads instead? -* Use skb_attempt_defer_free once it has been properly exported. - -* Thoughts on making TCP and Homa play better together: - * Goals: - * Keep the NIC tx queue from growing long. - * Share bandwidth fairly between TCP and Homa - * If one protocol is using a lot less bandwidth, give it preference - for transmission? - * Balance queue lengths for the protocols? - * Approach #1: analogous to "fair share" CPU scheduling - * Keep separate tx queues for Homa and TCP - * Try to equalize the queue lengths while pacing packets at - network speed? - * Keep track of lengths for each of many queues - * Each queue paces itself based on relative lengths - * Do all pacing centrally, call back to individual queues for output? - * Approach #2: - * Keep track of recent bandwidth consumed by each protocol; when there - is overload, restrict each protocol to its fraction of recent bandwidth. - * "Consumed" has to be measured in terms of bytes offered, not bytes - actually transmitted (otherwise a protocol could get "stuck" at a low - transmittion rate?). - * Approach #3: token bucket - * Use a token bucket for each protocol with 50% of available bandwidth - (or maybe less?). Split any extra available bandwidth among the - protocols. Maybe adjust rates for the token buckets based on recent - traffic? - * Also consider the amount of data that is "stuck" in the NIC? - -* Consider eliminating SoftIRQ: process packets completely at NAPI level? - * This eliminates the latency and cache overheads of switching cores - for SoftIRQ - * Should also help with tail latency: eliminates one opportunity for - hot-spots - * Load balancing should still be fine (especially if port number is used - for packet spraying) - * Or, always do SoftIRQ processing on same node as NAPI? - * Eliminate use of link_mbps in homa_grant.c; perhaps replace with configuration parameter fifo_mbps? Maybe the grant mechanism needs to be net_device-specific? @@ -69,15 +43,9 @@ Notes for Homa implementation in Linux: napi_reuse_skb (return GRO_MERGED_FREE?). See also napi_get_frags (used by the driver?). * Apparently TCP has a faster way of eventually freeing the merged skb - (return things to the allocating core): see tcp_eat_recv_skb? - -* Remedies to consider for the performance problems at 100 Gbps, where - one tx channel gets very backed up: - * Implement zero-copy on output in order to reduce memory bandwidth - consumption (presumed with this will increase throughput?) - * Reserve one channel for the pacer, and don't send non-paced packets - on that channel; this should eliminate the latency problems caused - by short messages getting queued on that channel + (return things to the allocating core): see tcp_eat_recv_skb. + * This uses the function skb_attempt_defer_free, which is not currently + exported for extensions. * Rework cp_node so that there aren't separate senders and receivers on the client. Instead, have each client thread send, then conditionally receive, @@ -107,14 +75,6 @@ Notes for Homa implementation in Linux: * Eventually SoftIRQ wakes up to handle the original packet, which re-creates the RPC and it gets serviced a second time. -* Use vmap to map the user-space buffer pool so that the kernel can use - memcpy rather than copy_to_user? - -* For W3, throttle_min_bytes is a problem: a significant fraction of all - transmitted bytes aren't being counted; as a result, the NIC queue - can build up. Reducing throttle_min_bytes from 1000 to 200 reduced P99 - short message latency from 250 us to 200 us. - * Don't understand why W3 performance is so variable under Gen3. Also, it's worth comparing tthoma output for W4 and W3 under Gen2; e.g., W3 has way more active outgoing messages than W4. @@ -197,10 +157,6 @@ Notes for Homa implementation in Linux: * recvmsg doesn't seem to return an address if there is an error? May need to return the address in a different place? -* IPv6 issues: - * See if error checking made syscalls slower. - * GSO always uses SKB_GSO_TCPV6; sometimes it should be V4. - * Pinning memory: see mm.h and mm/gup.c * get_user_page * get_user_pages @@ -211,14 +167,6 @@ Notes for Homa implementation in Linux: (just find the oldest message that doesn't have a pity grant)? Also, it doesn't look like homa_grant_fifo is keeping track of pity grants precisely; perhaps add another RPC field for this? - * Re-implement the duty-cycle mechanism. Use a generalized pacer to - control grants: - * Parameters: - * Allowable throughput - * Max accumulation of credits - * Methods: - * Request (current time, amount) (possibly 2 stages: isItOk and doIt?) - * Or, just reduce the link speed and let the pacer handler this? * Perhaps limit the number of polling threads per socket, to solve the problems with having lots of receiver threads? * Move some reaping to the pacer? It has time to spare @@ -238,15 +186,6 @@ Notes for Homa implementation in Linux: everything up to the latest received offset. * Try more aggressive retries (e.g. if a missing packet is sufficiently long ago, don't wait for timeout). - * Eliminate hot spots involving NAPI: - * Arrange for incoming bursts to be divided into batches where - alternate batches do their NAPI on 2 different cores. - * To do this, use TCP for Homa! - * Send Homa packets using TCP, and use different ports to force - different NAPI cores - * Interpose on the TCP packet reception hooks, and redirect - real TCP packets back to TCP. - * Consider replacing grantable list with a heap? * Unimplemented interface functions. * Learn about CONFIG_COMPAT and whether it needs to be supported in struct proto and struct proto_ops. @@ -258,7 +197,6 @@ Notes for Homa implementation in Linux: * Socket not supported on server (or server process ends while processing request). * Server timeout - * Is it safe to use non-locking skb queue functions? * Is the RCU usage for sockets safe? In particular, how long is it safe to use a homa_sock returned by homa_find_socket? Could it be deleted from underneath us? This question may no longer be relevant, given the @@ -266,16 +204,12 @@ Notes for Homa implementation in Linux: * Can a packet input handler be invoked multiple times concurrently? * What is audit_sockaddr? Do I need to invoke it when I read sockaddrs from user space? - * When a struct homa is destroyed, all of its sockets end up in an unsafe - state in terms of their socktab links. * Clean up ports and ips in unit_homa_incoming.c * Plug into Linux capability mechanism (man(7) capabilities) * Don't return any errors on sends? * Homa-RAMCloud doesn't retransmit bytes if it transmitted other bytes recently; should HomaModule do the same? Otherwise, will retransmit for requests whose service time is just about equal to the resend timer. - * Check tcp_transmit_skb to make sure we are doing everything we need to - do with skbuffs (e.g., update sk_wmem_alloc?) * Add support for cgroups (e.g. to manage memory allocation) * Questions for Linux experts: From de3b271d24c6be2c559ca161878e523db37f655f Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 22 Jan 2026 20:04:22 -0800 Subject: [PATCH 625/625] Remove dependencies on kernel customizations from timetrace.c --- timetrace.c | 73 +---------------------------------------------------- timetrace.h | 1 - 2 files changed, 1 insertion(+), 73 deletions(-) diff --git a/timetrace.c b/timetrace.c index 60c56fd5..e196ee48 100644 --- a/timetrace.c +++ b/timetrace.c @@ -36,12 +36,6 @@ extern void homa_trace(u64 u0, u64 u1, int i0, int i1); extern void ltt_record_nop(struct tt_buffer *buffer, u64 timestamp, const char *format, u32 arg0, u32 arg1, u32 arg2, u32 arg3); -extern void (*ltt_record_sendmsg)(struct sock *sk, struct msghdr *msg); -extern void ltt_record_sendmsg_nop(struct sock *sk, struct msghdr *msg); -extern void (*ltt_record_tcp)(char *format, struct sk_buff *skb, - __be32 saddr, __be32 daddr); -extern void ltt_record_tcp_nop(char *format, struct sk_buff *skb, - __be32 saddr, __be32 daddr); #endif void tt_inc_metric(int metric, u64 count); @@ -145,8 +139,6 @@ int tt_init(char *proc_file) tt_linux_freeze_count = &tt_freeze_count; tt_linux_inc_metrics = tt_inc_metric; tt_linux_printk = tt_printk; - ltt_record_sendmsg = tt_record_sendmsg; - ltt_record_tcp = tt_record_tcp; tt_linux_dbg1 = tt_dbg1; tt_linux_dbg2 = tt_dbg2; tt_linux_dbg3 = tt_dbg3; @@ -209,8 +201,6 @@ void tt_destroy(void) tt_linux_buffers[i] = NULL; tt_linux_inc_metrics = tt_linux_skip_metrics; tt_linux_printk = tt_linux_nop; - ltt_record_sendmsg = ltt_record_sendmsg_nop; - ltt_record_tcp = ltt_record_tcp_nop; tt_linux_dbg1 = (void (*)(char *, ...)) tt_linux_nop; tt_linux_dbg2 = (void (*)(char *, ...)) tt_linux_nop; tt_linux_dbg3 = (void (*)(char *, ...)) tt_linux_nop; @@ -935,67 +925,6 @@ void tt_inc_metric(int metric, u64 count) #endif /* See strip.py */ } -/** - * tt_record_sendmsg() - Invoked by tcp_sendmsg to create a timetrace - * record for the kernel call (if a new message is being started). - * @sk: Socket on which tcp_sendmsg was invoked. - * msg: The data to transmit on the socket (in user space). - */ -void tt_record_sendmsg(struct sock *sk, struct msghdr *msg) -{ - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct iov_iter iter; - int header[3]; - int copied; - int length; - - /* This design assumes that cp_node is generating the requests, - * so new messages will always start at the beginning of - * msg (but in some cases it may take multiple calls to - * sendmsg to transmit an entire message). - */ - if (!tp->homa_init) { - tp->homa_next_seq = tp->write_seq; - tp->homa_init = 1; - } - - /* This function is intended only for use with requests generated - * by cp_node, in which case new messages will always start at the - * beginnign of msg (but but in some cases it may take multiple - * calls to sendmsg to transmit an entire message). Check to see - * if we're in the middle of a message, or if this isn't cp_node; - * if so, do nothing. - */ - iter = msg->msg_iter; - if (iov_iter_count(&iter) < sizeof(header)) - return; - copied = copy_from_iter(&header, sizeof(header), &iter); - if (copied != sizeof(header)) { - tt_record1("copy_from_iter returned %d in tt_record_sendmsg", - copied); - return; - } - length = header[0]; - if (length < iov_iter_count(&msg->msg_iter)) { - /* There isn't a Homa message at the expected place. Most - * likely this isn't a Homa socket. - */ - return; - } - if (tp->homa_next_seq != tp->write_seq) - return; - - tp->homa_next_seq += length; - tt_record2("tcp_sendmsg new message slot is %d, response %d", - header[2] & 0xffff, (header[2] & 0x40000) ? 1 : 0); - tt_record4("tcp_sendmsg invoked for message from 0x%x to 0x%x, " - "length %d, starting sequence %u", - (htonl(inet->inet_saddr) << 16) + htons(inet->inet_sport), - (htonl(inet->inet_daddr) << 16) + htons(inet->inet_dport), - length, tp->write_seq); -} - /** * tt_record_tcp() - Create a timetrace record for a TCP packet, formatting * data in a standard way. @@ -1011,7 +940,7 @@ void tt_record_tcp(char *format, struct sk_buff *skb, __be32 saddr, struct tcphdr *th; int data_length; - th = (struct tcphdr*) skb_transport_header(skb); + th = (struct tcphdr*)skb_transport_header(skb); data_length = skb->len - skb_transport_offset(skb) - th->doff * 4; tt_record4(format, (ntohl(saddr) << 16) + ntohs(th->source), (ntohl(daddr) << 16) + ntohs(th->dest), data_length, diff --git a/timetrace.h b/timetrace.h index 9e8eca38..1796ef6e 100644 --- a/timetrace.h +++ b/timetrace.h @@ -136,7 +136,6 @@ ssize_t tt_proc_read(struct file *file, char __user *user_buf, size_t length, loff_t *offset); int tt_proc_release(struct inode *inode, struct file *file); loff_t tt_proc_lseek(struct file *file, loff_t offset, int whence); -void tt_record_sendmsg(struct sock *sk, struct msghdr *msg); void tt_record_tcp(char *format, struct sk_buff *skb, __be32 saddr, __be32 daddr); extern struct tt_buffer *tt_buffers[];